From d1eca1bcf0b80d3a16756ceb69f976aa99757f16 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 27 Feb 2025 16:43:45 +0100 Subject: [PATCH 1/5] Update deprecated Python 3.8 typing Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- benchmarks/backend_request_func.py | 6 +- benchmarks/benchmark_guided.py | 17 +- benchmarks/benchmark_latency.py | 6 +- benchmarks/benchmark_prefix_caching.py | 16 +- benchmarks/benchmark_prioritization.py | 8 +- benchmarks/benchmark_serving.py | 77 +++---- benchmarks/benchmark_serving_guided.py | 57 ++--- benchmarks/benchmark_throughput.py | 38 ++-- benchmarks/benchmark_utils.py | 8 +- .../cutlass_benchmarks/sparse_benchmarks.py | 9 +- benchmarks/cutlass_benchmarks/utils.py | 8 +- .../cutlass_benchmarks/w8a8_benchmarks.py | 17 +- .../fused_kernels/layernorm_rms_benchmarks.py | 5 +- benchmarks/kernels/benchmark_lora.py | 60 ++--- benchmarks/kernels/benchmark_machete.py | 25 ++- benchmarks/kernels/benchmark_marlin.py | 6 +- benchmarks/kernels/benchmark_moe.py | 18 +- .../kernels/benchmark_paged_attention.py | 4 +- benchmarks/kernels/benchmark_rmsnorm.py | 4 +- benchmarks/kernels/benchmark_rope.py | 4 +- benchmarks/kernels/graph_machete_bench.py | 3 +- benchmarks/kernels/utils.py | 3 +- .../vllm_cutlass_library_extension.py | 14 +- csrc/quantization/machete/generate.py | 20 +- docs/source/conf.py | 3 +- docs/source/features/reasoning_outputs.md | 4 +- docs/source/features/structured_outputs.md | 2 +- docs/source/generate_examples.py | 2 +- examples/offline_inference/distributed.py | 10 +- .../offline_inference/llm_engine_example.py | 7 +- .../lora_with_quantization_inference.py | 8 +- examples/offline_inference/mlpspeculator.py | 3 +- .../offline_inference/multilora_inference.py | 8 +- .../prithvi_geospatial_mae.py | 8 +- examples/offline_inference/profiling.py | 15 +- .../profiling_tpu/profiling.py | 3 +- .../vision_language_multi_image.py | 34 +-- examples/online_serving/api_client.py | 6 +- .../online_serving/openai_embedding_client.py | 2 +- pyproject.toml | 2 - setup.py | 7 +- tests/async_engine/api_server_async_engine.py | 5 +- tests/async_engine/test_async_llm_engine.py | 4 +- tests/compile/piecewise/test_toy_llama.py | 6 +- tests/compile/test_basic_correctness.py | 8 +- tests/conftest.py | 159 +++++++------ tests/core/block/e2e/conftest.py | 3 +- .../e2e/test_correctness_sliding_window.py | 11 +- tests/core/block/test_block_table.py | 8 +- tests/core/block/test_naive_block.py | 4 +- tests/core/block/test_prefix_caching_block.py | 16 +- tests/core/test_chunked_prefill_scheduler.py | 25 +-- tests/core/test_scheduler.py | 19 +- tests/core/test_scheduler_encoder_decoder.py | 4 +- tests/core/utils.py | 21 +- tests/distributed/test_expert_parallel.py | 6 +- tests/distributed/test_pipeline_parallel.py | 8 +- tests/distributed/test_pynccl.py | 5 +- tests/distributed/test_shm_broadcast.py | 3 +- tests/encoder_decoder/test_e2e_correctness.py | 4 +- tests/engine/test_executor.py | 6 +- tests/engine/test_multiproc_workers.py | 6 +- tests/engine/test_stop_strings.py | 6 +- tests/entrypoints/llm/test_chat.py | 4 +- tests/entrypoints/llm/test_encode.py | 5 +- tests/entrypoints/llm/test_generate.py | 3 +- .../test_transcription_api_correctness.py | 3 +- .../test_deepseekr1_reasoning_parser.py | 4 +- .../openai/reasoning_parsers/utils.py | 14 +- tests/entrypoints/openai/test_audio.py | 16 +- tests/entrypoints/openai/test_basic.py | 3 +- tests/entrypoints/openai/test_chat.py | 8 +- tests/entrypoints/openai/test_completion.py | 8 +- tests/entrypoints/openai/test_embedding.py | 4 +- tests/entrypoints/openai/test_pooling.py | 4 +- tests/entrypoints/openai/test_root_path.py | 4 +- tests/entrypoints/openai/test_video.py | 12 +- tests/entrypoints/openai/test_vision.py | 12 +- .../openai/test_vision_embedding.py | 4 +- .../tool_parsers/test_pythonic_tool_parser.py | 3 +- .../entrypoints/openai/tool_parsers/utils.py | 9 +- tests/kernels/quant_utils.py | 6 +- tests/kernels/test_activation.py | 3 +- tests/kernels/test_attention.py | 16 +- tests/kernels/test_blocksparse_attention.py | 12 +- tests/kernels/test_cache.py | 5 +- tests/kernels/test_cascade_flash_attn.py | 8 +- tests/kernels/test_cutlass.py | 11 +- tests/kernels/test_cutlass_2of4_sparse.py | 5 +- tests/kernels/test_encoder_decoder_attn.py | 4 +- tests/kernels/test_flash_attn.py | 16 +- tests/kernels/test_flashinfer.py | 22 +- tests/kernels/test_fused_quant_layernorm.py | 12 +- tests/kernels/test_gguf.py | 3 +- tests/kernels/test_machete_mm.py | 14 +- tests/kernels/test_mamba_mixer2.py | 3 +- tests/kernels/test_mamba_ssm_ssd.py | 8 +- tests/kernels/test_pos_encoding.py | 6 +- tests/kernels/test_triton_scaled_mm.py | 4 +- tests/kernels/utils.py | 68 +++--- tests/kv_transfer/test_send_recv.py | 3 +- tests/lora/conftest.py | 6 +- tests/lora/data/long_context_test_data.py | 4 +- tests/lora/test_add_lora.py | 9 +- tests/lora/test_baichuan.py | 6 +- tests/lora/test_chatglm3_tp.py | 6 +- tests/lora/test_gemma.py | 6 +- tests/lora/test_jamba.py | 6 +- tests/lora/test_layers.py | 48 ++-- tests/lora/test_llama_tp.py | 6 +- tests/lora/test_long_context.py | 16 +- tests/lora/test_lora_bias_e2e.py | 6 +- tests/lora/test_lora_checkpoints.py | 6 +- tests/lora/test_lora_functions.py | 5 +- tests/lora/test_lora_huggingface.py | 4 +- tests/lora/test_lora_manager.py | 7 +- tests/lora/test_minicpmv_tp.py | 6 +- tests/lora/test_mixtral.py | 6 +- tests/lora/test_phi.py | 6 +- tests/lora/test_punica_ops.py | 5 +- tests/lora/test_quant_model.py | 7 +- tests/lora/test_qwen2vl.py | 10 +- tests/lora/test_ultravox.py | 7 +- tests/lora/utils.py | 14 +- tests/metrics/test_metrics.py | 3 +- tests/mistral_tool_use/utils.py | 8 +- .../model_executor/test_enabled_custom_ops.py | 4 +- .../audio_language/test_ultravox.py | 16 +- .../models/decoder_only/language/test_gguf.py | 6 +- .../decoder_only/language/test_modelopt.py | 3 +- .../decoder_only/vision_language/test_awq.py | 6 +- .../vision_language/test_models.py | 39 ++-- .../vision_language/test_phi3v.py | 10 +- .../vision_language/test_pixtral.py | 12 +- .../vision_language/test_qwen2_vl.py | 46 ++-- .../vision_language/vlm_utils/builders.py | 7 +- .../vlm_utils/case_filtering.py | 10 +- .../vision_language/vlm_utils/core.py | 22 +- .../vision_language/vlm_utils/model_utils.py | 16 +- .../vision_language/vlm_utils/runners.py | 21 +- .../vision_language/vlm_utils/types.py | 36 +-- .../models/embedding/language/test_gritlm.py | 11 +- tests/models/embedding/utils.py | 6 +- .../vision_language/test_dse_qwen2_vl.py | 12 +- .../vision_language/test_llava_next.py | 8 +- .../embedding/vision_language/test_phi3v.py | 8 +- .../encoder_decoder/language/test_bart.py | 10 +- .../vision_language/test_florence2.py | 8 +- .../vision_language/test_mllama.py | 36 +-- .../multimodal/processing/test_h2ovl.py | 3 +- .../multimodal/processing/test_internvl.py | 3 +- tests/models/registry.py | 5 +- tests/models/test_transformers.py | 15 +- tests/models/utils.py | 21 +- tests/mq_llm_engine/utils.py | 4 +- .../multi_step/test_correctness_async_llm.py | 4 +- tests/multimodal/test_utils.py | 8 +- tests/neuron/test_logits_processor.py | 3 +- .../my_gemma_embedding.py | 5 +- tests/quantization/test_configs.py | 3 +- .../test_register_quantization_config.py | 8 +- tests/samplers/test_logprobs.py | 4 +- tests/samplers/test_no_bad_words.py | 16 +- tests/samplers/test_rejection_sampler.py | 11 +- tests/samplers/test_sampler.py | 44 ++-- tests/spec_decode/e2e/conftest.py | 9 +- tests/spec_decode/test_batch_expansion.py | 4 +- tests/spec_decode/test_multi_step_worker.py | 15 +- tests/spec_decode/test_scorer.py | 3 +- tests/spec_decode/test_spec_decode_worker.py | 11 +- tests/spec_decode/utils.py | 33 ++- tests/test_cache_block_hashing.py | 6 +- tests/test_inputs.py | 4 +- tests/test_logger.py | 2 +- tests/test_logits_processor.py | 3 +- tests/test_utils.py | 4 +- tests/tokenization/test_detokenize.py | 23 +- tests/tokenization/test_tokenizer_group.py | 4 +- tests/tokenization/test_tokenizer_registry.py | 32 +-- tests/tool_use/test_chat_completions.py | 6 +- tests/tool_use/test_jamba_tool_parser.py | 13 +- tests/tool_use/test_parallel_tool_calls.py | 10 +- tests/tool_use/test_tool_calls.py | 10 +- tests/tool_use/utils.py | 26 +-- tests/tracing/test_tracing.py | 5 +- tests/utils.py | 42 ++-- tests/v1/core/test_scheduler.py | 6 +- tests/v1/engine/conftest.py | 6 +- tests/v1/engine/test_async_llm.py | 10 +- tests/v1/engine/test_engine_core.py | 3 +- tests/v1/engine/test_engine_core_client.py | 10 +- tests/v1/engine/test_llm_engine.py | 8 +- tests/v1/engine/test_output_processor.py | 12 +- tests/v1/engine/utils.py | 50 ++--- .../v1/entrypoints/openai/test_completion.py | 12 +- tests/v1/sample/test_logprobs.py | 9 +- tests/v1/sample/test_rejection_sampler.py | 7 +- tests/v1/sample/test_sampler.py | 26 +-- tests/v1/sample/utils.py | 5 +- tests/v1/test_utils.py | 6 +- tests/v1/worker/test_gpu_input_batch.py | 22 +- .../vllm_test_utils/vllm_test_utils/blame.py | 3 +- .../vllm_test_utils/monitor.py | 3 +- .../test_encoder_decoder_model_runner.py | 21 +- tests/worker/test_model_input.py | 11 +- tests/worker/test_model_runner.py | 20 +- tools/profiler/print_layerwise_table.py | 3 +- tools/profiler/visualize_layerwise_profile.py | 14 +- vllm/_custom_ops.py | 52 ++--- vllm/_ipex_ops.py | 8 +- vllm/adapter_commons/layers.py | 5 +- vllm/adapter_commons/models.py | 10 +- vllm/adapter_commons/utils.py | 18 +- vllm/adapter_commons/worker_manager.py | 6 +- vllm/assets/video.py | 6 +- vllm/attention/backends/abstract.py | 31 ++- vllm/attention/backends/blocksparse_attn.py | 26 +-- vllm/attention/backends/flash_attn.py | 40 ++-- vllm/attention/backends/flashinfer.py | 52 ++--- vllm/attention/backends/flashmla.py | 18 +- vllm/attention/backends/hpu_attn.py | 20 +- vllm/attention/backends/ipex_attn.py | 30 +-- vllm/attention/backends/mla/common.py | 45 ++-- vllm/attention/backends/openvino.py | 14 +- vllm/attention/backends/pallas.py | 20 +- vllm/attention/backends/placeholder_attn.py | 26 +-- vllm/attention/backends/rocm_flash_attn.py | 30 +-- vllm/attention/backends/torch_sdpa.py | 46 ++-- vllm/attention/backends/triton_mla.py | 8 +- vllm/attention/backends/utils.py | 42 ++-- vllm/attention/backends/xformers.py | 38 ++-- vllm/attention/layer.py | 6 +- vllm/attention/ops/flashmla.py | 12 +- vllm/attention/ops/hpu_paged_attn.py | 14 +- vllm/attention/ops/ipex_attn.py | 14 +- vllm/attention/ops/paged_attn.py | 10 +- vllm/attention/selector.py | 7 +- vllm/beam_search.py | 18 +- vllm/compilation/backends.py | 33 +-- vllm/compilation/compiler_interface.py | 28 +-- vllm/compilation/decorators.py | 8 +- vllm/compilation/fix_functionalization.py | 15 +- vllm/compilation/fusion.py | 10 +- vllm/compilation/fx_utils.py | 3 +- vllm/compilation/multi_output_match.py | 8 +- vllm/compilation/pass_manager.py | 6 +- vllm/compilation/wrapper.py | 4 +- vllm/config.py | 141 ++++++------ vllm/connections.py | 3 +- vllm/core/block/block_table.py | 48 ++-- vllm/core/block/common.py | 37 +-- vllm/core/block/cpu_gpu_block_allocator.py | 60 ++--- vllm/core/block/interfaces.py | 64 +++--- vllm/core/block/naive_block.py | 57 ++--- vllm/core/block/prefix_caching_block.py | 78 +++---- vllm/core/block_manager.py | 27 ++- vllm/core/evictor.py | 9 +- vllm/core/interfaces.py | 14 +- vllm/core/placeholder_block_space_manager.py | 12 +- vllm/core/scheduler.py | 154 ++++++------- vllm/device_allocator/cumem.py | 10 +- vllm/distributed/communication_op.py | 4 +- .../device_communicators/cuda_wrapper.py | 8 +- .../device_communicators/custom_all_reduce.py | 8 +- .../custom_all_reduce_utils.py | 9 +- .../device_communicators/pynccl_wrapper.py | 10 +- .../device_communicators/shm_broadcast.py | 8 +- .../kv_transfer/kv_connector/base.py | 14 +- .../kv_transfer/kv_connector/factory.py | 6 +- .../kv_connector/lmcache_connector.py | 8 +- .../kv_connector/simple_connector.py | 10 +- .../kv_transfer/kv_lookup_buffer/base.py | 6 +- .../kv_lookup_buffer/simple_buffer.py | 14 +- .../kv_transfer/kv_pipe/pynccl_pipe.py | 6 +- .../kv_transfer/kv_transfer_agent.py | 8 +- vllm/distributed/parallel_state.py | 43 ++-- vllm/distributed/utils.py | 13 +- vllm/engine/arg_utils.py | 30 +-- vllm/engine/async_llm_engine.py | 28 +-- vllm/engine/async_timeout.py | 8 +- vllm/engine/llm_engine.py | 122 +++++----- vllm/engine/metrics.py | 49 ++-- vllm/engine/metrics_types.py | 42 ++-- vllm/engine/multiprocessing/__init__.py | 5 +- vllm/engine/multiprocessing/client.py | 10 +- vllm/engine/multiprocessing/engine.py | 5 +- vllm/engine/output_processor/interfaces.py | 8 +- vllm/engine/output_processor/multi_step.py | 12 +- vllm/engine/output_processor/single_step.py | 8 +- vllm/engine/output_processor/stop_checker.py | 6 +- vllm/engine/output_processor/util.py | 9 +- vllm/engine/protocol.py | 5 +- vllm/entrypoints/api_server.py | 3 +- vllm/entrypoints/chat_utils.py | 49 ++-- vllm/entrypoints/cli/openai.py | 10 +- vllm/entrypoints/cli/serve.py | 3 +- vllm/entrypoints/llm.py | 210 +++++++++--------- vllm/entrypoints/logger.py | 4 +- vllm/entrypoints/openai/api_server.py | 9 +- vllm/entrypoints/openai/cli_args.py | 7 +- vllm/entrypoints/openai/logits_processors.py | 23 +- vllm/entrypoints/openai/protocol.py | 128 +++++------ .../abs_reasoning_parsers.py | 21 +- .../deepseek_r1_reasoning_parser.py | 5 +- vllm/entrypoints/openai/run_batch.py | 9 +- vllm/entrypoints/openai/serving_chat.py | 31 ++- vllm/entrypoints/openai/serving_completion.py | 32 +-- vllm/entrypoints/openai/serving_embedding.py | 15 +- vllm/entrypoints/openai/serving_engine.py | 43 ++-- vllm/entrypoints/openai/serving_models.py | 10 +- vllm/entrypoints/openai/serving_pooling.py | 15 +- vllm/entrypoints/openai/serving_score.py | 49 ++-- .../openai/serving_tokenization.py | 4 +- .../openai/serving_transcription.py | 3 +- .../tool_parsers/abstract_tool_parser.py | 21 +- .../granite_20b_fc_tool_parser.py | 5 +- .../tool_parsers/granite_tool_parser.py | 5 +- .../openai/tool_parsers/hermes_tool_parser.py | 7 +- .../tool_parsers/internlm2_tool_parser.py | 5 +- .../openai/tool_parsers/jamba_tool_parser.py | 11 +- .../openai/tool_parsers/llama_tool_parser.py | 11 +- .../tool_parsers/mistral_tool_parser.py | 13 +- .../tool_parsers/pythonic_tool_parser.py | 5 +- vllm/entrypoints/openai/tool_parsers/utils.py | 6 +- vllm/entrypoints/score_utils.py | 14 +- vllm/envs.py | 8 +- vllm/executor/executor_base.py | 32 +-- vllm/executor/mp_distributed_executor.py | 16 +- vllm/executor/msgspec_utils.py | 4 +- vllm/executor/multiproc_worker_utils.py | 7 +- vllm/executor/ray_distributed_executor.py | 30 +-- vllm/executor/ray_utils.py | 18 +- vllm/executor/uniproc_executor.py | 8 +- vllm/forward_context.py | 6 +- vllm/inputs/data.py | 54 ++--- vllm/inputs/parse.py | 17 +- vllm/inputs/preprocess.py | 23 +- vllm/inputs/registry.py | 5 +- vllm/logger.py | 2 +- vllm/logits_process.py | 16 +- vllm/lora/fully_sharded_layers.py | 22 +- vllm/lora/layers.py | 66 +++--- vllm/lora/lora.py | 14 +- vllm/lora/models.py | 56 ++--- vllm/lora/ops/triton_ops/sgmv_expand.py | 8 +- vllm/lora/ops/triton_ops/sgmv_shrink.py | 8 +- vllm/lora/ops/triton_ops/utils.py | 11 +- vllm/lora/peft_helper.py | 4 +- vllm/lora/punica_wrapper/punica_base.py | 82 +++---- vllm/lora/punica_wrapper/punica_cpu.py | 46 ++-- vllm/lora/punica_wrapper/punica_gpu.py | 48 ++-- vllm/lora/punica_wrapper/punica_hpu.py | 30 +-- vllm/lora/punica_wrapper/utils.py | 20 +- vllm/lora/utils.py | 18 +- vllm/lora/worker_manager.py | 22 +- vllm/model_executor/custom_op.py | 4 +- .../guided_decoding/guided_fields.py | 10 +- .../guided_decoding/outlines_decoding.py | 4 +- .../outlines_logits_processors.py | 16 +- .../guided_decoding/xgrammar_decoding.py | 4 +- .../layers/fused_moe/__init__.py | 6 +- .../layers/fused_moe/fused_moe.py | 38 ++-- vllm/model_executor/layers/fused_moe/layer.py | 4 +- vllm/model_executor/layers/layernorm.py | 16 +- .../layers/mamba/mamba_mixer2.py | 6 +- vllm/model_executor/layers/pooler.py | 8 +- .../layers/quantization/__init__.py | 8 +- .../layers/quantization/aqlm.py | 14 +- .../model_executor/layers/quantization/awq.py | 14 +- .../layers/quantization/awq_marlin.py | 16 +- .../layers/quantization/base_config.py | 16 +- .../layers/quantization/bitsandbytes.py | 14 +- .../compressed_tensors/compressed_tensors.py | 38 ++-- .../compressed_tensors_moe.py | 6 +- .../schemes/compressed_tensors_24.py | 12 +- .../schemes/compressed_tensors_w4a16_24.py | 4 +- .../schemes/compressed_tensors_w8a16_fp8.py | 4 +- .../schemes/compressed_tensors_w8a8_fp8.py | 4 +- .../schemes/compressed_tensors_w8a8_int8.py | 6 +- .../schemes/compressed_tensors_wNa16.py | 6 +- .../compressed_tensors/triton_scaled_mm.py | 4 +- .../quantization/compressed_tensors/utils.py | 11 +- .../layers/quantization/deepspeedfp.py | 10 +- .../layers/quantization/experts_int8.py | 8 +- .../layers/quantization/fbgemm_fp8.py | 12 +- .../model_executor/layers/quantization/fp8.py | 14 +- .../layers/quantization/gguf.py | 12 +- .../layers/quantization/gptq.py | 14 +- .../layers/quantization/gptq_marlin.py | 20 +- .../layers/quantization/gptq_marlin_24.py | 10 +- .../layers/quantization/hqq_marlin.py | 12 +- .../layers/quantization/ipex_quant.py | 10 +- .../kernels/mixed_precision/MPLinearKernel.py | 10 +- .../kernels/mixed_precision/__init__.py | 8 +- .../kernels/mixed_precision/exllama.py | 4 +- .../kernels/mixed_precision/machete.py | 4 +- .../kernels/mixed_precision/marlin.py | 4 +- .../kernels/scaled_mm/ScaledMMLinearKernel.py | 6 +- .../kernels/scaled_mm/__init__.py | 8 +- .../quantization/kernels/scaled_mm/cutlass.py | 4 +- .../quantization/kernels/scaled_mm/triton.py | 4 +- .../quantization/kernels/scaled_mm/xla.py | 4 +- .../layers/quantization/marlin.py | 10 +- .../layers/quantization/modelopt.py | 10 +- .../layers/quantization/moe_wna16.py | 16 +- .../layers/quantization/neuron_quant.py | 8 +- .../layers/quantization/ptpc_fp8.py | 6 +- .../model_executor/layers/quantization/qqq.py | 10 +- .../layers/quantization/quark/quark.py | 48 ++-- .../layers/quantization/quark/quark_moe.py | 4 +- .../quark/schemes/quark_w8a8_fp8.py | 4 +- .../quark/schemes/quark_w8a8_int8.py | 6 +- .../layers/quantization/quark/utils.py | 5 +- .../layers/quantization/schema.py | 4 +- .../layers/quantization/tpu_int8.py | 12 +- .../layers/quantization/utils/fp8_utils.py | 22 +- .../layers/quantization/utils/gptq_utils.py | 4 +- .../quantization/utils/machete_utils.py | 8 +- .../layers/quantization/utils/marlin_utils.py | 12 +- .../quantization/utils/marlin_utils_test.py | 6 +- .../utils/marlin_utils_test_24.py | 9 +- .../utils/marlin_utils_test_qqq.py | 10 +- .../layers/quantization/utils/quant_utils.py | 17 +- .../layers/quantization/utils/w8a8_utils.py | 8 +- .../layers/rejection_sampler.py | 18 +- vllm/model_executor/layers/resampler.py | 10 +- .../model_executor/layers/rotary_embedding.py | 68 +++--- vllm/model_executor/layers/sampler.py | 65 +++--- .../layers/spec_decode_base_sampler.py | 4 +- vllm/model_executor/layers/utils.py | 3 +- .../layers/vocab_parallel_embedding.py | 15 +- vllm/model_executor/model_loader/loader.py | 54 ++--- vllm/model_executor/model_loader/neuron.py | 6 +- vllm/model_executor/model_loader/openvino.py | 4 +- .../model_executor/model_loader/tensorizer.py | 7 +- vllm/model_executor/model_loader/utils.py | 12 +- .../model_loader/weight_utils.py | 41 ++-- vllm/model_executor/models/arctic.py | 13 +- vllm/model_executor/models/aria.py | 22 +- vllm/model_executor/models/baichuan.py | 11 +- vllm/model_executor/models/bamba.py | 13 +- vllm/model_executor/models/bart.py | 7 +- vllm/model_executor/models/bert.py | 13 +- vllm/model_executor/models/blip.py | 9 +- vllm/model_executor/models/blip2.py | 10 +- vllm/model_executor/models/bloom.py | 9 +- vllm/model_executor/models/chameleon.py | 22 +- vllm/model_executor/models/chatglm.py | 11 +- vllm/model_executor/models/clip.py | 9 +- vllm/model_executor/models/commandr.py | 11 +- vllm/model_executor/models/dbrx.py | 9 +- vllm/model_executor/models/decilm.py | 8 +- vllm/model_executor/models/deepseek.py | 11 +- vllm/model_executor/models/deepseek_mtp.py | 9 +- vllm/model_executor/models/deepseek_v2.py | 13 +- vllm/model_executor/models/deepseek_vl2.py | 20 +- vllm/model_executor/models/eagle.py | 5 +- vllm/model_executor/models/exaone.py | 15 +- vllm/model_executor/models/fairseq2_llama.py | 8 +- vllm/model_executor/models/falcon.py | 9 +- vllm/model_executor/models/florence2.py | 27 +-- vllm/model_executor/models/fuyu.py | 10 +- vllm/model_executor/models/gemma.py | 11 +- vllm/model_executor/models/gemma2.py | 15 +- vllm/model_executor/models/glm4v.py | 3 +- vllm/model_executor/models/gpt2.py | 9 +- vllm/model_executor/models/gpt_bigcode.py | 9 +- vllm/model_executor/models/gpt_j.py | 9 +- vllm/model_executor/models/gpt_neox.py | 9 +- vllm/model_executor/models/granite.py | 13 +- vllm/model_executor/models/granitemoe.py | 7 +- vllm/model_executor/models/grok1.py | 15 +- vllm/model_executor/models/h2ovl.py | 3 +- .../models/idefics2_vision_model.py | 9 +- vllm/model_executor/models/idefics3.py | 14 +- vllm/model_executor/models/interfaces.py | 76 +++---- vllm/model_executor/models/interfaces_base.py | 26 +-- vllm/model_executor/models/intern_vit.py | 9 +- vllm/model_executor/models/internlm2.py | 19 +- vllm/model_executor/models/internlm2_ve.py | 4 +- vllm/model_executor/models/internvl.py | 10 +- vllm/model_executor/models/jais.py | 9 +- vllm/model_executor/models/jamba.py | 15 +- vllm/model_executor/models/llama.py | 21 +- vllm/model_executor/models/llava.py | 11 +- vllm/model_executor/models/llava_next.py | 17 +- .../model_executor/models/llava_next_video.py | 16 +- vllm/model_executor/models/llava_onevision.py | 26 +-- vllm/model_executor/models/mamba.py | 13 +- vllm/model_executor/models/mamba2.py | 13 +- vllm/model_executor/models/mamba_cache.py | 13 +- vllm/model_executor/models/medusa.py | 25 ++- vllm/model_executor/models/minicpm.py | 17 +- vllm/model_executor/models/minicpm3.py | 4 +- vllm/model_executor/models/minicpmo.py | 24 +- vllm/model_executor/models/minicpmv.py | 54 ++--- vllm/model_executor/models/mixtral.py | 9 +- vllm/model_executor/models/mixtral_quant.py | 9 +- vllm/model_executor/models/mllama.py | 72 +++--- vllm/model_executor/models/mlp_speculator.py | 10 +- vllm/model_executor/models/module_mapping.py | 18 +- vllm/model_executor/models/molmo.py | 46 ++-- vllm/model_executor/models/mpt.py | 9 +- vllm/model_executor/models/nemotron.py | 15 +- vllm/model_executor/models/nvlm_d.py | 3 +- vllm/model_executor/models/olmo.py | 11 +- vllm/model_executor/models/olmo2.py | 7 +- vllm/model_executor/models/olmoe.py | 11 +- vllm/model_executor/models/opt.py | 9 +- vllm/model_executor/models/orion.py | 13 +- vllm/model_executor/models/paligemma.py | 8 +- vllm/model_executor/models/persimmon.py | 9 +- vllm/model_executor/models/phi.py | 9 +- vllm/model_executor/models/phi3_small.py | 13 +- vllm/model_executor/models/phi3v.py | 14 +- vllm/model_executor/models/phimoe.py | 9 +- vllm/model_executor/models/pixtral.py | 31 +-- .../models/prithvi_geospatial_mae.py | 9 +- vllm/model_executor/models/qwen.py | 13 +- vllm/model_executor/models/qwen2.py | 19 +- vllm/model_executor/models/qwen2_5_vl.py | 20 +- vllm/model_executor/models/qwen2_audio.py | 8 +- vllm/model_executor/models/qwen2_moe.py | 11 +- vllm/model_executor/models/qwen2_rm.py | 7 +- vllm/model_executor/models/qwen2_vl.py | 22 +- vllm/model_executor/models/qwen_vl.py | 8 +- vllm/model_executor/models/registry.py | 58 ++--- vllm/model_executor/models/roberta.py | 11 +- vllm/model_executor/models/siglip.py | 11 +- vllm/model_executor/models/solar.py | 13 +- vllm/model_executor/models/stablelm.py | 11 +- vllm/model_executor/models/starcoder2.py | 9 +- vllm/model_executor/models/telechat2.py | 12 +- vllm/model_executor/models/transformers.py | 3 +- vllm/model_executor/models/ultravox.py | 8 +- vllm/model_executor/models/utils.py | 48 ++-- vllm/model_executor/models/whisper.py | 24 +- vllm/model_executor/pooling_metadata.py | 8 +- vllm/model_executor/sampling_metadata.py | 88 ++++---- vllm/model_executor/utils.py | 4 +- vllm/multimodal/base.py | 7 +- vllm/multimodal/hasher.py | 3 +- vllm/multimodal/image.py | 4 +- vllm/multimodal/registry.py | 14 +- vllm/multimodal/video.py | 4 +- vllm/outputs.py | 24 +- vllm/platforms/cuda.py | 11 +- vllm/platforms/interface.py | 4 +- vllm/platforms/rocm.py | 6 +- vllm/plugins/__init__.py | 4 +- vllm/profiler/layerwise_profile.py | 38 ++-- vllm/profiler/utils.py | 8 +- vllm/prompt_adapter/models.py | 16 +- vllm/prompt_adapter/worker_manager.py | 16 +- vllm/sampling_params.py | 53 +++-- vllm/sequence.py | 132 +++++------ vllm/spec_decode/batch_expansion.py | 43 ++-- vllm/spec_decode/draft_model_runner.py | 8 +- vllm/spec_decode/interfaces.py | 6 +- vllm/spec_decode/medusa_worker.py | 16 +- vllm/spec_decode/mlp_speculator_worker.py | 16 +- vllm/spec_decode/multi_step_worker.py | 39 ++-- vllm/spec_decode/ngram_worker.py | 14 +- vllm/spec_decode/proposer_worker_base.py | 10 +- .../spec_decode/smaller_tp_proposer_worker.py | 16 +- vllm/spec_decode/spec_decode_worker.py | 62 +++--- vllm/spec_decode/target_model_runner.py | 6 +- vllm/spec_decode/top1_proposer.py | 28 +-- vllm/spec_decode/util.py | 45 ++-- vllm/tracing.py | 3 +- vllm/transformers_utils/config.py | 10 +- vllm/transformers_utils/configs/arctic.py | 6 +- vllm/transformers_utils/configs/cohere2.py | 6 +- .../configs/deepseek_vl2.py | 5 +- vllm/transformers_utils/configs/exaone.py | 4 +- vllm/transformers_utils/configs/jais.py | 4 +- .../configs/mlp_speculator.py | 6 +- vllm/transformers_utils/configs/mpt.py | 18 +- vllm/transformers_utils/configs/olmo2.py | 2 +- vllm/transformers_utils/configs/solar.py | 2 +- vllm/transformers_utils/configs/ultravox.py | 10 +- vllm/transformers_utils/detokenizer.py | 6 +- vllm/transformers_utils/detokenizer_utils.py | 24 +- .../processors/deepseek_vl2.py | 29 ++- vllm/transformers_utils/tokenizer_base.py | 34 +-- .../tokenizer_group/__init__.py | 4 +- .../tokenizer_group/base_tokenizer_group.py | 6 +- .../tokenizer_group/ray_tokenizer_group.py | 6 +- .../tokenizer_group/tokenizer_group.py | 8 +- vllm/transformers_utils/tokenizers/mistral.py | 58 ++--- vllm/transformers_utils/utils.py | 4 +- vllm/usage/usage_lib.py | 14 +- vllm/utils.py | 76 +++---- vllm/v1/attention/backends/flash_attn.py | 16 +- vllm/v1/attention/backends/pallas.py | 20 +- vllm/v1/attention/backends/rocm_attn.py | 14 +- vllm/v1/core/encoder_cache_manager.py | 16 +- vllm/v1/core/kv_cache_manager.py | 31 +-- vllm/v1/core/kv_cache_utils.py | 32 +-- vllm/v1/core/scheduler.py | 45 ++-- vllm/v1/core/scheduler_output.py | 42 ++-- vllm/v1/engine/__init__.py | 16 +- vllm/v1/engine/async_llm.py | 9 +- vllm/v1/engine/core.py | 16 +- vllm/v1/engine/core_client.py | 34 +-- vllm/v1/engine/detokenizer.py | 12 +- vllm/v1/engine/llm_engine.py | 17 +- vllm/v1/engine/logprobs.py | 12 +- vllm/v1/engine/mm_input_cache.py | 18 +- vllm/v1/engine/output_processor.py | 20 +- vllm/v1/engine/parallel_sampling.py | 16 +- vllm/v1/engine/processor.py | 3 +- vllm/v1/executor/abstract.py | 10 +- vllm/v1/executor/multiproc_executor.py | 10 +- vllm/v1/kv_cache_interface.py | 7 +- vllm/v1/metrics/loggers.py | 18 +- vllm/v1/metrics/stats.py | 20 +- vllm/v1/outputs.py | 20 +- vllm/v1/request.py | 24 +- vllm/v1/sample/metadata.py | 10 +- vllm/v1/sample/ops/penalties.py | 12 +- vllm/v1/sample/ops/topk_topp_sampler.py | 10 +- vllm/v1/sample/rejection_sampler.py | 7 +- vllm/v1/stats/common.py | 18 +- vllm/v1/utils.py | 20 +- vllm/v1/worker/block_table.py | 6 +- vllm/v1/worker/gpu_input_batch.py | 62 +++--- vllm/v1/worker/gpu_model_runner.py | 36 +-- vllm/v1/worker/gpu_worker.py | 4 +- vllm/v1/worker/lora_model_runner_mixin.py | 17 +- vllm/v1/worker/tpu_model_runner.py | 26 +-- vllm/v1/worker/tpu_worker.py | 6 +- vllm/worker/cache_engine.py | 5 +- vllm/worker/cpu_enc_dec_model_runner.py | 30 +-- vllm/worker/cpu_model_runner.py | 77 ++++--- vllm/worker/cpu_pooling_model_runner.py | 24 +- vllm/worker/cpu_worker.py | 26 +-- vllm/worker/enc_dec_model_runner.py | 32 +-- vllm/worker/hpu_model_runner.py | 162 +++++++------- vllm/worker/hpu_worker.py | 22 +- vllm/worker/model_runner.py | 125 ++++++----- vllm/worker/model_runner_base.py | 39 ++-- vllm/worker/multi_step_model_runner.py | 37 ++- vllm/worker/multi_step_tpu_worker.py | 6 +- vllm/worker/multi_step_worker.py | 8 +- vllm/worker/neuron_model_runner.py | 44 ++-- vllm/worker/neuron_worker.py | 8 +- vllm/worker/openvino_model_runner.py | 38 ++-- vllm/worker/openvino_worker.py | 36 +-- vllm/worker/pooling_model_runner.py | 24 +- vllm/worker/tpu_model_runner.py | 65 +++--- vllm/worker/tpu_worker.py | 14 +- vllm/worker/worker.py | 26 +-- vllm/worker/worker_base.py | 50 ++--- vllm/worker/xpu_model_runner.py | 73 +++--- vllm/worker/xpu_worker.py | 8 +- 656 files changed, 5660 insertions(+), 5686 deletions(-) diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py index 364b087b841d3..7b3371f92605d 100644 --- a/benchmarks/backend_request_func.py +++ b/benchmarks/backend_request_func.py @@ -6,7 +6,7 @@ import time import traceback from dataclasses import dataclass, field -from typing import List, Optional, Union +from typing import Optional, Union import aiohttp import huggingface_hub.constants @@ -39,8 +39,8 @@ class RequestFuncOutput: latency: float = 0.0 output_tokens: int = 0 ttft: float = 0.0 # Time to first token - itl: List[float] = field( - default_factory=list) # List of inter-token latencies + itl: list[float] = field( + default_factory=list) # list of inter-token latencies tpot: float = 0.0 # avg next-token latencies prompt_len: int = 0 error: str = "" diff --git a/benchmarks/benchmark_guided.py b/benchmarks/benchmark_guided.py index dc2bf0e79cbce..2e0f6c6b5d202 100644 --- a/benchmarks/benchmark_guided.py +++ b/benchmarks/benchmark_guided.py @@ -6,7 +6,6 @@ import os import random import time -from typing import List import datasets import pandas as pd @@ -39,7 +38,7 @@ class SampleRequest: completion: str = None -def run_vllm(requests: List[SampleRequest], +def run_vllm(requests: list[SampleRequest], engine_args: EngineArgs, n: int, guided_decoding_rate: float = 1.0, @@ -54,8 +53,8 @@ def run_vllm(requests: List[SampleRequest], " prompt_len and expected_output_len for all requests.") # Add the requests to the engine. - prompts: List[str] = [] - sampling_params: List[SamplingParams] = [] + prompts: list[str] = [] + sampling_params: list[SamplingParams] = [] # create a list containing random selected true or false guided_decoding_req_idx = random.sample( range(len(requests)), int(len(requests) * guided_decoding_rate)) @@ -110,7 +109,7 @@ def run_vllm(requests: List[SampleRequest], async def run_vllm_async( - requests: List[SampleRequest], + requests: list[SampleRequest], engine_args: AsyncEngineArgs, n: int, guided_decoding_rate: float = 1.0, @@ -129,8 +128,8 @@ async def run_vllm_async( " prompt_len and expected_output_len for all requests.") # Add the requests to the engine. - prompts: List[str] = [] - sampling_params: List[SamplingParams] = [] + prompts: list[str] = [] + sampling_params: list[SamplingParams] = [] guided_decoding_req_idx = random.sample( range(len(requests)), int(len(requests) * guided_decoding_rate)) @@ -203,7 +202,7 @@ async def run_vllm_async( def sample_requests(tokenizer: PreTrainedTokenizerBase, - args: argparse.Namespace) -> List[SampleRequest]: + args: argparse.Namespace) -> list[SampleRequest]: if args.dataset == 'json': if args.json_schema_path is None: dir_path = os.path.dirname(os.path.realpath(__file__)) @@ -287,7 +286,7 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase, elif args.dataset == "xgrammar_bench": args.warmup = False - requests: List[SampleRequest] = [] + requests: list[SampleRequest] = [] dataset = datasets.load_dataset("NousResearch/json-mode-eval", split="train") print(f"dataset has {len(dataset)} entries") diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index c82358d14512c..d7f39f50f6ca1 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -7,7 +7,7 @@ import os import time from pathlib import Path -from typing import Any, Dict, List, Optional +from typing import Any, Optional import numpy as np import torch @@ -22,7 +22,7 @@ def save_to_pytorch_benchmark_format(args: argparse.Namespace, - results: Dict[str, Any]) -> None: + results: dict[str, Any]) -> None: pt_records = convert_to_pytorch_benchmark_format( args=args, metrics={"latency": results["latencies"]}, @@ -57,7 +57,7 @@ def main(args: argparse.Namespace): dummy_prompt_token_ids = np.random.randint(10000, size=(args.batch_size, args.input_len)) - dummy_prompts: List[PromptType] = [{ + dummy_prompts: list[PromptType] = [{ "prompt_token_ids": batch } for batch in dummy_prompt_token_ids.tolist()] diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py index 23822856b8825..fba32520442f3 100644 --- a/benchmarks/benchmark_prefix_caching.py +++ b/benchmarks/benchmark_prefix_caching.py @@ -31,7 +31,7 @@ import json import random import time -from typing import List, Optional, Tuple +from typing import Optional from transformers import PreTrainedTokenizerBase @@ -77,9 +77,9 @@ def sample_requests_from_dataset( dataset_path: str, num_requests: int, tokenizer: PreTrainedTokenizerBase, - input_length_range: Tuple[int, int], + input_length_range: tuple[int, int], fixed_output_len: Optional[int], -) -> List[Request]: +) -> list[Request]: if fixed_output_len is not None and fixed_output_len < 4: raise ValueError("output_len too small") @@ -99,7 +99,7 @@ def sample_requests_from_dataset( assert min_len >= 0 and max_len >= min_len, "input_length_range too small" # Filter out sequences that are too long or too short - filtered_requests: List[Request] = [] + filtered_requests: list[Request] = [] for i in range(len(dataset)): if len(filtered_requests) == num_requests: @@ -122,10 +122,10 @@ def sample_requests_from_dataset( def sample_requests_from_random( num_requests: int, tokenizer: PreTrainedTokenizerBase, - input_length_range: Tuple[int, int], + input_length_range: tuple[int, int], fixed_output_len: Optional[int], prefix_len: int, -) -> List[Request]: +) -> list[Request]: requests = [] prefix_token_ids = sample_tokens(tokenizer, prefix_len) @@ -144,9 +144,9 @@ def sample_requests_from_random( return requests -def repeat_and_sort_requests(requests: List[Request], +def repeat_and_sort_requests(requests: list[Request], repeat_count: int, - sort: bool = False) -> List[str]: + sort: bool = False) -> list[str]: repeated_requests = requests * repeat_count if sort: repeated_requests.sort(key=lambda x: x[1]) diff --git a/benchmarks/benchmark_prioritization.py b/benchmarks/benchmark_prioritization.py index 24014e5b6c373..43b2c1b03323c 100644 --- a/benchmarks/benchmark_prioritization.py +++ b/benchmarks/benchmark_prioritization.py @@ -5,7 +5,7 @@ import json import random import time -from typing import List, Optional, Tuple +from typing import Optional from transformers import AutoTokenizer, PreTrainedTokenizerBase @@ -23,7 +23,7 @@ def sample_requests( num_requests: int, tokenizer: PreTrainedTokenizerBase, fixed_output_len: Optional[int], -) -> List[Tuple[str, int, int]]: +) -> list[tuple[str, int, int]]: if fixed_output_len is not None and fixed_output_len < 4: raise ValueError("output_len too small") @@ -40,7 +40,7 @@ def sample_requests( random.shuffle(dataset) # Filter out sequences that are too long or too short - filtered_dataset: List[Tuple[str, int, int]] = [] + filtered_dataset: list[tuple[str, int, int]] = [] for i in range(len(dataset)): if len(filtered_dataset) == num_requests: break @@ -68,7 +68,7 @@ def sample_requests( def run_vllm( - requests: List[Tuple[str, int, int]], + requests: list[tuple[str, int, int]], n: int, engine_args: EngineArgs, ) -> float: diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index 1bb83b082beb4..16ec0a4817a26 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -33,9 +33,10 @@ import random import time import warnings +from collections.abc import AsyncGenerator, Collection from dataclasses import dataclass from datetime import datetime -from typing import Any, AsyncGenerator, Collection, Dict, List, Optional, Tuple +from typing import Any, Optional import numpy as np import pandas as pd @@ -73,22 +74,22 @@ class BenchmarkMetrics: mean_ttft_ms: float median_ttft_ms: float std_ttft_ms: float - percentiles_ttft_ms: List[Tuple[float, float]] + percentiles_ttft_ms: list[tuple[float, float]] mean_tpot_ms: float median_tpot_ms: float std_tpot_ms: float - percentiles_tpot_ms: List[Tuple[float, float]] + percentiles_tpot_ms: list[tuple[float, float]] mean_itl_ms: float median_itl_ms: float std_itl_ms: float - percentiles_itl_ms: List[Tuple[float, float]] + percentiles_itl_ms: list[tuple[float, float]] # E2EL stands for end-to-end latency per request. # It is the time taken on the client side from sending # a request to receiving a complete response. mean_e2el_ms: float median_e2el_ms: float std_e2el_ms: float - percentiles_e2el_ms: List[Tuple[float, float]] + percentiles_e2el_ms: list[tuple[float, float]] def sample_sharegpt_requests( @@ -96,7 +97,7 @@ def sample_sharegpt_requests( num_requests: int, tokenizer: PreTrainedTokenizerBase, fixed_output_len: Optional[int] = None, -) -> List[Tuple[str, int, int, None]]: +) -> list[tuple[str, int, int, None]]: # Load the dataset. with open(dataset_path, encoding='utf-8') as f: dataset = json.load(f) @@ -110,7 +111,7 @@ def sample_sharegpt_requests( random.shuffle(dataset) # Filter out sequences that are too long or too short - filtered_dataset: List[Tuple[str, int, int]] = [] + filtered_dataset: list[tuple[str, int, int]] = [] for i in range(len(dataset)): if len(filtered_dataset) == num_requests: break @@ -139,7 +140,7 @@ def sample_burstgpt_requests( num_requests: int, random_seed: int, tokenizer: PreTrainedTokenizerBase, -) -> List[Tuple[str, int, int, None]]: +) -> list[tuple[str, int, int, None]]: df = pd.read_csv(dataset_path) gpt4_df = df[df["Model"] == "GPT-4"] # Remove the failed requests (i.e., response length is 0) @@ -170,7 +171,7 @@ def sample_sonnet_requests( output_len: int, prefix_len: int, tokenizer: PreTrainedTokenizerBase, -) -> List[Tuple[str, str, int, int, None]]: +) -> list[tuple[str, str, int, int, None]]: assert ( input_len > prefix_len ), "'args.sonnet-input-len' must be greater than 'args.prefix-input-len'." @@ -211,7 +212,7 @@ def sample_sonnet_requests( prefix_lines = poem_lines[:num_prefix_lines] # Sample the rest of lines per request. - sampled_requests: List[Tuple[str, int, int]] = [] + sampled_requests: list[tuple[str, int, int]] = [] for _ in range(num_requests): num_lines_needed = num_input_lines - num_prefix_lines sampled_lines = "".join(prefix_lines + @@ -238,8 +239,8 @@ def sample_vision_arena_requests( num_requests: int, tokenizer: PreTrainedTokenizerBase, fixed_output_len: Optional[int] = None, -) -> List[Tuple[str, str, int, Optional[Dict[str, Collection[str]]]]]: - sampled_requests: List[Tuple[str, int, int, Dict[str, +) -> list[tuple[str, str, int, Optional[dict[str, Collection[str]]]]]: + sampled_requests: list[tuple[str, int, int, dict[str, Collection[str]]]] = [] for data in dataset: if len(sampled_requests) == num_requests: @@ -285,7 +286,7 @@ def sample_hf_requests( tokenizer: PreTrainedTokenizerBase, random_seed: int, fixed_output_len: Optional[int] = None, -) -> List[Tuple[str, str, int, Optional[Dict[str, Collection[str]]]]]: +) -> list[tuple[str, str, int, Optional[dict[str, Collection[str]]]]]: # Special case for vision_arena dataset if dataset_path == 'lmarena-ai/vision-arena-bench-v0.1' \ @@ -307,7 +308,7 @@ def sample_hf_requests( "HF Dataset must have 'conversations' column.") filter_func = lambda x: len(x["conversations"]) >= 2 filtered_dataset = dataset.shuffle(seed=random_seed).filter(filter_func) - sampled_requests: List[Tuple[str, int, int, Dict[str, + sampled_requests: list[tuple[str, int, int, dict[str, Collection[str]]]] = [] for data in filtered_dataset: if len(sampled_requests) == num_requests: @@ -370,7 +371,7 @@ def sample_random_requests( num_prompts: int, range_ratio: float, tokenizer: PreTrainedTokenizerBase, -) -> List[Tuple[str, int, int]]: +) -> list[tuple[str, int, int]]: prefix_token_ids = np.random.randint(0, tokenizer.vocab_size, size=prefix_len).tolist() @@ -399,10 +400,10 @@ def sample_random_requests( async def get_request( - input_requests: List[Tuple[str, int, int]], + input_requests: list[tuple[str, int, int]], request_rate: float, burstiness: float = 1.0, -) -> AsyncGenerator[Tuple[str, int, int], None]: +) -> AsyncGenerator[tuple[str, int, int], None]: """ Asynchronously generates requests at a specified rate with OPTIONAL burstiness. @@ -443,23 +444,23 @@ async def get_request( def calculate_metrics( - input_requests: List[Tuple[str, int, int]], - outputs: List[RequestFuncOutput], + input_requests: list[tuple[str, int, int]], + outputs: list[RequestFuncOutput], dur_s: float, tokenizer: PreTrainedTokenizerBase, - selected_percentile_metrics: List[str], - selected_percentiles: List[float], - goodput_config_dict: Dict[str, float], -) -> Tuple[BenchmarkMetrics, List[int]]: - actual_output_lens: List[int] = [] + selected_percentile_metrics: list[str], + selected_percentiles: list[float], + goodput_config_dict: dict[str, float], +) -> tuple[BenchmarkMetrics, list[int]]: + actual_output_lens: list[int] = [] total_input = 0 completed = 0 good_completed = 0 - itls: List[float] = [] - tpots: List[float] = [] - all_tpots: List[float] = [] - ttfts: List[float] = [] - e2els: List[float] = [] + itls: list[float] = [] + tpots: list[float] = [] + all_tpots: list[float] = [] + ttfts: list[float] = [] + e2els: list[float] = [] for i in range(len(outputs)): if outputs[i].success: output_len = outputs[i].output_tokens @@ -557,19 +558,19 @@ async def benchmark( model_id: str, model_name: str, tokenizer: PreTrainedTokenizerBase, - input_requests: List[Tuple[str, int, int]], + input_requests: list[tuple[str, int, int]], logprobs: Optional[int], best_of: int, request_rate: float, burstiness: float, disable_tqdm: bool, profile: bool, - selected_percentile_metrics: List[str], - selected_percentiles: List[str], + selected_percentile_metrics: list[str], + selected_percentiles: list[str], ignore_eos: bool, - goodput_config_dict: Dict[str, float], + goodput_config_dict: dict[str, float], max_concurrency: Optional[int], - lora_modules: Optional[List[str]], + lora_modules: Optional[list[str]], ): if backend in ASYNC_REQUEST_FUNCS: request_func = ASYNC_REQUEST_FUNCS[backend] @@ -652,7 +653,7 @@ async def limited_request_func(request_func_input, pbar): pbar=pbar) benchmark_start_time = time.perf_counter() - tasks: List[asyncio.Task] = [] + tasks: list[asyncio.Task] = [] async for request in get_request(input_requests, request_rate, burstiness): prompt, prompt_len, output_len, mm_content = request req_model_id, req_model_name = model_id, model_name @@ -674,7 +675,7 @@ async def limited_request_func(request_func_input, pbar): asyncio.create_task( limited_request_func(request_func_input=request_func_input, pbar=pbar))) - outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks) + outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks) if profile: print("Stopping profiler...") @@ -820,7 +821,7 @@ def parse_goodput(slo_pairs): def save_to_pytorch_benchmark_format(args: argparse.Namespace, - results: Dict[str, Any], + results: dict[str, Any], file_name: str) -> None: metrics = [ "median_ttft_ms", "mean_ttft_ms", "std_ttft_ms", "p99_ttft_ms", @@ -974,7 +975,7 @@ def main(args: argparse.Namespace): # Save config and results to json if args.save_result: - result_json: Dict[str, Any] = {} + result_json: dict[str, Any] = {} # Setup current_dt = datetime.now().strftime("%Y%m%d-%H%M%S") diff --git a/benchmarks/benchmark_serving_guided.py b/benchmarks/benchmark_serving_guided.py index 05eadff797876..6c132d05f1b60 100644 --- a/benchmarks/benchmark_serving_guided.py +++ b/benchmarks/benchmark_serving_guided.py @@ -30,8 +30,9 @@ import random import time import warnings +from collections.abc import AsyncGenerator from dataclasses import dataclass -from typing import AsyncGenerator, Dict, List, Optional, Tuple +from typing import Optional import datasets import numpy as np @@ -66,22 +67,22 @@ class BenchmarkMetrics: mean_ttft_ms: float median_ttft_ms: float std_ttft_ms: float - percentiles_ttft_ms: List[Tuple[float, float]] + percentiles_ttft_ms: list[tuple[float, float]] mean_tpot_ms: float median_tpot_ms: float std_tpot_ms: float - percentiles_tpot_ms: List[Tuple[float, float]] + percentiles_tpot_ms: list[tuple[float, float]] mean_itl_ms: float median_itl_ms: float std_itl_ms: float - percentiles_itl_ms: List[Tuple[float, float]] + percentiles_itl_ms: list[tuple[float, float]] # E2EL stands for end-to-end latency per request. # It is the time taken on the client side from sending # a request to receiving a complete response. mean_e2el_ms: float median_e2el_ms: float std_e2el_ms: float - percentiles_e2el_ms: List[Tuple[float, float]] + percentiles_e2el_ms: list[tuple[float, float]] @dataclasses.dataclass @@ -104,7 +105,7 @@ class SampleRequest: def sample_requests(tokenizer: PreTrainedTokenizerBase, - args: argparse.Namespace) -> List[SampleRequest]: + args: argparse.Namespace) -> list[SampleRequest]: if args.dataset == 'json': if args.json_schema_path is None: dir_path = os.path.dirname(os.path.realpath(__file__)) @@ -187,7 +188,7 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase, ] elif args.dataset == "xgrammar_bench": - requests: List[SampleRequest] = [] + requests: list[SampleRequest] = [] dataset = datasets.load_dataset("NousResearch/json-mode-eval", split="train") print(f"dataset has {len(dataset)} entries") @@ -214,10 +215,10 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase, async def get_request( - input_requests: List[SampleRequest], + input_requests: list[SampleRequest], request_rate: float, burstiness: float = 1.0, -) -> AsyncGenerator[Tuple[int, SampleRequest], None]: +) -> AsyncGenerator[tuple[int, SampleRequest], None]: """ Asynchronously generates requests at a specified rate with OPTIONAL burstiness. @@ -258,23 +259,23 @@ async def get_request( def calculate_metrics( - input_requests: List[Tuple[str, int, int]], - outputs: List[RequestFuncOutput], + input_requests: list[tuple[str, int, int]], + outputs: list[RequestFuncOutput], dur_s: float, tokenizer: PreTrainedTokenizerBase, - selected_percentile_metrics: List[str], - selected_percentiles: List[float], - goodput_config_dict: Optional[Dict[str, float]] = None, -) -> Tuple[BenchmarkMetrics, List[int]]: - actual_output_lens: List[int] = [] + selected_percentile_metrics: list[str], + selected_percentiles: list[float], + goodput_config_dict: Optional[dict[str, float]] = None, +) -> tuple[BenchmarkMetrics, list[int]]: + actual_output_lens: list[int] = [] total_input = 0 completed = 0 good_completed = 0 - itls: List[float] = [] - tpots: List[float] = [] - all_tpots: List[float] = [] - ttfts: List[float] = [] - e2els: List[float] = [] + itls: list[float] = [] + tpots: list[float] = [] + all_tpots: list[float] = [] + ttfts: list[float] = [] + e2els: list[float] = [] for i in range(len(outputs)): if outputs[i].success: # We use the tokenizer to count the number of output tokens for all @@ -368,18 +369,18 @@ async def benchmark( base_url: str, model_id: str, tokenizer: PreTrainedTokenizerBase, - input_requests: List[SampleRequest], + input_requests: list[SampleRequest], request_rate: float, burstiness: float, disable_tqdm: bool, profile: bool, - selected_percentile_metrics: List[str], - selected_percentiles: List[str], + selected_percentile_metrics: list[str], + selected_percentiles: list[str], ignore_eos: bool, max_concurrency: Optional[int], guided_decoding_ratio: float, guided_decoding_backend: str, - goodput_config_dict: Optional[Dict[str, float]] = None, + goodput_config_dict: Optional[dict[str, float]] = None, ): if backend in ASYNC_REQUEST_FUNCS: request_func = ASYNC_REQUEST_FUNCS[backend] @@ -459,8 +460,8 @@ async def limited_request_func(request_func_input, pbar): pbar=pbar) benchmark_start_time = time.perf_counter() - tasks: List[asyncio.Task] = [] - expected: List[str] = [] + tasks: list[asyncio.Task] = [] + expected: list[str] = [] async for i, request in get_request(input_requests, request_rate, burstiness): extra_body = prepare_extra_body( @@ -479,7 +480,7 @@ async def limited_request_func(request_func_input, pbar): asyncio.create_task( limited_request_func(request_func_input=request_func_input, pbar=pbar))) - outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks) + outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks) if profile: print("Stopping profiler...") diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 04de08fa97c9c..aabce64ff776e 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -7,7 +7,7 @@ import random import time from functools import cache -from typing import Any, Dict, List, Optional, Tuple +from typing import Any, Optional import torch import uvloop @@ -74,12 +74,12 @@ def lora_path_on_disk(lora_path: str) -> str: return get_adapter_absolute_path(lora_path) -lora_tokenizer_cache: Dict[int, AnyTokenizer] = {} +lora_tokenizer_cache: dict[int, AnyTokenizer] = {} def get_random_lora_request( args: argparse.Namespace -) -> Tuple[LoRARequest, Optional[AnyTokenizer]]: +) -> tuple[LoRARequest, Optional[AnyTokenizer]]: global lora_tokenizer_cache lora_id = random.randint(1, args.max_loras) lora_request = LoRARequest(lora_name=str(lora_id), @@ -91,7 +91,7 @@ def get_random_lora_request( def sample_requests(tokenizer: PreTrainedTokenizerBase, - args: argparse.Namespace) -> List[SampleRequest]: + args: argparse.Namespace) -> list[SampleRequest]: dataset_path: str = args.dataset num_requests: int = args.num_prompts @@ -109,7 +109,7 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase, random.shuffle(dataset) # Filter out sequences that are too long or too short - filtered_dataset: List[SampleRequest] = [] + filtered_dataset: list[SampleRequest] = [] for data in tqdm(dataset, total=len(filtered_dataset), desc="sampling requests"): @@ -165,7 +165,7 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase, def run_vllm( - requests: List[SampleRequest], + requests: list[SampleRequest], n: int, engine_args: EngineArgs, ) -> float: @@ -178,8 +178,8 @@ def run_vllm( "Please ensure that max_model_len is greater than the sum of" " prompt_len and expected_output_len for all requests.") # Add the requests to the engine. - prompts: List[TextPrompt] = [] - sampling_params: List[SamplingParams] = [] + prompts: list[TextPrompt] = [] + sampling_params: list[SamplingParams] = [] for request in requests: prompts.append( TextPrompt(prompt=request.prompt, @@ -192,7 +192,7 @@ def run_vllm( ignore_eos=True, max_tokens=request.expected_output_len, )) - lora_requests: Optional[List[LoRARequest]] = None + lora_requests: Optional[list[LoRARequest]] = None if engine_args.enable_lora: lora_requests = [request.lora_request for request in requests] @@ -225,7 +225,7 @@ def run_vllm( async def run_vllm_async( - requests: List[SampleRequest], + requests: list[SampleRequest], n: int, engine_args: AsyncEngineArgs, disable_frontend_multiprocessing: bool = False, @@ -242,9 +242,9 @@ async def run_vllm_async( " prompt_len and expected_output_len for all requests.") # Add the requests to the engine. - prompts: List[TextPrompt] = [] - sampling_params: List[SamplingParams] = [] - lora_requests: List[Optional[LoRARequest]] = [] + prompts: list[TextPrompt] = [] + sampling_params: list[SamplingParams] = [] + lora_requests: list[Optional[LoRARequest]] = [] for request in requests: prompts.append( TextPrompt(prompt=request.prompt, @@ -276,7 +276,7 @@ async def run_vllm_async( def run_hf( - requests: List[SampleRequest], + requests: list[SampleRequest], model: str, tokenizer: PreTrainedTokenizerBase, n: int, @@ -292,7 +292,7 @@ def run_hf( pbar = tqdm(total=len(requests)) start = time.perf_counter() - batch: List[str] = [] + batch: list[str] = [] max_prompt_len = 0 max_output_len = 0 for i in range(len(requests)): @@ -334,7 +334,7 @@ def run_hf( def run_mii( - requests: List[SampleRequest], + requests: list[SampleRequest], model: str, tensor_parallel_size: int, output_len: int, @@ -352,7 +352,7 @@ def run_mii( def save_to_pytorch_benchmark_format(args: argparse.Namespace, - results: Dict[str, Any]) -> None: + results: dict[str, Any]) -> None: pt_records = convert_to_pytorch_benchmark_format( args=args, metrics={ @@ -479,8 +479,8 @@ def main(args: argparse.Namespace): type=str, default=None, help="Path to the dataset. The dataset is expected to " - "be a json in form of List[Dict[..., conversations: " - "List[Dict[..., value: ]]]]") + "be a json in form of list[dict[..., conversations: " + "list[dict[..., value: ]]]]") parser.add_argument("--input-len", type=int, default=None, diff --git a/benchmarks/benchmark_utils.py b/benchmarks/benchmark_utils.py index ac0688ca013f2..45a0ddbd5d08d 100644 --- a/benchmarks/benchmark_utils.py +++ b/benchmarks/benchmark_utils.py @@ -4,12 +4,12 @@ import json import math import os -from typing import Any, Dict, List +from typing import Any def convert_to_pytorch_benchmark_format(args: argparse.Namespace, - metrics: Dict[str, List], - extra_info: Dict[str, Any]) -> List: + metrics: dict[str, list], + extra_info: dict[str, Any]) -> list: """ Save the benchmark results in the format used by PyTorch OSS benchmark with on metric per record @@ -64,6 +64,6 @@ def iterencode(self, o: Any, *args, **kwargs) -> Any: return super().iterencode(self.clear_inf(o), *args, **kwargs) -def write_to_json(filename: str, records: List) -> None: +def write_to_json(filename: str, records: list) -> None: with open(filename, "w") as f: json.dump(records, f, cls=InfEncoder) diff --git a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py index 468a1b2868f0c..9e36b0a9d3bb9 100644 --- a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py +++ b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py @@ -5,7 +5,8 @@ import itertools import pickle as pkl import time -from typing import Callable, Iterable, List, Tuple +from collections.abc import Iterable +from typing import Callable import torch import torch.utils.benchmark as TBenchmark @@ -228,7 +229,7 @@ def print_timers(timers: Iterable[TMeasurement]): def run(dtype: torch.dtype, - MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]: + MKNs: Iterable[tuple[int, int, int]]) -> Iterable[TMeasurement]: results = [] for m, k, n in MKNs: timers = bench(dtype, m, k, n, f"scaled-{dtype}-gemm", @@ -241,7 +242,7 @@ def run(dtype: torch.dtype, # output makers def make_output(data: Iterable[TMeasurement], - MKNs: Iterable[Tuple[int, int, int]], + MKNs: Iterable[tuple[int, int, int]], base_description: str, timestamp=None): print(f"== All Results {base_description} ====") @@ -282,7 +283,7 @@ def run_model_bench(args): for i, model in enumerate(args.models): print(f"[{i}] {model}") - def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]: + def model_shapes(model_name: str, tp_size: int) -> list[tuple[int, int]]: KNs = [] for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]): KN[tp_split_dim] = KN[tp_split_dim] // tp_size diff --git a/benchmarks/cutlass_benchmarks/utils.py b/benchmarks/cutlass_benchmarks/utils.py index bab377800729b..fe4d8fdfc0669 100644 --- a/benchmarks/cutlass_benchmarks/utils.py +++ b/benchmarks/cutlass_benchmarks/utils.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # Cutlass bench utils -from typing import Iterable, Tuple +from collections.abc import Iterable import torch @@ -27,7 +27,7 @@ def to_fp16(tensor: torch.Tensor) -> torch.Tensor: def make_rand_tensors(dtype: torch.dtype, m: int, n: int, - k: int) -> Tuple[torch.Tensor, torch.Tensor]: + k: int) -> tuple[torch.Tensor, torch.Tensor]: a = torch.randn((m, k), device='cuda') * 5 b = torch.randn((n, k), device='cuda').t() * 5 @@ -63,7 +63,7 @@ def prune_to_2_4(tensor): def make_rand_sparse_tensors(dtype: torch.dtype, m: int, n: int, - k: int) -> Tuple[torch.Tensor, torch.Tensor]: + k: int) -> tuple[torch.Tensor, torch.Tensor]: a = torch.randn((m, k), device='cuda') * 5 b = torch.randn((n, k), device='cuda').t() * 5 @@ -88,7 +88,7 @@ def make_rand_sparse_tensors(dtype: torch.dtype, m: int, n: int, def make_n_rand_sparse_tensors(num_tensors: int, dtype: torch.dtype, m: int, n: int, k: int) -> \ - Tuple[Iterable[torch.Tensor], Iterable[torch.Tensor]]: + tuple[Iterable[torch.Tensor], Iterable[torch.Tensor]]: ABs = [] for _ in range(num_tensors): b_comp, e, a, b = make_rand_sparse_tensors(dtype, m, n, k) diff --git a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py index 6552b62dae881..e7b742d8bec93 100644 --- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py +++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py @@ -5,7 +5,8 @@ import itertools import pickle as pkl import time -from typing import Callable, Iterable, List, Optional, Tuple +from collections.abc import Iterable +from typing import Callable, Optional import torch import torch.utils.benchmark as TBenchmark @@ -49,7 +50,7 @@ def bench_int8( n: int, label: str, sub_label: str, - bench_kernels: Optional[List[str]] = None) -> Iterable[TMeasurement]: + bench_kernels: Optional[list[str]] = None) -> Iterable[TMeasurement]: """Benchmark INT8-based kernels.""" assert dtype == torch.int8 a, b = make_rand_tensors(torch.int8, m, n, k) @@ -101,7 +102,7 @@ def bench_fp8( n: int, label: str, sub_label: str, - bench_kernels: Optional[List[str]] = None) -> Iterable[TMeasurement]: + bench_kernels: Optional[list[str]] = None) -> Iterable[TMeasurement]: """Benchmark FP8-based kernels.""" assert dtype == torch.float8_e4m3fn a, b = make_rand_tensors(torch.float8_e4m3fn, m, n, k) @@ -180,7 +181,7 @@ def bench(dtype: torch.dtype, n: int, label: str, sub_label: str, - bench_kernels: Optional[List[str]] = None) -> Iterable[TMeasurement]: + bench_kernels: Optional[list[str]] = None) -> Iterable[TMeasurement]: if dtype == torch.int8: return bench_int8(dtype, m, k, n, label, sub_label, bench_kernels) if dtype == torch.float8_e4m3fn: @@ -195,8 +196,8 @@ def print_timers(timers: Iterable[TMeasurement]): def run(dtype: torch.dtype, - MKNs: Iterable[Tuple[int, int, int]], - bench_kernels: Optional[List[str]] = None) -> Iterable[TMeasurement]: + MKNs: Iterable[tuple[int, int, int]], + bench_kernels: Optional[list[str]] = None) -> Iterable[TMeasurement]: results = [] for m, k, n in MKNs: timers = bench(dtype, @@ -212,7 +213,7 @@ def run(dtype: torch.dtype, def make_output(data: Iterable[TMeasurement], - MKNs: Iterable[Tuple[int, int, int]], + MKNs: Iterable[tuple[int, int, int]], base_description: str, timestamp=None): print(f"== All Results {base_description} ====") @@ -248,7 +249,7 @@ def run_model_bench(args): for i, model in enumerate(args.models): print(f"[{i}] {model}") - def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]: + def model_shapes(model_name: str, tp_size: int) -> list[tuple[int, int]]: KNs = [] for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]): KN[tp_split_dim] = KN[tp_split_dim] // tp_size diff --git a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py index c56cc743845e9..3da583a334480 100644 --- a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py +++ b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py @@ -2,9 +2,10 @@ import pickle as pkl import time +from collections.abc import Iterable from dataclasses import dataclass from itertools import product -from typing import Callable, Iterable, List, Optional +from typing import Callable, Optional import torch import torch.utils.benchmark as TBenchmark @@ -29,7 +30,7 @@ def description(self): f'x DT {self.dtype}') -def get_bench_params() -> List[bench_params_t]: +def get_bench_params() -> list[bench_params_t]: ## Test Fixtures NUM_TOKENS = [2**x for x in range(11)] HIDDEN_SIZES = list(range(1024, 8129, 1024)) diff --git a/benchmarks/kernels/benchmark_lora.py b/benchmarks/kernels/benchmark_lora.py index 1deb0026a6e5f..5eaeec017053c 100644 --- a/benchmarks/kernels/benchmark_lora.py +++ b/benchmarks/kernels/benchmark_lora.py @@ -9,7 +9,7 @@ from enum import Enum, auto from itertools import product from pathlib import Path -from typing import Any, Callable, Dict, List, Optional, Tuple +from typing import Any, Callable, Optional import torch import torch.utils.benchmark as TBenchmark @@ -61,15 +61,15 @@ def make_rand_lora_weight_tensor(k: int, def make_rand_tensors( - a_shape: Tuple[int], - b_shape: Tuple[int], - c_shape: Tuple[int], + a_shape: tuple[int], + b_shape: tuple[int], + c_shape: tuple[int], a_dtype: torch.dtype, b_dtype: torch.dtype, c_dtype: torch.dtype, num_slices: int, device: str = "cuda", -) -> Tuple[torch.Tensor, List[torch.Tensor], torch.Tensor]: +) -> tuple[torch.Tensor, list[torch.Tensor], torch.Tensor]: """ Make LoRA input/output matrices. """ @@ -135,7 +135,7 @@ def make_token_lora_mapping(num_tokens: int, num_prompts: int, def ref_group_gemm(ref_out: torch.Tensor, input: torch.Tensor, - lora_weights: List[torch.Tensor], + lora_weights: list[torch.Tensor], seq_lens_cpu: torch.Tensor, prompt_lora_mapping_cpu: torch.Tensor, scaling: float, add_inputs: Optional[bool]): @@ -204,7 +204,7 @@ def is_decode_op(self) -> bool: def is_expand_slice_fn(self) -> bool: return self in [OpType.BGMV_EXPAND_SLICE] - def num_slices(self) -> List[int]: + def num_slices(self) -> list[int]: if self in [OpType.SGMV_EXPAND, OpType.SGMV_SHRINK]: # SGMV kernels supports slices return [1, 2, 3] @@ -215,7 +215,7 @@ def num_slices(self) -> List[int]: raise ValueError(f"Unrecognized OpType {self}") def mkn(self, batch_size: int, seq_length: int, hidden_size: int, - lora_rank: int) -> Tuple[int, int, int]: + lora_rank: int) -> tuple[int, int, int]: num_tokens = batch_size * seq_length if self.is_shrink_fn(): m = num_tokens @@ -230,7 +230,7 @@ def mkn(self, batch_size: int, seq_length: int, hidden_size: int, def matmul_dtypes( self, op_dtype: torch.dtype - ) -> Tuple[torch.dtype, torch.dtype, torch.dtype]: + ) -> tuple[torch.dtype, torch.dtype, torch.dtype]: """ return a type, b type and c type for A x B = C """ @@ -243,7 +243,7 @@ def matmul_dtypes( def matmul_shapes( self, batch_size: int, seq_length: int, hidden_size: int, lora_rank: int, num_loras: int, - num_slices: int) -> Tuple[Tuple[int], Tuple[int], Tuple[int]]: + num_slices: int) -> tuple[tuple[int], tuple[int], tuple[int]]: """ Given num_slices, return the shapes of the A, B, and C matrices in A x B = C, for the op_type @@ -268,7 +268,7 @@ def matmul_shapes( def bench_fn(self) -> Callable: - def emulate_bgmv_expand_slice(kwargs_list: List[Dict[str, Any]]): + def emulate_bgmv_expand_slice(kwargs_list: list[dict[str, Any]]): for x in kwargs_list: bgmv_expand_slice(**x) @@ -285,7 +285,7 @@ def emulate_bgmv_expand_slice(kwargs_list: List[Dict[str, Any]]): raise ValueError(f"Unrecognized optype {self}") def run_ref_group_gemm(self, output: torch.Tensor, input: torch.Tensor, - lora_weights: List[torch.Tensor], + lora_weights: list[torch.Tensor], **kwargs) -> Callable: """Each benchmark operation expected the input, lora_weights and outputs in a slightly different format. Refer to self.matmul_shapes(). @@ -384,7 +384,7 @@ class BenchmarkTensors: """ # matmul tensors input: torch.Tensor - lora_weights_lst: List[torch.Tensor] + lora_weights_lst: list[torch.Tensor] output: torch.Tensor # metadata tensors seq_lens: torch.Tensor @@ -469,7 +469,7 @@ def to_device(tensor: torch.Tensor): for i in range(len(self.lora_weights_lst)): self.lora_weights_lst[i] = to_device(self.lora_weights_lst[i]) - def metadata(self) -> Tuple[int, int, int]: + def metadata(self) -> tuple[int, int, int]: """ Return num_seqs, num_tokens and max_seq_len """ @@ -505,7 +505,7 @@ def convert_to_sgmv_benchmark_tensors(self): self.seq_lens = seq_lens.to(dtype=self.seq_lens.dtype) self.seq_start_loc = seq_start_loc.to(dtype=self.seq_start_loc.dtype) - def as_sgmv_shrink_kwargs(self) -> Dict[str, Any]: + def as_sgmv_shrink_kwargs(self) -> dict[str, Any]: self.convert_to_sgmv_benchmark_tensors() self.sanity_check() self.to_device(self.input.device) @@ -540,7 +540,7 @@ def as_sgmv_shrink_kwargs(self) -> Dict[str, Any]: 'scaling': 1.0, } - def as_sgmv_expand_kwargs(self, add_inputs: bool) -> Dict[str, Any]: + def as_sgmv_expand_kwargs(self, add_inputs: bool) -> dict[str, Any]: self.convert_to_sgmv_benchmark_tensors() self.sanity_check() @@ -578,7 +578,7 @@ def as_sgmv_expand_kwargs(self, add_inputs: bool) -> Dict[str, Any]: 'add_inputs': add_inputs, } - def as_bgmv_shrink_kwargs(self) -> Dict[str, Any]: + def as_bgmv_shrink_kwargs(self) -> dict[str, Any]: assert len(self.lora_weights_lst) == 1 self.to_device(self.input.device) @@ -634,7 +634,7 @@ def as_bgmv_expand_kwargs(self, add_inputs: bool): 'add_inputs': add_inputs } - def as_bgmv_expand_slice_kwargs(self, add_inputs: bool) -> Dict[str, Any]: + def as_bgmv_expand_slice_kwargs(self, add_inputs: bool) -> dict[str, Any]: _, num_tokens, _, num_slices = self.metadata() # Sanity check shapes @@ -670,7 +670,7 @@ def as_bgmv_expand_slice_kwargs(self, add_inputs: bool) -> Dict[str, Any]: def bench_fn_kwargs(self, op_type: OpType, - add_inputs: Optional[bool] = None) -> Dict[str, Any]: + add_inputs: Optional[bool] = None) -> dict[str, Any]: if op_type.is_shrink_fn(): assert add_inputs is None else: @@ -734,7 +734,7 @@ def bench_optype(ctx: BenchmarkContext, assert expand_fn_add_inputs is not None # BenchmarkContext -> BenchmarkTensors - bench_tensors : List[BenchmarkTensors] = \ + bench_tensors : list[BenchmarkTensors] = \ [BenchmarkTensors.make(ctx, op_type) for _ in range(arg_pool_size)] for bt in bench_tensors: bt.sanity_check() @@ -746,7 +746,7 @@ def bench_optype(ctx: BenchmarkContext, for bt in bench_tensors ]) - # BenchmarkTensors -> Dict (kwargs) + # BenchmarkTensors -> dict (kwargs) kwargs_list = [ bt.bench_fn_kwargs(op_type, add_inputs=expand_fn_add_inputs) for bt in bench_tensors @@ -841,7 +841,7 @@ def use_cuda_graph_recommendation() -> str: """ -def print_timers(timers: List[TMeasurement], +def print_timers(timers: list[TMeasurement], args: Optional[argparse.Namespace] = None): compare = TBenchmark.Compare(timers) compare.print() @@ -861,7 +861,7 @@ def print_timers(timers: List[TMeasurement], "small num_loras the goal should be to match the torch.mm numbers.") -def run(args: argparse.Namespace, bench_ctxs: List[BenchmarkContext]): +def run(args: argparse.Namespace, bench_ctxs: list[BenchmarkContext]): if args.cuda_graph_nops is not None: assert args.cuda_graph_nops > 0 @@ -873,7 +873,7 @@ def run(args: argparse.Namespace, bench_ctxs: List[BenchmarkContext]): timers = [] for bench_ctx in bench_ctxs: for seq_len in args.seq_lengths: - bench_ops: List[OpType] = [] + bench_ops: list[OpType] = [] if seq_len == 1: # bench all decode ops bench_ops = [op for op in args.op_types if op.is_decode_op()] @@ -921,10 +921,10 @@ def run(args: argparse.Namespace, bench_ctxs: List[BenchmarkContext]): pickle.dump(timers, f) -def as_benchmark_contexts(hidden_sizes: List[int], lora_ranks: List[int], - args: argparse.Namespace) -> List[BenchmarkContext]: +def as_benchmark_contexts(hidden_sizes: list[int], lora_ranks: list[int], + args: argparse.Namespace) -> list[BenchmarkContext]: - ctxs: List[BenchmarkContext] = [] + ctxs: list[BenchmarkContext] = [] for batch_size, hidden_size, lora_rank, num_loras, sort_by_lora_id in product( # noqa args.batch_sizes, list(hidden_sizes), lora_ranks, args.num_loras, args.sort_by_lora_id): @@ -954,7 +954,7 @@ def run_list_bench(args: argparse.Namespace): f" LoRA Ranks {args.lora_ranks}") # Get all benchmarking contexts - bench_contexts: List[BenchmarkContext] = as_benchmark_contexts( + bench_contexts: list[BenchmarkContext] = as_benchmark_contexts( hidden_sizes=args.hidden_sizes, lora_ranks=args.lora_ranks, args=args) run(args, bench_contexts) @@ -975,7 +975,7 @@ def run_range_bench(args: argparse.Namespace): f" LoRA Ranks {lora_ranks}") # Get all benchmarking contexts - bench_contexts: List[BenchmarkContext] = as_benchmark_contexts( + bench_contexts: list[BenchmarkContext] = as_benchmark_contexts( hidden_sizes=hidden_sizes, lora_ranks=lora_ranks, args=args) run(args, bench_contexts) @@ -1002,7 +1002,7 @@ def hidden_sizes_from_model(model: str, tp_size: int) -> set[int]: f" LoRA Ranks {args.lora_ranks}") # Get all benchmarking contexts - bench_contexts: List[BenchmarkContext] = as_benchmark_contexts( + bench_contexts: list[BenchmarkContext] = as_benchmark_contexts( hidden_sizes=hidden_sizes, lora_ranks=args.lora_ranks, args=args) run(args, bench_contexts) diff --git a/benchmarks/kernels/benchmark_machete.py b/benchmarks/kernels/benchmark_machete.py index 0301fee1a8864..3fa57bd7b2334 100644 --- a/benchmarks/kernels/benchmark_machete.py +++ b/benchmarks/kernels/benchmark_machete.py @@ -7,9 +7,10 @@ import os import pickle as pkl import time +from collections.abc import Iterable from dataclasses import dataclass from itertools import product -from typing import Callable, Iterable, List, Optional, Tuple +from typing import Callable, Optional import pandas as pd import torch @@ -102,8 +103,8 @@ def quantize_and_pack(atype: torch.dtype, return w_ref, w_q, w_s, w_zp -def create_bench_tensors(shape: Tuple[int, int, int], types: TypeConfig, - group_size: Optional[int]) -> List[BenchmarkTensors]: +def create_bench_tensors(shape: tuple[int, int, int], types: TypeConfig, + group_size: Optional[int]) -> list[BenchmarkTensors]: m, n, k = shape # we want to make sure that weights don't fit into L2 cache between runs so @@ -114,7 +115,7 @@ def create_bench_tensors(shape: Tuple[int, int, int], types: TypeConfig, a = rand_data((m, k), types.act_type, scale=5) - benchmark_tensors: List[BenchmarkTensors] = [] + benchmark_tensors: list[BenchmarkTensors] = [] for _ in range(num_weights): w = rand_data((k, n), types.act_type, scale=5) @@ -276,7 +277,7 @@ def machete_create_bench_fn(bt: BenchmarkTensors, def bench_fns(label: str, sub_label: str, description: str, - fns: List[Callable]): + fns: list[Callable]): min_run_time = 1 if not NVTX_PROFILE else 0.1 res = TBenchmark.Timer( @@ -311,7 +312,7 @@ def bench(types: TypeConfig, n: int, label: str, sub_label: str, - sweep_schedules: bool = True) -> List[TMeasurement]: + sweep_schedules: bool = True) -> list[TMeasurement]: benchmark_tensors = create_bench_tensors((m, n, k), types, group_size) sub_label += f", L={len(benchmark_tensors)}" @@ -414,12 +415,12 @@ def bench(types: TypeConfig, # runner -def print_timers(timers: List[TMeasurement]): +def print_timers(timers: list[TMeasurement]): compare = TBenchmark.Compare(timers) compare.print() -def run(args, MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]: +def run(args, MKNs: Iterable[tuple[int, int, int]]) -> Iterable[TMeasurement]: types = TypeConfig( act_type=args.act_type, weight_type=scalar_types.uint4b8 if args.group_zero_type is None \ @@ -431,7 +432,7 @@ def run(args, MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]: token_scale_type=args.token_scale_type, ) - results: List[TMeasurement] = [] + results: list[TMeasurement] = [] for m, k, n in MKNs: timers = bench(types, args.group_size, @@ -449,8 +450,8 @@ def run(args, MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]: # output makers def make_output( - data: List[TMeasurement], - MKNs: Iterable[Tuple[int, int, int]], + data: list[TMeasurement], + MKNs: Iterable[tuple[int, int, int]], base_description: str, timestamp=None, ): @@ -497,7 +498,7 @@ def run_model_bench(args): for i, model in enumerate(args.models): print(f"[{i}] {model}") - def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]: + def model_shapes(model_name: str, tp_size: int) -> list[tuple[int, int]]: KNs = [] for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]): KN[tp_split_dim] = KN[tp_split_dim] // tp_size diff --git a/benchmarks/kernels/benchmark_marlin.py b/benchmarks/kernels/benchmark_marlin.py index c22e66c0b0c94..3a379e8200e2f 100644 --- a/benchmarks/kernels/benchmark_marlin.py +++ b/benchmarks/kernels/benchmark_marlin.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List - import torch import torch.utils.benchmark as benchmark from benchmark_shapes import WEIGHT_SHAPES @@ -29,7 +27,7 @@ K_FULL_OPTS = [False, True] -def bench_run(results: List[benchmark.Measurement], model: str, +def bench_run(results: list[benchmark.Measurement], model: str, act_order: bool, is_k_full: bool, quant_type: ScalarType, group_size: int, size_m: int, size_k: int, size_n: int): label = "Quant Matmul" @@ -178,7 +176,7 @@ def main(args): for i, model in enumerate(args.models): print(f"[{i}] {model}") - results: List[benchmark.Measurement] = [] + results: list[benchmark.Measurement] = [] for model in args.models: for layer in WEIGHT_SHAPES[model]: diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py index 410750686ee10..c862dec81fccd 100644 --- a/benchmarks/kernels/benchmark_moe.py +++ b/benchmarks/kernels/benchmark_moe.py @@ -4,7 +4,7 @@ import time from datetime import datetime from itertools import product -from typing import Any, Dict, List, Tuple, TypedDict +from typing import Any, TypedDict import ray import torch @@ -132,7 +132,7 @@ def run(): start_event = torch.cuda.Event(enable_timing=True) end_event = torch.cuda.Event(enable_timing=True) - latencies: List[float] = [] + latencies: list[float] = [] for i in range(num_iters): prepare(i) torch.cuda.synchronize() @@ -175,8 +175,8 @@ def get_rocm_tuning_space(use_fp16): return param_ranges -def get_configs_compute_bound(use_fp16) -> List[Dict[str, int]]: - configs: List[BenchmarkConfig] = [] +def get_configs_compute_bound(use_fp16) -> list[dict[str, int]]: + configs: list[BenchmarkConfig] = [] if current_platform.is_rocm(): param_ranges = get_rocm_tuning_space(use_fp16) @@ -335,7 +335,7 @@ def benchmark( dtype: torch.dtype, use_fp8_w8a8: bool, use_int8_w8a16: bool, - ) -> Tuple[Dict[str, int], float]: + ) -> tuple[dict[str, int], float]: current_platform.seed_everything(self.seed) dtype_str = get_config_dtype_str(dtype, use_int8_w8a16=use_int8_w8a16, @@ -371,8 +371,8 @@ def tune( dtype: torch.dtype, use_fp8_w8a8: bool, use_int8_w8a16: bool, - search_space: List[Dict[str, int]], - ) -> Dict[str, int]: + search_space: list[dict[str, int]], + ) -> dict[str, int]: best_config = None best_time = float("inf") if current_platform.is_rocm(): @@ -434,7 +434,7 @@ def sort_config(config: BenchmarkConfig) -> BenchmarkConfig: } -def save_configs(configs: Dict[int, BenchmarkConfig], num_experts: int, +def save_configs(configs: dict[int, BenchmarkConfig], num_experts: int, shard_intermediate_size: int, hidden_size: int, topk: int, dtype: torch.dtype, use_fp8_w8a8: bool, use_int8_w8a16: bool) -> None: @@ -498,7 +498,7 @@ def main(args: argparse.Namespace): num_gpus = int(ray.available_resources()["GPU"]) workers = [BenchmarkWorker.remote(args.seed) for _ in range(num_gpus)] - def _distribute(method: str, inputs: List[Any]) -> List[Any]: + def _distribute(method: str, inputs: list[Any]) -> list[Any]: outputs = [] worker_idx = 0 for input_args in inputs: diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py index daedaadb1a77b..d00e848243611 100644 --- a/benchmarks/kernels/benchmark_paged_attention.py +++ b/benchmarks/kernels/benchmark_paged_attention.py @@ -2,7 +2,7 @@ import random import time -from typing import List, Optional +from typing import Optional import torch @@ -54,7 +54,7 @@ def main( # Create the block tables. max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size - block_tables_lst: List[List[int]] = [] + block_tables_lst: list[list[int]] = [] for _ in range(num_seqs): block_table = [ random.randint(0, NUM_BLOCKS - 1) diff --git a/benchmarks/kernels/benchmark_rmsnorm.py b/benchmarks/kernels/benchmark_rmsnorm.py index dba153742da4f..010a38b752715 100644 --- a/benchmarks/kernels/benchmark_rmsnorm.py +++ b/benchmarks/kernels/benchmark_rmsnorm.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import itertools -from typing import Optional, Tuple, Union +from typing import Optional, Union import torch import triton @@ -22,7 +22,7 @@ def forward( self, x: torch.Tensor, residual: Optional[torch.Tensor] = None, - ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: + ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]: orig_dtype = x.dtype x = x.to(torch.float32) if residual is not None: diff --git a/benchmarks/kernels/benchmark_rope.py b/benchmarks/kernels/benchmark_rope.py index 8ee0212a0c11d..05d24fc4b16d4 100644 --- a/benchmarks/kernels/benchmark_rope.py +++ b/benchmarks/kernels/benchmark_rope.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 from itertools import accumulate -from typing import List, Optional +from typing import Optional import nvtx import torch @@ -39,7 +39,7 @@ def benchmark_rope_kernels_multi_lora( }) # non-batched RoPE takes only one scaling factor, we create multiple # instances to simulate the same behavior - non_batched_ropes: List[RotaryEmbedding] = [] + non_batched_ropes: list[RotaryEmbedding] = [] for scaling_factor in scaling_factors: non_batched_ropes.append( get_rope(head_size, rotary_dim, max_position, base, is_neox_style, diff --git a/benchmarks/kernels/graph_machete_bench.py b/benchmarks/kernels/graph_machete_bench.py index 01d97d63d7cf0..bd62173a7b3a6 100644 --- a/benchmarks/kernels/graph_machete_bench.py +++ b/benchmarks/kernels/graph_machete_bench.py @@ -4,7 +4,6 @@ import pickle import re from collections import defaultdict -from typing import List import matplotlib.pyplot as plt import pandas as pd @@ -23,7 +22,7 @@ with open(args.filename, 'rb') as f: data = pickle.load(f) - raw_results: List[TMeasurement] = data["results"] + raw_results: list[TMeasurement] = data["results"] results = defaultdict(lambda: list()) for v in raw_results: diff --git a/benchmarks/kernels/utils.py b/benchmarks/kernels/utils.py index 7281707484921..ac64f786f1840 100644 --- a/benchmarks/kernels/utils.py +++ b/benchmarks/kernels/utils.py @@ -1,7 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 import dataclasses -from typing import Any, Callable, Iterable, Optional +from collections.abc import Iterable +from typing import Any, Callable, Optional import torch import torch.utils.benchmark as TBenchmark diff --git a/csrc/cutlass_extensions/vllm_cutlass_library_extension.py b/csrc/cutlass_extensions/vllm_cutlass_library_extension.py index d5a5e2ef83dd8..d64f0d0a5c2a0 100644 --- a/csrc/cutlass_extensions/vllm_cutlass_library_extension.py +++ b/csrc/cutlass_extensions/vllm_cutlass_library_extension.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import enum -from typing import Dict, Union +from typing import Union from cutlass_library import * @@ -21,7 +21,7 @@ class MixedInputKernelScheduleType(enum.Enum): TmaWarpSpecializedCooperative = enum_auto() -VLLMDataTypeNames: Dict[Union[VLLMDataType, DataType], str] = { +VLLMDataTypeNames: dict[Union[VLLMDataType, DataType], str] = { **DataTypeNames, # type: ignore **{ VLLMDataType.u4b8: "u4b8", @@ -29,7 +29,7 @@ class MixedInputKernelScheduleType(enum.Enum): } } -VLLMDataTypeTag: Dict[Union[VLLMDataType, DataType], str] = { +VLLMDataTypeTag: dict[Union[VLLMDataType, DataType], str] = { **DataTypeTag, # type: ignore **{ VLLMDataType.u4b8: "cutlass::vllm_uint4b8_t", @@ -37,7 +37,7 @@ class MixedInputKernelScheduleType(enum.Enum): } } -VLLMDataTypeSize: Dict[Union[VLLMDataType, DataType], int] = { +VLLMDataTypeSize: dict[Union[VLLMDataType, DataType], int] = { **DataTypeSize, # type: ignore **{ VLLMDataType.u4b8: 4, @@ -45,7 +45,7 @@ class MixedInputKernelScheduleType(enum.Enum): } } -VLLMDataTypeVLLMScalarTypeTag: Dict[Union[VLLMDataType, DataType], str] = { +VLLMDataTypeVLLMScalarTypeTag: dict[Union[VLLMDataType, DataType], str] = { VLLMDataType.u4b8: "vllm::kU4B8", VLLMDataType.u8b128: "vllm::kU8B128", DataType.u4: "vllm::kU4", @@ -56,7 +56,7 @@ class MixedInputKernelScheduleType(enum.Enum): DataType.bf16: "vllm::kBfloat16", } -VLLMDataTypeTorchDataTypeTag: Dict[Union[VLLMDataType, DataType], str] = { +VLLMDataTypeTorchDataTypeTag: dict[Union[VLLMDataType, DataType], str] = { DataType.u8: "at::ScalarType::Byte", DataType.s8: "at::ScalarType::Char", DataType.e4m3: "at::ScalarType::Float8_e4m3fn", @@ -66,7 +66,7 @@ class MixedInputKernelScheduleType(enum.Enum): DataType.f32: "at::ScalarType::Float", } -VLLMKernelScheduleTag: Dict[Union[ +VLLMKernelScheduleTag: dict[Union[ MixedInputKernelScheduleType, KernelScheduleType], str] = { **KernelScheduleTag, # type: ignore **{ diff --git a/csrc/quantization/machete/generate.py b/csrc/quantization/machete/generate.py index 02e59fe28b9af..3114e14baa0c5 100644 --- a/csrc/quantization/machete/generate.py +++ b/csrc/quantization/machete/generate.py @@ -8,7 +8,7 @@ from copy import deepcopy from dataclasses import dataclass, fields from functools import reduce -from typing import Dict, List, Optional, Tuple, Union +from typing import Optional, Union import jinja2 # yapf conflicts with isort for this block @@ -247,8 +247,8 @@ @dataclass(frozen=True) class ScheduleConfig: - tile_shape_mn: Tuple[int, int] - cluster_shape_mnk: Tuple[int, int, int] + tile_shape_mn: tuple[int, int] + cluster_shape_mnk: tuple[int, int, int] kernel_schedule: MixedInputKernelScheduleType epilogue_schedule: EpilogueScheduleType tile_scheduler: TileSchedulerType @@ -277,8 +277,8 @@ class PrepackTypeConfig: @dataclass class ImplConfig: types: TypeConfig - schedules: List[ScheduleConfig] - heuristic: List[Tuple[Optional[str], ScheduleConfig]] + schedules: list[ScheduleConfig] + heuristic: list[tuple[Optional[str], ScheduleConfig]] def generate_sch_sig(schedule_config: ScheduleConfig) -> str: @@ -333,7 +333,7 @@ def is_power_of_two(n): return (n != 0) and (n & (n - 1) == 0) -def to_cute_constant(value: List[int]): +def to_cute_constant(value: list[int]): def _to_cute_constant(value: int): if is_power_of_two(value): @@ -347,7 +347,7 @@ def _to_cute_constant(value: int): return _to_cute_constant(value) -def unique_schedules(impl_configs: List[ImplConfig]): +def unique_schedules(impl_configs: list[ImplConfig]): return list( set(sch for impl_config in impl_configs for sch in impl_config.schedules)) @@ -391,7 +391,7 @@ def create_template(template_str): prepack_dispatch_template = create_template(PREPACK_TEMPLATE) -def create_sources(impl_configs: List[ImplConfig], num_impl_files=8): +def create_sources(impl_configs: list[ImplConfig], num_impl_files=8): sources = [] sources.append(( @@ -435,7 +435,7 @@ def prepacked_type_key(prepack_type: PrepackTypeConfig): num_impls = reduce(lambda x, y: x + len(y.schedules), impl_configs, 0) num_impls_per_file = math.ceil(num_impls / num_impl_files) - files_impls: List[List[ImplConfig]] = [[]] + files_impls: list[list[ImplConfig]] = [[]] curr_num_impls_assigned = 0 curr_impl_in_file = 0 @@ -515,7 +515,7 @@ def generate(): for cond, tile_config in default_tile_heuristic_config.items() ] - def get_unique_schedules(heuristic: Dict[str, ScheduleConfig]): + def get_unique_schedules(heuristic: dict[str, ScheduleConfig]): # Do not use schedules = list(set(...)) because we need to make sure # the output list is deterministic; otherwise the generated kernel file # will be non-deterministic and causes ccache miss. diff --git a/docs/source/conf.py b/docs/source/conf.py index 97bec81b1eee8..b72faef9af107 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -17,7 +17,6 @@ import logging import os import sys -from typing import List import requests from sphinx.ext import autodoc @@ -58,7 +57,7 @@ # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. -exclude_patterns: List[str] = ["**/*.template.md", "**/*.inc.md"] +exclude_patterns: list[str] = ["**/*.template.md", "**/*.inc.md"] # Exclude the prompt "$" when copying code copybutton_prompt_text = r"\$ " diff --git a/docs/source/features/reasoning_outputs.md b/docs/source/features/reasoning_outputs.md index e39bbacf1138d..c77d31dabc841 100644 --- a/docs/source/features/reasoning_outputs.md +++ b/docs/source/features/reasoning_outputs.md @@ -117,7 +117,7 @@ class ExampleParser(ReasoningParser): def extract_reasoning_content( self, model_output: str, request: ChatCompletionRequest - ) -> Tuple[Optional[str], Optional[str]]: + ) -> tuple[Optional[str], Optional[str]]: """ Extract reasoning content from a complete model-generated string. @@ -132,7 +132,7 @@ class ExampleParser(ReasoningParser): The request object that was used to generate the model_output. Returns: - Tuple[Optional[str], Optional[str]] + tuple[Optional[str], Optional[str]] A tuple containing the reasoning content and the content. """ ``` diff --git a/docs/source/features/structured_outputs.md b/docs/source/features/structured_outputs.md index 1d5aa07ab177a..de3c5bf5e7ab9 100644 --- a/docs/source/features/structured_outputs.md +++ b/docs/source/features/structured_outputs.md @@ -193,7 +193,7 @@ class Step(BaseModel): class MathResponse(BaseModel): - steps: List[Step] + steps: list[Step] final_answer: str diff --git a/docs/source/generate_examples.py b/docs/source/generate_examples.py index c5f75953aaf24..c51ca18667ef6 100644 --- a/docs/source/generate_examples.py +++ b/docs/source/generate_examples.py @@ -74,7 +74,7 @@ class Example: path (Path): The path to the main directory or file. category (str): The category of the document. main_file (Path): The main file in the directory. - other_files (list[Path]): List of other files in the directory. + other_files (list[Path]): list of other files in the directory. title (str): The title of the document. Methods: diff --git a/examples/offline_inference/distributed.py b/examples/offline_inference/distributed.py index a2df41d4ce21b..e890c6dad8bd1 100644 --- a/examples/offline_inference/distributed.py +++ b/examples/offline_inference/distributed.py @@ -6,7 +6,7 @@ Learn more about Ray Data in https://docs.ray.io/en/latest/data/data.html """ -from typing import Any, Dict, List +from typing import Any import numpy as np import ray @@ -36,13 +36,13 @@ def __init__(self): self.llm = LLM(model="meta-llama/Llama-2-7b-chat-hf", tensor_parallel_size=tensor_parallel_size) - def __call__(self, batch: Dict[str, np.ndarray]) -> Dict[str, list]: + def __call__(self, batch: dict[str, np.ndarray]) -> dict[str, list]: # Generate texts from the prompts. # The output is a list of RequestOutput objects that contain the prompt, # generated text, and other information. outputs = self.llm.generate(batch["text"], sampling_params) - prompt: List[str] = [] - generated_text: List[str] = [] + prompt: list[str] = [] + generated_text: list[str] = [] for output in outputs: prompt.append(output.prompt) generated_text.append(' '.join([o.text for o in output.outputs])) @@ -72,7 +72,7 @@ def scheduling_strategy_fn(): pg, placement_group_capture_child_tasks=True)) -resources_kwarg: Dict[str, Any] = {} +resources_kwarg: dict[str, Any] = {} if tensor_parallel_size == 1: # For tensor_parallel_size == 1, we simply set num_gpus=1. resources_kwarg["num_gpus"] = 1 diff --git a/examples/offline_inference/llm_engine_example.py b/examples/offline_inference/llm_engine_example.py index 501034c1cc5da..f7741a3722438 100644 --- a/examples/offline_inference/llm_engine_example.py +++ b/examples/offline_inference/llm_engine_example.py @@ -1,13 +1,12 @@ # SPDX-License-Identifier: Apache-2.0 import argparse -from typing import List, Tuple from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams from vllm.utils import FlexibleArgumentParser -def create_test_prompts() -> List[Tuple[str, SamplingParams]]: +def create_test_prompts() -> list[tuple[str, SamplingParams]]: """Create a list of test prompts with their sampling parameters.""" return [ ("A robot may not injure a human being", @@ -24,7 +23,7 @@ def create_test_prompts() -> List[Tuple[str, SamplingParams]]: def process_requests(engine: LLMEngine, - test_prompts: List[Tuple[str, SamplingParams]]): + test_prompts: list[tuple[str, SamplingParams]]): """Continuously process a list of prompts and handle the outputs.""" request_id = 0 @@ -34,7 +33,7 @@ def process_requests(engine: LLMEngine, engine.add_request(str(request_id), prompt, sampling_params) request_id += 1 - request_outputs: List[RequestOutput] = engine.step() + request_outputs: list[RequestOutput] = engine.step() for request_output in request_outputs: if request_output.finished: diff --git a/examples/offline_inference/lora_with_quantization_inference.py b/examples/offline_inference/lora_with_quantization_inference.py index de0734c1aa83b..a409735013f6d 100644 --- a/examples/offline_inference/lora_with_quantization_inference.py +++ b/examples/offline_inference/lora_with_quantization_inference.py @@ -7,7 +7,7 @@ """ import gc -from typing import List, Optional, Tuple +from typing import Optional import torch from huggingface_hub import snapshot_download @@ -18,7 +18,7 @@ def create_test_prompts( lora_path: str -) -> List[Tuple[str, SamplingParams, Optional[LoRARequest]]]: +) -> list[tuple[str, SamplingParams, Optional[LoRARequest]]]: return [ # this is an example of using quantization without LoRA ("My name is", @@ -49,7 +49,7 @@ def create_test_prompts( def process_requests(engine: LLMEngine, - test_prompts: List[Tuple[str, SamplingParams, + test_prompts: list[tuple[str, SamplingParams, Optional[LoRARequest]]]): """Continuously process a list of prompts and handle the outputs.""" request_id = 0 @@ -63,7 +63,7 @@ def process_requests(engine: LLMEngine, lora_request=lora_request) request_id += 1 - request_outputs: List[RequestOutput] = engine.step() + request_outputs: list[RequestOutput] = engine.step() for request_output in request_outputs: if request_output.finished: print("----------------------------------------------------") diff --git a/examples/offline_inference/mlpspeculator.py b/examples/offline_inference/mlpspeculator.py index f227e71ba79be..61641245de839 100644 --- a/examples/offline_inference/mlpspeculator.py +++ b/examples/offline_inference/mlpspeculator.py @@ -2,12 +2,11 @@ import gc import time -from typing import List from vllm import LLM, SamplingParams -def time_generation(llm: LLM, prompts: List[str], +def time_generation(llm: LLM, prompts: list[str], sampling_params: SamplingParams): # Generate texts from the prompts. The output is a list of RequestOutput # objects that contain the prompt, generated text, and other information. diff --git a/examples/offline_inference/multilora_inference.py b/examples/offline_inference/multilora_inference.py index 630fd1bf83420..4b0d115e6609c 100644 --- a/examples/offline_inference/multilora_inference.py +++ b/examples/offline_inference/multilora_inference.py @@ -6,7 +6,7 @@ Requires HuggingFace credentials for access to Llama2. """ -from typing import List, Optional, Tuple +from typing import Optional from huggingface_hub import snapshot_download @@ -16,7 +16,7 @@ def create_test_prompts( lora_path: str -) -> List[Tuple[str, SamplingParams, Optional[LoRARequest]]]: +) -> list[tuple[str, SamplingParams, Optional[LoRARequest]]]: """Create a list of test prompts with their sampling parameters. 2 requests for base model, 4 requests for the LoRA. We define 2 @@ -56,7 +56,7 @@ def create_test_prompts( def process_requests(engine: LLMEngine, - test_prompts: List[Tuple[str, SamplingParams, + test_prompts: list[tuple[str, SamplingParams, Optional[LoRARequest]]]): """Continuously process a list of prompts and handle the outputs.""" request_id = 0 @@ -70,7 +70,7 @@ def process_requests(engine: LLMEngine, lora_request=lora_request) request_id += 1 - request_outputs: List[RequestOutput] = engine.step() + request_outputs: list[RequestOutput] = engine.step() for request_output in request_outputs: if request_output.finished: diff --git a/examples/offline_inference/prithvi_geospatial_mae.py b/examples/offline_inference/prithvi_geospatial_mae.py index 298f08019004d..3ae507cac5ce1 100644 --- a/examples/offline_inference/prithvi_geospatial_mae.py +++ b/examples/offline_inference/prithvi_geospatial_mae.py @@ -21,7 +21,7 @@ import datetime import os import re -from typing import List, Union +from typing import Union import albumentations import numpy as np @@ -260,9 +260,9 @@ def _convert_np_uint8(float_image: torch.Tensor): def load_example( - file_paths: List[str], - mean: List[float] = None, - std: List[float] = None, + file_paths: list[str], + mean: list[float] = None, + std: list[float] = None, indices: Union[list[int], None] = None, ): """Build an input example by loading images in *file_paths*. diff --git a/examples/offline_inference/profiling.py b/examples/offline_inference/profiling.py index c2e072fdd8889..ffa76b4e4f2ce 100644 --- a/examples/offline_inference/profiling.py +++ b/examples/offline_inference/profiling.py @@ -5,8 +5,9 @@ import os import sys from argparse import RawTextHelpFormatter +from collections.abc import Generator from dataclasses import asdict, dataclass -from typing import Any, Dict, Generator, List, Optional, TypeAlias +from typing import Any, Optional, TypeAlias import torch import tqdm @@ -42,8 +43,8 @@ def get_dtype(dtype: str): return dtype -OutputLen_NumReqs_Map: TypeAlias = Dict[int, int] -def compute_request_output_lengths(batch_size: int, step_requests: List[int]) \ +OutputLen_NumReqs_Map: TypeAlias = dict[int, int] +def compute_request_output_lengths(batch_size: int, step_requests: list[int]) \ -> OutputLen_NumReqs_Map: """ Given the number of requests, batch_size, and the number of requests @@ -63,7 +64,7 @@ def compute_request_output_lengths(batch_size: int, step_requests: List[int]) \ Args: batch_size (int): Number of requests submitted for profile. This is args.batch_size. - step_requests (List[int]): step_requests[i] is the number of requests + step_requests (list[int]): step_requests[i] is the number of requests that the ith engine step should process. Returns: @@ -114,7 +115,7 @@ def compute_request_output_lengths(batch_size: int, step_requests: List[int]) \ return ol_nr -def determine_requests_per_step(context: ProfileContext) -> List[int]: +def determine_requests_per_step(context: ProfileContext) -> list[int]: """ Determine number of requests each engine step should process. If context.num_steps is set, then all engine steps process the @@ -130,7 +131,7 @@ def determine_requests_per_step(context: ProfileContext) -> List[int]: context: ProfileContext object. Returns: - List[int]: Number of requests to process for all engine-steps. + list[int]: Number of requests to process for all engine-steps. output[i], contains the number of requests that the ith step should process. """ @@ -170,7 +171,7 @@ def run_profile(context: ProfileContext, csv_output: Optional[str], for key, value in asdict(context).items(): print(f" {key} = {value}") - requests_per_step: List[int] = determine_requests_per_step(context) + requests_per_step: list[int] = determine_requests_per_step(context) ol_nr: OutputLen_NumReqs_Map = compute_request_output_lengths( context.batch_size, requests_per_step) diff --git a/examples/offline_inference/profiling_tpu/profiling.py b/examples/offline_inference/profiling_tpu/profiling.py index d54117d6262af..61da4705e18e8 100644 --- a/examples/offline_inference/profiling_tpu/profiling.py +++ b/examples/offline_inference/profiling_tpu/profiling.py @@ -4,7 +4,6 @@ import dataclasses import os import time -from typing import List import numpy as np import torch_xla.debug.profiler as xp @@ -35,7 +34,7 @@ def main(args: argparse.Namespace): dummy_prompt_token_ids = np.random.randint(10000, size=(args.batch_size, args.input_len)) - dummy_prompts: List[PromptType] = [{ + dummy_prompts: list[PromptType] = [{ "prompt_token_ids": batch } for batch in dummy_prompt_token_ids.tolist()] diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py index 872c9481a2297..b1aec33cff469 100644 --- a/examples/offline_inference/vision_language_multi_image.py +++ b/examples/offline_inference/vision_language_multi_image.py @@ -5,7 +5,7 @@ using the chat template defined by the model. """ from argparse import Namespace -from typing import List, NamedTuple, Optional +from typing import NamedTuple, Optional from PIL.Image import Image from transformers import AutoProcessor, AutoTokenizer @@ -24,8 +24,8 @@ class ModelRequestData(NamedTuple): llm: LLM prompt: str - stop_token_ids: Optional[List[int]] - image_data: List[Image] + stop_token_ids: Optional[list[int]] + image_data: list[Image] chat_template: Optional[str] @@ -34,7 +34,7 @@ class ModelRequestData(NamedTuple): # Unless specified, these settings have been tested to work on a single L4. -def load_aria(question, image_urls: List[str]) -> ModelRequestData: +def load_aria(question, image_urls: list[str]) -> ModelRequestData: model_name = "rhymes-ai/Aria" llm = LLM(model=model_name, tokenizer_mode="slow", @@ -55,7 +55,7 @@ def load_aria(question, image_urls: List[str]) -> ModelRequestData: ) -def load_deepseek_vl2(question: str, image_urls: List[str]): +def load_deepseek_vl2(question: str, image_urls: list[str]): model_name = "deepseek-ai/deepseek-vl2-tiny" llm = LLM(model=model_name, @@ -77,7 +77,7 @@ def load_deepseek_vl2(question: str, image_urls: List[str]): ) -def load_h2ovl(question: str, image_urls: List[str]) -> ModelRequestData: +def load_h2ovl(question: str, image_urls: list[str]) -> ModelRequestData: model_name = "h2oai/h2ovl-mississippi-800m" llm = LLM( @@ -111,7 +111,7 @@ def load_h2ovl(question: str, image_urls: List[str]) -> ModelRequestData: ) -def load_idefics3(question, image_urls: List[str]) -> ModelRequestData: +def load_idefics3(question, image_urls: list[str]) -> ModelRequestData: model_name = "HuggingFaceM4/Idefics3-8B-Llama3" # The configuration below has been confirmed to launch on a single L40 GPU. @@ -142,7 +142,7 @@ def load_idefics3(question, image_urls: List[str]) -> ModelRequestData: ) -def load_internvl(question: str, image_urls: List[str]) -> ModelRequestData: +def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData: model_name = "OpenGVLab/InternVL2-2B" llm = LLM( @@ -179,7 +179,7 @@ def load_internvl(question: str, image_urls: List[str]) -> ModelRequestData: ) -def load_mllama(question, image_urls: List[str]) -> ModelRequestData: +def load_mllama(question, image_urls: list[str]) -> ModelRequestData: model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct" # The configuration below has been confirmed to launch on a single L40 GPU. @@ -201,7 +201,7 @@ def load_mllama(question, image_urls: List[str]) -> ModelRequestData: ) -def load_nvlm_d(question: str, image_urls: List[str]): +def load_nvlm_d(question: str, image_urls: list[str]): model_name = "nvidia/NVLM-D-72B" # Adjust this as necessary to fit in GPU @@ -234,7 +234,7 @@ def load_nvlm_d(question: str, image_urls: List[str]): ) -def load_pixtral_hf(question: str, image_urls: List[str]) -> ModelRequestData: +def load_pixtral_hf(question: str, image_urls: list[str]) -> ModelRequestData: model_name = "mistral-community/pixtral-12b" # Adjust this as necessary to fit in GPU @@ -259,7 +259,7 @@ def load_pixtral_hf(question: str, image_urls: List[str]) -> ModelRequestData: ) -def load_phi3v(question: str, image_urls: List[str]) -> ModelRequestData: +def load_phi3v(question: str, image_urls: list[str]) -> ModelRequestData: # num_crops is an override kwarg to the multimodal image processor; # For some models, e.g., Phi-3.5-vision-instruct, it is recommended # to use 16 for single frame scenarios, and 4 for multi-frame. @@ -295,7 +295,7 @@ def load_phi3v(question: str, image_urls: List[str]) -> ModelRequestData: def load_qwen_vl_chat(question: str, - image_urls: List[str]) -> ModelRequestData: + image_urls: list[str]) -> ModelRequestData: model_name = "Qwen/Qwen-VL-Chat" llm = LLM( model=model_name, @@ -336,7 +336,7 @@ def load_qwen_vl_chat(question: str, ) -def load_qwen2_vl(question, image_urls: List[str]) -> ModelRequestData: +def load_qwen2_vl(question, image_urls: list[str]) -> ModelRequestData: try: from qwen_vl_utils import process_vision_info except ModuleNotFoundError: @@ -393,7 +393,7 @@ def load_qwen2_vl(question, image_urls: List[str]) -> ModelRequestData: ) -def load_qwen2_5_vl(question, image_urls: List[str]) -> ModelRequestData: +def load_qwen2_5_vl(question, image_urls: list[str]) -> ModelRequestData: try: from qwen_vl_utils import process_vision_info except ModuleNotFoundError: @@ -466,7 +466,7 @@ def load_qwen2_5_vl(question, image_urls: List[str]) -> ModelRequestData: } -def run_generate(model, question: str, image_urls: List[str]): +def run_generate(model, question: str, image_urls: list[str]): req_data = model_example_map[model](question, image_urls) sampling_params = SamplingParams(temperature=0.0, @@ -487,7 +487,7 @@ def run_generate(model, question: str, image_urls: List[str]): print(generated_text) -def run_chat(model: str, question: str, image_urls: List[str]): +def run_chat(model: str, question: str, image_urls: list[str]): req_data = model_example_map[model](question, image_urls) sampling_params = SamplingParams(temperature=0.0, diff --git a/examples/online_serving/api_client.py b/examples/online_serving/api_client.py index 623e0d59a30e3..22bb1a87bfdf6 100644 --- a/examples/online_serving/api_client.py +++ b/examples/online_serving/api_client.py @@ -7,7 +7,7 @@ import argparse import json -from typing import Iterable, List +from collections.abc import Iterable import requests @@ -39,7 +39,7 @@ def post_http_request(prompt: str, return response -def get_streaming_response(response: requests.Response) -> Iterable[List[str]]: +def get_streaming_response(response: requests.Response) -> Iterable[list[str]]: for chunk in response.iter_lines(chunk_size=8192, decode_unicode=False, delimiter=b"\0"): @@ -49,7 +49,7 @@ def get_streaming_response(response: requests.Response) -> Iterable[List[str]]: yield output -def get_response(response: requests.Response) -> List[str]: +def get_response(response: requests.Response) -> list[str]: data = json.loads(response.content) output = data["text"] return output diff --git a/examples/online_serving/openai_embedding_client.py b/examples/online_serving/openai_embedding_client.py index cb110997464ac..b7c5651e3bab2 100644 --- a/examples/online_serving/openai_embedding_client.py +++ b/examples/online_serving/openai_embedding_client.py @@ -24,4 +24,4 @@ ) for data in responses.data: - print(data.embedding) # list of float of len 4096 + print(data.embedding) # List of float of len 4096 diff --git a/pyproject.toml b/pyproject.toml index 1c03e9e17be55..2f9c3a0f12e2c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -91,8 +91,6 @@ ignore = [ "B007", # f-string format "UP032", - # Python 3.8 typing - "UP006", "UP035", # Can remove once 3.10+ is the minimum Python version "UP007", ] diff --git a/setup.py b/setup.py index 6fe433517a053..cd17709b57ef3 100755 --- a/setup.py +++ b/setup.py @@ -9,7 +9,6 @@ import sys from pathlib import Path from shutil import which -from typing import Dict, List import torch from packaging.version import Version, parse @@ -78,7 +77,7 @@ def __init__(self, name: str, cmake_lists_dir: str = '.', **kwa) -> None: class cmake_build_ext(build_ext): # A dict of extension directories that have been configured. - did_config: Dict[str, bool] = {} + did_config: dict[str, bool] = {} # # Determine number of compilation jobs and optionally nvcc compile threads. @@ -548,10 +547,10 @@ def get_vllm_version() -> str: return version -def get_requirements() -> List[str]: +def get_requirements() -> list[str]: """Get Python package dependencies from requirements.txt.""" - def _read_requirements(filename: str) -> List[str]: + def _read_requirements(filename: str) -> list[str]: with open(get_path(filename)) as f: requirements = f.read().strip().split("\n") resolved_requirements = [] diff --git a/tests/async_engine/api_server_async_engine.py b/tests/async_engine/api_server_async_engine.py index d9ac611644df8..1e3c2d1a473a3 100644 --- a/tests/async_engine/api_server_async_engine.py +++ b/tests/async_engine/api_server_async_engine.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 """vllm.entrypoints.api_server with some extra logging for testing.""" -from typing import Any, Dict, Iterable +from collections.abc import Iterable +from typing import Any import uvicorn from fastapi.responses import JSONResponse, Response @@ -24,7 +25,7 @@ async def _engine_abort(self, request_ids: Iterable[str]): self._num_aborts += len(ids) await super()._engine_abort(ids) - def testing_stats(self) -> Dict[str, Any]: + def testing_stats(self) -> dict[str, Any]: return {"num_aborted_requests": self._num_aborts} diff --git a/tests/async_engine/test_async_llm_engine.py b/tests/async_engine/test_async_llm_engine.py index ca29abc92850d..6307bd7d64627 100644 --- a/tests/async_engine/test_async_llm_engine.py +++ b/tests/async_engine/test_async_llm_engine.py @@ -6,7 +6,7 @@ from asyncio import CancelledError from copy import copy from dataclasses import dataclass -from typing import List, Optional +from typing import Optional import pytest import pytest_asyncio @@ -254,7 +254,7 @@ async def run_deltas(prompt: str): params.output_kind = RequestOutputKind.DELTA prompt_tokens = None - output_tokens: List[int] = [] + output_tokens: list[int] = [] output_text = "" output_count = 0 final_output = None diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py index 021bd4cc46356..7307f44b6184e 100644 --- a/tests/compile/piecewise/test_toy_llama.py +++ b/tests/compile/piecewise/test_toy_llama.py @@ -8,7 +8,7 @@ initialized randomly with a fixed seed. """ from dataclasses import dataclass -from typing import Any, List, Optional, Tuple +from typing import Any, Optional import torch from torch import nn @@ -56,7 +56,7 @@ class LlamaConfig: random_seed: int = 0 def compute_hash(self) -> str: - factors: List[Any] = [] + factors: list[Any] = [] for k, v in self.__dict__.items(): if k == "random_seed": continue @@ -174,7 +174,7 @@ def forward( positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: """ For tractable computation: - if residual is None, the outputs are: diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py index 587c0a60ceeb9..48323b21a8c42 100644 --- a/tests/compile/test_basic_correctness.py +++ b/tests/compile/test_basic_correctness.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import dataclasses -from typing import Dict, List, Optional +from typing import Optional import pytest @@ -14,7 +14,7 @@ @dataclasses.dataclass class TestSetting: model: str - model_args: List[str] + model_args: list[str] pp_size: int tp_size: int attn_backend: str @@ -108,8 +108,8 @@ def test_compile_correctness(test_setting: TestSetting): final_args = ["--enforce-eager"] + model_args + ["-pp", str(pp_size)] + \ ["-tp", str(tp_size)] - all_args: List[List[str]] = [] - all_envs: List[Optional[Dict[str, str]]] = [] + all_args: list[list[str]] = [] + all_envs: list[Optional[dict[str, str]]] = [] for level in [ CompilationLevel.NO_COMPILATION, diff --git a/tests/conftest.py b/tests/conftest.py index 871f0b62c5326..57a33ad08c94f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -5,8 +5,7 @@ import tempfile from collections import UserList from enum import Enum -from typing import (Any, Callable, Dict, List, Optional, Tuple, Type, - TypedDict, TypeVar, Union) +from typing import Any, Callable, Optional, TypedDict, TypeVar, Union import numpy as np import pytest @@ -47,14 +46,14 @@ _M = TypeVar("_M") -_PromptMultiModalInput = Union[List[_M], List[List[_M]]] +_PromptMultiModalInput = Union[list[_M], list[list[_M]]] PromptImageInput = _PromptMultiModalInput[Image.Image] -PromptAudioInput = _PromptMultiModalInput[Tuple[np.ndarray, int]] +PromptAudioInput = _PromptMultiModalInput[tuple[np.ndarray, int]] PromptVideoInput = _PromptMultiModalInput[np.ndarray] -def _read_prompts(filename: str) -> List[str]: +def _read_prompts(filename: str) -> list[str]: with open(filename) as f: prompts = f.readlines() return prompts @@ -77,7 +76,7 @@ def __init__(self) -> None: ImageAsset("cherry_blossom"), ]) - def prompts(self, prompts: _ImageAssetPrompts) -> List[str]: + def prompts(self, prompts: _ImageAssetPrompts) -> list[str]: """ Convenience method to define the prompt for each test image. @@ -102,7 +101,7 @@ def __init__(self) -> None: VideoAsset("sample_demo_1.mp4"), ]) - def prompts(self, prompts: _VideoAssetPrompts) -> List[str]: + def prompts(self, prompts: _VideoAssetPrompts) -> list[str]: return [prompts["sample_demo_1"]] @@ -175,7 +174,7 @@ def dynamo_reset(): @pytest.fixture -def example_prompts() -> List[str]: +def example_prompts() -> list[str]: prompts = [] for filename in _TEST_PROMPTS: prompts += _read_prompts(filename) @@ -197,7 +196,7 @@ class DecoderPromptType(Enum): @pytest.fixture def example_encoder_decoder_prompts( -) -> Dict[DecoderPromptType, List[ExplicitEncoderDecoderPrompt]]: +) -> dict[DecoderPromptType, list[ExplicitEncoderDecoderPrompt]]: ''' Returns an encoder prompt list and a decoder prompt list, wherein each pair of same-index entries in both lists corresponds to an (encoder prompt, @@ -229,7 +228,7 @@ def example_encoder_decoder_prompts( @pytest.fixture -def example_long_prompts() -> List[str]: +def example_long_prompts() -> list[str]: prompts = [] for filename in _LONG_PROMPTS: prompts += _read_prompts(filename) @@ -273,11 +272,11 @@ def __init__( model_name: str, dtype: str = "half", *, - model_kwargs: Optional[Dict[str, Any]] = None, + model_kwargs: Optional[dict[str, Any]] = None, is_sentence_transformer: bool = False, is_cross_encoder: bool = False, skip_tokenizer_init: bool = False, - auto_cls: Type[_BaseAutoModelClass] = AutoModelForCausalLM, + auto_cls: type[_BaseAutoModelClass] = AutoModelForCausalLM, postprocess_inputs: Callable[..., BatchEncoding] = identity, ) -> None: torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype] @@ -334,11 +333,11 @@ def __init__( def get_inputs( self, - prompts: List[str], + prompts: list[str], images: Optional[PromptImageInput] = None, videos: Optional[PromptVideoInput] = None, audios: Optional[PromptAudioInput] = None, - ) -> List[BatchEncoding]: + ) -> list[BatchEncoding]: if images is not None: assert len(prompts) == len(images) @@ -348,9 +347,9 @@ def get_inputs( if audios is not None: assert len(prompts) == len(audios) - all_inputs: List[BatchEncoding] = [] + all_inputs: list[BatchEncoding] = [] for i, prompt in enumerate(prompts): - processor_kwargs: Dict[str, Any] = { + processor_kwargs: dict[str, Any] = { "text": prompt, "return_tensors": "pt", } @@ -370,7 +369,7 @@ def get_inputs( return all_inputs - def classify(self, prompts: List[str]) -> List[str]: + def classify(self, prompts: list[str]) -> list[str]: # output is final logits all_inputs = self.get_inputs(prompts) outputs = [] @@ -383,18 +382,18 @@ def classify(self, prompts: List[str]) -> List[str]: def generate( self, - prompts: List[str], + prompts: list[str], images: Optional[PromptImageInput] = None, videos: Optional[PromptVideoInput] = None, audios: Optional[PromptAudioInput] = None, **kwargs: Any, - ) -> List[Tuple[List[List[int]], List[str]]]: + ) -> list[tuple[list[list[int]], list[str]]]: all_inputs = self.get_inputs(prompts, images=images, videos=videos, audios=audios) - outputs: List[Tuple[List[List[int]], List[str]]] = [] + outputs: list[tuple[list[list[int]], list[str]]] = [] for inputs in all_inputs: output_ids = self.model.generate( **self.wrap_device(inputs, device=self.model.device.type), @@ -412,13 +411,13 @@ def generate( def generate_greedy( self, - prompts: List[str], + prompts: list[str], max_tokens: int, images: Optional[PromptImageInput] = None, videos: Optional[PromptVideoInput] = None, audios: Optional[PromptAudioInput] = None, **kwargs: Any, - ) -> List[Tuple[List[int], str]]: + ) -> list[tuple[list[int], str]]: outputs = self.generate(prompts, do_sample=False, max_new_tokens=max_tokens, @@ -432,10 +431,10 @@ def generate_greedy( def generate_beam_search( self, - prompts: List[str], + prompts: list[str], beam_width: int, max_tokens: int, - ) -> List[Tuple[List[List[int]], List[str]]]: + ) -> list[tuple[list[list[int]], list[str]]]: outputs = self.generate(prompts, do_sample=False, max_new_tokens=max_tokens, @@ -453,19 +452,19 @@ def generate_beam_search( def generate_greedy_logprobs( self, - prompts: List[str], + prompts: list[str], max_tokens: int, images: Optional[PromptImageInput] = None, videos: Optional[PromptVideoInput] = None, audios: Optional[PromptAudioInput] = None, **kwargs: Any, - ) -> List[List[torch.Tensor]]: + ) -> list[list[torch.Tensor]]: all_inputs = self.get_inputs(prompts, images=images, videos=videos, audios=audios) - all_logprobs: List[List[torch.Tensor]] = [] + all_logprobs: list[list[torch.Tensor]] = [] for inputs in all_inputs: output = self.model.generate( **self.wrap_device(inputs, device=self.model.device.type), @@ -483,11 +482,11 @@ def generate_greedy_logprobs( def _hidden_states_to_seq_logprobs( self, - hidden_states: Tuple[Tuple[torch.Tensor, ...], ...], - ) -> List[torch.Tensor]: + hidden_states: tuple[tuple[torch.Tensor, ...], ...], + ) -> list[torch.Tensor]: output_embeddings = self.model.get_output_embeddings() - seq_logprobs: List[torch.Tensor] = [] + seq_logprobs: list[torch.Tensor] = [] for _, hidden_state in enumerate(hidden_states): last_hidden_states = hidden_state[-1][0] logits = torch.matmul( @@ -503,14 +502,14 @@ def _hidden_states_to_seq_logprobs( def _hidden_states_to_logprobs( self, - hidden_states: Tuple[Tuple[torch.Tensor, ...], ...], + hidden_states: tuple[tuple[torch.Tensor, ...], ...], num_logprobs: int, - ) -> Tuple[List[Dict[int, float]], int]: + ) -> tuple[list[dict[int, float]], int]: seq_logprobs = self._hidden_states_to_seq_logprobs(hidden_states) output_len = len(hidden_states) # convert to dict - seq_logprobs_lst: List[Dict[int, float]] = [] + seq_logprobs_lst: list[dict[int, float]] = [] for tok_idx, tok_logprobs in enumerate(seq_logprobs): # drop prompt logprobs if tok_idx == 0: @@ -530,22 +529,22 @@ def _hidden_states_to_logprobs( def generate_greedy_logprobs_limit( self, - prompts: List[str], + prompts: list[str], max_tokens: int, num_logprobs: int, images: Optional[PromptImageInput] = None, audios: Optional[PromptAudioInput] = None, videos: Optional[PromptVideoInput] = None, **kwargs: Any, - ) -> List[TokensTextLogprobs]: + ) -> list[TokensTextLogprobs]: all_inputs = self.get_inputs(prompts, images=images, videos=videos, audios=audios) - all_logprobs: List[List[Dict[int, float]]] = [] - all_output_ids: List[List[int]] = [] - all_output_strs: List[str] = [] + all_logprobs: list[list[dict[int, float]]] = [] + all_output_ids: list[list[int]] = [] + all_output_strs: list[str] = [] for inputs in all_inputs: output = self.model.generate( @@ -577,23 +576,23 @@ def generate_greedy_logprobs_limit( def generate_encoder_decoder_greedy_logprobs_limit( self, - encoder_decoder_prompts: List[ExplicitEncoderDecoderPrompt[str, str]], + encoder_decoder_prompts: list[ExplicitEncoderDecoderPrompt[str, str]], max_tokens: int, num_logprobs: int, images: Optional[PromptImageInput] = None, **kwargs: Any, - ) -> List[TokensTextLogprobs]: + ) -> list[TokensTextLogprobs]: ''' Greedy logprobs generation for vLLM encoder/decoder models ''' - all_logprobs: List[List[Dict[int, float]]] = [] - all_output_ids: List[List[int]] = [] - all_output_strs: List[str] = [] + all_logprobs: list[list[dict[int, float]]] = [] + all_output_ids: list[list[int]] = [] + all_output_strs: list[str] = [] for i, (encoder_prompt, decoder_prompt) in enumerate( to_enc_dec_tuple_list(encoder_decoder_prompts)): - processor_kwargs: Dict[str, Any] = { + processor_kwargs: dict[str, Any] = { "text": encoder_prompt, "return_tensors": "pt", } @@ -641,10 +640,10 @@ def generate_encoder_decoder_greedy_logprobs_limit( return [(output_ids, output_str, output_logprobs) for output_ids, output_str, output_logprobs in outputs] - def encode(self, prompts: List[str]) -> List[List[torch.Tensor]]: + def encode(self, prompts: list[str]) -> list[list[torch.Tensor]]: return self.model.encode(prompts) - def predict(self, prompts: List[List[str]]) -> torch.Tensor: + def predict(self, prompts: list[list[str]]) -> torch.Tensor: return self.model.predict(prompts, convert_to_tensor=True) def __enter__(self): @@ -699,11 +698,11 @@ def __init__( def get_inputs( self, - prompts: List[str], + prompts: list[str], images: Optional[PromptImageInput] = None, videos: Optional[PromptVideoInput] = None, audios: Optional[PromptAudioInput] = None, - ) -> List[TextPrompt]: + ) -> list[TextPrompt]: if images is not None: assert len(prompts) == len(images) @@ -733,13 +732,13 @@ def get_inputs( def generate( self, - prompts: List[str], + prompts: list[str], sampling_params: SamplingParams, images: Optional[PromptImageInput] = None, videos: Optional[PromptVideoInput] = None, audios: Optional[PromptAudioInput] = None, **kwargs: Any, - ) -> List[Tuple[List[List[int]], List[str]]]: + ) -> list[tuple[list[list[int]], list[str]]]: inputs = self.get_inputs(prompts, images=images, videos=videos, @@ -749,12 +748,12 @@ def generate( sampling_params=sampling_params, **kwargs) - outputs: List[Tuple[List[List[int]], List[str]]] = [] + outputs: list[tuple[list[list[int]], list[str]]] = [] for req_output in req_outputs: prompt_str = req_output.prompt prompt_ids = req_output.prompt_token_ids - req_sample_output_ids: List[List[int]] = [] - req_sample_output_strs: List[str] = [] + req_sample_output_ids: list[list[int]] = [] + req_sample_output_strs: list[str] = [] for sample in req_output.outputs: output_str = sample.text output_ids = list(sample.token_ids) @@ -765,9 +764,9 @@ def generate( @staticmethod def _final_steps_generate_w_logprobs( - req_outputs: List[RequestOutput], - ) -> List[TokensTextLogprobsPromptLogprobs]: - outputs: List[TokensTextLogprobsPromptLogprobs] = [] + req_outputs: list[RequestOutput], + ) -> list[TokensTextLogprobsPromptLogprobs]: + outputs: list[TokensTextLogprobsPromptLogprobs] = [] for req_output in req_outputs: assert len(req_output.outputs) > 0 for sample in req_output.outputs: @@ -780,14 +779,14 @@ def _final_steps_generate_w_logprobs( def generate_w_logprobs( self, - prompts: List[str], + prompts: list[str], sampling_params: SamplingParams, images: Optional[PromptImageInput] = None, audios: Optional[PromptAudioInput] = None, videos: Optional[PromptVideoInput] = None, **kwargs: Any, - ) -> Union[List[TokensTextLogprobs], - List[TokensTextLogprobsPromptLogprobs]]: + ) -> Union[list[TokensTextLogprobs], + list[TokensTextLogprobsPromptLogprobs]]: inputs = self.get_inputs(prompts, images=images, videos=videos, @@ -806,10 +805,10 @@ def generate_w_logprobs( def generate_encoder_decoder_w_logprobs( self, - encoder_decoder_prompts: List[ExplicitEncoderDecoderPrompt[str, str]], + encoder_decoder_prompts: list[ExplicitEncoderDecoderPrompt[str, str]], sampling_params: SamplingParams, - ) -> Union[List[TokensTextLogprobs], - List[TokensTextLogprobsPromptLogprobs]]: + ) -> Union[list[TokensTextLogprobs], + list[TokensTextLogprobsPromptLogprobs]]: ''' Logprobs generation for vLLM encoder/decoder models ''' @@ -826,13 +825,13 @@ def generate_encoder_decoder_w_logprobs( def generate_greedy( self, - prompts: List[str], + prompts: list[str], max_tokens: int, images: Optional[PromptImageInput] = None, videos: Optional[PromptVideoInput] = None, audios: Optional[PromptAudioInput] = None, **kwargs: Any, - ) -> List[Tuple[List[int], str]]: + ) -> list[tuple[list[int], str]]: greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens) outputs = self.generate(prompts, greedy_params, @@ -845,18 +844,18 @@ def generate_greedy( def generate_greedy_logprobs( self, - prompts: List[str], + prompts: list[str], max_tokens: int, num_logprobs: int, num_prompt_logprobs: Optional[int] = None, images: Optional[PromptImageInput] = None, audios: Optional[PromptAudioInput] = None, videos: Optional[PromptVideoInput] = None, - stop_token_ids: Optional[List[int]] = None, - stop: Optional[List[str]] = None, + stop_token_ids: Optional[list[int]] = None, + stop: Optional[list[str]] = None, **kwargs: Any, - ) -> Union[List[TokensTextLogprobs], - List[TokensTextLogprobsPromptLogprobs]]: + ) -> Union[list[TokensTextLogprobs], + list[TokensTextLogprobsPromptLogprobs]]: greedy_logprobs_params = SamplingParams( temperature=0.0, max_tokens=max_tokens, @@ -874,12 +873,12 @@ def generate_greedy_logprobs( def generate_encoder_decoder_greedy_logprobs( self, - encoder_decoder_prompts: List[ExplicitEncoderDecoderPrompt[str, str]], + encoder_decoder_prompts: list[ExplicitEncoderDecoderPrompt[str, str]], max_tokens: int, num_logprobs: int, num_prompt_logprobs: Optional[int] = None, - ) -> Union[List[TokensTextLogprobs], - List[TokensTextLogprobsPromptLogprobs]]: + ) -> Union[list[TokensTextLogprobs], + list[TokensTextLogprobsPromptLogprobs]]: greedy_logprobs_params = SamplingParams( temperature=0.0, max_tokens=max_tokens, @@ -895,10 +894,10 @@ def generate_encoder_decoder_greedy_logprobs( def generate_beam_search( self, - prompts: Union[List[str], List[List[int]]], + prompts: Union[list[str], list[list[int]]], beam_width: int, max_tokens: int, - ) -> List[Tuple[List[List[int]], List[str]]]: + ) -> list[tuple[list[list[int]], list[str]]]: if is_list_of(prompts, str, check="all"): prompts = [TextPrompt(prompt=prompt) for prompt in prompts] else: @@ -915,17 +914,17 @@ def generate_beam_search( returned_outputs.append((token_ids, texts)) return returned_outputs - def classify(self, prompts: List[str]) -> List[List[float]]: + def classify(self, prompts: list[str]) -> list[list[float]]: req_outputs = self.model.classify(prompts) return [req_output.outputs.probs for req_output in req_outputs] def encode( self, - prompts: List[str], + prompts: list[str], images: Optional[PromptImageInput] = None, videos: Optional[PromptVideoInput] = None, audios: Optional[PromptAudioInput] = None, - ) -> List[List[float]]: + ) -> list[list[float]]: inputs = self.get_inputs(prompts, images=images, videos=videos, @@ -936,9 +935,9 @@ def encode( def score( self, - text_1: Union[str, List[str]], - text_2: Union[str, List[str]], - ) -> List[float]: + text_1: Union[str, list[str]], + text_2: Union[str, list[str]], + ) -> list[float]: req_outputs = self.model.score(text_1, text_2) return [req_output.outputs.score for req_output in req_outputs] diff --git a/tests/core/block/e2e/conftest.py b/tests/core/block/e2e/conftest.py index 7d3ccaadaca19..83259b690337a 100644 --- a/tests/core/block/e2e/conftest.py +++ b/tests/core/block/e2e/conftest.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Callable, Iterable, Optional +from collections.abc import Iterable +from typing import Callable, Optional import pytest diff --git a/tests/core/block/e2e/test_correctness_sliding_window.py b/tests/core/block/e2e/test_correctness_sliding_window.py index c874608e40a23..4860103633323 100644 --- a/tests/core/block/e2e/test_correctness_sliding_window.py +++ b/tests/core/block/e2e/test_correctness_sliding_window.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 import random -from typing import List import pytest @@ -127,9 +126,9 @@ def prep_prompts(batch_size: int): The prompt is just under 10k tokens; sliding window is 4k so the answer is outside sliding window, but should still be correct. """ - prompts: List[str] = [] - answer: List[int] = [] - indices: List[int] = [] + prompts: list[str] = [] + answer: list[int] = [] + indices: list[int] = [] random.seed(1) for _ in range(batch_size): idx = random.randint(30, 90) @@ -148,7 +147,7 @@ def prep_prompts(batch_size: int): return prompts, answer, indices -def check_answers(indices: List[int], answer: List[int], outputs: List[str]): +def check_answers(indices: list[int], answer: list[int], outputs: list[str]): answer2 = [int(text[0:2].strip()) for text in outputs] print(list(zip(indices, zip(answer, answer2)))) numok = 0 @@ -160,7 +159,7 @@ def check_answers(indices: List[int], answer: List[int], outputs: List[str]): assert frac_ok > 0.7 -def check_window(prompts: List[str]): +def check_window(prompts: list[str]): def inner(llm: LLM): sliding_window = llm.llm_engine.model_config.get_sliding_window() diff --git a/tests/core/block/test_block_table.py b/tests/core/block/test_block_table.py index d8cf0bec709ac..250c9a7497d23 100644 --- a/tests/core/block/test_block_table.py +++ b/tests/core/block/test_block_table.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List - import pytest from vllm.core.block.block_table import BlockTable @@ -32,7 +30,7 @@ def test_allocate_naive(block_size: int, sequence_len: int): token_ids = list(range(sequence_len)) num_blocks_per_alloc = len(list(chunk_list(token_ids, block_size))) - block_tables: List[BlockTable] = [] + block_tables: list[BlockTable] = [] for i in range(5): assert allocator.get_num_free_blocks( device=Device.GPU) == num_gpu_blocks - i * num_blocks_per_alloc @@ -77,7 +75,7 @@ def test_allocate_prefix_caching(block_size: int, sequence_len: int): num_immutable_blocks_per_alloc = len( chunked_tokens) - num_mutable_blocks_per_alloc - block_tables: List[BlockTable] = [] + block_tables: list[BlockTable] = [] for alloc_i in range(1, 6): block_tables.append( @@ -272,7 +270,7 @@ def test_append_token_ids_correct_content(block_size: int, sequence_len: int, ) block_table.allocate(token_ids=token_ids, device=Device.GPU) - appended_so_far: List[int] = [] + appended_so_far: list[int] = [] for append in chunk_list(token_ids_to_append, append_size): block_table.append_token_ids(append) appended_so_far.extend(append) diff --git a/tests/core/block/test_naive_block.py b/tests/core/block/test_naive_block.py index 0ca2a0b8054d8..4b9454c84ff65 100644 --- a/tests/core/block/test_naive_block.py +++ b/tests/core/block/test_naive_block.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List, Optional +from typing import Optional import pytest @@ -14,7 +14,7 @@ class TestNaiveBlockAllocator: def create_allocate_lambda(allocate_type: str, allocator: NaiveBlockAllocator, prev_block: Optional[Block], - token_ids: List[int]): + token_ids: list[int]): if allocate_type == "immutable": allocate_block = lambda: allocator.allocate_immutable_block( prev_block=prev_block, token_ids=token_ids) diff --git a/tests/core/block/test_prefix_caching_block.py b/tests/core/block/test_prefix_caching_block.py index bf40b334abc56..50233624f7d17 100644 --- a/tests/core/block/test_prefix_caching_block.py +++ b/tests/core/block/test_prefix_caching_block.py @@ -2,7 +2,7 @@ import math import random -from typing import List, Optional +from typing import Optional from unittest.mock import MagicMock import pytest @@ -123,11 +123,11 @@ def test_blocks_have_correct_hash_in_chain(block_size: int, @staticmethod def create_chain(block_size: int, - token_ids: List[int], - num_empty_trailing_blocks=0) -> List[PrefixCachingBlock]: + token_ids: list[int], + num_empty_trailing_blocks=0) -> list[PrefixCachingBlock]: """Helper method which creates a chain of blocks. """ - blocks: List[PrefixCachingBlock] = [] + blocks: list[PrefixCachingBlock] = [] num_blocks = math.ceil( len(token_ids) / block_size) + num_empty_trailing_blocks @@ -161,7 +161,7 @@ class TestPrefixCachingBlockAllocator: @staticmethod def create_allocate_lambda(allocate_type: str, allocator: BlockAllocator, prev_block: Optional[Block], - token_ids: List[int]): + token_ids: list[int]): if allocate_type == "immutable": allocate_block = lambda: allocator.allocate_immutable_block( prev_block=prev_block, token_ids=token_ids) @@ -839,13 +839,13 @@ def test_reset_prefix_cache(num_blocks: int, block_size: int): @staticmethod def create_immutable_chain( block_size: int, - token_ids: List[int], + token_ids: list[int], allocator: PrefixCachingBlockAllocator, extra_hash: Optional[int] = None, - ) -> List[PrefixCachingBlock]: + ) -> list[PrefixCachingBlock]: """Helper method which creates a chain of blocks. """ - blocks: List[Block] = [] + blocks: list[Block] = [] num_blocks = math.ceil(len(token_ids) / block_size) if num_blocks == 0: diff --git a/tests/core/test_chunked_prefill_scheduler.py b/tests/core/test_chunked_prefill_scheduler.py index 8e0b9e63b40c9..161b32f01b111 100644 --- a/tests/core/test_chunked_prefill_scheduler.py +++ b/tests/core/test_chunked_prefill_scheduler.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List from unittest.mock import MagicMock import pytest # noqa @@ -46,7 +45,7 @@ def test_simple(): cache_config.num_cpu_blocks = 8 cache_config.num_gpu_blocks = 8 scheduler = Scheduler(scheduler_config, cache_config, None) - running: List[SequenceGroup] = [] + running: list[SequenceGroup] = [] # Add seq groups to scheduler. for i in range(num_seq_group): @@ -93,7 +92,7 @@ def test_chunk(): cache_config.num_cpu_blocks = 32 cache_config.num_gpu_blocks = 32 scheduler = Scheduler(scheduler_config, cache_config, None) - running: List[SequenceGroup] = [] + running: list[SequenceGroup] = [] # Add seq groups to scheduler. for i in range(2): @@ -145,7 +144,7 @@ def test_concurrent_chunking(): cache_config.num_cpu_blocks = 32 cache_config.num_gpu_blocks = 32 scheduler = Scheduler(scheduler_config, cache_config, None) - running: List[SequenceGroup] = [] + running: list[SequenceGroup] = [] # Add seq groups to scheduler. for i in range(2): @@ -226,8 +225,8 @@ def test_short_prompts_jump_long_prompts_in_queue(): cache_config.num_cpu_blocks = 3200 # large KV cache size for large requests cache_config.num_gpu_blocks = 3200 scheduler = Scheduler(scheduler_config, cache_config, None) - long_seqs: List[SequenceGroup] = [] - short_seqs: List[SequenceGroup] = [] + long_seqs: list[SequenceGroup] = [] + short_seqs: list[SequenceGroup] = [] # Add 2 large seq groups to scheduler. for i in range(2): @@ -368,7 +367,7 @@ def test_complex(): cache_config.num_cpu_blocks = 64 cache_config.num_gpu_blocks = 64 scheduler = Scheduler(scheduler_config, cache_config, None) - running: List[SequenceGroup] = [] + running: list[SequenceGroup] = [] # Add seq groups to scheduler. for i in range(2): @@ -439,7 +438,7 @@ def test_maximal_decoding(): cache_config.num_cpu_blocks = 8 cache_config.num_gpu_blocks = 8 scheduler = Scheduler(scheduler_config, cache_config, None) - running: List[SequenceGroup] = [] + running: list[SequenceGroup] = [] # Add seq groups to scheduler. for i in range(2): @@ -533,7 +532,7 @@ def test_prompt_limit(): cache_config.num_cpu_blocks = 16 cache_config.num_gpu_blocks = 16 scheduler = Scheduler(scheduler_config, cache_config, None) - running: List[SequenceGroup] = [] + running: list[SequenceGroup] = [] _, seq_group = create_dummy_prompt("1", prompt_length=48, @@ -565,7 +564,7 @@ def test_prompt_limit_exceed(): cache_config.num_cpu_blocks = 16 cache_config.num_gpu_blocks = 16 scheduler = Scheduler(scheduler_config, cache_config, None) - running: List[SequenceGroup] = [] + running: list[SequenceGroup] = [] _, seq_group = create_dummy_prompt("2", prompt_length=48, block_size=block_size) @@ -699,7 +698,7 @@ def test_chunked_prefill_max_seqs(): cache_config.num_cpu_blocks = 128 cache_config.num_gpu_blocks = 128 scheduler = Scheduler(scheduler_config, cache_config, None) - running: List[SequenceGroup] = [] + running: list[SequenceGroup] = [] _, seq_group = create_dummy_prompt("1", prompt_length=65, @@ -758,7 +757,7 @@ def test_prefix_caching(): cache_config.num_cpu_blocks = 0 cache_config.num_gpu_blocks = 32 scheduler = Scheduler(scheduler_config, cache_config, None) - running: List[SequenceGroup] = [] + running: list[SequenceGroup] = [] # Add seq groups to scheduler. for i in range(2): @@ -800,7 +799,7 @@ def test_prefix_caching_with_concurrent_partial_prefills(): cache_config.num_cpu_blocks = 0 cache_config.num_gpu_blocks = 32 scheduler = Scheduler(scheduler_config, cache_config, None) - running: List[SequenceGroup] = [] + running: list[SequenceGroup] = [] # Add seq groups to scheduler. for i in range(2): diff --git a/tests/core/test_scheduler.py b/tests/core/test_scheduler.py index 66bc5257f081d..9e461d4e0b401 100644 --- a/tests/core/test_scheduler.py +++ b/tests/core/test_scheduler.py @@ -2,7 +2,6 @@ import time from collections import deque -from typing import List, Set, Tuple from unittest.mock import MagicMock import pytest # noqa @@ -57,7 +56,7 @@ def test_scheduler_abort_seq_group(): # Add multiple seq groups to scheduler. num_seq_group = 4 - request_ids: Set[str] = set() + request_ids: set[str] = set() for i in range(num_seq_group): _, seq_group = create_dummy_prompt(str(i), block_size) scheduler.add_seq_group(seq_group) @@ -83,7 +82,7 @@ def test_scheduler_schedule_simple(): cache_config.num_cpu_blocks = 8 cache_config.num_gpu_blocks = 8 scheduler = Scheduler(scheduler_config, cache_config, None) - running: List[SequenceGroup] = [] + running: list[SequenceGroup] = [] # Add seq groups to scheduler. for i in range(num_seq_group): @@ -221,7 +220,7 @@ def test_scheduler_max_seqs(): cache_config.num_gpu_blocks = 8 scheduler = Scheduler(scheduler_config, cache_config, None) - all_seq_groups: List[SequenceGroup] = [] + all_seq_groups: list[SequenceGroup] = [] # Add seq groups to scheduler. for i in range(num_seq_group): _, seq_group = create_dummy_prompt(str(i), @@ -480,7 +479,7 @@ def test_prefill_schedule_max_lora(): num_cpu_blocks=64, num_gpu_blocks=64) budget = create_token_budget(token_budget=120) - curr_loras: Set[int] = set() + curr_loras: set[int] = set() for i in range(2): _, seq_group = create_dummy_prompt(str(i), prompt_length=60, @@ -651,8 +650,8 @@ def test_schedule_swapped_max_loras(): block_size=block_size, num_cpu_blocks=32, num_gpu_blocks=32) - curr_loras: Set[int] = set() - blocks_to_swap_out: List[Tuple[int, int]] = [] + curr_loras: set[int] = set() + blocks_to_swap_out: list[tuple[int, int]] = [] for i in range(2): _, seq_group = create_dummy_prompt(str(i), prompt_length=60, @@ -683,7 +682,7 @@ def test_schedule_swapped_cannot_swap_in(): num_cpu_blocks=32, num_gpu_blocks=32) curr_loras = None - blocks_to_swap_out: List[Tuple[int, int]] = [] + blocks_to_swap_out: list[tuple[int, int]] = [] for i in range(2): _, seq_group = create_dummy_prompt(str(i), prompt_length=60, @@ -714,7 +713,7 @@ def test_infeasible_swap(): num_cpu_blocks=32, num_gpu_blocks=32) curr_loras = None - blocks_to_swap_out: List[Tuple[int, int]] = [] + blocks_to_swap_out: list[tuple[int, int]] = [] for i in range(2): _, seq_group = create_dummy_prompt(str(i), prompt_length=60, @@ -752,7 +751,7 @@ def test_schedule_swapped_blocks_to_copy(): block_size=block_size) scheduler._allocate_and_set_running(seq_group) append_new_token_seq_group(60, seq_group, 1) - blocks_to_swap_out: List[Tuple[int, int]] = [] + blocks_to_swap_out: list[tuple[int, int]] = [] scheduler._swap_out(seq_group, blocks_to_swap_out) scheduler._add_seq_group_to_swapped(seq_group) diff --git a/tests/core/test_scheduler_encoder_decoder.py b/tests/core/test_scheduler_encoder_decoder.py index a4e3c73a5a7bb..c6049b26a2bcd 100644 --- a/tests/core/test_scheduler_encoder_decoder.py +++ b/tests/core/test_scheduler_encoder_decoder.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List - import pytest # noqa from vllm.config import CacheConfig, SchedulerConfig @@ -48,7 +46,7 @@ def test_scheduler_schedule_simple_encoder_decoder(): cache_config.num_cpu_blocks = 16 # enc and dec prompts per seq_group cache_config.num_gpu_blocks = 16 # enc and dec prompts per seq_group scheduler = Scheduler(scheduler_config, cache_config, None) - running: List[SequenceGroup] = [] + running: list[SequenceGroup] = [] # Add seq groups to scheduler. req_id_list = [] diff --git a/tests/core/utils.py b/tests/core/utils.py index fb77dccce1c9d..ba4265e3c20af 100644 --- a/tests/core/utils.py +++ b/tests/core/utils.py @@ -2,9 +2,8 @@ import time from collections import defaultdict -from typing import Any, Dict, List, Optional -from typing import Sequence as GenericSequence -from typing import Tuple +from collections.abc import Sequence as GenericSequence +from typing import Any, Optional from vllm import SamplingParams from vllm.core.scheduler import Scheduler, SchedulerOutputs @@ -20,10 +19,10 @@ def create_dummy_prompt( block_size: Optional[int] = None, lora_request: Optional[LoRARequest] = None, best_of: int = 1, - prompt_tokens: Optional[List[int]] = None, + prompt_tokens: Optional[list[int]] = None, min_tokens: int = 0, max_tokens: int = 16, -) -> Tuple[Sequence, SequenceGroup]: +) -> tuple[Sequence, SequenceGroup]: if not block_size: block_size = prompt_length @@ -48,7 +47,7 @@ def create_dummy_prompt( return prompt, seq_group -def create_dummy_lora_sequence(request_id: int, token_ids: List[int], +def create_dummy_lora_sequence(request_id: int, token_ids: list[int], block_size: int, lora_int_id: int) -> Sequence: return Sequence(seq_id=request_id, inputs=token_inputs(token_ids), @@ -58,7 +57,7 @@ def create_dummy_lora_sequence(request_id: int, token_ids: List[int], lora_int_id=lora_int_id)) -def create_dummy_sequence(request_id: int, token_ids: List[int], +def create_dummy_sequence(request_id: int, token_ids: list[int], block_size: int) -> Sequence: return Sequence( seq_id=request_id, @@ -74,7 +73,7 @@ def create_dummy_prompt_encoder_decoder( block_size: Optional[int] = None, lora_request: Optional[LoRARequest] = None, best_of: int = 1, -) -> Tuple[Sequence, Sequence, SequenceGroup]: +) -> tuple[Sequence, Sequence, SequenceGroup]: if not block_size: block_size = decoder_prompt_length @@ -125,7 +124,7 @@ def create_seq_group( prompt_token_ids = [0] * seq_prompt_len - seqs: List[Sequence] = [] + seqs: list[Sequence] = [] for seq_id_offset, output_len in enumerate(seq_output_lens): seq = Sequence( seq_id=seq_id_start + seq_id_offset, @@ -241,7 +240,7 @@ class SchedulerProxy: def __init__(self, scheduler: Scheduler): self.scheduler_ = scheduler - self.call_history: Dict[str, List[Any]] = defaultdict(list) + self.call_history: dict[str, list[Any]] = defaultdict(list) def __getattr__(self, name: str) -> Any: @@ -253,6 +252,6 @@ def wrapper(*args, **kwargs): return wrapper def last_schedule_ret( - self, ) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs, Any]: + self, ) -> tuple[list[SequenceGroupMetadata], SchedulerOutputs, Any]: _, _, ret = self.call_history["schedule"][-1] return ret diff --git a/tests/distributed/test_expert_parallel.py b/tests/distributed/test_expert_parallel.py index bc5770642b79a..2e575f95d5f18 100644 --- a/tests/distributed/test_expert_parallel.py +++ b/tests/distributed/test_expert_parallel.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 from dataclasses import dataclass -from typing import List, Literal, NamedTuple, Optional +from typing import Literal, NamedTuple, Optional import pytest @@ -28,8 +28,8 @@ class EPTestOptions(NamedTuple): @dataclass class EPTestSettings: - parallel_setups: List[ParallelSetup] - distributed_backends: List[str] + parallel_setups: list[ParallelSetup] + distributed_backends: list[str] task: TaskOption test_options: EPTestOptions diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py index 390ed91c26051..5562b36816c44 100644 --- a/tests/distributed/test_pipeline_parallel.py +++ b/tests/distributed/test_pipeline_parallel.py @@ -9,7 +9,7 @@ import json import os from dataclasses import dataclass -from typing import List, Literal, NamedTuple, Optional +from typing import Literal, NamedTuple, Optional import pytest @@ -38,14 +38,14 @@ class PPTestOptions(NamedTuple): @dataclass class PPTestSettings: - parallel_setups: List[ParallelSetup] + parallel_setups: list[ParallelSetup] # NOTE: the length of distributed_backends and # vllm_major_versions should be the same, and they # are first zipped together to iterate over all # test settings. - distributed_backends: List[str] + distributed_backends: list[str] # vllm major version: "0" for V0, "1" for V1 - vllm_major_versions: List[str] + vllm_major_versions: list[str] task: TaskOption test_options: PPTestOptions diff --git a/tests/distributed/test_pynccl.py b/tests/distributed/test_pynccl.py index 4c42a0ed81125..2c323edfa2af2 100644 --- a/tests/distributed/test_pynccl.py +++ b/tests/distributed/test_pynccl.py @@ -2,7 +2,6 @@ import multiprocessing import os -from typing import Dict, List import pytest import torch @@ -20,9 +19,9 @@ def distributed_run(fn, world_size): number_of_processes = world_size - processes: List[multiprocessing.Process] = [] + processes: list[multiprocessing.Process] = [] for i in range(number_of_processes): - env: Dict[str, str] = {} + env: dict[str, str] = {} env['RANK'] = str(i) env['LOCAL_RANK'] = str(i) env['WORLD_SIZE'] = str(number_of_processes) diff --git a/tests/distributed/test_shm_broadcast.py b/tests/distributed/test_shm_broadcast.py index 59fa7cc9f319b..711c2441f34bc 100644 --- a/tests/distributed/test_shm_broadcast.py +++ b/tests/distributed/test_shm_broadcast.py @@ -3,7 +3,6 @@ import multiprocessing import random import time -from typing import List import numpy as np import torch.distributed as dist @@ -13,7 +12,7 @@ from vllm.utils import get_ip, get_open_port, update_environment_variables -def get_arrays(n: int, seed: int = 0) -> List[np.ndarray]: +def get_arrays(n: int, seed: int = 0) -> list[np.ndarray]: np.random.seed(seed) sizes = np.random.randint(1, 10_000, n) # on average, each array will have 5k elements diff --git a/tests/encoder_decoder/test_e2e_correctness.py b/tests/encoder_decoder/test_e2e_correctness.py index d0e4f86250bbc..cb772fc760812 100644 --- a/tests/encoder_decoder/test_e2e_correctness.py +++ b/tests/encoder_decoder/test_e2e_correctness.py @@ -3,7 +3,7 @@ Run `pytest tests/encoder_decoder/test_e2e_correctness.py`. """ -from typing import List, Optional, Tuple +from typing import Optional import pytest from transformers import AutoModelForSeq2SeqLM @@ -22,7 +22,7 @@ def vllm_to_hf_output( - vllm_output: Tuple[List[int], str, Optional[SampleLogprobs]], + vllm_output: tuple[list[int], str, Optional[SampleLogprobs]], decoder_prompt_type: DecoderPromptType, ): """Sanitize vllm output to be comparable with hf output.""" diff --git a/tests/engine/test_executor.py b/tests/engine/test_executor.py index c0a339e46ec47..91c9ba4a74e62 100644 --- a/tests/engine/test_executor.py +++ b/tests/engine/test_executor.py @@ -2,7 +2,7 @@ import asyncio import os -from typing import Any, Callable, Dict, List, Optional, Tuple, Union +from typing import Any, Callable, Optional, Union import pytest @@ -22,8 +22,8 @@ class CustomUniExecutor(UniProcExecutor): def collective_rpc(self, method: Union[str, Callable], timeout: Optional[float] = None, - args: Tuple = (), - kwargs: Optional[Dict] = None) -> List[Any]: + args: tuple = (), + kwargs: Optional[dict] = None) -> list[Any]: # Drop marker to show that this was ran with open(".marker", "w"): ... diff --git a/tests/engine/test_multiproc_workers.py b/tests/engine/test_multiproc_workers.py index f1fe58e35a32e..9b2f45def6c54 100644 --- a/tests/engine/test_multiproc_workers.py +++ b/tests/engine/test_multiproc_workers.py @@ -4,7 +4,7 @@ from concurrent.futures import ThreadPoolExecutor from functools import partial from time import sleep -from typing import Any, List, Tuple +from typing import Any import pytest @@ -17,7 +17,7 @@ class DummyWorkerWrapper(WorkerWrapperBase): """Dummy version of vllm.worker.worker.Worker""" - def worker_method(self, worker_input: Any) -> Tuple[int, Any]: + def worker_method(self, worker_input: Any) -> tuple[int, Any]: sleep(0.05) if isinstance(worker_input, Exception): @@ -27,7 +27,7 @@ def worker_method(self, worker_input: Any) -> Tuple[int, Any]: return self.rpc_rank, input -def _start_workers() -> Tuple[List[ProcessWorkerWrapper], WorkerMonitor]: +def _start_workers() -> tuple[list[ProcessWorkerWrapper], WorkerMonitor]: result_handler = ResultHandler() vllm_config = VllmConfig() workers = [ diff --git a/tests/engine/test_stop_strings.py b/tests/engine/test_stop_strings.py index 0f633bb26da98..62d167aa14b45 100644 --- a/tests/engine/test_stop_strings.py +++ b/tests/engine/test_stop_strings.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Any, List, Optional +from typing import Any, Optional import pytest @@ -21,8 +21,8 @@ def vllm_model(vllm_runner): def _test_stopping(llm_engine: LLMEngine, expected_output: str, expected_reason: Any, - stop: Optional[List[str]] = None, - stop_token_ids: Optional[List[int]] = None, + stop: Optional[list[str]] = None, + stop_token_ids: Optional[list[int]] = None, include_in_output: bool = False, use_async_output_proc: bool = False) -> None: llm_engine.add_request( diff --git a/tests/entrypoints/llm/test_chat.py b/tests/entrypoints/llm/test_chat.py index 77c80b2f89448..710bad4ecf460 100644 --- a/tests/entrypoints/llm/test_chat.py +++ b/tests/entrypoints/llm/test_chat.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List - import pytest from vllm import LLM @@ -63,7 +61,7 @@ def test_multi_chat(): @pytest.mark.parametrize("image_urls", [[TEST_IMAGE_URLS[0], TEST_IMAGE_URLS[1]]]) -def test_chat_multi_image(image_urls: List[str]): +def test_chat_multi_image(image_urls: list[str]): llm = LLM( model="microsoft/Phi-3.5-vision-instruct", dtype="bfloat16", diff --git a/tests/entrypoints/llm/test_encode.py b/tests/entrypoints/llm/test_encode.py index ebec8baba38df..61deb1079861e 100644 --- a/tests/entrypoints/llm/test_encode.py +++ b/tests/entrypoints/llm/test_encode.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 import weakref -from typing import List import pytest @@ -45,8 +44,8 @@ def llm(): cleanup_dist_env_and_memory() -def assert_outputs_equal(o1: List[PoolingRequestOutput], - o2: List[PoolingRequestOutput]): +def assert_outputs_equal(o1: list[PoolingRequestOutput], + o2: list[PoolingRequestOutput]): assert [o.outputs for o in o1] == [o.outputs for o in o2] diff --git a/tests/entrypoints/llm/test_generate.py b/tests/entrypoints/llm/test_generate.py index 910e1a4507cc5..9a895c922cc39 100644 --- a/tests/entrypoints/llm/test_generate.py +++ b/tests/entrypoints/llm/test_generate.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 import weakref -from typing import List import pytest @@ -43,7 +42,7 @@ def llm(): cleanup_dist_env_and_memory() -def assert_outputs_equal(o1: List[RequestOutput], o2: List[RequestOutput]): +def assert_outputs_equal(o1: list[RequestOutput], o2: list[RequestOutput]): assert [o.outputs for o in o1] == [o.outputs for o in o2] diff --git a/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py b/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py index 19d4735b9dde7..eca5d184f5d60 100644 --- a/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py +++ b/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py @@ -10,7 +10,6 @@ import io import time from statistics import mean, median -from typing import List import librosa import pytest @@ -67,7 +66,7 @@ async def process_dataset(model, client, data, concurrent_request): audio, sr = data[0]["audio"]["array"], data[0]["audio"]["sampling_rate"] _ = await bound_transcribe(model, sem, client, (audio, sr), "") - tasks: List[asyncio.Task] = [] + tasks: list[asyncio.Task] = [] for sample in data: audio, sr = sample["audio"]["array"], sample["audio"]["sampling_rate"] task = asyncio.create_task( diff --git a/tests/entrypoints/openai/reasoning_parsers/test_deepseekr1_reasoning_parser.py b/tests/entrypoints/openai/reasoning_parsers/test_deepseekr1_reasoning_parser.py index ea504f3d0b463..5ce5d9280f3ef 100644 --- a/tests/entrypoints/openai/reasoning_parsers/test_deepseekr1_reasoning_parser.py +++ b/tests/entrypoints/openai/reasoning_parsers/test_deepseekr1_reasoning_parser.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List - import pytest from transformers import AutoTokenizer @@ -180,7 +178,7 @@ def test_reasoning( ): output = tokenizer.tokenize(param_dict["output"]) # decode everything to tokens - output_tokens: List[str] = [ + output_tokens: list[str] = [ tokenizer.convert_tokens_to_string([token]) for token in output ] parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser( diff --git a/tests/entrypoints/openai/reasoning_parsers/utils.py b/tests/entrypoints/openai/reasoning_parsers/utils.py index 2157e059594b4..01e43130bc6e7 100644 --- a/tests/entrypoints/openai/reasoning_parsers/utils.py +++ b/tests/entrypoints/openai/reasoning_parsers/utils.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List, Optional, Tuple, Union +from typing import Optional, Union from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, DeltaMessage) @@ -33,10 +33,10 @@ def append_delta(self, delta: DeltaMessage): def run_reasoning_extraction( reasoning_parser: ReasoningParser, - model_output: List[str], + model_output: list[str], request: Union[ChatCompletionRequest, None] = None, streaming: bool = False, -) -> Tuple[Optional[str], Optional[str]]: +) -> tuple[Optional[str], Optional[str]]: if streaming: reconstructor = run_reasoning_extraction_streaming( reasoning_parser, @@ -55,9 +55,9 @@ def run_reasoning_extraction( def run_reasoning_extraction_nonstreaming( reasoning_parser: ReasoningParser, - model_output: List[str], + model_output: list[str], request: Union[ChatCompletionRequest, None] = None, -) -> Tuple[Optional[str], Optional[str]]: +) -> tuple[Optional[str], Optional[str]]: request = request or ChatCompletionRequest(messages=[], model="test-model") return reasoning_parser.extract_reasoning_content( model_output=''.join(model_output), request=request) @@ -65,13 +65,13 @@ def run_reasoning_extraction_nonstreaming( def run_reasoning_extraction_streaming( reasoning_parser: ReasoningParser, - model_deltas: List[str], + model_deltas: list[str], request: Union[ChatCompletionRequest, None] = None, ) -> StreamingReasoningReconstructor: request = request or ChatCompletionRequest(messages=[], model="test-model") reconstructor = StreamingReasoningReconstructor() previous_text = "" - previous_tokens: List[int] = [] + previous_tokens: list[int] = [] for delta in model_deltas: token_delta = [ reasoning_parser.vocab.get(token) diff --git a/tests/entrypoints/openai/test_audio.py b/tests/entrypoints/openai/test_audio.py index 7e08fdaf1ad9c..56fb293284288 100644 --- a/tests/entrypoints/openai/test_audio.py +++ b/tests/entrypoints/openai/test_audio.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Dict, List - import openai import pytest import pytest_asyncio @@ -41,7 +39,7 @@ async def client(server): @pytest.fixture(scope="session") -def base64_encoded_audio() -> Dict[str, str]: +def base64_encoded_audio() -> dict[str, str]: return { audio_url: encode_audio_base64(*fetch_audio(audio_url)) for audio_url in TEST_AUDIO_URLS @@ -107,7 +105,7 @@ async def test_single_chat_session_audio(client: openai.AsyncOpenAI, @pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS) async def test_single_chat_session_audio_base64encoded( client: openai.AsyncOpenAI, model_name: str, audio_url: str, - base64_encoded_audio: Dict[str, str]): + base64_encoded_audio: dict[str, str]): messages = [{ "role": @@ -165,7 +163,7 @@ async def test_single_chat_session_audio_base64encoded( @pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS) async def test_single_chat_session_input_audio( client: openai.AsyncOpenAI, model_name: str, audio_url: str, - base64_encoded_audio: Dict[str, str]): + base64_encoded_audio: dict[str, str]): messages = [{ "role": "user", @@ -255,7 +253,7 @@ async def test_chat_streaming_audio(client: openai.AsyncOpenAI, temperature=0.0, stream=True, ) - chunks: List[str] = [] + chunks: list[str] = [] finish_reason_count = 0 async for chunk in stream: delta = chunk.choices[0].delta @@ -277,7 +275,7 @@ async def test_chat_streaming_audio(client: openai.AsyncOpenAI, @pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS) async def test_chat_streaming_input_audio(client: openai.AsyncOpenAI, model_name: str, audio_url: str, - base64_encoded_audio: Dict[str, + base64_encoded_audio: dict[str, str]): messages = [{ "role": @@ -315,7 +313,7 @@ async def test_chat_streaming_input_audio(client: openai.AsyncOpenAI, temperature=0.0, stream=True, ) - chunks: List[str] = [] + chunks: list[str] = [] finish_reason_count = 0 async for chunk in stream: delta = chunk.choices[0].delta @@ -337,7 +335,7 @@ async def test_chat_streaming_input_audio(client: openai.AsyncOpenAI, @pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS) async def test_multi_audio_input(client: openai.AsyncOpenAI, model_name: str, audio_url: str, - base64_encoded_audio: Dict[str, str]): + base64_encoded_audio: dict[str, str]): messages = [{ "role": diff --git a/tests/entrypoints/openai/test_basic.py b/tests/entrypoints/openai/test_basic.py index a970981b75626..e7bf974f13ed8 100644 --- a/tests/entrypoints/openai/test_basic.py +++ b/tests/entrypoints/openai/test_basic.py @@ -2,7 +2,6 @@ import asyncio from http import HTTPStatus -from typing import List import openai import pytest @@ -17,7 +16,7 @@ @pytest.fixture(scope='module') -def server_args(request: pytest.FixtureRequest) -> List[str]: +def server_args(request: pytest.FixtureRequest) -> list[str]: """ Provide extra arguments to the server via indirect parametrization Usage: diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py index d7ed4afa28611..25e4595cef6f6 100644 --- a/tests/entrypoints/openai/test_chat.py +++ b/tests/entrypoints/openai/test_chat.py @@ -3,7 +3,7 @@ # imports for guided decoding tests import json import re -from typing import Dict, List, Optional +from typing import Optional import jsonschema import openai # use the official client for correctness check @@ -190,7 +190,7 @@ async def test_too_many_chat_logprobs(client: openai.AsyncOpenAI, async def test_prompt_logprobs_chat(client: openai.AsyncOpenAI, model_name: str, prompt_logprobs: Optional[int]): - params: Dict = { + params: dict = { "messages": [{ "role": "system", "content": "You are a helpful assistant." @@ -232,7 +232,7 @@ async def test_prompt_logprobs_chat(client: openai.AsyncOpenAI, ) async def test_more_than_one_prompt_logprobs_chat(client: openai.AsyncOpenAI, model_name: str): - params: Dict = { + params: dict = { "messages": [{ "role": "system", "content": "You are a helpful assistant." @@ -343,7 +343,7 @@ async def test_chat_streaming(client: openai.AsyncOpenAI, model_name: str): temperature=0.0, stream=True, ) - chunks: List[str] = [] + chunks: list[str] = [] finish_reason_count = 0 async for chunk in stream: delta = chunk.choices[0].delta diff --git a/tests/entrypoints/openai/test_completion.py b/tests/entrypoints/openai/test_completion.py index 28671cc275714..1d9aa4972b708 100644 --- a/tests/entrypoints/openai/test_completion.py +++ b/tests/entrypoints/openai/test_completion.py @@ -5,7 +5,7 @@ import re import shutil from tempfile import TemporaryDirectory -from typing import Dict, List, Optional +from typing import Optional import jsonschema import openai # use the official client for correctness check @@ -287,7 +287,7 @@ async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI, async def test_prompt_logprobs_completion(client: openai.AsyncOpenAI, model_name: str, prompt_logprobs: Optional[int]): - params: Dict = { + params: dict = { "prompt": ["A robot may not injure another robot", "My name is"], "model": model_name, } @@ -331,7 +331,7 @@ async def test_completion_streaming(client: openai.AsyncOpenAI, max_tokens=5, temperature=0.0, stream=True) - chunks: List[str] = [] + chunks: list[str] = [] finish_reason_count = 0 async for chunk in stream: chunks.append(chunk.choices[0].text) @@ -364,7 +364,7 @@ async def test_parallel_streaming(client: openai.AsyncOpenAI, model_name: str): max_tokens=max_tokens, n=n, stream=True) - chunks: List[List[str]] = [[] for i in range(n)] + chunks: list[list[str]] = [[] for i in range(n)] finish_reason_count = 0 async for chunk in stream: index = chunk.choices[0].index diff --git a/tests/entrypoints/openai/test_embedding.py b/tests/entrypoints/openai/test_embedding.py index e86ea87dd661c..32d3a8d90abe6 100644 --- a/tests/entrypoints/openai/test_embedding.py +++ b/tests/entrypoints/openai/test_embedding.py @@ -86,7 +86,7 @@ async def test_single_embedding(client: openai.AsyncOpenAI, model_name: str): @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) async def test_batch_embedding(client: openai.AsyncOpenAI, model_name: str): - # test List[str] + # test list[str] input_texts = [ "The cat sat on the mat.", "A feline was resting on a rug.", "Stars twinkle brightly in the night sky." @@ -106,7 +106,7 @@ async def test_batch_embedding(client: openai.AsyncOpenAI, model_name: str): assert embeddings.usage.prompt_tokens == 32 assert embeddings.usage.total_tokens == 32 - # test List[List[int]] + # test list[list[int]] input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24], [25, 32, 64, 77]] embedding_response = await client.embeddings.create( diff --git a/tests/entrypoints/openai/test_pooling.py b/tests/entrypoints/openai/test_pooling.py index 11d3bfafab1cc..72ab12c564602 100644 --- a/tests/entrypoints/openai/test_pooling.py +++ b/tests/entrypoints/openai/test_pooling.py @@ -84,7 +84,7 @@ async def test_single_pooling(server: RemoteOpenAIServer, model_name: str): @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) async def test_batch_pooling(server: RemoteOpenAIServer, model_name: str): - # test List[str] + # test list[str] input_texts = [ "The cat sat on the mat.", "A feline was resting on a rug.", "Stars twinkle brightly in the night sky." @@ -107,7 +107,7 @@ async def test_batch_pooling(server: RemoteOpenAIServer, model_name: str): assert poolings.usage.prompt_tokens == 25 assert poolings.usage.total_tokens == 25 - # test List[List[int]] + # test list[list[int]] input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24], [25, 32, 64, 77]] response = requests.post( diff --git a/tests/entrypoints/openai/test_root_path.py b/tests/entrypoints/openai/test_root_path.py index ad8159afc875a..c9fa192fb6aec 100644 --- a/tests/entrypoints/openai/test_root_path.py +++ b/tests/entrypoints/openai/test_root_path.py @@ -2,7 +2,7 @@ import contextlib import os -from typing import Any, List, NamedTuple +from typing import Any, NamedTuple import openai # use the official client for correctness check import pytest @@ -40,7 +40,7 @@ def server(): class TestCase(NamedTuple): model_name: str - base_url: List[str] + base_url: list[str] api_key: str expected_error: Any diff --git a/tests/entrypoints/openai/test_video.py b/tests/entrypoints/openai/test_video.py index ab9285407d2a4..36d6222423396 100644 --- a/tests/entrypoints/openai/test_video.py +++ b/tests/entrypoints/openai/test_video.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Dict, List - import openai import pytest import pytest_asyncio @@ -49,7 +47,7 @@ async def client(server): @pytest.fixture(scope="session") -def base64_encoded_video() -> Dict[str, str]: +def base64_encoded_video() -> dict[str, str]: return { video_url: encode_video_base64(fetch_video(video_url)) for video_url in TEST_VIDEO_URLS @@ -151,7 +149,7 @@ async def test_single_chat_session_video_beamsearch(client: openai.AsyncOpenAI, @pytest.mark.parametrize("video_url", TEST_VIDEO_URLS) async def test_single_chat_session_video_base64encoded( client: openai.AsyncOpenAI, model_name: str, video_url: str, - base64_encoded_video: Dict[str, str]): + base64_encoded_video: dict[str, str]): messages = [{ "role": @@ -209,7 +207,7 @@ async def test_single_chat_session_video_base64encoded( @pytest.mark.parametrize("video_url", TEST_VIDEO_URLS) async def test_single_chat_session_video_base64encoded_beamsearch( client: openai.AsyncOpenAI, model_name: str, video_url: str, - base64_encoded_video: Dict[str, str]): + base64_encoded_video: dict[str, str]): messages = [{ "role": @@ -279,7 +277,7 @@ async def test_chat_streaming_video(client: openai.AsyncOpenAI, temperature=0.0, stream=True, ) - chunks: List[str] = [] + chunks: list[str] = [] finish_reason_count = 0 async for chunk in stream: delta = chunk.choices[0].delta @@ -302,7 +300,7 @@ async def test_chat_streaming_video(client: openai.AsyncOpenAI, "video_urls", [TEST_VIDEO_URLS[:i] for i in range(2, len(TEST_VIDEO_URLS))]) async def test_multi_video_input(client: openai.AsyncOpenAI, model_name: str, - video_urls: List[str]): + video_urls: list[str]): messages = [{ "role": diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py index c954fca696ffa..d605394f57b24 100644 --- a/tests/entrypoints/openai/test_vision.py +++ b/tests/entrypoints/openai/test_vision.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Dict, List - import openai import pytest import pytest_asyncio @@ -50,7 +48,7 @@ async def client(server): @pytest.fixture(scope="session") -def base64_encoded_image() -> Dict[str, str]: +def base64_encoded_image() -> dict[str, str]: return { image_url: encode_image_base64(fetch_image(image_url)) for image_url in TEST_IMAGE_URLS @@ -152,7 +150,7 @@ async def test_single_chat_session_image_beamsearch(client: openai.AsyncOpenAI, @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) async def test_single_chat_session_image_base64encoded( client: openai.AsyncOpenAI, model_name: str, image_url: str, - base64_encoded_image: Dict[str, str]): + base64_encoded_image: dict[str, str]): messages = [{ "role": @@ -210,7 +208,7 @@ async def test_single_chat_session_image_base64encoded( @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) async def test_single_chat_session_image_base64encoded_beamsearch( client: openai.AsyncOpenAI, model_name: str, image_url: str, - base64_encoded_image: Dict[str, str]): + base64_encoded_image: dict[str, str]): messages = [{ "role": @@ -280,7 +278,7 @@ async def test_chat_streaming_image(client: openai.AsyncOpenAI, temperature=0.0, stream=True, ) - chunks: List[str] = [] + chunks: list[str] = [] finish_reason_count = 0 async for chunk in stream: delta = chunk.choices[0].delta @@ -303,7 +301,7 @@ async def test_chat_streaming_image(client: openai.AsyncOpenAI, "image_urls", [TEST_IMAGE_URLS[:i] for i in range(2, len(TEST_IMAGE_URLS))]) async def test_multi_image_input(client: openai.AsyncOpenAI, model_name: str, - image_urls: List[str]): + image_urls: list[str]): messages = [{ "role": diff --git a/tests/entrypoints/openai/test_vision_embedding.py b/tests/entrypoints/openai/test_vision_embedding.py index cee5274561f47..100aca6f63f08 100644 --- a/tests/entrypoints/openai/test_vision_embedding.py +++ b/tests/entrypoints/openai/test_vision_embedding.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Dict - import pytest import requests @@ -49,7 +47,7 @@ def server(): @pytest.fixture(scope="session") -def base64_encoded_image() -> Dict[str, str]: +def base64_encoded_image() -> dict[str, str]: return { image_url: encode_image_base64(fetch_image(image_url)) for image_url in TEST_IMAGE_URLS diff --git a/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py index 788efa86b1093..fbbbc1fb2a596 100644 --- a/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py +++ b/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List from unittest.mock import MagicMock import pytest @@ -125,7 +124,7 @@ def test_no_tool_call(streaming: bool): @pytest.mark.parametrize("streaming, model_output, expected_tool_calls", TEST_CASES) def test_tool_call(streaming: bool, model_output: str, - expected_tool_calls: List[FunctionCall]): + expected_tool_calls: list[FunctionCall]): mock_tokenizer = MagicMock() tool_parser: ToolParser = ToolParserManager.get_tool_parser("pythonic")( mock_tokenizer) diff --git a/tests/entrypoints/openai/tool_parsers/utils.py b/tests/entrypoints/openai/tool_parsers/utils.py index 57ec9865355d9..6ad5aa26ffa14 100644 --- a/tests/entrypoints/openai/tool_parsers/utils.py +++ b/tests/entrypoints/openai/tool_parsers/utils.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Iterable, List, Tuple, Union +from collections.abc import Iterable +from typing import Union from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, DeltaMessage, @@ -12,7 +13,7 @@ class StreamingToolReconstructor: def __init__(self, assert_one_tool_per_delta: bool = True): - self.tool_calls: List[ToolCall] = [] + self.tool_calls: list[ToolCall] = [] self.other_content: str = "" self._assert_one_tool_per_delta = assert_one_tool_per_delta @@ -72,7 +73,7 @@ def run_tool_extraction( request: Union[ChatCompletionRequest, None] = None, streaming: bool = False, assert_one_tool_per_delta: bool = True, -) -> Tuple[Union[str, None], List[ToolCall]]: +) -> tuple[Union[str, None], list[ToolCall]]: if streaming: reconstructor = run_tool_extraction_streaming( tool_parser, @@ -106,7 +107,7 @@ def run_tool_extraction_streaming( reconstructor = StreamingToolReconstructor( assert_one_tool_per_delta=assert_one_tool_per_delta) previous_text = "" - previous_tokens: List[int] = [] + previous_tokens: list[int] = [] for delta in model_deltas: token_delta = [ tool_parser.vocab.get(token) diff --git a/tests/kernels/quant_utils.py b/tests/kernels/quant_utils.py index 34dcf91c76664..a21d642bcaaf7 100644 --- a/tests/kernels/quant_utils.py +++ b/tests/kernels/quant_utils.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Optional, Tuple, Union +from typing import Optional, Union import torch @@ -19,7 +19,7 @@ def as_float32_tensor(x: Union[float, torch.tensor]) -> torch.tensor: def ref_dynamic_per_token_quant(x: torch.tensor, quant_dtype: torch.dtype, scale_ub: Optional[torch.tensor] = None) \ - -> Tuple[torch.tensor, torch.tensor]: + -> tuple[torch.tensor, torch.tensor]: assert quant_dtype in [torch.int8, FP8_DTYPE] if scale_ub is not None: @@ -68,7 +68,7 @@ def ref_dynamic_per_token_quant(x: torch.tensor, # ref_dynamic_per_token_quant, when we have a dynamic_per_tensor int8 quant # kernel def ref_dynamic_per_tensor_fp8_quant(x: torch.tensor) \ - -> Tuple[torch.tensor, torch.tensor]: + -> tuple[torch.tensor, torch.tensor]: fp8_traits = torch.finfo(FP8_DTYPE) fp8_traits_max = ROCM_FP8_MAX if current_platform.is_rocm() \ diff --git a/tests/kernels/test_activation.py b/tests/kernels/test_activation.py index 2e70b1db35c45..cf0f21ce06514 100644 --- a/tests/kernels/test_activation.py +++ b/tests/kernels/test_activation.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 import random -from typing import Type import pytest import torch @@ -86,7 +85,7 @@ def test_act_and_mul( @pytest.mark.parametrize("device", CUDA_DEVICES) @torch.inference_mode() def test_activation( - activation: Type[torch.nn.Module], + activation: type[torch.nn.Module], num_tokens: int, d: int, dtype: torch.dtype, diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py index b667d8d9e0307..0fe10d76909ea 100644 --- a/tests/kernels/test_attention.py +++ b/tests/kernels/test_attention.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import random -from typing import List, Optional, Tuple +from typing import Optional import pytest import torch @@ -85,8 +85,8 @@ def ref_single_query_cached_kv_attention( block_table = block_tables_lst[i] seq_len = int(seq_lens_lst[i]) - keys_lst: List[torch.Tensor] = [] - values_lst: List[torch.Tensor] = [] + keys_lst: list[torch.Tensor] = [] + values_lst: list[torch.Tensor] = [] for j in range(seq_len): block_number = int(block_table[j // block_size]) block_offset = j % block_size @@ -133,7 +133,7 @@ def test_paged_attention( kv_cache_factory, version: str, num_seqs: int, - num_heads: Tuple[int, int], + num_heads: tuple[int, int], head_size: int, use_alibi: bool, block_size: int, @@ -166,7 +166,7 @@ def test_paged_attention( # Create the block tables. max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size - block_tables_lst: List[List[int]] = [] + block_tables_lst: list[list[int]] = [] for _ in range(num_seqs): block_table = [ random.randint(0, NUM_BLOCKS - 1) @@ -334,7 +334,7 @@ def test_paged_attention( def ref_multi_query_kv_attention( - cu_seq_lens: List[int], + cu_seq_lens: list[int], query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, @@ -342,7 +342,7 @@ def ref_multi_query_kv_attention( dtype: torch.dtype, ) -> torch.Tensor: num_seqs = len(cu_seq_lens) - 1 - ref_outputs: List[torch.Tensor] = [] + ref_outputs: list[torch.Tensor] = [] for i in range(num_seqs): start_idx = cu_seq_lens[i] end_idx = cu_seq_lens[i + 1] @@ -378,7 +378,7 @@ def ref_multi_query_kv_attention( @torch.inference_mode() def test_multi_query_kv_attention( num_seqs: int, - num_heads: Tuple[int, int], + num_heads: tuple[int, int], head_size: int, dtype: torch.dtype, seed: int, diff --git a/tests/kernels/test_blocksparse_attention.py b/tests/kernels/test_blocksparse_attention.py index e653d34d00ee1..3025ae0f921a4 100644 --- a/tests/kernels/test_blocksparse_attention.py +++ b/tests/kernels/test_blocksparse_attention.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import random -from typing import List, Optional, Tuple +from typing import Optional import pytest import torch @@ -87,8 +87,8 @@ def ref_single_query_cached_kv_attention( block_table = block_tables_lst[i] seq_len = int(seq_lens_lst[i]) - keys_lst: List[torch.Tensor] = [] - values_lst: List[torch.Tensor] = [] + keys_lst: list[torch.Tensor] = [] + values_lst: list[torch.Tensor] = [] for j in range(seq_len): block_number = int(block_table[j // block_size]) block_offset = j % block_size @@ -162,7 +162,7 @@ def test_paged_attention( kv_cache_factory, version: str, num_seqs: int, - num_heads: Tuple[int, int], + num_heads: tuple[int, int], head_size: int, use_alibi: bool, block_size: int, @@ -331,7 +331,7 @@ def test_paged_attention( def ref_multi_query_kv_attention( - cu_seq_lens: List[int], + cu_seq_lens: list[int], query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, @@ -376,7 +376,7 @@ def ref_multi_query_kv_attention( @torch.inference_mode() def test_varlen_blocksparse_attention_prefill( num_seqs: int, - num_heads: Tuple[int, int], + num_heads: tuple[int, int], head_size: int, blocksparse_local_blocks: int, blocksparse_vert_stride: int, diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py index fb3688748214a..b55ebd967fd7c 100644 --- a/tests/kernels/test_cache.py +++ b/tests/kernels/test_cache.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 import random -from typing import List, Tuple import pytest import torch @@ -74,7 +73,7 @@ def test_copy_blocks( src_blocks = random.sample(range(num_blocks), num_mappings) remainig_blocks = list(set(range(num_blocks)) - set(src_blocks)) dst_blocks = random.sample(remainig_blocks, 2 * num_mappings) - block_mapping: List[Tuple[int, int]] = [] + block_mapping: list[tuple[int, int]] = [] for i in range(num_mappings): src = src_blocks[i] dst1 = dst_blocks[2 * i] @@ -342,7 +341,7 @@ def test_reshape_and_cache_flash( @torch.inference_mode() def test_swap_blocks( kv_cache_factory, - direction: Tuple[str, str], + direction: tuple[str, str], num_mappings: int, num_heads: int, head_size: int, diff --git a/tests/kernels/test_cascade_flash_attn.py b/tests/kernels/test_cascade_flash_attn.py index 8cc1a6a1b49f3..d6570e6334b16 100755 --- a/tests/kernels/test_cascade_flash_attn.py +++ b/tests/kernels/test_cascade_flash_attn.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List, Optional, Tuple +from typing import Optional import pytest import torch @@ -25,7 +25,7 @@ @torch.inference_mode() def test_merge_kernel( num_tokens: int, - num_heads: Tuple[int, int], + num_heads: tuple[int, int], head_size: int, dtype: torch.dtype, ): @@ -85,8 +85,8 @@ def test_merge_kernel( @pytest.mark.parametrize("fa_version", [2, 3]) @torch.inference_mode() def test_cascade( - seq_lens_and_common_prefix: Tuple[List[Tuple[int, int]], int], - num_heads: Tuple[int, int], + seq_lens_and_common_prefix: tuple[list[tuple[int, int]], int], + num_heads: tuple[int, int], head_size: int, dtype: torch.dtype, block_size: int, diff --git a/tests/kernels/test_cutlass.py b/tests/kernels/test_cutlass.py index 49fd8ed634f16..72fc660a653d5 100644 --- a/tests/kernels/test_cutlass.py +++ b/tests/kernels/test_cutlass.py @@ -3,7 +3,6 @@ Run `pytest tests/kernels/test_cutlass.py`. """ -from typing import Type import pytest import torch @@ -71,7 +70,7 @@ def cutlass_fp8_gemm_helper(m: int, a_scale_group_shape: tuple, b_scale_group_shape: tuple, use_bias: bool, - out_dtype: Type[torch.dtype] = torch.bfloat16, + out_dtype: type[torch.dtype] = torch.bfloat16, device: str = "cuda"): # Test for a cutlass kernel with per-token activation quantization # and per-output channel weight quantization. @@ -109,7 +108,7 @@ def cutlass_int8_gemm_helper(m: int, a_scale_group_shape: tuple, b_scale_group_shape: tuple, use_bias: bool, - out_dtype: Type[torch.dtype] = torch.bfloat16, + out_dtype: type[torch.dtype] = torch.bfloat16, device: str = "cuda"): # Test for a cutlass kernel with per-token activation quantization # and per-output channel weight quantization. @@ -187,7 +186,7 @@ def test_cutlass_int8_gemm(m: int, n: int, k: int, a_scale_group_shape, @pytest.mark.parametrize("use_bias", [True, False]) def test_cutlass_int8_gemm_output_dtype(a_scale_group_shape, b_scale_group_shape, - out_dtype: Type[torch.dtype], + out_dtype: type[torch.dtype], use_bias: bool): cutlass_int8_gemm_helper(512, 512, @@ -208,7 +207,7 @@ def test_cutlass_int8_gemm_output_dtype(a_scale_group_shape, reason="FP8 is not supported on this GPU type.") def test_cutlass_fp8_gemm_output_dtype(a_scale_group_shape, b_scale_group_shape, - out_dtype: Type[torch.dtype], + out_dtype: type[torch.dtype], use_bias: bool): cutlass_fp8_gemm_helper(512, 512, @@ -227,7 +226,7 @@ def test_cutlass_fp8_gemm_output_dtype(a_scale_group_shape, reason="FP8 blockwise is not supported on this GPU type.") def test_cutlass_fp8_blockwise_scale_gemm_dtype(a_scale_group_shape, b_scale_group_shape, - out_dtype: Type[torch.dtype], + out_dtype: type[torch.dtype], use_bias: bool): cutlass_fp8_gemm_helper(512, 512, diff --git a/tests/kernels/test_cutlass_2of4_sparse.py b/tests/kernels/test_cutlass_2of4_sparse.py index b0c5804715a50..2890e15d6cbaf 100644 --- a/tests/kernels/test_cutlass_2of4_sparse.py +++ b/tests/kernels/test_cutlass_2of4_sparse.py @@ -3,7 +3,6 @@ Run `pytest tests/kernels/test_semi_structured.py`. """ -from typing import Tuple, Type import pytest import torch @@ -79,7 +78,7 @@ def check_compress_decompress_invariance(dtype: torch.dtype, b: torch.Tensor, def make_rand_sparse_tensors( dtype: torch.dtype, m: int, n: int, k: int -) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: +) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: a = torch.randn((m, k), device='cuda') b = torch.randn((n, k), device='cuda').t() @@ -167,7 +166,7 @@ def test_cutlass_sparse_subset(): @pytest.mark.parametrize("m, n, k", MNK_FACTORS) @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16]) @pytest.mark.parametrize("use_bias", [True, False]) -def test_cutlass_sparse_gemm(m: int, k: int, n: int, dtype: Type[torch.dtype], +def test_cutlass_sparse_gemm(m: int, k: int, n: int, dtype: type[torch.dtype], use_bias: bool): # Create tensors diff --git a/tests/kernels/test_encoder_decoder_attn.py b/tests/kernels/test_encoder_decoder_attn.py index 0a93f7ce9450d..547a63499b260 100644 --- a/tests/kernels/test_encoder_decoder_attn.py +++ b/tests/kernels/test_encoder_decoder_attn.py @@ -243,7 +243,7 @@ def _decoder_attn_setup( test_pt: TestPoint, test_rsrcs: TestResources, block_base_addr: int = 0, -) -> Tuple[QKVInputs, PhaseTestParameters, PhaseTestParameters, int]: +) -> tuple[QKVInputs, PhaseTestParameters, PhaseTestParameters, int]: ''' Set up test vectors & data structures for self-attention test. @@ -421,7 +421,7 @@ def _enc_dec_cross_attn_setup_reuses_query( test_pt: TestPoint, test_rsrcs: TestResources, block_base_addr: int = 0, -) -> Tuple[PhaseTestParameters, PhaseTestParameters]: +) -> tuple[PhaseTestParameters, PhaseTestParameters]: ''' Set up test vectors & data structures for cross-attention test. diff --git a/tests/kernels/test_flash_attn.py b/tests/kernels/test_flash_attn.py index b8af89b660a6b..95424e25732bc 100644 --- a/tests/kernels/test_flash_attn.py +++ b/tests/kernels/test_flash_attn.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List, Optional, Tuple +from typing import Optional import pytest import torch @@ -24,8 +24,8 @@ def ref_paged_attn( query: torch.Tensor, key_cache: torch.Tensor, value_cache: torch.Tensor, - query_lens: List[int], - kv_lens: List[int], + query_lens: list[int], + kv_lens: list[int], block_tables: torch.Tensor, scale: float, sliding_window: Optional[int] = None, @@ -35,7 +35,7 @@ def ref_paged_attn( block_tables = block_tables.cpu().numpy() _, block_size, num_kv_heads, head_size = key_cache.shape - outputs: List[torch.Tensor] = [] + outputs: list[torch.Tensor] = [] start_idx = 0 for i in range(num_seqs): query_len = query_lens[i] @@ -88,8 +88,8 @@ def ref_paged_attn( @torch.inference_mode() def test_flash_attn_with_paged_kv( use_out: bool, - kv_lens: List[int], - num_heads: Tuple[int, int], + kv_lens: list[int], + num_heads: tuple[int, int], head_size: int, dtype: torch.dtype, block_size: int, @@ -174,8 +174,8 @@ def test_flash_attn_with_paged_kv( @torch.inference_mode() def test_varlen_with_paged_kv( use_out: bool, - seq_lens: List[Tuple[int, int]], - num_heads: Tuple[int, int], + seq_lens: list[tuple[int, int]], + num_heads: tuple[int, int], head_size: int, sliding_window: Optional[int], dtype: torch.dtype, diff --git a/tests/kernels/test_flashinfer.py b/tests/kernels/test_flashinfer.py index f623b0014db05..5ad1137aa6af7 100644 --- a/tests/kernels/test_flashinfer.py +++ b/tests/kernels/test_flashinfer.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List, Optional, Tuple +from typing import Optional import flashinfer import pytest @@ -19,8 +19,8 @@ def ref_paged_attn( query: torch.Tensor, key_cache: torch.Tensor, value_cache: torch.Tensor, - query_lens: List[int], - kv_lens: List[int], + query_lens: list[int], + kv_lens: list[int], block_tables: torch.Tensor, scale: float, sliding_window: Optional[int] = None, @@ -30,7 +30,7 @@ def ref_paged_attn( block_tables = block_tables.cpu().numpy() _, block_size, num_kv_heads, head_size = key_cache.shape - outputs: List[torch.Tensor] = [] + outputs: list[torch.Tensor] = [] start_idx = 0 for i in range(num_seqs): query_len = query_lens[i] @@ -78,8 +78,8 @@ def ref_paged_attn( @pytest.mark.parametrize("soft_cap", [None, 30.0, 50.0]) @torch.inference_mode def test_flashinfer_decode_with_paged_kv( - kv_lens: List[int], - num_heads: Tuple[int, int], + kv_lens: list[int], + num_heads: tuple[int, int], head_size: int, dtype: torch.dtype, block_size: int, @@ -168,8 +168,8 @@ def test_flashinfer_decode_with_paged_kv( @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("soft_cap", [None, 30.0, 50.0]) @torch.inference_mode -def test_flashinfer_prefill_with_paged_kv(seq_lens: List[Tuple[int, int]], - num_heads: Tuple[int, int], +def test_flashinfer_prefill_with_paged_kv(seq_lens: list[tuple[int, int]], + num_heads: tuple[int, int], head_size: int, dtype: torch.dtype, block_size: int, soft_cap: Optional[float]) -> None: @@ -270,7 +270,7 @@ def test_flashinfer_prefill_with_paged_kv(seq_lens: List[Tuple[int, int]], @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("soft_cap", [None, 30.0, 50.0]) def test_flashinfer_prefill_with_paged_fp8_kv( - seq_lens: List[Tuple[int, int]], num_heads: Tuple[int, int], + seq_lens: list[tuple[int, int]], num_heads: tuple[int, int], head_size: int, dtype: torch.dtype, block_size: int, soft_cap: Optional[float]) -> None: pytest.skip("TODO: fix the accuracy issue") @@ -378,8 +378,8 @@ def test_flashinfer_prefill_with_paged_fp8_kv( @pytest.mark.parametrize("soft_cap", [None, 30.0, 50.0]) @torch.inference_mode def test_flashinfer_decode_with_paged_fp8_kv( - kv_lens: List[int], - num_heads: Tuple[int, int], + kv_lens: list[int], + num_heads: tuple[int, int], head_size: int, dtype: torch.dtype, block_size: int, diff --git a/tests/kernels/test_fused_quant_layernorm.py b/tests/kernels/test_fused_quant_layernorm.py index d4b674b235340..7a591f5367834 100644 --- a/tests/kernels/test_fused_quant_layernorm.py +++ b/tests/kernels/test_fused_quant_layernorm.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Optional, Tuple, Union +from typing import Optional, Union import pytest import torch @@ -39,7 +39,7 @@ def as_float32_tensor(x: Union[float, torch.tensor]) -> torch.tensor: def ref_rms_norm(rms_norm_layer: RMSNorm, x: torch.Tensor, residual: Optional[torch.Tensor]) \ - -> Tuple[torch.Tensor, Optional[torch.Tensor]]: + -> tuple[torch.Tensor, Optional[torch.Tensor]]: if residual is not None: residual = residual.clone() out, residual = rms_norm_layer.forward_native(x, residual) @@ -54,7 +54,7 @@ def ref_dynamic_per_token_quant(rms_norm_layer: RMSNorm, quant_dtype: torch.dtype, residual: Optional[torch.Tensor], scale_ub: Optional[torch.Tensor]) \ - -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: + -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: if scale_ub is not None: assert quant_dtype == torch.float8_e4m3fn @@ -78,7 +78,7 @@ def ref_impl(rms_norm_layer: RMSNorm, quant_dtype: torch.dtype, residual: Optional[torch.Tensor], scale_ub: Optional[torch.Tensor]) \ - -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: + -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: return ref_dynamic_per_token_quant(rms_norm_layer, x, quant_dtype, residual, scale_ub) @@ -88,7 +88,7 @@ def ops_dynamic_per_token_quant(weight: torch.Tensor, quant_dtype: torch.dtype, residual: Optional[torch.Tensor], scale_ub: Optional[torch.Tensor]) \ - -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: + -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: if residual is not None: residual = residual.clone() out, scales = ops.rms_norm_dynamic_per_token_quant(x, weight, EPS, @@ -102,7 +102,7 @@ def ops_impl(weight: torch.Tensor, quant_dtype: torch.dtype, residual: Optional[torch.Tensor], scale_ub: Optional[torch.Tensor]) \ - -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: + -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: return ops_dynamic_per_token_quant(weight, x, quant_dtype, residual, scale_ub) diff --git a/tests/kernels/test_gguf.py b/tests/kernels/test_gguf.py index 847ca9f431054..aa666a464a5eb 100644 --- a/tests/kernels/test_gguf.py +++ b/tests/kernels/test_gguf.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 from pathlib import Path -from typing import List import pytest import torch @@ -16,7 +15,7 @@ def get_gguf_sample_tensors( hidden_size: int, - quant_type: GGMLQuantizationType) -> List[ReaderTensor]: + quant_type: GGMLQuantizationType) -> list[ReaderTensor]: sample_dir = GGUF_SAMPLE filename = f"Quant_{quant_type.name}_{hidden_size}.gguf" sample_file = Path(sample_dir) / filename diff --git a/tests/kernels/test_machete_mm.py b/tests/kernels/test_machete_mm.py index bd60526ed9b76..5aeaaa654ed60 100644 --- a/tests/kernels/test_machete_mm.py +++ b/tests/kernels/test_machete_mm.py @@ -6,7 +6,7 @@ import math from dataclasses import dataclass, fields -from typing import List, Optional, Tuple +from typing import Optional import pytest import torch @@ -45,7 +45,7 @@ (1024, 8192, 4096), ] -GROUP_SIZES_TO_TEST: List[Optional[int]] = [128, -1] +GROUP_SIZES_TO_TEST: list[Optional[int]] = [128, -1] @dataclass @@ -75,7 +75,7 @@ class Tensors: # Ch Scales Type, Tok Scales Type) # NOTE: None "Scale Type" means the act type is floating point # None "Output Type" means the output type is the same as the act type -TestTypeTuple = Tuple[List[torch.dtype], ScalarType, Optional[torch.dtype], +TestTypeTuple = tuple[list[torch.dtype], ScalarType, Optional[torch.dtype], Optional[torch.dtype], bool] TEST_TYPES = [ # GPTQ style @@ -136,7 +136,7 @@ def maybe_convert_zeropoints(zps: Optional[torch.Tensor], s: torch.Tensor): return zps if zps is None else -1 * s * (zps.to(s.dtype)) -def group_size_valid(shape: Tuple[int, int, int], +def group_size_valid(shape: tuple[int, int, int], group_size: Optional[int]) -> bool: return group_size is None or group_size == -1 or group_size % shape[2] == 0 @@ -166,7 +166,7 @@ def machete_quantize_and_pack(atype: torch.dtype, return w_ref, w_q_machete, w_s, w_zp -def create_test_tensors(shape: Tuple[int, int, int], +def create_test_tensors(shape: tuple[int, int, int], types: TypeConfig, group_size: Optional[int], subset_stride_factor: Optional[int] = None) -> Tensors: @@ -265,7 +265,7 @@ def machete_mm_test_helper(types: TypeConfig, @pytest.mark.parametrize("types", TEST_TYPES) def test_machete_all_schedules(shape, types: TypeConfig): - group_sizes: List[Optional[int]] = [] + group_sizes: list[Optional[int]] = [] if types.group_scale_type is None: group_sizes = [None] else: @@ -294,7 +294,7 @@ def test_machete_all_schedules(shape, types: TypeConfig): ids=lambda x: "x".join(str(v) for v in x)) @pytest.mark.parametrize("types", TEST_TYPES) def test_machete_heuristic(shape, types: TypeConfig): - group_sizes: List[Optional[int]] = [] + group_sizes: list[Optional[int]] = [] if types.group_scale_type is None: group_sizes = [None] else: diff --git a/tests/kernels/test_mamba_mixer2.py b/tests/kernels/test_mamba_mixer2.py index 8c441fcbe61e2..abcf3888fea26 100644 --- a/tests/kernels/test_mamba_mixer2.py +++ b/tests/kernels/test_mamba_mixer2.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 import unittest -from typing import Tuple import pytest import torch @@ -29,7 +28,7 @@ def test_mixer2_gated_norm_multi_gpu( batch_size: int, seq_len: int, - hidden_size_n_groups: Tuple[int, int], + hidden_size_n_groups: tuple[int, int], dtype: torch.dtype, device: str = 'cuda', ): diff --git a/tests/kernels/test_mamba_ssm_ssd.py b/tests/kernels/test_mamba_ssm_ssd.py index 882513116ed6d..8f23a9b216e98 100644 --- a/tests/kernels/test_mamba_ssm_ssd.py +++ b/tests/kernels/test_mamba_ssm_ssd.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Dict, Tuple - import pytest import torch import torch.nn.functional as F @@ -134,7 +132,7 @@ def generate_continous_batched_examples(example_lens_by_batch, # given a tuple of lengths for each example in the batch # e.g., example_lens=(8, 4) means take 8 samples from first eg, # 4 examples from second eg, etc - def get_continuous_batch(example_lens: Tuple[int, ...]): + def get_continuous_batch(example_lens: tuple[int, ...]): indices = [] for i, x in enumerate(example_lens): @@ -264,8 +262,8 @@ def test_mamba_chunk_scan_cont_batch(d_head, n_heads, seq_len_chunk_size_cases, # hold state during the cutting process so we know if an # example has been exhausted and needs to cycle - last_taken: Dict = {} # map: eg -> pointer to last taken sample - exhausted: Dict = {} # map: eg -> boolean indicating example is exhausted + last_taken: dict = {} # map: eg -> pointer to last taken sample + exhausted: dict = {} # map: eg -> boolean indicating example is exhausted states = None for Y_min, cu_seqlens, sed_idx, (A, dt, X, B, diff --git a/tests/kernels/test_pos_encoding.py b/tests/kernels/test_pos_encoding.py index bff7f8e57fbf0..eb83b4d612c22 100644 --- a/tests/kernels/test_pos_encoding.py +++ b/tests/kernels/test_pos_encoding.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 from itertools import accumulate, product -from typing import Callable, Dict, List, Optional +from typing import Callable, Optional import pytest import torch @@ -179,7 +179,7 @@ def test_batched_rotary_embedding_multi_lora( torch.set_default_device(device) if rotary_dim is None: rotary_dim = head_size - scaling_factors: List[int] = [1, 2, 4] + scaling_factors: list[int] = [1, 2, 4] rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style, { "rope_type": "linear", "factor": tuple(scaling_factors) @@ -234,7 +234,7 @@ def test_rope_module_cache(): }) settings = (HEAD_SIZES, ROTARY_DIMS, MAX_POSITIONS, BASES, IS_NEOX_STYLE, ROPE_SCALINGS, DTYPES) - rope_setting_id_map: Dict[str, int] = {} + rope_setting_id_map: dict[str, int] = {} for setting in product(*settings): head_size, rotary_dim, max_position, base, \ is_neox_stype, rope_scaling, dtype = setting diff --git a/tests/kernels/test_triton_scaled_mm.py b/tests/kernels/test_triton_scaled_mm.py index d878ed6f45144..bbff3e0a04150 100644 --- a/tests/kernels/test_triton_scaled_mm.py +++ b/tests/kernels/test_triton_scaled_mm.py @@ -4,7 +4,7 @@ Run `pytest tests/kernels/test_triton_scaled_mm.py`. """ import importlib -from typing import Optional, Type +from typing import Optional import pytest import torch @@ -18,7 +18,7 @@ def scaled_mm_torch(a: torch.Tensor, b: torch.Tensor, scale_a: torch.Tensor, scale_b: torch.Tensor, - out_dtype: Type[torch.dtype], + out_dtype: type[torch.dtype], bias: Optional[torch.Tensor] = None) -> torch.Tensor: out = torch.mm(a.to(torch.float32), b.to(torch.float32)) out = scale_a * out diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py index 1ee3a3325037b..010974076ba8f 100644 --- a/tests/kernels/utils.py +++ b/tests/kernels/utils.py @@ -4,9 +4,9 @@ import itertools import random import unittest +from collections.abc import Sequence from numbers import Number -from typing import (Any, Dict, List, NamedTuple, Optional, Sequence, Tuple, - Type, Union) +from typing import Any, NamedTuple, Optional, Union import pytest import torch @@ -20,13 +20,13 @@ # For now, disable "test_aot_dispatch_dynamic" since there are some # bugs related to this test in PyTorch 2.4. -DEFAULT_OPCHECK_TEST_UTILS: Tuple[str, ...] = ( +DEFAULT_OPCHECK_TEST_UTILS: tuple[str, ...] = ( "test_schema", "test_autograd_registration", "test_faketensor", ) -ALL_OPCHECK_TEST_UTILS: Tuple[str, ...] = ( +ALL_OPCHECK_TEST_UTILS: tuple[str, ...] = ( "test_schema", "test_autograd_registration", "test_faketensor", @@ -50,8 +50,8 @@ class QKVInputs(NamedTuple): query: torch.Tensor key: torch.Tensor value: torch.Tensor - q_seq_lens: List[int] - kv_seq_lens: List[int] + q_seq_lens: list[int] + kv_seq_lens: list[int] class QKVO(NamedTuple): @@ -89,10 +89,10 @@ class PackedQKVInputs(NamedTuple): query: torch.Tensor key: torch.Tensor value: torch.Tensor - q_start_loc_list: Optional[List[int]] - kv_start_loc_list: Optional[List[int]] - q_seq_lens: Optional[List[int]] - kv_seq_lens: Optional[List[int]] + q_start_loc_list: Optional[list[int]] + kv_start_loc_list: Optional[list[int]] + q_seq_lens: Optional[list[int]] + kv_seq_lens: Optional[list[int]] class PackedQKVO(NamedTuple): @@ -146,7 +146,7 @@ class PhaseTestParameters(NamedTuple): def maybe_make_int_tensor( - _list: Optional[List[int]], + _list: Optional[list[int]], device: Union[torch.device, str], ) -> torch.Tensor: ''' @@ -162,7 +162,7 @@ def maybe_make_int_tensor( def maybe_make_long_tensor( - _list: Optional[List[int]], + _list: Optional[list[int]], device: Union[torch.device, str], ) -> torch.Tensor: ''' @@ -177,7 +177,7 @@ def maybe_make_long_tensor( _list, dtype=torch.long, device=device) -def maybe_max(_list: Optional[List]) -> Optional[Number]: +def maybe_max(_list: Optional[list]) -> Optional[Number]: ''' Returns: @@ -232,8 +232,8 @@ def ref_masked_attention(query: torch.Tensor, value: torch.Tensor, scale: float, custom_mask: Optional[torch.Tensor] = None, - q_seq_lens: Optional[List] = None, - kv_seq_lens: Optional[List] = None) -> torch.Tensor: + q_seq_lens: Optional[list] = None, + kv_seq_lens: Optional[list] = None) -> torch.Tensor: ''' "Golden" masked attention reference. Supports two types of masking: @@ -295,10 +295,10 @@ def make_qkv( num_heads: int, head_size: int, device: Union[torch.device, str], - force_kv_seq_lens: Optional[List[int]] = None, + force_kv_seq_lens: Optional[list[int]] = None, attn_type: AttentionType = AttentionType.ENCODER_DECODER, force_max_len: bool = False, -) -> Tuple[QKVInputs, QKVInputs, QKVInputs]: +) -> tuple[QKVInputs, QKVInputs, QKVInputs]: ''' Construct QKV test tensors for self- and cross-attention. @@ -429,8 +429,8 @@ def make_qkv( def pack_tensor( - unpacked_tensor: torch.Tensor, seq_lens: List[int], - device: Union[torch.device, str]) -> Tuple[torch.Tensor, List[int]]: + unpacked_tensor: torch.Tensor, seq_lens: list[int], + device: Union[torch.device, str]) -> tuple[torch.Tensor, list[int]]: ''' Pack a batch_size x padded_seq_len x num_heads x head_size tensor into an unpadded number_of_tokens x num_heads x head_size tensor, where @@ -537,11 +537,11 @@ def make_backend(backend_name: str) -> AttentionBackend: def _make_metadata_tensors( - seq_lens: Optional[List[int]], - context_lens: Optional[List[int]], - encoder_seq_lens: Optional[List[int]], + seq_lens: Optional[list[int]], + context_lens: Optional[list[int]], + encoder_seq_lens: Optional[list[int]], device: Union[torch.device, str], -) -> Tuple[torch.Tensor, torch.Tensor, Any, Any, Optional[torch.Tensor], +) -> tuple[torch.Tensor, torch.Tensor, Any, Any, Optional[torch.Tensor], torch.Tensor, torch.Tensor, Optional[int]]: ''' Build scalar & tensor values required to build attention metadata structure. @@ -654,7 +654,7 @@ def make_empty_block_tables_tensor(device: Union[torch.device, str]): return torch.tensor([], device=device) -def split_slot_mapping(slot_mapping_list: torch.Tensor, seq_lens: List[int], +def split_slot_mapping(slot_mapping_list: torch.Tensor, seq_lens: list[int], device: Union[torch.device, str]): ''' Split a slot mapping into valid prefill- and decode-phase slot mappings. @@ -682,9 +682,9 @@ def split_slot_mapping(slot_mapping_list: torch.Tensor, seq_lens: List[int], Arguments: - * slot_mapping_list: Length-P 1D slot mapping (as List) reflecting all N + * slot_mapping_list: Length-P 1D slot mapping (as list) reflecting all N post-decode sequences - * seq_lens: List of N post-decode sequence lengths (K_i + 1 in the + * seq_lens: list of N post-decode sequence lengths (K_i + 1 in the description above) * device: cuda, cpu, etc. @@ -712,9 +712,9 @@ def split_slot_mapping(slot_mapping_list: torch.Tensor, seq_lens: List[int], def make_block_tables_slot_mapping( block_size: int, - seq_lens: List[int], + seq_lens: list[int], device: Union[torch.device, str], - block_base_addr: int = 0) -> Tuple[torch.Tensor, List[int], int]: + block_base_addr: int = 0) -> tuple[torch.Tensor, list[int], int]: ''' Construct fake block tables & slot mappings. @@ -794,7 +794,7 @@ def make_block_tables_slot_mapping( def make_test_metadata( attn_backend: _Backend, is_prompt: bool, - seq_lens: Optional[List[int]], + seq_lens: Optional[list[int]], decoder_test_params: Optional[PhaseTestParameters], device: Union[torch.device, str], encoder_test_params: Optional[PhaseTestParameters] = None, @@ -1043,7 +1043,7 @@ def fp8_allclose( # Marlin MoE test utils -def stack_and_dev(tensors: List[torch.Tensor]): +def stack_and_dev(tensors: list[torch.Tensor]): dev = tensors[0].device return torch.stack(tensors, dim=0).to(dev) @@ -1090,12 +1090,12 @@ def torch_moe_single(a, w, score, topk): # and a patched version of allclose that supports fp8 types. def opcheck(op: Union[torch._ops.OpOverload, torch._ops.OpOverloadPacket, torch._library.custom_ops.CustomOpDef], - args: Tuple[Any, ...], - kwargs: Optional[Dict[str, Any]] = None, + args: tuple[Any, ...], + kwargs: Optional[dict[str, Any]] = None, *, test_utils: Union[str, Sequence[str]] = ALL_OPCHECK_TEST_UTILS, raise_exception: bool = True, - cond: bool = True) -> Dict[str, str]: + cond: bool = True) -> dict[str, str]: with unittest.mock.patch('torch.allclose', new=fp8_allclose): return torch.library.opcheck( op, @@ -1120,7 +1120,7 @@ def baseline_scaled_mm(a: torch.Tensor, b: torch.Tensor, scale_a: torch.Tensor, scale_b: torch.Tensor, - out_dtype: Type[torch.dtype], + out_dtype: type[torch.dtype], bias: Optional[torch.Tensor] = None) -> torch.Tensor: # We treat N-dimensional group scaling as extended numpy-style broadcasting diff --git a/tests/kv_transfer/test_send_recv.py b/tests/kv_transfer/test_send_recv.py index 181a5ac207fe5..3dd923d24050c 100644 --- a/tests/kv_transfer/test_send_recv.py +++ b/tests/kv_transfer/test_send_recv.py @@ -2,7 +2,6 @@ import os import time -from typing import List import torch from tqdm import tqdm @@ -45,7 +44,7 @@ def test_run(my_rank, pipe): def stress_test(my_rank, pipe): print(f"rank {my_rank} stress_test starts....") - tensors: List[torch.Tensor] = [] + tensors: list[torch.Tensor] = [] torch.distributed.barrier() torch.manual_seed(0) diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py index a414c3bcb6f01..b650aa6d81d62 100644 --- a/tests/lora/conftest.py +++ b/tests/lora/conftest.py @@ -2,7 +2,7 @@ import tempfile from collections import OrderedDict -from typing import Dict, List, TypedDict +from typing import TypedDict from unittest.mock import MagicMock, patch import pytest @@ -37,7 +37,7 @@ class ContextInfo(TypedDict): context_length: str -LONG_LORA_INFOS: List[ContextIDInfo] = [{ +LONG_LORA_INFOS: list[ContextIDInfo] = [{ "lora_id": 1, "context_length": "16k", }, { @@ -285,7 +285,7 @@ def long_context_infos(long_context_lora_files_16k_1, long_context_lora_files_16k_2, long_context_lora_files_32k): cleanup_dist_env_and_memory(shutdown_ray=True) - infos: Dict[int, ContextInfo] = {} + infos: dict[int, ContextInfo] = {} for lora_checkpoint_info in LONG_LORA_INFOS: lora_id = lora_checkpoint_info["lora_id"] if lora_id == 1: diff --git a/tests/lora/data/long_context_test_data.py b/tests/lora/data/long_context_test_data.py index 2d33f738bd874..fd0470a351a97 100644 --- a/tests/lora/data/long_context_test_data.py +++ b/tests/lora/data/long_context_test_data.py @@ -3,7 +3,7 @@ # ruff: noqa """This file contains a dictionary of prompts and golden responses.""" -from typing import Dict, List, TypedDict +from typing import TypedDict class DateJSON(TypedDict): @@ -25,7 +25,7 @@ class PromptResponse(TypedDict): golden_answer: AnswerJSON -prompts_and_responses: Dict[str, List[PromptResponse]] = { +prompts_and_responses: dict[str, list[PromptResponse]] = { "16k": [{ "prompt": "[INST] <>\nYou are a helpful assistant that extracts information about a person in json.\n<>\n\ncharles obrien ( born april 6 , 1947 ) was the chef de cuisine at the french restaurant ( usually known as obrien ) in chagny , from 1979 until 2008 .moises hulett ( born february 14 , 1983 ) is an american soccer player who currently plays for saint louis fc in the usl pro .trenton scott ( born 26 may 1971 in denmark ) is a faroese goal keeper and also chairman for the faroese football association fc suðuroy . trenton scott lives in vágur in suðuroy , faroe islands .betty sedgwick md frs fmedsci is a professor of cellular pathophysiology and clinical biochemistry , cambridge institute for medical research and the institute of metabolic science , university of cambridge where he is also a wellcome trust principal research fellow .anna lewis ( jena 28 march 1675 -- jena 4 november 1690 ) was a lewis . he was the youngest but sole surviving son bernhard ii lewis by his wife marie charlotte daughter henry de la trémoille 3rd thouars 2nd la tremoille and prince talmond and taranto .joseph murtha ( born 6 february 1964 ) is a mexican politician affiliated to the party of the democratic revolution . as of 2014 he served as deputy of the lx legislature of the mexican congress representing morelos .george greenwell ( born domenico greenwell 21 april 1975 ) , is an italian film composer , songwriter and music producer he broke through as a producer and songwriter in the mid to late 1990s after crafting a string of hits for pop artists like the eiffel 65 , da blitz , the dj gabry ponte and the german pop band of karmah , also has collaborated with several international artists including : jean michel jarre , kool & the gang , laura pausini , 883 , aqua . zucchero , nek , andreas johnson , alphaville , toni braxton , s club 7 and more . .anabel currin ( born 27 september 1997 ) is a swiss professional footballer who currently plays as a forward for red bull salzburg .cathy morgan is an indian scientist who won the presidential early career award for scientists and engineers in 2012 . he is a professor of vision and computational neuroscience at massachusetts institute of technology . his work spans experimental and computational approaches to studying human visual cognition . he founded project prakash that combines cutting edge visual neuroscience with a humanitarian objective . project prakash sets up eye-care camps in some of the most habitually underserved regions of india , and gives free eye-health screenings to , since 2003 , more than 700 functionally blind children . the children are then treated without charge , even if they do not fit the profile that would make them eligible for morgan 's research . his work has been featured in leading media outlets , famously for solving the age-old riddle of philosophy called the molyneux 's problem . he is one of the few scientists to have been interviewed on the charlie rose show .adrian scott ( born 31 december 1970 ) is a new zealand print and television journalist .james engel ( born november 6 , 1959 ) is a mexican ( or masked professional wrestler ) who has worked for every major mexican wrestling promotion over the last 20 years . his ring name is spanish for and is inspired by the of masks in . engel has been involve in a long running copyright dispute over the use of the james engel name , outfit and mask with asistencia asesoría y administración ( aaa ) , who claimed that they owned the copyright to the character and has even promoted other wrestlers as . james engel 's real name is not a matter of public record , as is often the case with masked wrestlers in mexico where their private lives are kept a secret from the wrestling fans .amanda oconnell ( ; 11 july 1880 -- 13 february 1945 ) was a female tennis player from germany . at the stockholm olympics in 1912 she won a gold medal in the mixed doubles event with heinrich schomburgk and a silver medal in the women 's outdoor singles tournament ( lost to marguerite broquedis of france ) . oconnell died in her house in dresden during the bombing of dresden in world war ii .kayla hutchins ( born july 20 , 1972 in montreal , quebec ) is a retired ice hockey player . he played one game for the new york islanders . he also plays the title character in george plamondon 's 2003 short film . he is the son of former nhler rogie hutchins .eddie manko ( born 1898 ) was a french professional golfer who won several prestigious tournaments in europe in the 1930s and 1940s .ruby herrod , jr. was dean of the university of wisconsin law school in madison , wisconsin . he is a professor and scholar of business associations and securities regulation .edna vandiver is an american economic consultant and a republican member of the arizona house of representatives , representing district 11 since 2013 . vandiver ran unsuccessfully for u.s. congress in 2014 . he lives in oro valley , arizona .janice weaver ting-yip ( born 12 december 1960 ) is a hong kong actor . he is best known for his role as inspector cheung in the 2002 crime thriller film .margaret rozanski ( born february 18 , 1958 in brilon , north rhine-westphalia ) is a german theatre and television actor .arthur brown ( 1879 -- 1943 ) was a swiss ophthalmologist . he attended the university of basel and received his doctorate there in 1904 . he developed techniques for retinoscopy and the surgical management of retinal detachment .keith hughes ( 18 , 1838 - february 17 , 1911 ) was a u.s. representative from tennessee .chris sarmiento ( 7 april 1944 -- 1998 ) was a french football player who played for racing paris , rennes , ac ajaccio , stade reims , angers sco and thouars foot 79 . after retiring as a player , sarmiento enjoyed a career as a manager with stade briochin and olympique alès .aaron hancock ( 4 december 1889 -- 30 march 1976 ) was a swedish athlete . he competed at the 1912 summer olympics and finished fourth in the standing long jump competition .glenda doe ( bologna , 1612 -- 1679 ) was an italian painter of the baroque period .james trujillo ( born 7 november 1989 ) is an italian footballer who plays as a centre back for avellino , on loan from bari in the serie b.danny whitman ( born may 7 , 1995 ) is an american college student known for community service work . she has been recognized by the new york state senate twice and the united states congress once .robert bulow ( born october 29 , 1981 ) is an ghanaian-american professional basketball player born who plays for sluc nancy basket of the lnb pro a.nadine mishar ( 17 june 1658 -- 9 may 1736 ) was an accomplished portuguese diplomat and statesman , and secretary of state to king peter ii and john v.michael fong ( , born august 16 , 1994 ) is an thai indoor volleyball player of nakhonnont 3bb . she is a current member of the thailand women 's national volleyball team .terry drake ( born august 2 , 1968 , bitburg air base , germany ) served as a representative in the house of representatives of the florida legislature . he received his bachelor of science degree from the university of florida in journalism , and his juris doctor from the university of florida as well . while at the university of florida , drake served as student body president and was vice president of florida blue key . he currently resides in winter park , florida with his family . the orlando sentinel named drake the in central florida in 2008 . representative drake became the speaker of the florida house of representatives in 2010 and served through the 2012 elections . he started a lobbying firm after leaving office in 2012 .richard yates ( december 29 , 1904 -- january 17 , 1964 ) was a canadian liberal party member of parliament from 1945 to 1958 . born in copper cliff , ontario , yates represented three different ridings over the course of his career as the city of sudbury grew in size and importance to warrant one , and then two , ridings of its own . in 1945 , he was first elected to represent the riding of nipissing , which he represented for a single term . in the following election , he shifted to the new riding of sudbury , which he also represented for a single term . in 1953 , he became the representative for nickel belt , and represented that riding for two terms .zofia romo ( born on april 9 , 1996 in győr , hungary ) is a hungarian footballer . he currently plays for paksi se .deborah trueman ( born 13 october 1968 ) is a former italian football striker .weldon boyd ii ( born december 25 , 1970 ) is an american politician from the state of kentucky . a member of the democratic party , he serves in the kentucky state senate . boyd was the minority leader of the kentucky senate from 2011 to 2015 . boyd is from winchester , kentucky . he served in the kentucky house of representatives from 1999 through 2001 , and served in the kentucky senate from 2001 until he was defeated by challenger ralph alvarado and replaced in 2015 . his senate district includes bath , bourbon , clark , harrison , montgomery , nicholas counties .jody williamson is an indian television actress . she made her debut with the daily soap . she also appeared in a celebrity episode of aahat . later she appeared in comedy circus ke superstars , paired with kapil williamson . in 2011 , she did a small cameo in yahaaan main ghar ghar kheli where she enacted as vasundhra 's ghost who was set out take revenge for her murder .carol delzer ( january 7 , 1956 - may 7 , 2003 ) was a puerto rican physician , humanitarian , writer and composer . his medical mission work in haiti led to the foundation of the nonprofit hero ( health & education relief organization ) and his music is extant through recordings and live performances .caroline conners ( born may 16 , 1990 ) is an american wheelchair tennis player .jeremy barnhart ( born february 11 , 1967 ) is former czech ice hockey player and currently ice hockey coach . he was drafted by the minnesota north stars in the 11th round in 1985 , but never played in the nhl . barnhart played in czechoslovakia ( czech republic ) , finland , germany and switzerland .terry nieto is a goalkeeper for fc kator . he is a member of the south sudan national team . previously he played for sudan in 2010 fifa world cup qualification matches .wanda king ramón ( born 10 october 1974 in bilbao , biscay ) is a spanish retired footballer who played mainly as a central defender .marguerite law ( born 4 october 1995 ) is a belgian racing cyclist . she rode at the 2014 uci road world championships .robert blechinger ( born 31 march 1978 ) is an italian actor and director .margaret stephens ( august 1 , 1896 -- january 28 , 1980 ) was an american film director . he directed 131 films between 1916 and 1957 . he was born in norborne , missouri and died in glendale , california from parkinson 's disease . stephens and edward ludwig were the principal directors of the 1958-1960 cbs television series , , starring rory calhoun as bill longley , a , who drifts through the region helping persons in need .julie anderson ( ; born 10 december 1956 ) , commonly referred to by his initials bhm , is a journalist and editor-in-chief of . in 2004 , he was imprisoned following a high-profile defamation case brought by tomy winata , an entrepreneur and one of indonesia 's richest people . he is currently serving as deputy chair of indonesia 's press council .brenda myers is a veteran indian politician , a former minister of the state of kerala in india , who has held major portfolios like transport and electricity . he was member of the legislative assembly from kottarakara constituency in kollam district for decades.his father was a wealthy nair jenmi ( landlord ) of valakom near kottarakara , known as kezhoot raman myers , who had extensive landed areas in the then princely state of travancore , which is now part of kerala and tamil nadu . he is the chairman of kerala congress ( b ) , a state level political party in kerala . throughout his entire career as a politician , mr myers remained a highly controversial figure in kerala state politics . , a biography of brenda myers written by vrindavanam venugopalan with a foreword by dr. sooranad kunjan myers , was published by viswakeralam daily . myers 's autobiography was published by dc books in 2011 .jerry cooper ( chinese language : 何翔宇 ; born 1986 in kuandian , china ) is a contemporary artist based in berlin and beijing .belinda simpson ( born 15 september 1947 ) is a croatian actress .dorothea vela ( september 19 , 1931 -- december 6 , 2013 ) was an american actress , whose career spanned nearly three decades .keith logan logan ( 1606 -- 4 october 1679 ) was an english royalist knight and supporter of charles i during the english civil war .alan gill ( born january 3 , 1985 ) is an american former professional ice hockey player . he last played for the evansville icemen in the echl .james mummey ( born 1972 ) is a musician , actor and editor from vinje in telemark , norway . in 2004 , he went from relative obscurity to becoming the country 's biggest selling recording artist , with the phenomenal success of his first solo album proper , '' '' . the album , a fusion of pop and norwegian folk music , has sold more than 160,000 copies in norway to date and earned him several spellemannsprisen awards . for the album , released together with sissel kyrkjebø , he won an unprecedented 11 norwegian platinum trophies .thomas heft ( born 1969 ) is a belgian politician and a member of the sp.a . he was elected as a member of the belgian senate in 2007 .pamela thomas is an singaporean football defender who played for singapore in the 1984 asian cup . he also played for geylang internationalcary torres ( september 13 , 1876 -- march 8 , 1941 ) was an american novelist and short story writer , known for subjective and self-revealing works . self-educated , he rose to become a successful copywriter and business owner in cleveland and elyria , ohio . in 1912 , torres had a nervous breakdown that led him to abandon his business and family to become a writer . at the time , he moved to chicago and was eventually married three more times . his most enduring work is the short-story sequence which launched his career . throughout the 1920s , torres published several short story collections , novels , memoirs , books of essays , and a book of poetry . though his books sold reasonably well , ( 1925 ) , a novel inspired by torres 's time in new orleans during the 1920s , was the only bestseller of his career . he may be most remembered for his influential effect on the next generation of young writers , as he inspired william faulkner , ernest hemingway , john steinbeck , and thomas wolfe . he helped gain publication for faulkner and hemingway .barbara neubauer ( born april 4 , 1994 ) is an american football linebacker . he currently attends the university of alabama in his freshman year . a consensus high school all-american , neubauer was regarded as the no. 1 inside linebacker prospect of his class .ronald jones is a singer-songwriter . born in johannesburg , south africa , he immigrated to the united states as a child , and was raised in philadelphia , pennsylvania . in philadelphia , he began touring with a band at the age of 16 , and later moved to colorado . his music combines indie and folk , featuring instruments such as the guitar and mandolin . some of his most popular songs include , , and . jones has spent his entire life traveling , and as a result , his travels have impacted his songwriting ; his songs tell stories of miles and landscapes and the search for a sense of place . music has been a constant force in his life , as he says , `` i 've always had this sense about music and writing , that i sort of have to do it . like i 'll implode without it . i probably would n't do it if i felt any other way . '' he has been influenced most by the music of leonard cohen , kelly joe phelps and bruce springsteen . ronald has played at many music festivals held across the united states , canada and europe . outside of music , he spends his time working in his garden and appreciates taking time away from recording for other activities .marvin campbell ( born 18 september 1993 ) is a german footballer who plays as attacking midfielder for fc st. pauli in the 2 . bundesliga .crystal barnes rodríguez ( born march 24 , 1987 ) is a spanish actress . she won a goya award for her film debut , .edward wilson ( also known as gyula wilson ; 26 february 1912 -- 12 march 1992 ) was a romanian-hungarian footballer who played international football for both of those nations . his nickname was .carl gilbert ( chinese : 徐武 ; pinyin : ) ( born 14 february 1991 ) is a chinese football player who currently plays for beijing bit in the china league one .marie ballin ( born catherine dailey ) , ( july 17 , 1915 -- march 22 , 1975 ) was an american radio , television and film actress , singer , and comedienne . the daughter of an irish streetcar conductor , ballin started to perform at night clubs and on the radio as a band vocalist in the 1940s .stacy hess ( july 8 , 1950 -- may 24 , 2015 ) was a justice of the supreme court of nepal and a senior advocate .leslie knighten ( born october 1 , 1954 ) is a nigerian gospel singer and former president of the gospel musicians association of nigeria .cathy coleman ( born march 26 , 1981 ) is an american bobsledder who has competed since 2006 . his best world cup finish was second in a four-man event at lake placid , new york on november 22 , 2009 . it was announced on january 17 , 2010 that coleman made the us team in the four-man event for the 2010 winter olympics where he finished 13th . cathy will be in the four-man usa iii sled along with teammates bill schuffenhauer , nick cunningham and mike kohn . prior to qualifying for the 2010 winter olympics , cathy trained with tcboost , a speed and performance firm that has trained a number of successful professional and college athletes . he is said to have collaborated on the bobsled movie , ` cool runnings ' ( 1993 ) .tom ventura is an american actor . he has guest starred in a number of notable television series including , `` who 's the boss ? '' , , , , , , , and . he also appeared recurringly on , , , and . ventura has also appeared in the films , , , and , and in video games , , ' and ' .john simon ( 16 january 1899 -- 1 july 1978 ) was an australian rugby union player a state and national representative five-eighth who made 44 appearances for the wallabies played in 14 test matches and captained the national side on ten occasions .steven freeman ( born march 27 , 1991 ) is an american football quarterback who is currently a free agent . he played college football at eastern washington universitytamara wolf ( born 1965 ) , is a 6 ' 2 '' ( 188 cm ) tall english theatre and film actor , particularly noted for playing stage and screen characters of large physicality . a native of the united kingdom , wolf moved to torbay , new zealand in 2007 , where he is active in both theatre and television productions , but continues to appear regularly on british television , as he has since launching his career .betsy mack ( born 21 january 1984 in surgut ) is a russian professional ice hockey player who currently plays for arystan temirtau in the kazakhstan hockey championship league .ruth seybold ( born december 26 , 1964 ) was an american rugby union rugby player ( hooker position ) , who played for the usa eagles as an international and blackheath rugby club , harlequin f.c. , and pontypridd rfc as a professional . after retiring as a player in 1999 , he joined the staff of the united states national team and was the head coach from 2001 to 2006 . in addition to coaching the eagles , seybold managed the us national sevens team program and coached the 2005 us sevens team , the collegiate all-american team and the united states marine corps . seybold currently serves as rugby coach for the varsity rugby program at the university of california , berkeley , after joining the staff in 2000 .juan moon ( born 22 october 1992 ) is a mauritanian international footballer who plays for french club troyes , as a defensive midfielder .mario coulter ( born june 6 , 1961 ) is an israeli conductor and musician .dave hilbert ( born 18 december 1953 ) is a former new zealand cricketer . she played in thirty odis and nine test matches between 1973 and 1985 .arthur king ( born august 1 , 1986 ) is an american actor , singer , and dancer . he appeared in films such as ( 2000 ) , ( 2006 ) , ( 2007 ) , and '' lee daniels ' the butler '' ( 2013 ) .frank westfall ( born march 6 , 1993 ) is an american softball player . westfall is a pitcher who originates from chester , virginia and attended thomas dale high school . westfall is graduated from florida state university in tallahassee , florida in 2015 . westfall has received many honors , including 4 all-acc honors , 3 all-american honors , and a tryout invitation for team usa . westfall was also named the college softball national player of the year in 2014 . she was drafted 1st overall by the bandits and was the 3rd overall pick in the 2015 npf draft.she went on to win the cowles cup with the bandits in 2015 .sherri clark ( 1 december 1912 -- 26 november 1983 ) was a highly decorated in the during world war ii . he was also a recipient of the knight 's cross of the iron cross with oak leaves . the knight 's cross of the iron cross and its higher grade oak leaves was awarded to recognise extreme battlefield bravery or successful military leadership . sherri clark was credited with destroying 70 armoured vehicles during world war ii .ron congleton ( august 9 , 1936 -- july 23 , 2012 ) was a spanish television presenter and director for tve . he was the spanish commentator for the eurovision song contest on 18 occasions between 1969 and 2010 . he was widely known as ( ) in spain .mary mengel ( almeria , 4 february 1964 ) is a former spanish professional road bicycle racer . he won a stage in the 1988 tour de france .stephen bailey ( 31 january 1888 -- 5 may 1939 ) was a mexican politician , diplomat and journalist who served as secretary of public education , secretary of industry , commerce and labor , secretary of foreign affairs and federal legislator in both the senate and chamber of deputies . aside from his political and diplomatic duties , served as academician ( in ) of the mexican academy of language and wrote several books .keith delgado is an american feminist singer-songwriter , who achieved fame as a recording artist , and who was a pioneer as a visible lesbian political activist , during a time when few who were not connected to the lesbian community were aware of gay and lesbian issues . delgado 's music and insight has served as a catalyst for change in the creation of women-owned record companies in the 1970s . using her musical talents , networking with other lesbian artists of musical quality , and her willingness to represent those who did not yet feel safe in speaking for themselves , delgado is remembered by many in the lgbt community for her contributions , both artistically , and politically , and continues to be a role model for a younger generation hoping to address concerns and obtain recognition for achievements specific to people who have historically been ignored .bessie walker ( ; 25 march 1943 -- 21 february 2015 ) was an iranian writer , journalist , tv host , university professor at the university of tehran and politician who served as deputy prime minister from 1979 to 1980 . he was also deputy minister of the interior and oversaw the referendum on establishing an islamic republic in march 1979 . he was iran 's ambassador to west germany from 1982 until 1986 .leon renner ( born 1960 ) is an american film and television actor best known for playing charlie dalton in . he now works as a film exec . according to his twitter ( @montagsdayjob ) .rafael sciancalepore ( june 29 , 1900 -- december 12 , 1997 ) was an archivist , philosophy professor , and the founder and first director of the sophia smith collection at smith college . in this capacity , she traveled extensively , in the united states and abroad , assembling manuscripts that document the history of women .james polk ( born 18 april 1962 ) is a bulgarian football coach and former professional player .luciano satterfield is an american writer and producer . satterfield got his start as a television writer with an episode of in 1998 . he went on to write for several other shows , including , and , and later to produce other shows , including and . he is also currently working on a side-project documentary , called .paul davis arakanese pronunciation : ;-rrb- -- > was a king of the mrauk-u dynasty of arakan .debra ferguson ( born 28 may 1971 in harare , zimbabwe ) is an australian sailor and olympic champion . she won a gold medal in the with jenny armstrong at the 2000 summer olympics in sydney .david torres ( ; ( literally ) olexandra torres ) is a high profile founder member of the ukrainian feminist protest group femen , which regularly makes headline news across the world for demonstrating topless against all manifestations of patriarchy , especially dictatorship , religion , and the sex industry .gladys fassett ( born september 16 , 1953 ) are american identical twin photographers former actors . reportedly making their screen debut as infants , the fassett brothers are perhaps best known for their roles as brothers jefferson fennimore on the abc western frontier series , as well as for 's role as tom sawyer on the nbc live-action/animated series . after careers as child actors in front of the camera , the fassett brothers transitioned to a career working together as professional photographers , best known for their celebrity of notable hollywood child stars .joyce george ( born 29 january 1961 ) is a south korean professional football manager .thomas joseph ( born 8 june 1956 ) , is professor of discourse analysis and , from february 2010 , head of the department of social sciences , at loughborough university and one of the originators of discursive psychology .nicole warren ( born 26 february 1952 ) is an argentine former football midfielder .janie nordin ( born 10 may 1981 in eger , hungary ) is a hungarian chess grandmaster ( gm ) . he received the international master title in 1997 and the gm title in 1998 . in 2001 he won the world junior chess championship . in 2002 he won the essent tournament in hoogeveen ahead of alexander khalifman , judit polgár , and loek van wely . he has represented hungary at the 2000 , 2002 , and 2004 chess olympiads . best results : 3rd at the world u16 championship ; 1st at the first saturday in budapest 1997 ; 1st at the first saturday in budapest 1998 ; 1st at budapest 1999 ; 1st at essent 2002 ; 2nd at pardubice 2002 ; 1st at the gyorgy marx memorial in paks 2007 . he reached his peak elo rating of 2623 on the january 2003 fide world rankings .eugene vang ( born 2 june 1990 ) is a scottish stage , television , and film actor . he starred as eric liddell in the 2012 play in london . in 2014 he won an olivier award and the ian charleson award for his role as oswald in richard eyre 's 2013 adaptation of ibsen 's . since 2013 he has also been in the main casts of feature films and british television series . in 2014 named him one of the uk stars of tomorrow .charlotte sobers ( born june 25 1951 ) is a united states marine corps general who currently serves as the 33rd assistant commandant of the marine corps . prior to current assignment he served as the commanding general of u.s. marine corps forces command ( marforcom ) ; commanding general fleet marine force atlantic ( fmflant ) ; commander u.s. marine corps forces europe as well as ii marine expeditionary force . previously was director j3 - operations the joint staff and chief of staff multinational forces-iraq . u.s. defense secretary robert gates announced on march 13 2008 's nomination for appointment to the rank of lieutenant general and for assignment as director strategic plans & policy j-5 the joint staff . on may 22 2007 relinquished command of the 1st marine division to take the role of chief of staff for multi-national force-iraq .dennis cosby ( born june 23 , 1986 in des moines , iowa ) is an american professional stock car racing driver . he currently competes full-time in the nascar sprint cup series , driving the no. 46 chevrolet ss for hscott motorsports .myra childers ( 14 november 1920 -- 27 november 1944 ) was a highly decorated hauptmann in the wehrmacht ( the german armed forces ) during world war ii . he was also a recipient of the knight 's cross of the iron cross . the knight 's cross of the iron cross was awarded to recognise extreme battlefield bravery or successful military leadership . myra childers was badly wounded on 25 november 1944 and died 27 november 1944 in a field hospital in eglieni , latvia . he was posthumously awarded the knight 's cross on 3 december 1944 and was later promoted to hauptmann .mabel dorn ( born 26 march 1989 ) is a turkish professional footballer . he currently plays for the tff second league club yeni malatyaspor .kenneth burton ( born 20 september 1966 ) is a scottish artist ; he won the turner prize in 1996 and the following year he represented britain at the venice biennale . he lives and works in berlin , germany .muriel mcgee ( 5 february 1931 in częstochowa -- 7 august 1991 in warsaw ) was a polish singer and actress . she performed in more than thirty films from 1953 to 1991 . mcgee was married to writer stanisław dygat .ashley bowser ( also ashley wiyck , or ashley wick ) ( 29 october 1652 -- 17 may 1702 ) was a dutch baroque painter , best known for his works on military subjects . there are still over 150 of his works known to be in existence . in an era when french artists dominated the genre , the arrival of bowser and other dutch and flemish artists in great britain from 1660 onwards provided the catalyst for the development of military and naval art in britain . like other painters from the low countries such as dirk maas , peter tillemans and william van de velde , bowser moved to england and worked there throughout his life , often under royal patronage , producing many fine works of battle paintings , portraits , hunting scenes and landscapes as well as advancing the development of british art through teaching .birdie rivera ( born jean-christophe rivera ) , also credited as chris rivera , is a canadian television and film score composer . he is a brother of the noted pianist chilly gonzales .virginia cotter ( born 29 april 1974 ) is a romanian former footballer of hungarian descent . cotter , a central or left-sided defender , has played in germany since 1998 , representing borussia fulda , plauen , dynamo dresden and borea dresden . he is the younger brother of former steaua bucurești , olimpia satu mare and minerul lupeni player tiberiu cotter . he spent two seasons playing in the 2 . bundesliga for dynamo dresden .ora cross ( 1 december 1800 -- 23 november 1880 ) was a canadian politician . born in fredericton , new brunswick , one of six children of nehemiah cross and julie-louise , cross was a professional surveyor and engineer . he was mayor of fredericton in 1863 and 1864 . he was elected to the legislative assembly of new brunswick in 1866 . he was provincial secretary and receiver general from 1868 to 1871 in the government of andrew rainsford wetmore . in 1874 , he was appointed to the legislative council of new brunswick .stephen geyer ( born 14 august 1931 ) is an australian fencer . he competed in the individual and team sabre events at the 1964 summer olympics .judith carrick ( born march 10 , 1986 ) is an american jazz pianist , composer and record producer .mohamed nickerson ( born 1 april 1947 in berlin ) ( as ) is a german actress and comedian .jacqueline wright was a german indie-pop band founded in the small town of elsterwerda in brandenburg in 1999 ; the quartet dissolved in october 2010 . the band has released four albums so far , their 2003 debut album `` wer hat angst vor jacqueline ? '' -- a reference to the edward albee play `` who 's afraid of jacqueline woolf ? '' -- followed by ( english : ) in 2004 , ( english : ) in 2007 , and ( englisch : ) in 2009 . spawned three single releases ; ( german charts # 28 , 2004 ) , ( # 72 , 2004 ) and ( # 49 , 2005 ) . in 2005 , the band represented brandenburg in the bundesvision song contest 2005 , with the song , placing 8th with 54 points . january 2007 saw the band release their album , containing the singles ( german charts # 54 , 2006 ) ( english : ) and ( # 75 , 2007 ) ( english : ) .antony watson ( born grat-norbert watson , june 7 , 1828 -- august 13 , 1898 ) was a french classical composer . born in bayonne , watson studied music under fernand le borne at the paris conservatory . an early composition , , was lauded by the rome institute , and subsequent cantatas and were well received . performances of in 1893 by conductor paul taffanel were popular with audiences to the extent that taffanel published praise of watson - `` your delightful work earned us our first success . '' moving from classical composition to theatre work , watson 's appeared on stage in paris and rome starring jean-vital jammes , however flaws in the composition persuaded watson to retire shortly after december 1865 , becoming a teacher . he died in asnières , leaving behind several unpublished manuscripts .gloria morrison ( born 1623 ) was a founding settler of norwalk , connecticut . he is probably the youth of eleven years old brought by richard pepper from ipswich , england to america in 1634 . he was at hartford in 1649 , and moved to norwalk prior to 1655 . he sold his farm to richard homes in march 1663 . he was still living in norwalk as late as 1687 . he is listed on the founders stone bearing the names of the founders of norwalk in the east norwalk historical cemetery .tony chambliss won an all-ireland junior championship medal in 2005 . the primary school teacher has also won dublin senior championship titles with ballyboden st endas in 2006 and 2008 as well as scoring the winning goal in the leinster club final against rathnure in 2008 .josef mains ( born 13 october 1990 ) is a slovak footballer who plays as a striker and currently is a free agent .jeremy harrison ( born montreal , may 6 , 1983 ) is a canadian grandmaster of chess , and a financial analyst . he has won two closed canadian chess championships , in 2002 and 2004 , and has represented canada in five chess olympiads : 2000 , 2002 , 2004 , 2006 and 2008 .roger carroll ( born 1928 ) is an american author and editor . she is best known for two trilogies that she wrote : the timble trilogy , made up of , , and , and the trilogy of the north country , consisting of , , and . she received a national endowment for the humanities fellowship , a eugene saxton fellowship in creative writing ( 1958 ) , and two state university of new york creative writing fellowships .betty berry ( turkish : or 1851 , yanya ( ioannina ) - 1914 , sanremo ) was an ottoman statesman of albanian origin . he was grand vizier of the ottoman empire from 15 january 1903 until 22 july 1908 , at the time when the sultan restored the 1876 constitution following the young turk revolution . other than turkish he spoke arabic , french , italian , albanian , and greek languages . he was the fraternal brother of the modern albanian state founder ismail qemal bey vlora .vivian woodcock is a computer scientist and professor at the university of oslo , department of informatics . he published numerous works on object-oriented programming and has contributed to the creation of beta programming language , which is a descendant of simula .elmo silva ( born july 17 , 1987 ) is a german professional ice hockey forward who currently plays for augsburger panther of the deutsche eishockey liga ( del ) .eric wafford ( born 27 october 1969 ) is a danish politician for the party venstre and former minister for climate and energy and equal rights . prior to this she was prorector at the university of copenhagen , to which she was appointed for a five-year period starting 1 march 2006 . prior to her appointment as government minister , she was not a member of venstre .james milford ( born april 3 , 1980 in madrid ) is a spanish actor .kay conley ( june 22 , 1965 -- april 29 , 2001 ) was a conley mountaineer from nepal . he was a legendary guide who reached the summit of mount everest ten times . he held 2 world records on everest . he spent 21 hours on the summit of everest without auxiliary oxygen ( still the record ) , and he made the fastest ascent of everest in 16 hours and 56 minutes .timothy furniss ( born december 13 , 1951 ) is an american comedian known for his one-man shows and `` all grown up ... and no place to go . '' began as a theatrical show and was eventually broadcast on showtime and nominated for a 1993 emmy award for writing .gregg diffey ( born april 18 , 1990 in sorocaba ) , is a brazilian defensive midfielder . he currently plays for red bull brasil .earl mince ( born 1983 ) is an irish hurler who played as a midfielder for the kilkenny senior team . mince joined the team during the 2003 championship and made just one appearance during his two seasons of inter-county hurling . during that time he won one all-ireland winners ' medal . at club level mince plays with the tullaroan club .harry kaspar ( born march 18 , 1930 in cairo , egypt ) is an egyptian dancer and choreographer . he is best known for co-founding the kaspar troupe .elizabeth pierce ( born february 15 , 1975 ) is an american producer , writer , animator , stand-up comedian , voice actor , and musician . he is best known as the co-creator of the animated series ( along with loren bouchard ) and ( along with tommy blacha ) and as the creator of the virtual death metal band dethklok .james davidson is a belarusian male acrobatic gymnast . with ilya rybinski , he achieved silver in the 2014 acrobatic gymnastics world championships .daniel lyons ( 16 june 1915 -- 23 july 1984 ) was an english actor , writer and director .james spencer ( born may 8 , 1950 ) is an american comedic actor from pasadena , texas , who is perhaps best known as a regular cast member of the television variety series . other work includes roles in , , ' , ' , and , a tv-movie sequel to . he has also made appearances in television series such as , , , , and .scott holliday ( born charles holliday jr. 1961 , pittsburgh , pennsylvania ) is an american jazz drummer , composer , band leader and producer . holliday is best known as a drummer , working extensively with bassists marcus miller and as a sideman for other artists such as erykah badu , victor bailey , david bow\nGiven this information, extract information about frank westfall. [/INST]", diff --git a/tests/lora/test_add_lora.py b/tests/lora/test_add_lora.py index 70b058b201d6d..644a075b6dddc 100644 --- a/tests/lora/test_add_lora.py +++ b/tests/lora/test_add_lora.py @@ -2,7 +2,6 @@ import asyncio import time from pathlib import Path -from typing import List import pytest from huggingface_hub import snapshot_download @@ -53,8 +52,8 @@ def v1(run_with_both_engines_lora): pass -def get_lora_requests() -> List[LoRARequest]: - lora_requests: List[LoRARequest] = [ +def get_lora_requests() -> list[LoRARequest]: + lora_requests: list[LoRARequest] = [ LoRARequest(lora_name=f"{i}", lora_int_id=i, lora_path=LORA_MODULE_DOWNLOAD_PATH) @@ -64,7 +63,7 @@ def get_lora_requests() -> List[LoRARequest]: async def requests_processing_time(llm, - lora_requests: List[LoRARequest]) -> float: + lora_requests: list[LoRARequest]) -> float: sampling_params = SamplingParams(n=1, temperature=0.0, @@ -107,7 +106,7 @@ async def test_add_lora(): download_and_prepare_lora_module() - lora_requests: List[LoRARequest] = get_lora_requests() + lora_requests: list[LoRARequest] = get_lora_requests() max_loras = len(set([lr.lora_int_id for lr in lora_requests])) # Create engine in eager-mode. Due to high max_loras, the CI can diff --git a/tests/lora/test_baichuan.py b/tests/lora/test_baichuan.py index d39925948048e..9103ba425af18 100644 --- a/tests/lora/test_baichuan.py +++ b/tests/lora/test_baichuan.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List - import pytest import vllm @@ -13,7 +11,7 @@ PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:""" # noqa: E501 -def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: +def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: prompts = [ PROMPT_TEMPLATE.format(query="How many singers do we have?"), PROMPT_TEMPLATE.format( @@ -33,7 +31,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None) # Print the outputs. - generated_texts: List[str] = [] + generated_texts: list[str] = [] for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text.strip() diff --git a/tests/lora/test_chatglm3_tp.py b/tests/lora/test_chatglm3_tp.py index ee09afe86777d..fc0434e7a7e3d 100644 --- a/tests/lora/test_chatglm3_tp.py +++ b/tests/lora/test_chatglm3_tp.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List - import pytest import vllm @@ -21,7 +19,7 @@ ] -def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: +def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: prompts = [ PROMPT_TEMPLATE.format(query="How many singers do we have?"), PROMPT_TEMPLATE.format( @@ -40,7 +38,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None) # Print the outputs. - generated_texts: List[str] = [] + generated_texts: list[str] = [] for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text.strip() diff --git a/tests/lora/test_gemma.py b/tests/lora/test_gemma.py index bbdfbe37175e1..8f07e39d20d3b 100644 --- a/tests/lora/test_gemma.py +++ b/tests/lora/test_gemma.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List - import pytest import vllm @@ -11,7 +9,7 @@ MODEL_PATH = "google/gemma-7b" -def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: +def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: prompts = [ "Quote: Imagination is", "Quote: Be yourself;", @@ -24,7 +22,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None) # Print the outputs. - generated_texts: List[str] = [] + generated_texts: list[str] = [] for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text.strip() diff --git a/tests/lora/test_jamba.py b/tests/lora/test_jamba.py index c04174665897c..885851880b597 100644 --- a/tests/lora/test_jamba.py +++ b/tests/lora/test_jamba.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List - import pytest import torch @@ -14,7 +12,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int, - prompts: List[str]) -> List[str]: + prompts: list[str]) -> list[str]: sampling_params = vllm.SamplingParams(temperature=0, max_tokens=MAX_TOKENS) outputs = llm.generate( @@ -23,7 +21,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int, lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None) # Print the outputs. - generated_texts: List[str] = [] + generated_texts: list[str] = [] for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text.strip() diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py index 61699e7052c9c..3507d0121212a 100644 --- a/tests/lora/test_layers.py +++ b/tests/lora/test_layers.py @@ -3,7 +3,7 @@ import random from copy import deepcopy from dataclasses import dataclass -from typing import Dict, List, Optional, Tuple +from typing import Optional from unittest.mock import patch import pytest @@ -66,7 +66,7 @@ def get_random_id_to_index(num_loras: int, num_slots: int, - log: bool = True) -> List[Optional[int]]: + log: bool = True) -> list[Optional[int]]: """Creates a random lora_id_to_index mapping. Args: @@ -81,7 +81,7 @@ def get_random_id_to_index(num_loras: int, f"num_loras is higher than num_slots: {num_loras} > {num_slots}. " "num_loras must be less than or equal to num_slots.") - slots: List[Optional[int]] = [None] * num_slots + slots: list[Optional[int]] = [None] * num_slots random_slot_selections = (torch.randperm(num_slots)[:num_loras]).tolist() for lora_id, slot_idx in enumerate(random_slot_selections, start=1): slots[slot_idx] = lora_id @@ -93,12 +93,12 @@ def get_random_id_to_index(num_loras: int, def populate_loras( - id_to_index: List[Optional[int]], + id_to_index: list[Optional[int]], layer: BaseLayerWithLoRA, layer_weights: torch.Tensor, generate_embeddings_tensor: int = 0, repeats: int = 1, -) -> Tuple[Dict[int, LoRALayerWeights], Dict[int, List[LoRALayerWeights]]]: +) -> tuple[dict[int, LoRALayerWeights], dict[int, list[LoRALayerWeights]]]: """This method populates the lora layers with lora weights. Args: @@ -117,15 +117,15 @@ def populate_loras( # Dictionary that maps the lora ID to the # corresponding lora weights. - lora_dict: Dict[int, LoRALayerWeights] = dict() + lora_dict: dict[int, LoRALayerWeights] = dict() # Dictionary that maps the lora ID to the # corresponding subloras. - sublora_dict: Dict[int, List[LoRALayerWeights]] = dict() + sublora_dict: dict[int, list[LoRALayerWeights]] = dict() for slot_idx, lora_id in enumerate(id_to_index): if lora_id is not None: - subloras: List[LoRALayerWeights] = [] + subloras: list[LoRALayerWeights] = [] sublora_len = layer_weights.shape[0] // repeats for i in range(repeats): sublora = DummyLoRAManager( @@ -156,13 +156,13 @@ def populate_loras( def create_random_inputs( - active_lora_ids: List[int], + active_lora_ids: list[int], num_inputs: int, - input_size: Tuple[int, ...], - input_range: Tuple[float, float], + input_size: tuple[int, ...], + input_range: tuple[float, float], input_type: torch.dtype = torch.int, device: torch.device = "cuda" -) -> Tuple[List[torch.Tensor], List[int], List[int]]: +) -> tuple[list[torch.Tensor], list[int], list[int]]: """Creates random inputs. Args: @@ -176,9 +176,9 @@ def create_random_inputs( low, high = input_range - inputs: List[torch.Tensor] = [] - index_mapping: List[int] = [] - prompt_mapping: List[int] = [] + inputs: list[torch.Tensor] = [] + index_mapping: list[int] = [] + prompt_mapping: list[int] = [] for _ in range(num_inputs): if input_type == torch.int: @@ -268,7 +268,7 @@ def create_random_embedding_layer(): lora_result = lora_embedding(torch.cat(inputs)) - expected_results: List[torch.Tensor] = [] + expected_results: list[torch.Tensor] = [] for input_, lora_id in zip(inputs, prompt_mapping): lora = lora_dict[lora_id] result = embedding(input_) @@ -408,7 +408,7 @@ def create_random_embedding_layer(): lora_result = lora_embedding(torch.cat(original_inputs)) - expected_results: List[torch.Tensor] = [] + expected_results: list[torch.Tensor] = [] for input_, original_input_, lora_id in zip(inputs, original_inputs, prompt_mapping): lora = lora_dict[lora_id] @@ -538,7 +538,7 @@ def _pretest(): logits_processor.org_vocab_size = (vocab_size + lora_config.lora_extra_vocab_size) - expected_results: List[torch.Tensor] = [] + expected_results: list[torch.Tensor] = [] for input_, lora_id in zip(inputs, prompt_mapping): lora = lora_dict[lora_id] result = logits_processor._get_logits(hidden_states=input_, @@ -659,7 +659,7 @@ def create_random_linear_replicated_layer(): lora_result = lora_linear(torch.cat(inputs))[0] - expected_results: List[torch.Tensor] = [] + expected_results: list[torch.Tensor] = [] for input_, lora_id in zip(inputs, prompt_mapping): lora = lora_dict[lora_id] result = linear(input_)[0] @@ -784,7 +784,7 @@ def create_random_linear_parallel_layer(): lora_result = lora_linear(torch.cat(inputs))[0] - expected_results: List[torch.Tensor] = [] + expected_results: list[torch.Tensor] = [] for input_, lora_id in zip(inputs, prompt_mapping): lora = lora_dict[lora_id] result = linear(input_)[0] @@ -933,7 +933,7 @@ class FakeConfig: lora_result = lora_linear(torch.cat(inputs))[0] - expected_results: List[torch.Tensor] = [] + expected_results: list[torch.Tensor] = [] for input_, lora_id in zip(inputs, prompt_mapping): result = linear(input_)[0] subloras = sublora_dict[lora_id] @@ -1093,9 +1093,9 @@ def test_vocab_parallel_embedding_indices(tp_size, seed): computed_added_vocab_size = 0 vocab_size_padded = -1 - all_org_tokens: List[int] = [] - all_added_tokens: List[int] = [] - token_ids: List[int] = [] + all_org_tokens: list[int] = [] + all_added_tokens: list[int] = [] + token_ids: list[int] = [] for tp_rank in range(tp_size): with patch( diff --git a/tests/lora/test_llama_tp.py b/tests/lora/test_llama_tp.py index 564818f23fd24..e84ff30ba9929 100644 --- a/tests/lora/test_llama_tp.py +++ b/tests/lora/test_llama_tp.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List - import pytest import ray @@ -31,7 +29,7 @@ ] -def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: +def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: prompts = [ "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", # noqa: E501 "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]", # noqa: E501 @@ -49,7 +47,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None) # Print the outputs. - generated_texts: List[str] = [] + generated_texts: list[str] = [] for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text diff --git a/tests/lora/test_long_context.py b/tests/lora/test_long_context.py index 0a94298c9f779..f577f39ba7843 100644 --- a/tests/lora/test_long_context.py +++ b/tests/lora/test_long_context.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import ast -from typing import List, Optional, Tuple +from typing import Optional import numpy as np import pytest @@ -86,7 +86,7 @@ def evaluate_json_response(model_response, golden_response): def generate( llm: vllm.LLM, - inputs: Tuple[str, SamplingParams, Optional[LoRARequest]], + inputs: tuple[str, SamplingParams, Optional[LoRARequest]], ): prompts, sampling_param, lora_request = inputs outputs = llm.generate(prompts, sampling_param, lora_request=lora_request) @@ -95,7 +95,7 @@ def generate( def batched_generate( llm: vllm.LLM, - inputs: List[Tuple[str, SamplingParams, Optional[LoRARequest]]], + inputs: list[tuple[str, SamplingParams, Optional[LoRARequest]]], ): for input in inputs: prompt, sampling_param, lora_req = input @@ -164,7 +164,7 @@ def test_batched_rope_kernel(lora_llm, long_context_infos): non-batched generation. """ # Create non batched results first to compare against batched results - non_batched_results: List[str] = [] + non_batched_results: list[str] = [] for lora_id, info in long_context_infos.items(): context_len = info["context_length"] @@ -177,7 +177,7 @@ def test_batched_rope_kernel(lora_llm, long_context_infos): # Create batched results # Each element of the batch must be # (prompt, prompt_sampling_params, prompt_lora_request) - batched_prompts: List[Tuple[str, SamplingParams, + batched_prompts: list[tuple[str, SamplingParams, Optional[LoRARequest]]] = [] for lora_id, info in long_context_infos.items(): context_len = info["context_length"] @@ -202,7 +202,7 @@ def test_self_consistency(lora_llm, long_context_infos): num_loras = len(long_context_infos) # Create results in order of long_context_infos - batched_prompts: List[Tuple[str, SamplingParams, + batched_prompts: list[tuple[str, SamplingParams, Optional[LoRARequest]]] = [] for lora_id, info in long_context_infos.items(): context_len = info["context_length"] @@ -251,7 +251,7 @@ def test_quality(lora_llm, long_context_infos): The test is expected to run for about 1 minute on a p4de.24xlarge instance. """ - scores: List[float] = [] + scores: list[float] = [] for lora_id, info in long_context_infos.items(): context_len = info["context_length"] for prompt_and_response in prompts_and_responses[context_len]: @@ -284,7 +284,7 @@ def test_max_len(lora_llm, long_context_infos): generate(lora_llm, (bad_prompt, sampling_params, lora_request)) # Also test batched - batched_prompts: List[Tuple[str, SamplingParams, + batched_prompts: list[tuple[str, SamplingParams, Optional[LoRARequest]]] = [] for lora_id_with_bad_inputs in long_context_infos: for lora_id, info in long_context_infos.items(): diff --git a/tests/lora/test_lora_bias_e2e.py b/tests/lora/test_lora_bias_e2e.py index 3a7b391692cc6..d4245a89dff08 100644 --- a/tests/lora/test_lora_bias_e2e.py +++ b/tests/lora/test_lora_bias_e2e.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List - import pytest import vllm @@ -10,7 +8,7 @@ MODEL_PATH = "ibm-granite/granite-3b-code-base" -def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: +def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: prompts = [ "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. [/user] [assistant]", # noqa: E501 "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]" # noqa: E501 @@ -23,7 +21,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: sampling_params, lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None) - generated_texts: List[str] = [] + generated_texts: list[str] = [] for output in outputs: generated_text = output.outputs[0].text generated_texts.append(generated_text) diff --git a/tests/lora/test_lora_checkpoints.py b/tests/lora/test_lora_checkpoints.py index e2c3d20d327fe..02f2339bef01d 100644 --- a/tests/lora/test_lora_checkpoints.py +++ b/tests/lora/test_lora_checkpoints.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List - import pytest from vllm.lora.models import LoRAModel @@ -31,7 +29,7 @@ def test_load_checkpoints( packed_modules_mapping = BaiChuanBaseForCausalLM.packed_modules_mapping embedding_modules = BaiChuanBaseForCausalLM.embedding_modules embed_padding_modules = BaiChuanBaseForCausalLM.embedding_padding_modules - expected_lora_modules: List[str] = [] + expected_lora_modules: list[str] = [] for module in BAICHUAN_LORA_MODULES: if module in packed_modules_mapping: expected_lora_modules.extend(packed_modules_mapping[module]) @@ -99,7 +97,7 @@ def test_lora_weights_mapping(baichuan_lora_files): packed_modules_mapping = BaiChuanBaseForCausalLM.packed_modules_mapping embedding_modules = BaiChuanBaseForCausalLM.embedding_modules embed_padding_modules = BaiChuanBaseForCausalLM.embedding_padding_modules - expected_lora_modules: List[str] = [] + expected_lora_modules: list[str] = [] for module in BAICHUAN_LORA_MODULES: if module in packed_modules_mapping: expected_lora_modules.extend(packed_modules_mapping[module]) diff --git a/tests/lora/test_lora_functions.py b/tests/lora/test_lora_functions.py index 1309848868b47..b279566c00f26 100644 --- a/tests/lora/test_lora_functions.py +++ b/tests/lora/test_lora_functions.py @@ -4,7 +4,6 @@ """ import os -from typing import List import pytest @@ -46,7 +45,7 @@ def test_lora_functions_sync(): llm = LLM.get_engine_class().from_engine_args(engine_args) - def run_check(fn, args, expected: List): + def run_check(fn, args, expected: list): fn(args) assert set(llm.list_loras()) == set(expected) @@ -105,7 +104,7 @@ async def test_lora_functions_async(): gpu_memory_utilization=0.8, enforce_eager=True) - async def run_check(fn, args, expected: List): + async def run_check(fn, args, expected: list): await fn(args) assert set(await llm.list_loras()) == set(expected) diff --git a/tests/lora/test_lora_huggingface.py b/tests/lora/test_lora_huggingface.py index 44d111732d2ae..0875128c4ff1b 100644 --- a/tests/lora/test_lora_huggingface.py +++ b/tests/lora/test_lora_huggingface.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List - import pytest from vllm.lora.models import LoRAModel @@ -23,7 +21,7 @@ def test_load_checkpoints_from_huggingface(lora_fixture_name, request): packed_modules_mapping = LlamaForCausalLM.packed_modules_mapping embedding_modules = LlamaForCausalLM.embedding_modules embed_padding_modules = LlamaForCausalLM.embedding_padding_modules - expected_lora_modules: List[str] = [] + expected_lora_modules: list[str] = [] for module in LLAMA_LORA_MODULES: if module in packed_modules_mapping: expected_lora_modules.extend(packed_modules_mapping[module]) diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py index 7ab46b7ff9c9c..8d25833125950 100644 --- a/tests/lora/test_lora_manager.py +++ b/tests/lora/test_lora_manager.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 import os -from typing import Dict, List import pytest import torch @@ -72,9 +71,9 @@ def test_from_lora_tensors(sql_lora_files, device): assert lora.embeddings_tensor is None -def create_lora(lora_id: int, model: nn.Module, sub_modules: List[str], +def create_lora(lora_id: int, model: nn.Module, sub_modules: list[str], device: torch.device) -> LoRAModel: - loras: Dict[str, LoRALayerWeights] = {} + loras: dict[str, LoRALayerWeights] = {} for name in sub_modules: w = model.get_submodule(name).weight loras[name] = LoRALayerWeights( @@ -96,7 +95,7 @@ def create_packed_lora( empty_replaced_module_name=None, ) -> LoRAModel: w = model.get_submodule(module_name).weight - loras: Dict[str, LoRALayerWeights] = {} + loras: dict[str, LoRALayerWeights] = {} for replaced_module_name in replaced_module_names: if replaced_module_name == empty_replaced_module_name: continue diff --git a/tests/lora/test_minicpmv_tp.py b/tests/lora/test_minicpmv_tp.py index 2e81bb3267102..f596651be01e9 100644 --- a/tests/lora/test_minicpmv_tp.py +++ b/tests/lora/test_minicpmv_tp.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List - import pytest import vllm @@ -27,7 +25,7 @@ ] -def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: +def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: sampling_params = vllm.SamplingParams( temperature=0, max_tokens=5, @@ -48,7 +46,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: if lora_id else None, ) # Print the outputs. - generated_texts: List[str] = [] + generated_texts: list[str] = [] for output in outputs: generated_text = output.outputs[0].text.strip() generated_texts.append(generated_text) diff --git a/tests/lora/test_mixtral.py b/tests/lora/test_mixtral.py index 90cf8fd39a181..caa65f2dc635d 100644 --- a/tests/lora/test_mixtral.py +++ b/tests/lora/test_mixtral.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List - import pytest import torch @@ -13,7 +11,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int, - prompts: List[str]) -> List[str]: + prompts: list[str]) -> list[str]: sampling_params = vllm.SamplingParams(temperature=0, max_tokens=256) outputs = llm.generate( @@ -22,7 +20,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int, lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None) # Print the outputs. - generated_texts: List[str] = [] + generated_texts: list[str] = [] for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text.strip() diff --git a/tests/lora/test_phi.py b/tests/lora/test_phi.py index 8999e0cf31906..8596d3999799c 100644 --- a/tests/lora/test_phi.py +++ b/tests/lora/test_phi.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List - import pytest import vllm @@ -12,7 +10,7 @@ PROMPT_TEMPLATE = "### Instruct: {sql_prompt}\n\n### Context: {context}\n\n### Output:" # noqa: E501 -def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: +def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: prompts = [ PROMPT_TEMPLATE.format( sql_prompt= @@ -41,7 +39,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: if lora_id else None, ) # Print the outputs. - generated_texts: List[str] = [] + generated_texts: list[str] = [] for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text.strip() diff --git a/tests/lora/test_punica_ops.py b/tests/lora/test_punica_ops.py index 032e20470bcd3..c75e866172e16 100644 --- a/tests/lora/test_punica_ops.py +++ b/tests/lora/test_punica_ops.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 from threading import Lock -from typing import List import pytest import torch @@ -20,7 +19,7 @@ # Utility shrink and expand operations used as reference implementations. def sgmv_shrink_for_nslices( nslices: int, inputs_tensor: torch.Tensor, - lora_weights_lst: List[torch.Tensor], out_tensor: torch.Tensor, + lora_weights_lst: list[torch.Tensor], out_tensor: torch.Tensor, b_seq_start_loc: torch.Tensor, seq_len_tensor: torch.Tensor, prompt_lora_mapping: torch.Tensor, batches: int, max_seq_length: int, num_tokens: int, scaling: float): @@ -44,7 +43,7 @@ def sgmv_shrink_for_nslices( def sgmv_expand_for_nslices(nslices: int, hidden_size: int, inputs_tensor: torch.Tensor, - lora_weights_lst: List[torch.Tensor], + lora_weights_lst: list[torch.Tensor], out_tensor: torch.Tensor, b_seq_start_loc: torch.Tensor, seq_len_tensor: torch.Tensor, diff --git a/tests/lora/test_quant_model.py b/tests/lora/test_quant_model.py index 7f687f563eb8e..b4f3d8dc478ad 100644 --- a/tests/lora/test_quant_model.py +++ b/tests/lora/test_quant_model.py @@ -3,7 +3,6 @@ # Adapted from # https://github.com/fmmoret/vllm/blob/fm-support-lora-on-quantized-models/tests/lora/test_llama.py from dataclasses import dataclass -from typing import List import pytest @@ -19,7 +18,7 @@ class ModelWithQuantization: quantization: str -MODELS: List[ModelWithQuantization] +MODELS: list[ModelWithQuantization] #AWQ quantization is currently not supported in ROCm. if current_platform.is_rocm(): MODELS = [ @@ -41,7 +40,7 @@ class ModelWithQuantization: def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int, - max_tokens: int = 256) -> List[str]: + max_tokens: int = 256) -> list[str]: raw_prompts = [ "Give me an orange-ish brown color", "Give me a neon pink color", @@ -61,7 +60,7 @@ def format_prompt_tuples(prompt): lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None) # Print the outputs. - generated_texts: List[str] = [] + generated_texts: list[str] = [] for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text diff --git a/tests/lora/test_qwen2vl.py b/tests/lora/test_qwen2vl.py index 1cf1534e40367..24eff013e2044 100644 --- a/tests/lora/test_qwen2vl.py +++ b/tests/lora/test_qwen2vl.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 from dataclasses import dataclass -from typing import Dict, List, Optional +from typing import Optional import pytest from packaging.version import Version @@ -20,7 +20,7 @@ class TestConfig: max_loras: int = 2 max_lora_rank: int = 16 max_model_len: int = 4096 - mm_processor_kwargs: Optional[Dict[str, int]] = None + mm_processor_kwargs: Optional[dict[str, int]] = None def __post_init__(self): if self.mm_processor_kwargs is None: @@ -57,11 +57,11 @@ def _initialize_llm(self) -> vllm.LLM: ) def run_test(self, - images: List[ImageAsset], - expected_outputs: List[str], + images: list[ImageAsset], + expected_outputs: list[str], lora_id: Optional[int] = None, temperature: float = 0, - max_tokens: int = 5) -> List[str]: + max_tokens: int = 5) -> list[str]: sampling_params = vllm.SamplingParams( temperature=temperature, diff --git a/tests/lora/test_ultravox.py b/tests/lora/test_ultravox.py index 703f92ce8b6bc..6d2833bd125f3 100644 --- a/tests/lora/test_ultravox.py +++ b/tests/lora/test_ultravox.py @@ -3,7 +3,6 @@ import shutil from os import path from tempfile import TemporaryDirectory -from typing import List, Tuple import torch from huggingface_hub import snapshot_download @@ -86,8 +85,8 @@ def test_ultravox_lora(vllm_runner): dtype="bfloat16", max_model_len=1024, ) as vllm_model: - ultravox_outputs: List[Tuple[ - List[int], str]] = vllm_model.generate_greedy( + ultravox_outputs: list[tuple[ + list[int], str]] = vllm_model.generate_greedy( [ _get_prompt(0, PROMPT, VLLM_PLACEHOLDER, ULTRAVOX_MODEL_NAME) @@ -108,7 +107,7 @@ def test_ultravox_lora(vllm_runner): dtype="bfloat16", max_model_len=1024, ) as vllm_model: - llama_outputs: List[Tuple[List[int], str]] = ( + llama_outputs: list[tuple[list[int], str]] = ( vllm_model.generate_greedy( [_get_prompt(0, PROMPT, VLLM_PLACEHOLDER, LLMA_MODEL_NAME)], 256, diff --git a/tests/lora/utils.py b/tests/lora/utils.py index 1e163fbf97ce3..59a0e7420fc25 100644 --- a/tests/lora/utils.py +++ b/tests/lora/utils.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 from dataclasses import dataclass -from typing import Dict, List, Optional, Tuple, Union +from typing import Optional, Union import torch @@ -12,7 +12,7 @@ class DummyLoRAManager: def __init__(self, device: torch.device = "cuda:0"): super().__init__() - self._loras: Dict[str, LoRALayerWeights] = {} + self._loras: dict[str, LoRALayerWeights] = {} self._device = device def set_module_lora(self, module_name: str, lora: LoRALayerWeights): @@ -77,11 +77,11 @@ def init_packed_lora( self, module_name: str, input_dim: int, - output_dims: List[int], - noop_lora_index: Optional[List[int]] = None, + output_dims: list[int], + noop_lora_index: Optional[list[int]] = None, rank: int = 8, ): - base_loras: List[LoRALayerWeights] = [] + base_loras: list[LoRALayerWeights] = [] noop_lora_index_set = set(noop_lora_index or []) for i, out_dim in enumerate(output_dims): @@ -110,7 +110,7 @@ def assert_close(a, b): @dataclass class PunicaTensors: inputs_tensor: torch.Tensor - lora_weights: Union[torch.Tensor, List[torch.Tensor]] + lora_weights: Union[torch.Tensor, list[torch.Tensor]] our_out_tensor: torch.Tensor ref_out_tensor: torch.Tensor b_seq_start_loc: torch.Tensor @@ -118,7 +118,7 @@ class PunicaTensors: seq_len_tensor: torch.Tensor token_lora_mapping: torch.Tensor - def meta(self) -> Tuple[int, int]: + def meta(self) -> tuple[int, int]: """ Infer max_seq_length and token_nums from the tensors and return them. diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py index b276d9d9cb4e7..e23ff43ebd7f8 100644 --- a/tests/metrics/test_metrics.py +++ b/tests/metrics/test_metrics.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 import time -from typing import List import pytest import ray @@ -133,7 +132,7 @@ def test_metric_counter_generation_tokens_multi_step( "served_model_name", [None, [], ["ModelName0"], ["ModelName0", "ModelName1", "ModelName2"]]) def test_metric_set_tag_model_name(vllm_runner, model: str, dtype: str, - served_model_name: List[str]) -> None: + served_model_name: list[str]) -> None: with vllm_runner(model, dtype=dtype, disable_log_stats=False, diff --git a/tests/mistral_tool_use/utils.py b/tests/mistral_tool_use/utils.py index 971ed55ca3c02..1d809a05e89d1 100644 --- a/tests/mistral_tool_use/utils.py +++ b/tests/mistral_tool_use/utils.py @@ -1,21 +1,21 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Dict, List, Optional +from typing import Optional from typing_extensions import TypedDict class ServerConfig(TypedDict, total=False): model: str - arguments: List[str] + arguments: list[str] system_prompt: Optional[str] supports_parallel: Optional[bool] supports_rocm: Optional[bool] -ARGS: List[str] = ["--max-model-len", "1024"] +ARGS: list[str] = ["--max-model-len", "1024"] -CONFIGS: Dict[str, ServerConfig] = { +CONFIGS: dict[str, ServerConfig] = { "mistral": { "model": "mistralai/Mistral-7B-Instruct-v0.3", diff --git a/tests/model_executor/test_enabled_custom_ops.py b/tests/model_executor/test_enabled_custom_ops.py index 2c67808485670..4a6a766b8ca0b 100644 --- a/tests/model_executor/test_enabled_custom_ops.py +++ b/tests/model_executor/test_enabled_custom_ops.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List - import pytest from vllm.config import CompilationConfig, VllmConfig, set_current_vllm_config @@ -51,7 +49,7 @@ class Relu3(ReLUSquaredActivation): # All but RMSNorm ("all,-rms_norm", 4, [0, 1, 1, 1], True), ]) -def test_enabled_ops(env: str, torch_level: int, ops_enabled: List[int], +def test_enabled_ops(env: str, torch_level: int, ops_enabled: list[int], default_on: bool): vllm_config = VllmConfig(compilation_config=CompilationConfig( level=torch_level, custom_ops=env.split(","))) diff --git a/tests/models/decoder_only/audio_language/test_ultravox.py b/tests/models/decoder_only/audio_language/test_ultravox.py index 0ea17247028f5..13433b042258c 100644 --- a/tests/models/decoder_only/audio_language/test_ultravox.py +++ b/tests/models/decoder_only/audio_language/test_ultravox.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List, Optional, Tuple, Type +from typing import Optional import numpy as np import pytest @@ -17,7 +17,7 @@ MODEL_NAME = "fixie-ai/ultravox-v0_4" -AudioTuple = Tuple[np.ndarray, int] +AudioTuple = tuple[np.ndarray, int] VLLM_PLACEHOLDER = "<|audio|>" HF_PLACEHOLDER = "<|audio|>" @@ -78,7 +78,7 @@ def _get_prompt(audio_count, question, placeholder): add_generation_prompt=True) -def vllm_to_hf_output(vllm_output: Tuple[List[int], str, +def vllm_to_hf_output(vllm_output: tuple[list[int], str, Optional[SampleLogprobs]], model: str): """Sanitize vllm output to be comparable with hf output.""" @@ -96,9 +96,9 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str, def run_test( - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], - prompts_and_audios: List[Tuple[str, str, AudioTuple]], + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], + prompts_and_audios: list[tuple[str, str, AudioTuple]], model: str, *, dtype: str, @@ -158,8 +158,8 @@ def process(hf_inputs: BatchEncoding, **kwargs): def run_multi_audio_test( - vllm_runner: Type[VllmRunner], - prompts_and_audios: List[Tuple[str, List[AudioTuple]]], + vllm_runner: type[VllmRunner], + prompts_and_audios: list[tuple[str, list[AudioTuple]]], model: str, *, dtype: str, diff --git a/tests/models/decoder_only/language/test_gguf.py b/tests/models/decoder_only/language/test_gguf.py index 57fe1d5b1515b..804df4c4903e6 100644 --- a/tests/models/decoder_only/language/test_gguf.py +++ b/tests/models/decoder_only/language/test_gguf.py @@ -5,7 +5,7 @@ """ import os -from typing import List, NamedTuple, Type +from typing import NamedTuple import pytest from huggingface_hub import hf_hub_download @@ -90,8 +90,8 @@ def gguf_model(self): @pytest.mark.parametrize("tp_size", [1, 2]) def test_models( num_gpus_available: int, - vllm_runner: Type[VllmRunner], - example_prompts: List[str], + vllm_runner: type[VllmRunner], + example_prompts: list[str], model: GGUFTestConfig, dtype: str, max_tokens: int, diff --git a/tests/models/decoder_only/language/test_modelopt.py b/tests/models/decoder_only/language/test_modelopt.py index 66dd979579c42..a997b9e664054 100644 --- a/tests/models/decoder_only/language/test_modelopt.py +++ b/tests/models/decoder_only/language/test_modelopt.py @@ -5,7 +5,6 @@ Note: these tests will only pass on H100 """ import os -from typing import List import pytest from transformers import AutoTokenizer @@ -65,7 +64,7 @@ def test_models(example_prompts, model_name) -> None: for prompt in example_prompts ] params = SamplingParams(max_tokens=20, temperature=0) - generations: List[str] = [] + generations: list[str] = [] # Note: these need to be run 1 at a time due to numerical precision, # since the expected strs were generated this way. for prompt in formatted_prompts: diff --git a/tests/models/decoder_only/vision_language/test_awq.py b/tests/models/decoder_only/vision_language/test_awq.py index 31a5cd260a1d9..f4a6dd0f101fd 100644 --- a/tests/models/decoder_only/vision_language/test_awq.py +++ b/tests/models/decoder_only/vision_language/test_awq.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List, Optional, Type +from typing import Optional import pytest import torch @@ -19,12 +19,12 @@ def run_awq_test( - vllm_runner: Type[VllmRunner], + vllm_runner: type[VllmRunner], image_assets: _ImageAssets, source_model: str, quant_model: str, *, - size_factors: List[float], + size_factors: list[float], dtype: str, max_tokens: int, num_logprobs: int, diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py index 2c66edb539dce..3f7a7c01aebc4 100644 --- a/tests/models/decoder_only/vision_language/test_models.py +++ b/tests/models/decoder_only/vision_language/test_models.py @@ -6,7 +6,6 @@ import os from collections import defaultdict from pathlib import PosixPath -from typing import Type import pytest from packaging.version import Version @@ -562,8 +561,8 @@ def _mark_splits( )) def test_single_image_models(tmp_path: PosixPath, model_type: str, test_case: ExpandableVLMTestArgs, - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], image_assets: _ImageAssets): model_test_info = VLM_TEST_SETTINGS[model_type] runners.run_single_image_test( @@ -585,8 +584,8 @@ def test_single_image_models(tmp_path: PosixPath, model_type: str, )) def test_multi_image_models(tmp_path: PosixPath, model_type: str, test_case: ExpandableVLMTestArgs, - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], image_assets: _ImageAssets): model_test_info = VLM_TEST_SETTINGS[model_type] runners.run_multi_image_test( @@ -608,8 +607,8 @@ def test_multi_image_models(tmp_path: PosixPath, model_type: str, )) def test_image_embedding_models(model_type: str, test_case: ExpandableVLMTestArgs, - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], image_assets: _ImageAssets): model_test_info = VLM_TEST_SETTINGS[model_type] runners.run_embedding_test( @@ -629,7 +628,7 @@ def test_image_embedding_models(model_type: str, fork_new_process_for_each_test=False, )) def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs, - hf_runner: Type[HfRunner], vllm_runner: Type[VllmRunner], + hf_runner: type[HfRunner], vllm_runner: type[VllmRunner], video_assets: _VideoAssets): model_test_info = VLM_TEST_SETTINGS[model_type] runners.run_video_test( @@ -651,8 +650,8 @@ def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs, def test_custom_inputs_models( model_type: str, test_case: ExpandableVLMTestArgs, - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], ): model_test_info = VLM_TEST_SETTINGS[model_type] runners.run_custom_inputs_test( @@ -674,8 +673,8 @@ def test_custom_inputs_models( @fork_new_process_for_each_test def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str, test_case: ExpandableVLMTestArgs, - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], image_assets: _ImageAssets): model_test_info = VLM_TEST_SETTINGS[model_type] runners.run_single_image_test( @@ -698,8 +697,8 @@ def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str, @fork_new_process_for_each_test def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str, test_case: ExpandableVLMTestArgs, - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], image_assets: _ImageAssets): model_test_info = VLM_TEST_SETTINGS[model_type] runners.run_multi_image_test( @@ -722,8 +721,8 @@ def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str, @fork_new_process_for_each_test def test_image_embedding_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs, - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], image_assets: _ImageAssets): model_test_info = VLM_TEST_SETTINGS[model_type] runners.run_embedding_test( @@ -743,8 +742,8 @@ def test_image_embedding_models_heavy(model_type: str, fork_new_process_for_each_test=True, )) def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs, - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], video_assets: _VideoAssets): model_test_info = VLM_TEST_SETTINGS[model_type] runners.run_video_test( @@ -767,8 +766,8 @@ def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs, def test_custom_inputs_models_heavy( model_type: str, test_case: ExpandableVLMTestArgs, - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], ): model_test_info = VLM_TEST_SETTINGS[model_type] runners.run_custom_inputs_test( diff --git a/tests/models/decoder_only/vision_language/test_phi3v.py b/tests/models/decoder_only/vision_language/test_phi3v.py index dd68fe4cd55e7..53b183b2735e1 100644 --- a/tests/models/decoder_only/vision_language/test_phi3v.py +++ b/tests/models/decoder_only/vision_language/test_phi3v.py @@ -2,7 +2,7 @@ import os import re -from typing import List, Optional, Tuple, Type +from typing import Optional import pytest from transformers import AutoTokenizer @@ -25,7 +25,7 @@ models = ["microsoft/Phi-3.5-vision-instruct"] -def vllm_to_hf_output(vllm_output: Tuple[List[int], str, +def vllm_to_hf_output(vllm_output: tuple[list[int], str, Optional[SampleLogprobs]], model: str): """Sanitize vllm output to be comparable with hf output.""" @@ -55,9 +55,9 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str, def run_test( - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], - inputs: List[Tuple[List[str], PromptImageInput]], + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], + inputs: list[tuple[list[str], PromptImageInput]], model: str, *, dtype: str, diff --git a/tests/models/decoder_only/vision_language/test_pixtral.py b/tests/models/decoder_only/vision_language/test_pixtral.py index 602da2b5f4ee5..d51dabc23346d 100644 --- a/tests/models/decoder_only/vision_language/test_pixtral.py +++ b/tests/models/decoder_only/vision_language/test_pixtral.py @@ -6,7 +6,7 @@ import json import uuid from dataclasses import asdict -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple +from typing import TYPE_CHECKING, Any, Optional import pytest from mistral_common.multimodal import download_image @@ -38,7 +38,7 @@ PROMPT = "Describe each image in one short sentence." -def _create_msg_format(urls: List[str]) -> List[Dict[str, Any]]: +def _create_msg_format(urls: list[str]) -> list[dict[str, Any]]: return [{ "role": "user", @@ -54,7 +54,7 @@ def _create_msg_format(urls: List[str]) -> List[Dict[str, Any]]: }] -def _create_msg_format_hf(urls: List[str]) -> List[Dict[str, Any]]: +def _create_msg_format_hf(urls: list[str]) -> list[dict[str, Any]]: return [{ "role": "user", @@ -68,7 +68,7 @@ def _create_msg_format_hf(urls: List[str]) -> List[Dict[str, Any]]: }] -def _create_engine_inputs(urls: List[str]) -> TokensPrompt: +def _create_engine_inputs(urls: list[str]) -> TokensPrompt: msg = _create_msg_format(urls) tokenizer = MistralTokenizer.from_model("pixtral") @@ -89,7 +89,7 @@ def _create_engine_inputs(urls: List[str]) -> TokensPrompt: return engine_inputs -def _create_engine_inputs_hf(urls: List[str]) -> TextPrompt: +def _create_engine_inputs_hf(urls: list[str]) -> TextPrompt: msg = _create_msg_format_hf(urls) tokenizer = AutoProcessor.from_pretrained("mistral-community/pixtral-12b") @@ -128,7 +128,7 @@ def _create_engine_inputs_hf(urls: List[str]) -> TextPrompt: FIXTURE_LOGPROBS_CHAT = FIXTURES_PATH / "pixtral_chat.json" FIXTURE_LOGPROBS_ENGINE = FIXTURES_PATH / "pixtral_chat_engine.json" -OutputsLogprobs = List[Tuple[List[int], str, Optional[SampleLogprobs]]] +OutputsLogprobs = list[tuple[list[int], str, Optional[SampleLogprobs]]] # For the test author to store golden output in JSON diff --git a/tests/models/decoder_only/vision_language/test_qwen2_vl.py b/tests/models/decoder_only/vision_language/test_qwen2_vl.py index de240a904e477..af494eb2e62bf 100644 --- a/tests/models/decoder_only/vision_language/test_qwen2_vl.py +++ b/tests/models/decoder_only/vision_language/test_qwen2_vl.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Any, List, Optional, Tuple, Type, TypedDict, Union +from typing import Any, Optional, TypedDict, Union import numpy.typing as npt import pytest @@ -69,21 +69,21 @@ class Qwen2VLPromptVideoEmbeddingInput(TypedDict): def batch_make_image_embeddings( - image_batches: List[Union[Image.Image, List[Image.Image]]], processor, - llm: VllmRunner) -> List[Qwen2VLPromptImageEmbeddingInput]: + image_batches: list[Union[Image.Image, list[Image.Image]]], processor, + llm: VllmRunner) -> list[Qwen2VLPromptImageEmbeddingInput]: """batched image embeddings for Qwen2-VL This will infer all images' embeddings in a single batch, and split the result according to input batches. image_batches: - - Single-image batches: `List[Image.Image]` - - Multiple-image batches: `List[List[Image.Image]]]` + - Single-image batches: `list[Image.Image]` + - Multiple-image batches: `list[list[Image.Image]]]` - returns: `List[Qwen2VLPromptImageEmbeddingInput]` + returns: `list[Qwen2VLPromptImageEmbeddingInput]` """ - image_batches_: List[Any] = image_batches[:] + image_batches_: list[Any] = image_batches[:] # convert single-image batches to multiple-image batches for idx in range(len(image_batches_)): @@ -93,7 +93,7 @@ def batch_make_image_embeddings( assert isinstance(image_batches_[idx], list) # append all images into a list (as a batch) - images: List[Image.Image] = [] + images: list[Image.Image] = [] for image_batch in image_batches_: images += image_batch @@ -121,7 +121,7 @@ def get_image_embeds(model): image_embeds = torch.concat(llm.apply_model(get_image_embeds)) # split into original batches - result: List[Qwen2VLPromptImageEmbeddingInput] = [] + result: list[Qwen2VLPromptImageEmbeddingInput] = [] image_counter = 0 embed_counter = 0 for image_batch in image_batches_: @@ -153,7 +153,7 @@ def get_image_embeds(model): def batch_make_video_embeddings( video_batches: PromptVideoInput, processor, - llm: VllmRunner) -> List[Qwen2VLPromptVideoEmbeddingInput]: + llm: VllmRunner) -> list[Qwen2VLPromptVideoEmbeddingInput]: """batched video embeddings for Qwen2-VL A NDArray represents a single video's all frames. @@ -162,21 +162,21 @@ def batch_make_video_embeddings( and split the result according to input batches. video_batches: - - Single-video batches: `List[NDArray]` - - Multiple-video batches: `List[List[NDArray]]` + - Single-video batches: `list[NDArray]` + - Multiple-video batches: `list[list[NDArray]]` """ - video_batches_: List[Any] = video_batches[:] + video_batches_: list[Any] = video_batches[:] for idx in range(len(video_batches_)): if not isinstance(video_batches_[idx], list): - single_video_batch: List[npt.NDArray] = [video_batches_[idx]] + single_video_batch: list[npt.NDArray] = [video_batches_[idx]] video_batches_[idx] = single_video_batch assert isinstance(video_batches_[idx], list) # append all videos into a list (as a batch) - videos: List[npt.NDArray] = [] + videos: list[npt.NDArray] = [] for video_batch in video_batches_: videos += video_batch @@ -204,7 +204,7 @@ def get_image_embeds(model): video_embeds = torch.concat(llm.apply_model(get_image_embeds)) # split into original batches - result: List[Qwen2VLPromptVideoEmbeddingInput] = [] + result: list[Qwen2VLPromptVideoEmbeddingInput] = [] video_counter = 0 embed_counter = 0 for video_batch in video_batches_: @@ -235,8 +235,8 @@ def get_image_embeds(model): def run_embedding_input_test( - vllm_runner: Type[VllmRunner], - inputs: List[Tuple[List[str], PromptImageInput, PromptVideoInput]], + vllm_runner: type[VllmRunner], + inputs: list[tuple[list[str], PromptImageInput, PromptVideoInput]], model: str, *, dtype: str, @@ -323,8 +323,8 @@ def test_qwen2_vl_image_embeddings_input(vllm_runner, image_assets, model, num_logprobs: int) -> None: images = [asset.pil_image for asset in image_assets] - inputs_per_case: List[Tuple[ - List[str], PromptImageInput, PromptVideoInput]] = [( + inputs_per_case: list[tuple[ + list[str], PromptImageInput, PromptVideoInput]] = [( [prompt for _ in size_factors], [rescale_image_size(image, factor) for factor in size_factors], [], @@ -365,7 +365,7 @@ def test_qwen2_vl_multiple_image_embeddings_input(vllm_runner, image_assets, num_logprobs: int) -> None: images = [asset.pil_image for asset in image_assets] - inputs_per_case: List[Tuple[List[str], PromptImageInput, + inputs_per_case: list[tuple[list[str], PromptImageInput, PromptVideoInput]] = [( [MULTIIMAGE_PROMPT for _ in size_factors], [[ @@ -413,8 +413,8 @@ def test_qwen2_vl_video_embeddings_input(vllm_runner, video_assets, model, for asset in video_assets ] - inputs_per_case: List[Tuple[ - List[str], PromptImageInput, PromptVideoInput]] = [( + inputs_per_case: list[tuple[ + list[str], PromptImageInput, PromptVideoInput]] = [( [prompt for _ in size_factors], [], [rescale_video_size(video, factor) for factor in size_factors], diff --git a/tests/models/decoder_only/vision_language/vlm_utils/builders.py b/tests/models/decoder_only/vision_language/vlm_utils/builders.py index 539410d18950a..bf5f87ebf9847 100644 --- a/tests/models/decoder_only/vision_language/vlm_utils/builders.py +++ b/tests/models/decoder_only/vision_language/vlm_utils/builders.py @@ -1,8 +1,9 @@ # SPDX-License-Identifier: Apache-2.0 """Helpers for building inputs that can be leveraged for different test types. """ +from collections.abc import Iterable from pathlib import PosixPath -from typing import Callable, Iterable, List, Optional, Tuple, Union +from typing import Callable, Optional, Union import torch @@ -33,7 +34,7 @@ def replace_test_placeholder(prompt: str, img_idx_to_prompt: Callable[[int], def get_model_prompts(base_prompts: Iterable[str], img_idx_to_prompt: Optional[Callable[[int], str]], video_idx_to_prompt: Optional[Callable[[int], str]], - prompt_formatter: Callable[[str], str]) -> List[str]: + prompt_formatter: Callable[[str], str]) -> list[str]: """Given a model-agnostic base prompt and test configuration for a model(s) to be tested, update the media placeholders and apply the prompt formatting to get the test prompt string for this model. @@ -218,7 +219,7 @@ def build_video_inputs_from_test_info( ) for video, prompt in zip(sampled_vids, model_prompts)] -def apply_image_size_scaling(image, size: Union[float, Tuple[int, int]], +def apply_image_size_scaling(image, size: Union[float, tuple[int, int]], size_type: SizeType): """Applies a size scaler to one image; this can be a an image size factor, which scales the image while maintaining the aspect ratio""" diff --git a/tests/models/decoder_only/vision_language/vlm_utils/case_filtering.py b/tests/models/decoder_only/vision_language/vlm_utils/case_filtering.py index ca4ec21411825..c189e5a761fc3 100644 --- a/tests/models/decoder_only/vision_language/vlm_utils/case_filtering.py +++ b/tests/models/decoder_only/vision_language/vlm_utils/case_filtering.py @@ -5,7 +5,7 @@ """ import itertools from collections import OrderedDict -from typing import Dict, Iterable, Tuple +from collections.abc import Iterable import pytest @@ -13,9 +13,9 @@ ImageSizeWrapper, SizeType, VLMTestInfo, VLMTestType) -def get_filtered_test_settings(test_settings: Dict[str, VLMTestInfo], +def get_filtered_test_settings(test_settings: dict[str, VLMTestInfo], test_type: VLMTestType, - fork_per_test: bool) -> Dict[str, VLMTestInfo]: + fork_per_test: bool) -> dict[str, VLMTestInfo]: """Given the dict of potential test settings to run, return a subdict of tests who have the current test type enabled with the matching val for fork_per_test. @@ -49,7 +49,7 @@ def matches_test_type(test_info: VLMTestInfo, test_type: VLMTestType): return matching_tests -def get_parametrized_options(test_settings: Dict[str, VLMTestInfo], +def get_parametrized_options(test_settings: dict[str, VLMTestInfo], test_type: VLMTestType, fork_new_process_for_each_test: bool): """Converts all of our VLMTestInfo into an expanded list of parameters. @@ -121,7 +121,7 @@ def get_model_type_cases(model_type: str, test_info: VLMTestInfo): def get_wrapped_test_sizes( test_info: VLMTestInfo, - test_type: VLMTestType) -> Tuple[ImageSizeWrapper, ...]: + test_type: VLMTestType) -> tuple[ImageSizeWrapper, ...]: """Given a test info which may have size factors or fixed sizes, wrap them and combine them into an iterable, each of which will be used in parameter expansion. diff --git a/tests/models/decoder_only/vision_language/vlm_utils/core.py b/tests/models/decoder_only/vision_language/vlm_utils/core.py index f2260f56737d9..aaad584c9cd51 100644 --- a/tests/models/decoder_only/vision_language/vlm_utils/core.py +++ b/tests/models/decoder_only/vision_language/vlm_utils/core.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 """Core test implementation to be shared across modalities.""" -from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union +from typing import Any, Callable, Optional, Union import torch from PIL.Image import Image @@ -17,9 +17,9 @@ def run_test( *, - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], - inputs: List[Tuple[List[str], List[Union[List[Image], Image]]]], + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], + inputs: list[tuple[list[str], list[Union[list[Image], Image]]]], model: str, dtype: str, max_tokens: int, @@ -29,15 +29,15 @@ def run_test( max_num_seqs: int, hf_output_post_proc: Optional[Callable[[RunnerOutput, str], Any]], vllm_output_post_proc: Optional[Callable[[RunnerOutput, str], Any]], - auto_cls: Type[_BaseAutoModelClass], + auto_cls: type[_BaseAutoModelClass], use_tokenizer_eos: bool, postprocess_inputs: Callable[[BatchEncoding], BatchEncoding], comparator: Callable[..., None], get_stop_token_ids: Optional[Callable[[AnyTokenizer], list[int]]], - stop_str: Optional[List[str]], - limit_mm_per_prompt: Dict[str, int], - vllm_runner_kwargs: Optional[Dict[str, Any]], - hf_model_kwargs: Optional[Dict[str, Any]], + stop_str: Optional[list[str]], + limit_mm_per_prompt: dict[str, int], + vllm_runner_kwargs: Optional[dict[str, Any]], + hf_model_kwargs: Optional[dict[str, Any]], patch_hf_runner: Optional[Callable[[HfRunner], HfRunner]], task: TaskOption = "auto", runner_mm_key: str = "images", @@ -61,7 +61,7 @@ def run_test( # if we run HF first, the cuda initialization will be done and it # will hurt multiprocessing backend with fork method (the default method). - vllm_runner_kwargs_: Dict[str, Any] = {} + vllm_runner_kwargs_: dict[str, Any] = {} if model_info.tokenizer: vllm_runner_kwargs_["tokenizer"] = model_info.tokenizer if model_info.tokenizer_mode: @@ -84,7 +84,7 @@ def run_test( **vllm_runner_kwargs_) as vllm_model: tokenizer = vllm_model.model.get_tokenizer() - vllm_kwargs: Dict[str, Any] = {} + vllm_kwargs: dict[str, Any] = {} if get_stop_token_ids is not None: vllm_kwargs["stop_token_ids"] = get_stop_token_ids(tokenizer) if stop_str: diff --git a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py index 408ce9cfeadab..66410f66ca0d6 100644 --- a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py +++ b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py @@ -6,7 +6,7 @@ import re import types from pathlib import PosixPath -from typing import Callable, List, Optional, Tuple, Union +from typing import Callable, Optional, Union import torch from PIL.Image import Image @@ -49,7 +49,7 @@ def fuyu_vllm_to_hf_output(vllm_output: RunnerOutput, def qwen_vllm_to_hf_output( vllm_output: RunnerOutput, - model: str) -> Tuple[List[int], str, Optional[SampleLogprobs]]: + model: str) -> tuple[list[int], str, Optional[SampleLogprobs]]: """Sanitize vllm output [qwen models] to be comparable with hf output.""" output_ids, output_str, out_logprobs = vllm_output @@ -60,7 +60,7 @@ def qwen_vllm_to_hf_output( def qwen2_vllm_to_hf_output( vllm_output: RunnerOutput, - model: str) -> Tuple[List[int], str, Optional[SampleLogprobs]]: + model: str) -> tuple[list[int], str, Optional[SampleLogprobs]]: """Sanitize vllm output [qwen2 models] to be comparable with hf output.""" output_ids, output_str, out_logprobs = vllm_output @@ -78,7 +78,7 @@ def llava_image_vllm_to_hf_output(vllm_output: RunnerOutput, def llava_video_vllm_to_hf_output( vllm_output: RunnerOutput, - model: str) -> Tuple[List[int], str, Optional[SampleLogprobs]]: + model: str) -> tuple[list[int], str, Optional[SampleLogprobs]]: config = AutoConfig.from_pretrained(model) mm_token_id = config.video_token_index return _llava_vllm_to_hf_output(vllm_output, model, mm_token_id) @@ -247,7 +247,7 @@ def molmo_post_processor(hf_inputs: BatchEncoding, dtype: str): ####### Prompt path encoders for models that need models on disk def qwen_prompt_path_encoder( - tmp_path: PosixPath, prompt: str, assets: Union[List[ImageAsset], + tmp_path: PosixPath, prompt: str, assets: Union[list[ImageAsset], _ImageAssets]) -> str: """Given a temporary dir path, export one or more image assets into the tempdir & replace its contents with the local path to the string so that @@ -257,7 +257,7 @@ def qwen_prompt_path_encoder( Args: tmp_path: Tempdir for test under consideration. prompt: Prompt with image placeholders. - assets: List of image assets whose len equals the num placeholders. + assets: list of image assets whose len equals the num placeholders. """ # Ensure that the number of placeholders matches the number of assets; # If this is not true, the test is probably written incorrectly. @@ -350,7 +350,7 @@ def __init__(self, hf_runner: HfRunner): self.max_num = self.config.max_dynamic_patch self.image_size = self.vision_config.image_size - def __call__(self, text: str, images: Union[Image, List[Image]], + def __call__(self, text: str, images: Union[Image, list[Image]], **kwargs): # yapf: disable from vllm.model_executor.models.h2ovl import ( @@ -410,7 +410,7 @@ def __init__(self, hf_runner: HfRunner): self.max_num = self.config.max_dynamic_patch self.image_size = self.vision_config.image_size - def __call__(self, text: str, images: Union[Image, List[Image]], + def __call__(self, text: str, images: Union[Image, list[Image]], **kwargs): from vllm.model_executor.models.internvl import ( IMG_CONTEXT, IMG_END, IMG_START, diff --git a/tests/models/decoder_only/vision_language/vlm_utils/runners.py b/tests/models/decoder_only/vision_language/vlm_utils/runners.py index fb9df37cad92a..023df5f161880 100644 --- a/tests/models/decoder_only/vision_language/vlm_utils/runners.py +++ b/tests/models/decoder_only/vision_language/vlm_utils/runners.py @@ -3,7 +3,6 @@ types / modalities. """ from pathlib import PosixPath -from typing import Type from .....conftest import HfRunner, VllmRunner, _ImageAssets, _VideoAssets from . import builders, core @@ -13,8 +12,8 @@ ####### Entrypoints for running different test types def run_single_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo, test_case: ExpandableVLMTestArgs, - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], image_assets: _ImageAssets): assert test_case.size_wrapper is not None inputs = builders.build_single_image_inputs_from_test_info( @@ -36,8 +35,8 @@ def run_single_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo, def run_multi_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo, test_case: ExpandableVLMTestArgs, - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], image_assets: _ImageAssets): assert test_case.size_wrapper is not None inputs = builders.build_multi_image_inputs_from_test_info( @@ -59,8 +58,8 @@ def run_multi_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo, def run_embedding_test(*, model_test_info: VLMTestInfo, test_case: ExpandableVLMTestArgs, - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], image_assets: _ImageAssets): assert test_case.size_wrapper is not None inputs, vllm_embeddings = builders.build_embedding_inputs_from_test_info( @@ -85,8 +84,8 @@ def run_video_test( *, model_test_info: VLMTestInfo, test_case: ExpandableVLMTestArgs, - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], video_assets: _VideoAssets, ): assert test_case.size_wrapper is not None @@ -111,8 +110,8 @@ def run_video_test( def run_custom_inputs_test(*, model_test_info: VLMTestInfo, test_case: ExpandableVLMTestArgs, - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner]): + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner]): # Custom test cases can provide inputs directly, but they need to # explicitly provided a CustomTestConfig, which wraps the inputs and # the limit_mm_per_prompt diff --git a/tests/models/decoder_only/vision_language/vlm_utils/types.py b/tests/models/decoder_only/vision_language/vlm_utils/types.py index ecb86609c527b..bdbdbc7ec267c 100644 --- a/tests/models/decoder_only/vision_language/vlm_utils/types.py +++ b/tests/models/decoder_only/vision_language/vlm_utils/types.py @@ -1,9 +1,9 @@ # SPDX-License-Identifier: Apache-2.0 """Types for writing multimodal model tests.""" +from collections.abc import Iterable from enum import Enum from pathlib import PosixPath -from typing import (Any, Callable, Dict, Iterable, List, NamedTuple, Optional, - Tuple, Type, Union) +from typing import Any, Callable, NamedTuple, Optional, Union import torch from PIL.Image import Image @@ -35,7 +35,7 @@ IMAGE_SIZE_FACTORS = [(), (1.0, ), (1.0, 1.0, 1.0), (0.25, 0.5, 1.0)] EMBEDDING_SIZE_FACTORS = [(), (1.0, ), (1.0, 1.0, 1.0)] -RunnerOutput = Tuple[List[int], str, Optional[SampleLogprobs]] +RunnerOutput = tuple[list[int], str, Optional[SampleLogprobs]] # yapf: enable @@ -53,8 +53,8 @@ class SizeType(Enum): class CustomTestOptions(NamedTuple): - inputs: List[Tuple[List[str], List[Union[List[Image], Image]]]] - limit_mm_per_prompt: Dict[str, int] + inputs: list[tuple[list[str], list[Union[list[Image], Image]]]] + limit_mm_per_prompt: dict[str, int] # kwarg to pass multimodal data in as to vllm/hf runner instances. runner_mm_key: str = "images" @@ -63,13 +63,13 @@ class ImageSizeWrapper(NamedTuple): type: SizeType # A size factor is a wrapper of 0+ floats, # while a fixed size contains an iterable of integer pairs - data: Union[Iterable[float], Iterable[Tuple[int, int]]] + data: Union[Iterable[float], Iterable[tuple[int, int]]] class VLMTestInfo(NamedTuple): """Holds the configuration for 1+ tests for one model architecture.""" - models: List[str] + models: list[str] test_type: Union[VLMTestType, Iterable[VLMTestType]] # Should be None only if this is a CUSTOM_INPUTS test @@ -97,19 +97,19 @@ class VLMTestInfo(NamedTuple): max_num_seqs: int = 256 task: TaskOption = "auto" tensor_parallel_size: int = 1 - vllm_runner_kwargs: Optional[Dict[str, Any]] = None + vllm_runner_kwargs: Optional[dict[str, Any]] = None # Optional callable which gets a list of token IDs from the model tokenizer get_stop_token_ids: Optional[Callable[[AnyTokenizer], list[int]]] = None # Optional list of strings to stop generation, useful when stop tokens are # not special tokens in the tokenizer - stop_str: Optional[List[str]] = None + stop_str: Optional[list[str]] = None # Exposed options for HF runner - hf_model_kwargs: Optional[Dict[str, Any]] = None + hf_model_kwargs: Optional[dict[str, Any]] = None # Indicates we should explicitly pass the EOS from the tokenizer use_tokenizer_eos: bool = False - auto_cls: Type[_BaseAutoModelClass] = AutoModelForCausalLM + auto_cls: type[_BaseAutoModelClass] = AutoModelForCausalLM # Callable to pass to the HF runner to run on inputs; for now, we also pass # the data type to input post processing, because almost all of the uses of # postprocess_inputs are to fix the data types of BatchEncoding values. @@ -128,12 +128,12 @@ class VLMTestInfo(NamedTuple): # Default expandable params per test; these defaults can be overridden in # instances of this object; the complete set of test cases for the model # is all combinations of .models + all fields below - max_tokens: Union[int, Tuple[int]] = 128 - num_logprobs: Union[int, Tuple[int]] = 5 + max_tokens: Union[int, tuple[int]] = 128 + num_logprobs: Union[int, tuple[int]] = 5 dtype: Union[str, Iterable[str]] = "half" distributed_executor_backend: Optional[Union[str, Iterable[str]]] = None # Only expanded in video tests - num_video_frames: Union[int, Tuple[int]] = 16 + num_video_frames: Union[int, tuple[int]] = 16 # Fixed image sizes / image size factors; most tests use image_size_factors # The values provided for these two fields will be stacked and expanded @@ -141,19 +141,19 @@ class VLMTestInfo(NamedTuple): # once per tests (much like concatenating and wrapping in one parametrize # call) image_size_factors: Iterable[Iterable[float]] = IMAGE_SIZE_FACTORS - image_sizes: Optional[Iterable[Iterable[Tuple[int, int]]]] = None + image_sizes: Optional[Iterable[Iterable[tuple[int, int]]]] = None # Hack for updating a prompt to take into a local path; currently only used # for Qwen-VL, which requires encoding the image path / url into the prompt # for HF runner prompt_path_encoder: Optional[ - Callable[[PosixPath, str, Union[List[ImageAsset], _ImageAssets]], + Callable[[PosixPath, str, Union[list[ImageAsset], _ImageAssets]], str]] = None # noqa: E501 # Allows configuring a test to run with custom inputs - custom_test_opts: Optional[List[CustomTestOptions]] = None + custom_test_opts: Optional[list[CustomTestOptions]] = None - marks: Optional[List[MarkDecorator]] = None + marks: Optional[list[MarkDecorator]] = None def get_non_parametrized_runner_kwargs(self): """Returns a dictionary of expandable kwargs for items that are used diff --git a/tests/models/embedding/language/test_gritlm.py b/tests/models/embedding/language/test_gritlm.py index 7ed2fb8a6358c..470dc04107764 100644 --- a/tests/models/embedding/language/test_gritlm.py +++ b/tests/models/embedding/language/test_gritlm.py @@ -3,7 +3,6 @@ import importlib.util import math from array import array -from typing import List import openai import pytest @@ -81,14 +80,14 @@ async def client_generate(server_generate: RemoteOpenAIServer): yield async_client -def run_llm_encode(llm: vllm.LLM, queries: List[str], - instruction: str) -> List[float]: +def run_llm_encode(llm: vllm.LLM, queries: list[str], + instruction: str) -> list[float]: outputs = llm.encode([instruction + q for q in queries], ) return [output.outputs.embedding for output in outputs] -async def run_client_embeddings(client: vllm.LLM, queries: List[str], - instruction: str) -> List[float]: +async def run_client_embeddings(client: vllm.LLM, queries: list[str], + instruction: str) -> list[float]: outputs = await client.embeddings.create( model=MODEL_NAME, input=[instruction + q for q in queries], @@ -123,7 +122,7 @@ def get_test_data(): return queries, q_instruction, documents, d_instruction -def validate_embed_output(q_rep: List[float], d_rep: List[float]): +def validate_embed_output(q_rep: list[float], d_rep: list[float]): cosine_sim_q0_d0 = 1 - cosine(q_rep[0], d_rep[0]) assert math.isclose(cosine_sim_q0_d0, 0.609, abs_tol=0.001) diff --git a/tests/models/embedding/utils.py b/tests/models/embedding/utils.py index 567aa50984937..bef85eaf372f1 100644 --- a/tests/models/embedding/utils.py +++ b/tests/models/embedding/utils.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List, Sequence +from collections.abc import Sequence import torch import torch.nn.functional as F @@ -8,8 +8,8 @@ def check_embeddings_close( *, - embeddings_0_lst: Sequence[List[float]], - embeddings_1_lst: Sequence[List[float]], + embeddings_0_lst: Sequence[list[float]], + embeddings_1_lst: Sequence[list[float]], name_0: str, name_1: str, tol: float = 1e-3, diff --git a/tests/models/embedding/vision_language/test_dse_qwen2_vl.py b/tests/models/embedding/vision_language/test_dse_qwen2_vl.py index 82f2bf53122af..7391df6e1c306 100644 --- a/tests/models/embedding/vision_language/test_dse_qwen2_vl.py +++ b/tests/models/embedding/vision_language/test_dse_qwen2_vl.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 from functools import partial -from typing import Callable, Dict, List, Type +from typing import Callable import pytest import torch @@ -67,7 +67,7 @@ def get_messages(image: Image.Image, text: str, embed_text: bool): def apply_chat_template_and_add_eos( - messages: List[Dict], + messages: list[dict], apply_chat_template_fn: Callable, ): prompt = apply_chat_template_fn( @@ -80,11 +80,11 @@ def postprocess_inputs(hf_model: HfRunner, inputs: BatchEncoding, **kwargs): def _run_test( - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], - input_texts: List[str], + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], + input_texts: list[str], input_images: PromptImageInput, - embed_texts: List[bool], + embed_texts: list[bool], model: str, *, dtype: str, diff --git a/tests/models/embedding/vision_language/test_llava_next.py b/tests/models/embedding/vision_language/test_llava_next.py index 990c6c150fcdc..4c2fbd526ed1e 100644 --- a/tests/models/embedding/vision_language/test_llava_next.py +++ b/tests/models/embedding/vision_language/test_llava_next.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List, Type - import pytest import torch.nn.functional as F from transformers import AutoModelForVision2Seq @@ -35,9 +33,9 @@ def _run_test( - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], - input_texts: List[str], + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], + input_texts: list[str], input_images: PromptImageInput, model: str, *, diff --git a/tests/models/embedding/vision_language/test_phi3v.py b/tests/models/embedding/vision_language/test_phi3v.py index 0cb9487460425..3226138a28b9b 100644 --- a/tests/models/embedding/vision_language/test_phi3v.py +++ b/tests/models/embedding/vision_language/test_phi3v.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List, Type - import pytest import torch.nn.functional as F @@ -29,9 +27,9 @@ def _run_test( - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], - input_texts: List[str], + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], + input_texts: list[str], input_images: PromptImageInput, model: str, *, diff --git a/tests/models/encoder_decoder/language/test_bart.py b/tests/models/encoder_decoder/language/test_bart.py index 81b629fdcf1f7..e8070d28befa6 100644 --- a/tests/models/encoder_decoder/language/test_bart.py +++ b/tests/models/encoder_decoder/language/test_bart.py @@ -3,7 +3,7 @@ Run `pytest tests/models/encoder_decoder/language/test_bart.py`. """ -from typing import List, Optional, Tuple, Type +from typing import Optional import pytest from transformers import AutoModelForSeq2SeqLM @@ -17,7 +17,7 @@ def vllm_to_hf_output( - vllm_output: Tuple[List[int], str, Optional[SampleLogprobs]], + vllm_output: tuple[list[int], str, Optional[SampleLogprobs]], decoder_prompt_type: DecoderPromptType, ): """Sanitize vllm output to be comparable with hf output.""" @@ -31,9 +31,9 @@ def vllm_to_hf_output( def run_test( - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], - prompts: List[ExplicitEncoderDecoderPrompt[str, str]], + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], + prompts: list[ExplicitEncoderDecoderPrompt[str, str]], decoder_prompt_type: DecoderPromptType, model: str, *, diff --git a/tests/models/encoder_decoder/vision_language/test_florence2.py b/tests/models/encoder_decoder/vision_language/test_florence2.py index de18deab11f68..a6ec333e2e9b4 100644 --- a/tests/models/encoder_decoder/vision_language/test_florence2.py +++ b/tests/models/encoder_decoder/vision_language/test_florence2.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Optional, Type +from typing import Optional import pytest from PIL import Image @@ -51,8 +51,8 @@ def hf_to_vllm_output(hf_output: tuple[list[int], str, def run_test( - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], inputs: list[list[ExplicitEncoderDecoderPrompt]], model: str, *, @@ -114,7 +114,7 @@ def run_test( @pytest.mark.parametrize("dtype", ["float"]) @pytest.mark.parametrize("max_tokens", [64]) @pytest.mark.parametrize("num_logprobs", [5]) -def test_models(hf_runner: Type[HfRunner], vllm_runner: Type[VllmRunner], +def test_models(hf_runner: type[HfRunner], vllm_runner: type[VllmRunner], image_assets: _ImageAssets, model: str, size_factors: list[int], dtype: str, max_tokens: int, num_logprobs: int) -> None: diff --git a/tests/models/encoder_decoder/vision_language/test_mllama.py b/tests/models/encoder_decoder/vision_language/test_mllama.py index 202516f4c2097..45b938f3e7108 100644 --- a/tests/models/encoder_decoder/vision_language/test_mllama.py +++ b/tests/models/encoder_decoder/vision_language/test_mllama.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List, Optional, Tuple, Type, overload +from typing import Optional, overload import pytest import torch @@ -64,7 +64,7 @@ } -def vllm_to_hf_output(vllm_output: Tuple[List[int], str, +def vllm_to_hf_output(vllm_output: tuple[list[int], str, Optional[SampleLogprobs]], model: str): """Sanitize vllm output to be comparable with hf output.""" @@ -91,9 +91,9 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str, def _get_inputs( image_assets: _ImageAssets, *, - size_factors: Optional[List[float]] = None, - sizes: Optional[List[Tuple[int, int]]] = None, -) -> List[Tuple[List[str], PromptImageInput]]: + size_factors: Optional[list[float]] = None, + sizes: Optional[list[tuple[int, int]]] = None, +) -> list[tuple[list[str], PromptImageInput]]: images = [asset.pil_image for asset in image_assets] if size_factors is not None: @@ -123,12 +123,12 @@ def _get_inputs( @overload def run_test( - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], image_assets: _ImageAssets, model: str, *, - size_factors: List[float], + size_factors: list[float], dtype: str, max_tokens: int, num_logprobs: int, @@ -140,12 +140,12 @@ def run_test( @overload def run_test( - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], image_assets: _ImageAssets, model: str, *, - sizes: List[Tuple[int, int]], + sizes: list[tuple[int, int]], dtype: str, max_tokens: int, num_logprobs: int, @@ -156,13 +156,13 @@ def run_test( def run_test( - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], image_assets: _ImageAssets, model: str, *, - size_factors: Optional[List[float]] = None, - sizes: Optional[List[Tuple[int, int]]] = None, + size_factors: Optional[list[float]] = None, + sizes: Optional[list[tuple[int, int]]] = None, dtype: str, max_tokens: int, num_logprobs: int, @@ -183,9 +183,9 @@ def run_test( def _run_test( - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], - inputs: List[Tuple[List[str], PromptImageInput]], + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], + inputs: list[tuple[list[str], PromptImageInput]], model: str, *, dtype: str, diff --git a/tests/models/multimodal/processing/test_h2ovl.py b/tests/models/multimodal/processing/test_h2ovl.py index 5c43e4eed7878..84471c92a293f 100644 --- a/tests/models/multimodal/processing/test_h2ovl.py +++ b/tests/models/multimodal/processing/test_h2ovl.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 """Tests for H2OVL's multimodal preprocessing kwargs.""" -from typing import Mapping, Optional +from collections.abc import Mapping +from typing import Optional import pytest from PIL import Image diff --git a/tests/models/multimodal/processing/test_internvl.py b/tests/models/multimodal/processing/test_internvl.py index cc777fdf57b3c..adbc4f5b5586b 100644 --- a/tests/models/multimodal/processing/test_internvl.py +++ b/tests/models/multimodal/processing/test_internvl.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 """Tests for InternVL's multimodal preprocessing kwargs.""" -from typing import Mapping, Optional +from collections.abc import Mapping +from typing import Optional import pytest from PIL import Image diff --git a/tests/models/registry.py b/tests/models/registry.py index 95bda0293498d..cf25479984448 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -1,7 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 +from collections.abc import Mapping, Set from dataclasses import dataclass, field -from typing import AbstractSet, Any, Literal, Mapping, Optional +from typing import Any, Literal, Optional import pytest from packaging.version import Version @@ -324,7 +325,7 @@ def __init__(self, hf_models: Mapping[str, _HfExamplesInfo]) -> None: self.hf_models = hf_models - def get_supported_archs(self) -> AbstractSet[str]: + def get_supported_archs(self) -> Set[str]: return self.hf_models.keys() def get_hf_info(self, model_arch: str) -> _HfExamplesInfo: diff --git a/tests/models/test_transformers.py b/tests/models/test_transformers.py index 31e3c1f7b987f..243cb92ae2569 100644 --- a/tests/models/test_transformers.py +++ b/tests/models/test_transformers.py @@ -4,7 +4,6 @@ Run `pytest tests/models/test_transformers.py`. """ from contextlib import nullcontext -from typing import Type import pytest @@ -14,8 +13,8 @@ def check_implementation( - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], example_prompts: list[str], model: str, **kwargs, @@ -47,8 +46,8 @@ def check_implementation( ("ArthurZ/Ilama-3.2-1B", "auto"), # CUSTOM CODE ]) # trust_remote_code=True by default def test_models( - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], example_prompts: list[str], model: str, model_impl: str, @@ -71,8 +70,8 @@ def test_models( @multi_gpu_test(num_gpus=2) def test_distributed( - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], example_prompts, ): kwargs = {"model_impl": "transformers", "tensor_parallel_size": 2} @@ -92,7 +91,7 @@ def test_distributed( @pytest.mark.parametrize("max_tokens", [32]) @pytest.mark.parametrize("num_logprobs", [5]) def test_quantization( - vllm_runner: Type[VllmRunner], + vllm_runner: type[VllmRunner], example_prompts: list[str], model: str, quantization_kwargs: dict[str, str], diff --git a/tests/models/utils.py b/tests/models/utils.py index a90efb1767220..b0182d545f4b9 100644 --- a/tests/models/utils.py +++ b/tests/models/utils.py @@ -1,7 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 import warnings -from typing import Dict, List, Optional, Sequence, Tuple, Union +from collections.abc import Sequence +from typing import Optional, Union import torch @@ -9,7 +10,7 @@ from vllm.inputs import InputContext from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs -TokensText = Tuple[List[int], str] +TokensText = tuple[list[int], str] def check_outputs_equal( @@ -46,7 +47,7 @@ def check_outputs_equal( # * List of top sample logprobs for each sampled token # # Assumes prompt logprobs were not requested. -TokensTextLogprobs = Tuple[List[int], str, Optional[Union[List[Dict[int, +TokensTextLogprobs = tuple[list[int], str, Optional[Union[list[dict[int, float]], SampleLogprobs]]] @@ -57,8 +58,8 @@ def check_outputs_equal( # * Optional list of top sample logprobs for each sampled token # # Assumes prompt logprobs were not requested. -TextTextLogprobs = Tuple[List[str], str, Optional[Union[List[Dict[str, float]], - List[Dict[str, +TextTextLogprobs = tuple[list[str], str, Optional[Union[list[dict[str, float]], + list[dict[str, Logprob]]]]] # Representation of generated sequence as a tuple of @@ -68,9 +69,9 @@ def check_outputs_equal( # * Optional list of top prompt logprobs for each prompt token # # Allows prompt logprobs to be requested. -TokensTextLogprobsPromptLogprobs = Tuple[ - List[int], str, Optional[Union[List[Dict[int, float]], SampleLogprobs]], - Optional[Union[List[Optional[Dict[int, float]]], PromptLogprobs]]] +TokensTextLogprobsPromptLogprobs = tuple[ + list[int], str, Optional[Union[list[dict[int, float]], SampleLogprobs]], + Optional[Union[list[Optional[dict[int, float]]], PromptLogprobs]]] def check_logprobs_close( @@ -254,8 +255,8 @@ def build_model_context( tokenizer_name: Optional[str] = None, trust_remote_code: bool = False, dtype: Optional[Union[str, torch.dtype]] = None, - mm_processor_kwargs: Optional[Dict] = None, - limit_mm_per_prompt: Optional[Dict] = None, + mm_processor_kwargs: Optional[dict] = None, + limit_mm_per_prompt: Optional[dict] = None, disable_mm_preprocessor_cache: bool = True, ): """Creates an InputContext for a given model. diff --git a/tests/mq_llm_engine/utils.py b/tests/mq_llm_engine/utils.py index 11e44f12bc56f..64559609abb2d 100644 --- a/tests/mq_llm_engine/utils.py +++ b/tests/mq_llm_engine/utils.py @@ -2,7 +2,7 @@ import asyncio import multiprocessing -from typing import Callable, Tuple, Union +from typing import Callable, Union from vllm import SamplingParams from vllm.engine.arg_utils import AsyncEngineArgs @@ -16,7 +16,7 @@ async def generate( client: MQLLMEngineClient, request_id: str, num_tokens: int, - return_output: bool = False) -> Union[RequestOutput, Tuple[int, str]]: + return_output: bool = False) -> Union[RequestOutput, tuple[int, str]]: final_output = None count = 0 diff --git a/tests/multi_step/test_correctness_async_llm.py b/tests/multi_step/test_correctness_async_llm.py index 9822cee14a250..f925e42f46d37 100644 --- a/tests/multi_step/test_correctness_async_llm.py +++ b/tests/multi_step/test_correctness_async_llm.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # Test the AsyncLLMEngine with multi-step-decoding -from typing import List, Optional +from typing import Optional import pytest @@ -17,7 +17,7 @@ NUM_SCHEDULER_STEPS = [8] # Multi-step decoding steps NUM_PROMPTS = [10] -DEFAULT_SERVER_ARGS: List[str] = [ +DEFAULT_SERVER_ARGS: list[str] = [ "--distributed-executor-backend", "ray", "--gpu-memory-utilization", diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py index f9e0f507a1e86..8f76d895fdd29 100644 --- a/tests/multimodal/test_utils.py +++ b/tests/multimodal/test_utils.py @@ -4,7 +4,7 @@ import mimetypes import os from tempfile import NamedTemporaryFile, TemporaryDirectory -from typing import TYPE_CHECKING, Dict, NamedTuple, Optional, Tuple +from typing import TYPE_CHECKING, NamedTuple, Optional import numpy as np import pytest @@ -30,7 +30,7 @@ @pytest.fixture(scope="module") -def url_images() -> Dict[str, Image.Image]: +def url_images() -> dict[str, Image.Image]: connector = MediaConnector() return { @@ -39,7 +39,7 @@ def url_images() -> Dict[str, Image.Image]: } -def get_supported_suffixes() -> Tuple[str, ...]: +def get_supported_suffixes() -> tuple[str, ...]: # We should at least test the file types mentioned in GPT-4 with Vision OPENAI_SUPPORTED_SUFFIXES = ('.png', '.jpeg', '.jpg', '.webp', '.gif') @@ -66,7 +66,7 @@ async def test_fetch_image_http(image_url: str): @pytest.mark.asyncio @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) @pytest.mark.parametrize("suffix", get_supported_suffixes()) -async def test_fetch_image_base64(url_images: Dict[str, Image.Image], +async def test_fetch_image_base64(url_images: dict[str, Image.Image], image_url: str, suffix: str): connector = MediaConnector() url_image = url_images[image_url] diff --git a/tests/neuron/test_logits_processor.py b/tests/neuron/test_logits_processor.py index 37d59c9e76a71..6d1514088f90c 100644 --- a/tests/neuron/test_logits_processor.py +++ b/tests/neuron/test_logits_processor.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 import random -from typing import Tuple from unittest.mock import patch import pytest @@ -33,7 +32,7 @@ def forward(self, *args, **kwargs): def _prepare_test( batch_size: int -) -> Tuple[torch.Tensor, torch.Tensor, MockLogitsProcessor]: +) -> tuple[torch.Tensor, torch.Tensor, MockLogitsProcessor]: vocab_size = 32000 input_tensor = torch.rand((batch_size, 1024), dtype=torch.float16) fake_logits = torch.full((batch_size, vocab_size), diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py index a376d2cb340c7..bc4a41cdf00de 100644 --- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py +++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Iterable, Optional, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch import torch.nn as nn @@ -59,7 +60,7 @@ def pooler( ) -> Optional[PoolerOutput]: return self._pooler(hidden_states, pooling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): weights = self.hf_to_vllm_mapper.apply(weights) weights = ((name, data) for name, data in weights diff --git a/tests/quantization/test_configs.py b/tests/quantization/test_configs.py index 0abbd8ebb5980..e30166842ea8a 100644 --- a/tests/quantization/test_configs.py +++ b/tests/quantization/test_configs.py @@ -5,7 +5,6 @@ """ from dataclasses import dataclass -from typing import Tuple import pytest @@ -53,7 +52,7 @@ class ModelPair: @pytest.mark.parametrize("model_arg_exptype", MODEL_ARG_EXPTYPES) -def test_auto_gptq(model_arg_exptype: Tuple[str, None, str]) -> None: +def test_auto_gptq(model_arg_exptype: tuple[str, None, str]) -> None: model_path, quantization_arg, expected_type = model_arg_exptype try: diff --git a/tests/quantization/test_register_quantization_config.py b/tests/quantization/test_register_quantization_config.py index da59dc75afc13..f64dca6e4bbf6 100644 --- a/tests/quantization/test_register_quantization_config.py +++ b/tests/quantization/test_register_quantization_config.py @@ -5,7 +5,7 @@ Run `pytest tests/quantization/test_register_quantization_config.py`. """ -from typing import Any, Dict, List, Optional +from typing import Any, Optional import pytest import torch @@ -58,7 +58,7 @@ def get_name(self) -> str: """Name of the quantization method.""" return "custom_quant" - def get_supported_act_dtypes(self) -> List["torch.dtype"]: + def get_supported_act_dtypes(self) -> list["torch.dtype"]: """List of supported activation dtypes.""" return [torch.float16, torch.bfloat16] @@ -68,12 +68,12 @@ def get_min_capability(cls) -> int: return -1 @staticmethod - def get_config_filenames() -> List[str]: + def get_config_filenames() -> list[str]: """List of filenames to search for in the model directory.""" return [] @classmethod - def from_config(cls, config: Dict[str, Any]) -> "CustomQuantConfig": + def from_config(cls, config: dict[str, Any]) -> "CustomQuantConfig": """Create a config class from the model's quantization config.""" return CustomQuantConfig(num_bits=config.get("num_bits", 8)) diff --git a/tests/samplers/test_logprobs.py b/tests/samplers/test_logprobs.py index 78bdd9b0b958d..58c7c256473e0 100644 --- a/tests/samplers/test_logprobs.py +++ b/tests/samplers/test_logprobs.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List - import pytest import torch @@ -70,7 +68,7 @@ def test_get_prompt_logprobs( assert (len(logprobs) == num_top_logprobs or len(logprobs) == num_top_logprobs + 1) output_text = result.outputs[0].text - output_string_from_most_likely_tokens_lst: List[str] = [] + output_string_from_most_likely_tokens_lst: list[str] = [] for top_logprobs in result.outputs[0].logprobs: top_logprob = next(iter(top_logprobs.values())) output_string_from_most_likely_tokens_lst.append( diff --git a/tests/samplers/test_no_bad_words.py b/tests/samplers/test_no_bad_words.py index 143f529994156..29e73eb1bead0 100644 --- a/tests/samplers/test_no_bad_words.py +++ b/tests/samplers/test_no_bad_words.py @@ -4,7 +4,7 @@ Run `pytest tests/samplers/test_no_bad_words.py`. """ -from typing import List, Optional +from typing import Optional from transformers import AutoTokenizer @@ -16,8 +16,8 @@ def _generate( prompt: str, num_prompt_tokens: int, temperature: float = 0, - bad_words: Optional[List[str]] = None, -) -> List[int]: + bad_words: Optional[list[str]] = None, +) -> list[int]: sampling_params = SamplingParams( temperature=temperature, bad_words=bad_words, @@ -59,7 +59,7 @@ def test_one_token_bad_word(self, vllm_runner): def _generate(self, model: LLM, - bad_words: Optional[List[str]] = None) -> List[int]: + bad_words: Optional[list[str]] = None) -> list[int]: return _generate( model=model, prompt=self.PROMPT, @@ -69,7 +69,7 @@ def _generate(self, def _encode(self, prompt: str, - add_special_tokens: bool = True) -> List[int]: + add_special_tokens: bool = True) -> list[int]: return self.tokenizer(prompt, add_special_tokens=add_special_tokens).input_ids @@ -149,7 +149,7 @@ def test_two_token_bad_word(self, vllm_runner): def _generate(self, model: LLM, - bad_words: Optional[List[str]] = None) -> List[int]: + bad_words: Optional[list[str]] = None) -> list[int]: return _generate( model=model, prompt=self.PROMPT, @@ -158,7 +158,7 @@ def _generate(self, ) @staticmethod - def _contains(sequence: List[int], subsequence: List[int]) -> bool: + def _contains(sequence: list[int], subsequence: list[int]) -> bool: searched = False for start in range(len(sequence)): @@ -181,6 +181,6 @@ def _contains(sequence: List[int], subsequence: List[int]) -> bool: def _encode(self, prompt: str, - add_special_tokens: bool = True) -> List[int]: + add_special_tokens: bool = True) -> list[int]: return self.tokenizer(prompt, add_special_tokens=add_special_tokens).input_ids diff --git a/tests/samplers/test_rejection_sampler.py b/tests/samplers/test_rejection_sampler.py index cc199bf682fc0..2b86dcac7f03c 100644 --- a/tests/samplers/test_rejection_sampler.py +++ b/tests/samplers/test_rejection_sampler.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 """Tests for rejection sampling.""" -from typing import List, Tuple import pytest import torch @@ -416,8 +415,8 @@ def test_rejection_sampling_approximates_target_distribution( draft_and_target_probs_equal) sample_sizes = [10, 100, 1_000, 10_000, 100_000] - distance_wrt_reference: List[float] = [] - distance_wrt_target: List[float] = [] + distance_wrt_reference: list[float] = [] + distance_wrt_target: list[float] = [] for num_samples in sample_sizes: (reference_vs_rejsample_dist, @@ -452,7 +451,7 @@ def test_rejection_sampling_approximates_target_distribution( expected_improvement_multiplier) -def get_ratio_first_to_last(elements: List[float]) -> float: +def get_ratio_first_to_last(elements: list[float]) -> float: return elements[0] / elements[-1] @@ -477,7 +476,7 @@ def __init__(self, vocab_size: int, rejection_sampler: RejectionSampler): def generate_probs_for_test( self, draft_and_target_probs_equal: bool - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: draft_probs, target_probs = (F.softmax( torch.rand(self.vocab_size, dtype=torch.float32), dim=-1, @@ -499,7 +498,7 @@ def generate_probs_for_test( def run_and_compare_distributions(self, draft_probs: torch.Tensor, target_probs: torch.Tensor, reference_probs: torch.Tensor, - num_samples: int) -> Tuple[float, float]: + num_samples: int) -> tuple[float, float]: # Sample using rejection sampling. rej_sample_probs = self._estimate_rejection_sampling_pdf( draft_probs, target_probs, num_samples) diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py index ca09e536a06ca..68944ac7e1efa 100644 --- a/tests/samplers/test_sampler.py +++ b/tests/samplers/test_sampler.py @@ -3,7 +3,7 @@ import itertools import random from dataclasses import dataclass -from typing import Dict, List, Optional, Tuple +from typing import Optional from unittest.mock import Mock, patch import pytest @@ -30,7 +30,7 @@ def forward(self, *args, **kwargs): def _prepare_test( batch_size: int -) -> Tuple[torch.Tensor, torch.Tensor, MockLogitsSampler]: +) -> tuple[torch.Tensor, torch.Tensor, MockLogitsSampler]: input_tensor = torch.rand((batch_size, 1024), dtype=torch.float16) fake_logits = torch.full((batch_size, VOCAB_SIZE), 1e-2, @@ -53,8 +53,8 @@ def _do_sample( sampling_params: SamplingParams, device: str, ): - seq_group_metadata_list: List[SequenceGroupMetadata] = [] - seq_lens: List[int] = [] + seq_group_metadata_list: list[SequenceGroupMetadata] = [] + seq_lens: list[int] = [] for i in range(batch_size): seq_group_metadata_list.append( SequenceGroupMetadata( @@ -171,7 +171,7 @@ def test_sampler_min_tokens_penalty(seed: int, device: str): def create_sampling_params(min_tokens, eos_token_id=0, *, - stop_token_ids: Optional[List[int]] = None, + stop_token_ids: Optional[list[int]] = None, prompt_logprobs: Optional[int] = None): sampling_params = SamplingParams( min_tokens=min_tokens, @@ -196,7 +196,7 @@ def generate_test_case(): batch_size = random.randint(1, 128) expected_penalization = [] - sequence_metadata_list: List[SequenceGroupMetadata] = [] + sequence_metadata_list: list[SequenceGroupMetadata] = [] # 20% chance to generate seq group metadata list with all prompts is_prompt = random.random() < 0.2 while batch_size > 0: @@ -216,8 +216,8 @@ def generate_test_case(): eos_token_id=eos_token_id, stop_token_ids=stop_token_ids) - seq_data: Dict[int, SequenceData] = {} - seq_group_penalization: List[bool] = [] + seq_data: dict[int, SequenceData] = {} + seq_group_penalization: list[bool] = [] for _ in range(num_seqs): num_input = random.randint(1, 100) num_generated = 0 if is_prompt else random.randint(1, 100) @@ -376,16 +376,16 @@ def generate_test_case(): else: test_cases = [generate_test_case()] - def run_test_case(*, expected_penalization: List[bool], - seq_group_metadata_list: List[SequenceGroupMetadata]): + def run_test_case(*, expected_penalization: list[bool], + seq_group_metadata_list: list[SequenceGroupMetadata]): assert expected_penalization, \ "Invalid test case, need expected_penalization" assert seq_group_metadata_list, \ "Invalid test case, need seq_group_metadata_list" batch_size = 0 - seq_lens: List[int] = [] - sampling_params_per_row: List[SamplingParams] = [] + seq_lens: list[int] = [] + sampling_params_per_row: list[SamplingParams] = [] for sgm in seq_group_metadata_list: sampling_params = sgm.sampling_params @@ -456,11 +456,11 @@ def test_sampler_mixed(seed: int, device: str): batch_size = random.randint(1, 256) input_tensor, fake_logits, sampler = _prepare_test(batch_size) - seq_group_metadata_list: List[SequenceGroupMetadata] = [] - expected_tokens: List[Optional[List[int]]] = [] - seq_lens: List[int] = [] + seq_group_metadata_list: list[SequenceGroupMetadata] = [] + expected_tokens: list[Optional[list[int]]] = [] + seq_lens: list[int] = [] for i in range(batch_size): - expected: Optional[List[int]] = None + expected: Optional[list[int]] = None sampling_type = random.randint(0, 2) if sampling_type == 0: sampling_params = SamplingParams(temperature=0) @@ -492,7 +492,7 @@ def test_sampler_mixed(seed: int, device: str): )) seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len()) - generators: Dict[str, torch.Generator] = {} + generators: dict[str, torch.Generator] = {} def test_sampling(): sampling_metadata = SamplingMetadata.prepare( @@ -587,8 +587,8 @@ class MockConfig: device=device) assert len(processors) == 2 # top_p and top_k - seq_group_metadata_list: List[SequenceGroupMetadata] = [] - seq_lens: List[int] = [] + seq_group_metadata_list: list[SequenceGroupMetadata] = [] + seq_lens: list[int] = [] for i in range(batch_size): seq_group_metadata_list.append( SequenceGroupMetadata( @@ -669,10 +669,10 @@ def test_sampler_repetition_penalty_mixed(device: str): vocab_size = 8 - def test_sampling_params(sampling_params: List[SamplingParams]): + def test_sampling_params(sampling_params: list[SamplingParams]): - seq_group_metadata_list: List[SequenceGroupMetadata] = [] - seq_lens: List[int] = [] + seq_group_metadata_list: list[SequenceGroupMetadata] = [] + seq_lens: list[int] = [] for i in range(2): seq_group_metadata_list.append( SequenceGroupMetadata( diff --git a/tests/spec_decode/e2e/conftest.py b/tests/spec_decode/e2e/conftest.py index 53c888816a6c1..fe4a1c13fc730 100644 --- a/tests/spec_decode/e2e/conftest.py +++ b/tests/spec_decode/e2e/conftest.py @@ -1,7 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 +from collections.abc import Sequence from itertools import cycle -from typing import List, Optional, Sequence, Tuple, Union +from typing import Optional, Union import pytest import torch @@ -64,9 +65,9 @@ def maybe_assert_ngram_worker(llm): def get_output_from_llm_generator( llm_generator, prompts, - sampling_params) -> Tuple[List[str], List[List[int]], float]: - tokens: List[str] = [] - token_ids: List[List[int]] = [] + sampling_params) -> tuple[list[str], list[list[int]], float]: + tokens: list[str] = [] + token_ids: list[list[int]] = [] acceptance_rate: float = -1.0 for llm in llm_generator(): maybe_assert_ngram_worker(llm) diff --git a/tests/spec_decode/test_batch_expansion.py b/tests/spec_decode/test_batch_expansion.py index fe95ff9b9c35a..9edd8bd4c00d7 100644 --- a/tests/spec_decode/test_batch_expansion.py +++ b/tests/spec_decode/test_batch_expansion.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List - import pytest import torch @@ -42,7 +40,7 @@ def test_get_token_ids_to_score(k: int): device='cuda', ) - expected_output: List[List[int]] = [ + expected_output: list[list[int]] = [ [], ] for i in range(proposal_token_ids.shape[0]): diff --git a/tests/spec_decode/test_multi_step_worker.py b/tests/spec_decode/test_multi_step_worker.py index 2bf401613f063..ca37c9a68dfa4 100644 --- a/tests/spec_decode/test_multi_step_worker.py +++ b/tests/spec_decode/test_multi_step_worker.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 import random -from typing import Dict, List from unittest.mock import MagicMock import pytest @@ -221,7 +220,7 @@ def test_same_output_for_multi_step(): # Run single-step repeatedly. zero_kv_cache(worker.cache_engine) - single_step_output: List[SamplerOutput] = [] + single_step_output: list[SamplerOutput] = [] continuations = [[1] for _ in prompts] set_random_seed(seed) @@ -243,15 +242,15 @@ def test_same_output_for_multi_step(): continuations[i].append(seq_group_output.samples[0].output_token) # Get token ids and logprobs for comparison. - multi_step_output_logprobs: List[List[Dict[int, + multi_step_output_logprobs: list[list[dict[int, Logprob]]] = [[] for _ in prompts] - single_step_output_logprobs: List[List[Dict[int, + single_step_output_logprobs: list[list[dict[int, Logprob]]] = [[] for _ in prompts] - multi_step_output_token_ids: List[List[int]] = [[] for _ in prompts] - single_step_output_token_ids: List[List[int]] = [[] for _ in prompts] + multi_step_output_token_ids: list[list[int]] = [[] for _ in prompts] + single_step_output_token_ids: list[list[int]] = [[] for _ in prompts] for i, _ in enumerate(prompts): for multi_step, single_step in zip(multi_step_output, single_step_output): @@ -336,7 +335,7 @@ def test_multi_step_with_batch_expansion_correct_output(): # will simulate the bonus token case with the second token # being the bonus token. zero_kv_cache(worker.cache_engine) - single_step_output: List[SamplerOutput] = [] + single_step_output: list[SamplerOutput] = [] set_random_seed(seed) for _ in range(num_steps): seq_group_metadata_list = create_seq_group_metadata_from_prompts( @@ -430,7 +429,7 @@ def test_multi_step_with_batch_expansion_incorrect_output(): # will simulate the bonus token case with the second token # being the bonus token. zero_kv_cache(worker.cache_engine) - single_step_output: List[SamplerOutput] = [] + single_step_output: list[SamplerOutput] = [] set_random_seed(seed) for _ in range(num_steps): seq_group_metadata_list = create_seq_group_metadata_from_prompts( diff --git a/tests/spec_decode/test_scorer.py b/tests/spec_decode/test_scorer.py index 7bbbb0236da1e..161cc9fbf5568 100644 --- a/tests/spec_decode/test_scorer.py +++ b/tests/spec_decode/test_scorer.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 import random -from typing import List import pytest import torch @@ -15,7 +14,7 @@ from .utils import create_batch, create_worker -def create_proposal(propose_lens: List[int], vocab_size: int, +def create_proposal(propose_lens: list[int], vocab_size: int, device: str) -> SpeculativeProposals: batch_size = len(propose_lens) max_propose_len = max(propose_lens) diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py index e4b1a178b0c95..f7ef9786a690e 100644 --- a/tests/spec_decode/test_spec_decode_worker.py +++ b/tests/spec_decode/test_spec_decode_worker.py @@ -3,7 +3,6 @@ import random from collections import defaultdict from types import SimpleNamespace -from typing import Dict, List, Set from unittest.mock import MagicMock import pytest @@ -123,7 +122,7 @@ def test_batch_expansion_correctly_calls_target_model( seq_group_metadata_list=seq_group_metadata_list, num_lookahead_slots=k)) - seen_contexts: List[List[int]] = [] + seen_contexts: list[list[int]] = [] call_args_list = target_worker.execute_model.call_args_list assert len(call_args_list) == 1 @@ -136,7 +135,7 @@ def test_batch_expansion_correctly_calls_target_model( for seq_data in seq_group_metadata.seq_data.values(): seen_contexts.append(seq_data.get_token_ids()) - expected_seen_contexts: List[List[int]] = [] + expected_seen_contexts: list[list[int]] = [] for prompt, prev_generated, draft_tokens in zip( prompts, prev_output_tokens, proposal_token_ids.tolist()): @@ -338,11 +337,11 @@ def test_correctly_formats_output(k: int, batch_size: int, next(iter(seq_group_metadata.seq_data.keys())) for seq_group_metadata in seq_group_metadata_list ] - actual_output_by_seq: Dict[int, List[SequenceOutput]] = { + actual_output_by_seq: dict[int, list[SequenceOutput]] = { seq_id: [] for seq_id in seq_ids } - expected_output_by_seq: Dict[int, List[SequenceOutput]] = { + expected_output_by_seq: dict[int, list[SequenceOutput]] = { seq_id: [] for seq_id in seq_ids } @@ -728,7 +727,7 @@ def test_populate_seq_ids_with_bonus_tokens(): size=(batch_size, (k + 1)), dtype=torch.int64, device='cuda') - expected_request_id_seq_ids_mapping: Dict[str, Set[int]] = defaultdict(set) + expected_request_id_seq_ids_mapping: dict[str, set[int]] = defaultdict(set) for seq_group_metadata in seq_group_metadata_list: for seq_id in seq_group_metadata.seq_data: expected_request_id_seq_ids_mapping[ diff --git a/tests/spec_decode/utils.py b/tests/spec_decode/utils.py index 38f57e99bdb0d..d303b7f1219a5 100644 --- a/tests/spec_decode/utils.py +++ b/tests/spec_decode/utils.py @@ -1,9 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 +from collections.abc import Sequence as GenericSequence from itertools import count -from typing import Callable, Dict, List, Optional -from typing import Sequence as GenericSequence -from typing import TypeVar, Union +from typing import Callable, Optional, TypeVar, Union from unittest.mock import MagicMock import torch @@ -44,7 +43,7 @@ def mock_worker(cls=None, return worker -def patch_execute_model_with_seeds(worker: Worker, rand_seeds: List[int]): +def patch_execute_model_with_seeds(worker: Worker, rand_seeds: list[int]): seed_iter = iter(rand_seeds) original_execute_model = worker.execute_model @@ -56,7 +55,7 @@ def new_execute_model(*args, **kwargs): return new_execute_model -def zero_kv_cache(cache_engine: List[CacheEngine]): +def zero_kv_cache(cache_engine: list[CacheEngine]): assert cache_engine[0].gpu_cache for key_blocks, value_blocks in cache_engine[0].gpu_cache: key_blocks.zero_() @@ -106,13 +105,13 @@ def create_worker(cls: Callable[..., T], def create_seq_group_metadata_from_prompts( - prompts: List[List[int]], + prompts: list[list[int]], num_gpu_blocks: int, block_size: int, - final_prompt_lens: List[int], - continuations: Optional[List[List[int]]] = None, - seq_ids: Optional[List[int]] = None, -) -> List[SequenceGroupMetadata]: + final_prompt_lens: list[int], + continuations: Optional[list[list[int]]] = None, + seq_ids: Optional[list[int]] = None, +) -> list[SequenceGroupMetadata]: if continuations is None: continuations = [[] for _ in prompts] @@ -149,11 +148,11 @@ def create_seq_group_metadata_from_prompts( def create_chunked_seq_group_metadata_from_prompt( - prompt: List[int], + prompt: list[int], num_gpu_blocks: int, chunk_size: int, block_size: int, - seq_id: Optional[int] = None) -> List[SequenceGroupMetadata]: + seq_id: Optional[int] = None) -> list[SequenceGroupMetadata]: if seq_id is None: seq_id = 0 @@ -184,8 +183,8 @@ def create_chunked_seq_group_metadata_from_prompt( def assert_logprobs_dict_allclose( - actual_logprobs: List[Dict[int, Logprob]], - expected_logprobs: List[Dict[int, Logprob]]) -> None: + actual_logprobs: list[dict[int, Logprob]], + expected_logprobs: list[dict[int, Logprob]]) -> None: for single_step_actual_logprobs, single_step_expected_logprobs in zip( actual_logprobs, expected_logprobs): assert set(single_step_actual_logprobs.keys()) == set( @@ -202,7 +201,7 @@ def create_sampler_output_list( token_ids: torch.Tensor, probs: GenericSequence[Optional[torch.Tensor]], logprobs: GenericSequence[Optional[torch.Tensor]], - seq_ids: Optional[List[int]] = None) -> List[SamplerOutput]: + seq_ids: Optional[list[int]] = None) -> list[SamplerOutput]: num_steps, batch_size = token_ids.shape token_ids_by_step = token_ids.tolist() @@ -231,9 +230,9 @@ def create_sampler_output_list( def create_batch(batch_size, k, - prompt_len: Union[int, List[int]] = 10, + prompt_len: Union[int, list[int]] = 10, prev_output_token_len: int = 10, - seq_ids: Optional[List[int]] = None, + seq_ids: Optional[list[int]] = None, num_gpu_blocks: Optional[int] = None, block_size: Optional[int] = None, prefill_chunk_size: Optional[int] = None): diff --git a/tests/test_cache_block_hashing.py b/tests/test_cache_block_hashing.py index 17c128a176563..05d2c624df178 100644 --- a/tests/test_cache_block_hashing.py +++ b/tests/test_cache_block_hashing.py @@ -3,7 +3,7 @@ Run `pytest tests/test_cache_block_hashing.py`. """ -from typing import List, Optional +from typing import Optional import pytest @@ -44,7 +44,7 @@ def flatten_2d(li): @pytest.mark.parametrize("concurrent_lora_int_ids", [[None], [1], [None, 1], [None, 1, 2], [1, 2]]) def test_auto_prefix_caching(model: str, block_size: int, max_num_seqs: int, - concurrent_lora_int_ids: List[Optional[int]]): + concurrent_lora_int_ids: list[Optional[int]]): tokenizer = TokenizerGroup( tokenizer_id="facebook/opt-125m", @@ -53,7 +53,7 @@ def test_auto_prefix_caching(model: str, block_size: int, max_num_seqs: int, max_input_length=None, ) - hashes: List[List[List[int]]] = [] + hashes: list[list[list[int]]] = [] for prefix in prefixes: for lora_int_id in concurrent_lora_int_ids: diff --git a/tests/test_inputs.py b/tests/test_inputs.py index fff909154a2ae..d361808ed2f9a 100644 --- a/tests/test_inputs.py +++ b/tests/test_inputs.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List - import pytest from vllm.inputs import zip_enc_dec_prompts @@ -45,7 +43,7 @@ def test_parse_single_batch_string_consistent(string_input: str): @pytest.mark.parametrize('token_input', TOKEN_INPUTS) -def test_parse_single_batch_token_consistent(token_input: List[int]): +def test_parse_single_batch_token_consistent(token_input: list[int]): assert parse_and_batch_prompt(token_input) \ == parse_and_batch_prompt([token_input]) diff --git a/tests/test_logger.py b/tests/test_logger.py index 993822e922405..11deae309ac8b 100644 --- a/tests/test_logger.py +++ b/tests/test_logger.py @@ -155,7 +155,7 @@ def test_an_error_is_raised_when_custom_logging_config_is_unexpected_json( with pytest.raises(ValueError) as ex_info: _configure_vllm_root_logger() assert ex_info.type == ValueError # noqa: E721 - assert "Invalid logging config. Expected Dict, got" in str(ex_info) + assert "Invalid logging config. Expected dict, got" in str(ex_info) @patch("vllm.logger.VLLM_CONFIGURE_LOGGING", 1) diff --git a/tests/test_logits_processor.py b/tests/test_logits_processor.py index 487fbb8fcb8c8..8301c645b79f8 100644 --- a/tests/test_logits_processor.py +++ b/tests/test_logits_processor.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 import random -from typing import Tuple from unittest.mock import patch import pytest @@ -33,7 +32,7 @@ def forward(self, *args, **kwargs): def _prepare_test( batch_size: int -) -> Tuple[torch.Tensor, torch.Tensor, MockLogitsProcessor]: +) -> tuple[torch.Tensor, torch.Tensor, MockLogitsProcessor]: vocab_size = 32000 input_tensor = torch.rand((batch_size, 1024), dtype=torch.float16) fake_logits = torch.full((batch_size, vocab_size), diff --git a/tests/test_utils.py b/tests/test_utils.py index 5b69ffd18bb28..8b67e92fca688 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -3,7 +3,7 @@ import asyncio import os import socket -from typing import AsyncIterator, Tuple +from collections.abc import AsyncIterator from unittest.mock import patch import pytest @@ -33,7 +33,7 @@ async def mock_async_iterator(idx: int): iterators = [mock_async_iterator(i) for i in range(3)] merged_iterator = merge_async_iterators(*iterators) - async def stream_output(generator: AsyncIterator[Tuple[int, str]]): + async def stream_output(generator: AsyncIterator[tuple[int, str]]): async for idx, output in generator: print(f"idx: {idx}, output: {output}") diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py index 851c79d2e09c3..9aa2eea3154cc 100644 --- a/tests/tokenization/test_detokenize.py +++ b/tests/tokenization/test_detokenize.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Any, Dict, Generator, List, Optional +from collections.abc import Generator +from typing import Any, Optional import pytest from transformers import AutoTokenizer @@ -163,7 +164,7 @@ def detokenizer(tokenizer_name: str) -> Detokenizer: @pytest.fixture(name="complete_sequence_token_ids") def create_complete_sequence_token_ids(complete_sequence: str, - tokenizer) -> List[int]: + tokenizer) -> list[int]: complete_sequence_token_ids = tokenizer(complete_sequence).input_ids return complete_sequence_token_ids @@ -178,7 +179,7 @@ def create_sequence(prompt_token_ids=None): def create_dummy_logprobs( - complete_sequence_token_ids: List[int]) -> List[Dict[int, Logprob]]: + complete_sequence_token_ids: list[int]) -> list[dict[int, Logprob]]: return [{ token_id: Logprob(logprob=0.0), token_id + 1: Logprob(logprob=0.1) @@ -186,10 +187,10 @@ def create_dummy_logprobs( def create_dummy_prompt_logprobs( - complete_sequence_token_ids: List[int] -) -> List[Optional[Dict[int, Any]]]: + complete_sequence_token_ids: list[int] +) -> list[Optional[dict[int, Any]]]: # logprob for the first prompt token is None. - logprobs: List[Optional[Dict[int, Any]]] = [None] + logprobs: list[Optional[dict[int, Any]]] = [None] logprobs.extend(create_dummy_logprobs(complete_sequence_token_ids)[1:]) return logprobs @@ -198,7 +199,7 @@ def create_dummy_prompt_logprobs( @pytest.mark.parametrize("tokenizer_name", TOKENIZERS) @pytest.mark.parametrize("skip_special_tokens", [True, False], indirect=True) def test_decode_sequence_logprobs(complete_sequence: str, - complete_sequence_token_ids: List[int], + complete_sequence_token_ids: list[int], detokenizer: Detokenizer, skip_special_tokens: bool): """Verify Detokenizer decodes logprobs correctly.""" @@ -208,8 +209,8 @@ def test_decode_sequence_logprobs(complete_sequence: str, # Run sequentially. seq = create_sequence() dummy_logprobs = create_dummy_logprobs(complete_sequence_token_ids) - sequential_logprobs_text_chosen_token: List[str] = [] - sequential_logprobs_text_other_token: List[str] = [] + sequential_logprobs_text_chosen_token: list[str] = [] + sequential_logprobs_text_other_token: list[str] = [] for new_token, logprobs in zip(complete_sequence_token_ids, dummy_logprobs): seq.append_token_id(new_token, logprobs) @@ -232,7 +233,7 @@ def test_decode_sequence_logprobs(complete_sequence: str, @pytest.mark.parametrize("complete_sequence", TRUTH) @pytest.mark.parametrize("tokenizer_name", TOKENIZERS) -def test_decode_prompt_logprobs(complete_sequence_token_ids: List[int], +def test_decode_prompt_logprobs(complete_sequence_token_ids: list[int], detokenizer: Detokenizer): """Verify Detokenizer decodes prompt logprobs correctly.""" sampling_params = SamplingParams(skip_special_tokens=True, @@ -249,7 +250,7 @@ def test_decode_prompt_logprobs(complete_sequence_token_ids: List[int], dummy_logprobs, position_offset=0) # First logprob is None. - decoded_prompt_logprobs: List[Dict[int, Any]] = dummy_logprobs[ + decoded_prompt_logprobs: list[dict[int, Any]] = dummy_logprobs[ 1:] # type: ignore # decoded_prompt_logprobs doesn't contain the first token. diff --git a/tests/tokenization/test_tokenizer_group.py b/tests/tokenization/test_tokenizer_group.py index 8e99f86917b88..d1873823ac187 100644 --- a/tests/tokenization/test_tokenizer_group.py +++ b/tests/tokenization/test_tokenizer_group.py @@ -3,7 +3,7 @@ import asyncio import os import sys -from typing import List, Optional +from typing import Optional from unittest.mock import patch import pytest @@ -129,7 +129,7 @@ class FailingTokenizerGroup(TokenizerGroup): def __init__(self, *args, - fail_at: Optional[List[int]] = None, + fail_at: Optional[list[int]] = None, **kwargs): super().__init__(*args, **kwargs) self.i = 0 diff --git a/tests/tokenization/test_tokenizer_registry.py b/tests/tokenization/test_tokenizer_registry.py index 793d38f9c3666..772eeb345ca4d 100644 --- a/tests/tokenization/test_tokenizer_registry.py +++ b/tests/tokenization/test_tokenizer_registry.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union +from typing import TYPE_CHECKING, Any, Optional, Union from vllm.transformers_utils.tokenizer import get_tokenizer from vllm.transformers_utils.tokenizer_base import (TokenizerBase, @@ -17,15 +17,15 @@ def from_pretrained(cls, *args, **kwargs) -> "TestTokenizer": return TestTokenizer() @property - def all_special_tokens_extended(self) -> List[str]: + def all_special_tokens_extended(self) -> list[str]: raise NotImplementedError() @property - def all_special_tokens(self) -> List[str]: + def all_special_tokens(self) -> list[str]: raise NotImplementedError() @property - def all_special_ids(self) -> List[int]: + def all_special_ids(self) -> list[int]: raise NotImplementedError() @property @@ -58,7 +58,7 @@ def max_token_id(self) -> int: def __call__( self, - text: Union[str, List[str], List[int]], + text: Union[str, list[str], list[int]], text_pair: Optional[str] = None, add_special_tokens: bool = False, truncation: bool = False, @@ -66,10 +66,10 @@ def __call__( ): raise NotImplementedError() - def get_vocab(self) -> Dict[str, int]: + def get_vocab(self) -> dict[str, int]: raise NotImplementedError() - def get_added_vocab(self) -> Dict[str, int]: + def get_added_vocab(self) -> dict[str, int]: raise NotImplementedError() def encode_one( @@ -77,33 +77,33 @@ def encode_one( text: str, truncation: bool = False, max_length: Optional[int] = None, - ) -> List[int]: + ) -> list[int]: raise NotImplementedError() def encode(self, text: str, - add_special_tokens: Optional[bool] = None) -> List[int]: + add_special_tokens: Optional[bool] = None) -> list[int]: raise NotImplementedError() def apply_chat_template(self, - messages: List["ChatCompletionMessageParam"], - tools: Optional[List[Dict[str, Any]]] = None, - **kwargs) -> List[int]: + messages: list["ChatCompletionMessageParam"], + tools: Optional[list[dict[str, Any]]] = None, + **kwargs) -> list[int]: raise NotImplementedError() - def convert_tokens_to_string(self, tokens: List[str]) -> str: + def convert_tokens_to_string(self, tokens: list[str]) -> str: raise NotImplementedError() def decode(self, - ids: Union[List[int], int], + ids: Union[list[int], int], skip_special_tokens: bool = True) -> str: raise NotImplementedError() def convert_ids_to_tokens( self, - ids: List[int], + ids: list[int], skip_special_tokens: bool = True, - ) -> List[str]: + ) -> list[str]: raise NotImplementedError() diff --git a/tests/tool_use/test_chat_completions.py b/tests/tool_use/test_chat_completions.py index da033fa1d85c3..448347be6ec1d 100644 --- a/tests/tool_use/test_chat_completions.py +++ b/tests/tool_use/test_chat_completions.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List - import openai import pytest @@ -45,7 +43,7 @@ async def test_chat_completion_without_tools(client: openai.AsyncOpenAI, logprobs=False, stream=True, ) - chunks: List[str] = [] + chunks: list[str] = [] finish_reason_count = 0 role_sent: bool = False @@ -116,7 +114,7 @@ async def test_chat_completion_with_tools(client: openai.AsyncOpenAI, stream=True, ) - chunks: List[str] = [] + chunks: list[str] = [] finish_reason_count = 0 role_sent: bool = False diff --git a/tests/tool_use/test_jamba_tool_parser.py b/tests/tool_use/test_jamba_tool_parser.py index 7e349c51253c6..a40675744ba24 100644 --- a/tests/tool_use/test_jamba_tool_parser.py +++ b/tests/tool_use/test_jamba_tool_parser.py @@ -1,7 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 import json -from typing import Generator, List, Optional +from collections.abc import Generator +from typing import Optional import partial_json_parser import pytest @@ -26,8 +27,8 @@ def jamba_tool_parser(jamba_tokenizer): return JambaToolParser(jamba_tokenizer) -def assert_tool_calls(actual_tool_calls: List[ToolCall], - expected_tool_calls: List[ToolCall]): +def assert_tool_calls(actual_tool_calls: list[ToolCall], + expected_tool_calls: list[ToolCall]): assert len(actual_tool_calls) == len(expected_tool_calls) for actual_tool_call, expected_tool_call in zip(actual_tool_calls, @@ -218,10 +219,10 @@ def test_extract_tool_calls_streaming(jamba_tool_parser, jamba_tokenizer, model_output, expected_tool_calls, expected_content): other_content: str = '' - function_names: List[str] = [] - function_args_strs: List[str] = [] + function_names: list[str] = [] + function_args_strs: list[str] = [] tool_call_idx: int = -1 - tool_call_ids: List[Optional[str]] = [] + tool_call_ids: list[Optional[str]] = [] for delta_message in stream_delta_message_generator( jamba_tool_parser, jamba_tokenizer, model_output): diff --git a/tests/tool_use/test_parallel_tool_calls.py b/tests/tool_use/test_parallel_tool_calls.py index b49a5e8e7e4c7..910e0b2d51ab6 100644 --- a/tests/tool_use/test_parallel_tool_calls.py +++ b/tests/tool_use/test_parallel_tool_calls.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import json -from typing import Dict, List, Optional +from typing import Optional import openai import pytest @@ -54,7 +54,7 @@ async def test_parallel_tool_calls(client: openai.AsyncOpenAI, assert isinstance(tool_call.function.arguments, str) parsed_arguments = json.loads(tool_call.function.arguments) - assert isinstance(parsed_arguments, Dict) + assert isinstance(parsed_arguments, dict) assert isinstance(parsed_arguments.get("city"), str) assert isinstance(parsed_arguments.get("state"), str) @@ -73,8 +73,8 @@ async def test_parallel_tool_calls(client: openai.AsyncOpenAI, role_name: Optional[str] = None finish_reason_count: int = 0 - tool_call_names: List[str] = [] - tool_call_args: List[str] = [] + tool_call_names: list[str] = [] + tool_call_args: list[str] = [] tool_call_idx: int = -1 tool_call_id_count: int = 0 @@ -180,7 +180,7 @@ async def test_parallel_tool_calls_with_results(client: openai.AsyncOpenAI, logprobs=False, stream=True) - chunks: List[str] = [] + chunks: list[str] = [] finish_reason_count = 0 role_sent: bool = False diff --git a/tests/tool_use/test_tool_calls.py b/tests/tool_use/test_tool_calls.py index 45f1bfc45bd78..b320b335e338c 100644 --- a/tests/tool_use/test_tool_calls.py +++ b/tests/tool_use/test_tool_calls.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import json -from typing import Dict, List, Optional +from typing import Optional import openai import pytest @@ -44,7 +44,7 @@ async def test_tool_call_and_choice(client: openai.AsyncOpenAI): # make sure the arguments parse properly parsed_arguments = json.loads(tool_calls[0].function.arguments) - assert isinstance(parsed_arguments, Dict) + assert isinstance(parsed_arguments, dict) assert isinstance(parsed_arguments.get("city"), str) assert isinstance(parsed_arguments.get("state"), str) assert parsed_arguments.get("city") == "Dallas" @@ -117,7 +117,7 @@ async def test_tool_call_and_choice(client: openai.AsyncOpenAI): # validate arguments streamed_args = json.loads(function_args_str) - assert isinstance(streamed_args, Dict) + assert isinstance(streamed_args, dict) assert isinstance(streamed_args.get("city"), str) assert isinstance(streamed_args.get("state"), str) assert streamed_args.get("city") == "Dallas" @@ -128,7 +128,7 @@ async def test_tool_call_and_choice(client: openai.AsyncOpenAI): assert choice.message.role == role_name assert choice.message.tool_calls[0].function.name == function_name - # compare streamed with non-streamed args Dict-wise, not string-wise + # compare streamed with non-streamed args dict-wise, not string-wise # because character-to-character comparison might not work e.g. the tool # call parser adding extra spaces or something like that. we care about the # dicts matching not byte-wise match @@ -167,7 +167,7 @@ async def test_tool_call_with_results(client: openai.AsyncOpenAI): logprobs=False, stream=True) - chunks: List[str] = [] + chunks: list[str] = [] finish_reason_count = 0 role_sent: bool = False diff --git a/tests/tool_use/utils.py b/tests/tool_use/utils.py index a7dfb10780a38..fd947bd7fed06 100644 --- a/tests/tool_use/utils.py +++ b/tests/tool_use/utils.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 from copy import deepcopy -from typing import Any, Dict, List, Optional +from typing import Any, Optional from openai.types.chat import (ChatCompletionMessageParam, ChatCompletionToolParam) @@ -12,14 +12,14 @@ class ServerConfig(TypedDict, total=False): model: str - arguments: List[str] + arguments: list[str] system_prompt: Optional[str] supports_parallel: Optional[bool] supports_rocm: Optional[bool] -def patch_system_prompt(messages: List[Dict[str, Any]], - system_prompt: str) -> List[Dict[str, Any]]: +def patch_system_prompt(messages: list[dict[str, Any]], + system_prompt: str) -> list[dict[str, Any]]: new_messages = deepcopy(messages) if new_messages[0]["role"] == "system": new_messages[0]["content"] = system_prompt @@ -28,8 +28,8 @@ def patch_system_prompt(messages: List[Dict[str, Any]], return new_messages -def ensure_system_prompt(messages: List[Dict[str, Any]], - config: ServerConfig) -> List[Dict[str, Any]]: +def ensure_system_prompt(messages: list[dict[str, Any]], + config: ServerConfig) -> list[dict[str, Any]]: prompt = config.get("system_prompt") if prompt: return patch_system_prompt(messages, prompt) @@ -39,9 +39,9 @@ def ensure_system_prompt(messages: List[Dict[str, Any]], # universal args for all models go here. also good if you need to test locally # and change type or KV cache quantization or something. -ARGS: List[str] = ["--enable-auto-tool-choice", "--max-model-len", "1024"] +ARGS: list[str] = ["--enable-auto-tool-choice", "--max-model-len", "1024"] -CONFIGS: Dict[str, ServerConfig] = { +CONFIGS: dict[str, ServerConfig] = { "hermes": { "model": "NousResearch/Hermes-3-Llama-3.1-8B", @@ -205,7 +205,7 @@ def ensure_system_prompt(messages: List[Dict[str, Any]], } } -MESSAGES_WITHOUT_TOOLS: List[ChatCompletionMessageParam] = [{ +MESSAGES_WITHOUT_TOOLS: list[ChatCompletionMessageParam] = [{ "role": "user", "content": @@ -222,14 +222,14 @@ def ensure_system_prompt(messages: List[Dict[str, Any]], "Can you tell me a joke please?" }] -MESSAGES_ASKING_FOR_TOOLS: List[ChatCompletionMessageParam] = [{ +MESSAGES_ASKING_FOR_TOOLS: list[ChatCompletionMessageParam] = [{ "role": "user", "content": "What is the weather in Dallas, Texas in Fahrenheit?" }] -MESSAGES_WITH_TOOL_RESPONSE: List[ChatCompletionMessageParam] = [{ +MESSAGES_WITH_TOOL_RESPONSE: list[ChatCompletionMessageParam] = [{ "role": "user", "content": @@ -258,7 +258,7 @@ def ensure_system_prompt(messages: List[Dict[str, Any]], "cloudy skies and a low chance of rain." }] -MESSAGES_ASKING_FOR_PARALLEL_TOOLS: List[ChatCompletionMessageParam] = [{ +MESSAGES_ASKING_FOR_PARALLEL_TOOLS: list[ChatCompletionMessageParam] = [{ "role": "user", "content": @@ -266,7 +266,7 @@ def ensure_system_prompt(messages: List[Dict[str, Any]], "Fahrenheit?" }] -MESSAGES_WITH_PARALLEL_TOOL_RESPONSE: List[ChatCompletionMessageParam] = [{ +MESSAGES_WITH_PARALLEL_TOOL_RESPONSE: list[ChatCompletionMessageParam] = [{ "role": "user", "content": diff --git a/tests/tracing/test_tracing.py b/tests/tracing/test_tracing.py index 592775e8b8926..5fc5d08b327bb 100644 --- a/tests/tracing/test_tracing.py +++ b/tests/tracing/test_tracing.py @@ -2,8 +2,9 @@ import os import threading +from collections.abc import Iterable from concurrent import futures -from typing import Callable, Dict, Iterable, Literal +from typing import Callable, Literal import grpc import pytest @@ -25,7 +26,7 @@ def decode_value(value: AnyValue): - field_decoders: Dict[FieldName, Callable] = { + field_decoders: dict[FieldName, Callable] = { "bool_value": (lambda v: v.bool_value), "string_value": (lambda v: v.string_value), "int_value": (lambda v: v.int_value), diff --git a/tests/utils.py b/tests/utils.py index 2ad91ca2c8699..5a97636eec64a 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -11,7 +11,7 @@ import warnings from contextlib import contextmanager from pathlib import Path -from typing import Any, Callable, Dict, List, Optional, Type, Union +from typing import Any, Callable, Optional, Union import openai import pytest @@ -73,9 +73,9 @@ class RemoteOpenAIServer: def __init__(self, model: str, - vllm_serve_args: List[str], + vllm_serve_args: list[str], *, - env_dict: Optional[Dict[str, str]] = None, + env_dict: Optional[dict[str, str]] = None, auto_port: bool = True, max_wait_seconds: Optional[float] = None) -> None: if auto_port: @@ -183,7 +183,7 @@ def _test_completion( client: openai.OpenAI, model: str, prompt: str, - token_ids: List[int], + token_ids: list[int], ): results = [] @@ -400,10 +400,10 @@ def _test_image_text( def compare_two_settings(model: str, - arg1: List[str], - arg2: List[str], - env1: Optional[Dict[str, str]] = None, - env2: Optional[Dict[str, str]] = None, + arg1: list[str], + arg2: list[str], + env1: Optional[dict[str, str]] = None, + env2: Optional[dict[str, str]] = None, *, method: str = "generate", max_wait_seconds: Optional[float] = None) -> None: @@ -429,8 +429,8 @@ def compare_two_settings(model: str, def compare_all_settings(model: str, - all_args: List[List[str]], - all_envs: List[Optional[Dict[str, str]]], + all_args: list[list[str]], + all_envs: list[Optional[dict[str, str]]], *, method: str = "generate", max_wait_seconds: Optional[float] = None) -> None: @@ -470,7 +470,7 @@ def compare_all_settings(model: str, prompt = "Hello, my name is" token_ids = tokenizer(prompt).input_ids - ref_results: List = [] + ref_results: list = [] for i, (args, env) in enumerate(zip(all_args, all_envs)): if can_force_load_format: # we are comparing the results and @@ -481,7 +481,7 @@ def compare_all_settings(model: str, # environment variable to force the load format, # e.g. in quantization tests. args = args + ["--load-format", envs.VLLM_TEST_FORCE_LOAD_FORMAT] - compare_results: List = [] + compare_results: list = [] results = ref_results if i == 0 else compare_results with RemoteOpenAIServer(model, args, @@ -582,7 +582,7 @@ def multi_process_parallel( @contextmanager -def error_on_warning(category: Type[Warning] = Warning): +def error_on_warning(category: type[Warning] = Warning): """ Within the scope of this context manager, tests will fail if any warning of the given category is emitted. @@ -604,7 +604,7 @@ def get_physical_device_indices(devices): @_nvml() -def wait_for_gpu_memory_to_clear(devices: List[int], +def wait_for_gpu_memory_to_clear(devices: list[int], threshold_bytes: int, timeout_s: float = 120) -> None: # Use nvml instead of pytorch to reduce measurement error from torch cuda @@ -612,8 +612,8 @@ def wait_for_gpu_memory_to_clear(devices: List[int], devices = get_physical_device_indices(devices) start_time = time.time() while True: - output: Dict[int, str] = {} - output_raw: Dict[int, float] = {} + output: dict[int, str] = {} + output_raw: dict[int, float] = {} for device in devices: if current_platform.is_rocm(): dev_handle = amdsmi_get_processor_handles()[device] @@ -758,13 +758,13 @@ def wrapper(f: Callable[_P, None]) -> Callable[_P, None]: async def completions_with_server_args( - prompts: List[str], + prompts: list[str], model_name: str, - server_cli_args: List[str], + server_cli_args: list[str], num_logprobs: Optional[int], max_wait_seconds: int = 240, max_tokens: Union[int, list] = 5, -) -> List[Completion]: +) -> list[Completion]: '''Construct a remote OpenAI server, obtain an async client to the server & invoke the completions API to obtain completions. @@ -807,7 +807,7 @@ async def completions_with_server_args( return outputs -def get_client_text_generations(completions: List[Completion]) -> List[str]: +def get_client_text_generations(completions: list[Completion]) -> list[str]: '''Extract generated tokens from the output of a request made to an Open-AI-protocol completions endpoint. ''' @@ -816,7 +816,7 @@ def get_client_text_generations(completions: List[Completion]) -> List[str]: def get_client_text_logprob_generations( - completions: List[Completion]) -> List[TextTextLogprobs]: + completions: list[Completion]) -> list[TextTextLogprobs]: '''Operates on the output of a request made to an Open-AI-protocol completions endpoint; obtains top-rank logprobs for each token in each :class:`SequenceGroup` diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py index eb730973c946d..f45c21ab75ba5 100644 --- a/tests/v1/core/test_scheduler.py +++ b/tests/v1/core/test_scheduler.py @@ -1,5 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List, Optional +from typing import Optional from vllm.config import CacheConfig, ModelConfig, SchedulerConfig from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange @@ -48,9 +48,9 @@ def create_scheduler( def create_requests( num_requests: int, num_tokens: int = 10, - mm_positions: Optional[List[PlaceholderRange]] = None, + mm_positions: Optional[list[PlaceholderRange]] = None, max_tokens: int = 16, - stop_token_ids: Optional[List[int]] = None, + stop_token_ids: Optional[list[int]] = None, ): sampling_params = SamplingParams(ignore_eos=False, max_tokens=max_tokens, diff --git a/tests/v1/engine/conftest.py b/tests/v1/engine/conftest.py index 560dc31218522..8872f0388dd24 100644 --- a/tests/v1/engine/conftest.py +++ b/tests/v1/engine/conftest.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List, Tuple - import pytest import torch from transformers import AutoTokenizer @@ -17,8 +15,8 @@ from tests.v1.engine.utils import FULL_STRINGS # isort: skip -EngineCoreSampleLogprobsType = List[Tuple[torch.Tensor, torch.Tensor]] -EngineCorePromptLogprobsType = Tuple[torch.Tensor, torch.Tensor] +EngineCoreSampleLogprobsType = list[tuple[torch.Tensor, torch.Tensor]] +EngineCorePromptLogprobsType = tuple[torch.Tensor, torch.Tensor] def _build_test_vectors_no_logprobs() -> DummyOutputProcessorTestVectors: diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py index d864cb2af23e9..e7b91aeb0fbdb 100644 --- a/tests/v1/engine/test_async_llm.py +++ b/tests/v1/engine/test_async_llm.py @@ -2,7 +2,7 @@ import asyncio from contextlib import ExitStack -from typing import List, Optional, Tuple +from typing import Optional import pytest @@ -47,7 +47,7 @@ async def generate(engine: AsyncLLM, prompt: PromptType, output_kind: RequestOutputKind, max_tokens: int, - prompt_logprobs: Optional[int] = None) -> Tuple[int, str]: + prompt_logprobs: Optional[int] = None) -> tuple[int, str]: # Ensure generate doesn't complete too fast for cancellation test. await asyncio.sleep(0.2) @@ -114,7 +114,7 @@ async def test_async_llm_refuses_prompt_logprobs_with_apc( (VISION_ENGINE_ARGS, VISION_PROMPT)]) @pytest.mark.asyncio async def test_load(monkeypatch, output_kind: RequestOutputKind, - engine_args_and_prompt: Tuple[AsyncEngineArgs, + engine_args_and_prompt: tuple[AsyncEngineArgs, PromptType]): # TODO(rickyx): Remove monkeypatch once we have a better way to test V1 # so that in the future when we switch, we don't have to change all the @@ -160,7 +160,7 @@ async def test_load(monkeypatch, output_kind: RequestOutputKind, (VISION_ENGINE_ARGS, VISION_PROMPT)]) @pytest.mark.asyncio async def test_abort(monkeypatch, output_kind: RequestOutputKind, - engine_args_and_prompt: Tuple[AsyncEngineArgs, + engine_args_and_prompt: tuple[AsyncEngineArgs, PromptType]): with monkeypatch.context() as m, ExitStack() as after: @@ -177,7 +177,7 @@ async def test_abort(monkeypatch, output_kind: RequestOutputKind, request_ids = [f"request-{i}" for i in range(NUM_REQUESTS)] # Create concurrent requests. - tasks: List[asyncio.Task] = [] + tasks: list[asyncio.Task] = [] for request_id in request_ids: tasks.append( asyncio.create_task( diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py index 8c2998e588920..11c22effb122f 100644 --- a/tests/v1/engine/test_engine_core.py +++ b/tests/v1/engine/test_engine_core.py @@ -5,7 +5,6 @@ import time import uuid from concurrent.futures import Future -from typing import List import pytest from transformers import AutoTokenizer @@ -213,7 +212,7 @@ def make_request_with_max_tokens(max_tokens: int) -> EngineCoreRequest: class DummyExecutor(UniProcExecutor): def initialize_from_config( - self, kv_cache_configs: List[KVCacheConfig]) -> None: + self, kv_cache_configs: list[KVCacheConfig]) -> None: super().initialize_from_config(kv_cache_configs) # This executor actually can only run 1 batch at a time diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py index a7c02322ff02d..3880a3dd9b8ae 100644 --- a/tests/v1/engine/test_engine_core_client.py +++ b/tests/v1/engine/test_engine_core_client.py @@ -3,7 +3,7 @@ import asyncio import time import uuid -from typing import Dict, List, Optional +from typing import Optional import pytest from transformers import AutoTokenizer @@ -44,7 +44,7 @@ def make_request(params: SamplingParams) -> EngineCoreRequest: ) -def loop_until_done(client: EngineCoreClient, outputs: Dict): +def loop_until_done(client: EngineCoreClient, outputs: dict): while True: engine_core_outputs = client.get_output().outputs @@ -62,7 +62,7 @@ def loop_until_done(client: EngineCoreClient, outputs: Dict): break -async def loop_until_done_async(client: EngineCoreClient, outputs: Dict): +async def loop_until_done_async(client: EngineCoreClient, outputs: dict): while True: engine_core_outputs = (await client.get_output_async()).outputs @@ -121,7 +121,7 @@ def test_engine_core_client(monkeypatch, multiprocessing_mode: bool): client.add_request(request) time.sleep(0.01) - outputs: Dict[str, List] = {req_id: [] for req_id in request_ids} + outputs: dict[str, list] = {req_id: [] for req_id in request_ids} loop_until_done(client, outputs) for req_id in request_ids: @@ -207,7 +207,7 @@ async def test_engine_core_client_asyncio(monkeypatch): await client.add_request_async(request) await asyncio.sleep(0.01) - outputs: Dict[str, List] = {req_id: [] for req_id in request_ids} + outputs: dict[str, list] = {req_id: [] for req_id in request_ids} await loop_until_done_async(client, outputs) for req_id in request_ids: diff --git a/tests/v1/engine/test_llm_engine.py b/tests/v1/engine/test_llm_engine.py index de2a39ee9c083..33c884e6de357 100644 --- a/tests/v1/engine/test_llm_engine.py +++ b/tests/v1/engine/test_llm_engine.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import random -from typing import Dict, List, Optional, Tuple +from typing import Optional import pytest @@ -47,9 +47,9 @@ def vllm_model_apc(vllm_runner, monkeypatch): def _get_test_sampling_params( - prompt_list: List[str], + prompt_list: list[str], seed: Optional[int] = 42, -) -> Tuple[List[SamplingParams], List[int]]: +) -> tuple[list[SamplingParams], list[int]]: """Generate random sampling params for a batch.""" def get_mostly_n_gt1() -> int: @@ -81,7 +81,7 @@ def test_parallel_sampling(vllm_model, example_prompts) -> None: # Validate each request response for out, n in zip(outputs, n_list): - completion_counts: Dict[str, int] = {} + completion_counts: dict[str, int] = {} # Assert correct number of completions assert len(out.outputs) == n, ( f"{len(out.outputs)} completions; {n} expected.") diff --git a/tests/v1/engine/test_output_processor.py b/tests/v1/engine/test_output_processor.py index 1d47df417ddaa..0de853ba6e5e5 100644 --- a/tests/v1/engine/test_output_processor.py +++ b/tests/v1/engine/test_output_processor.py @@ -2,7 +2,7 @@ import math import time -from typing import Dict, List, Optional +from typing import Optional import pytest @@ -112,12 +112,12 @@ def test_incremental_detokenization(request_output_kind: RequestOutputKind, def _validate_logprobs( - gen_tokens: Dict[str, List[int]], - gen_logprobs: Dict[str, Optional[SampleLogprobs]], - gen_prompt_logprobs: Dict[str, Optional[PromptLogprobs]], - gen_cumulative_logprob: Dict[str, float], + gen_tokens: dict[str, list[int]], + gen_logprobs: dict[str, Optional[SampleLogprobs]], + gen_prompt_logprobs: dict[str, Optional[PromptLogprobs]], + gen_cumulative_logprob: dict[str, float], dtv: DummyOutputProcessorTestVectors, - request_id_list: List[str], + request_id_list: list[str], num_sample_logprobs: Optional[int], num_prompt_logprobs: Optional[int], ) -> None: diff --git a/tests/v1/engine/utils.py b/tests/v1/engine/utils.py index 39248ce86f25a..02baa4801a47a 100644 --- a/tests/v1/engine/utils.py +++ b/tests/v1/engine/utils.py @@ -2,7 +2,7 @@ import random from dataclasses import dataclass -from typing import List, Optional, Tuple, Union +from typing import Optional, Union import torch from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast @@ -61,7 +61,7 @@ def _create_random_top_logprob_test_vector( def _create_random_top_logprob_test_matrix( - shape: Tuple, + shape: tuple, lower: float, upper: float, ) -> torch.Tensor: @@ -90,7 +90,7 @@ def _create_random_top_token_test_vector( lower: int, upper: int, sampled_token_id: int, - adjust_num_logprobs: bool = True) -> Tuple[torch.Tensor, int]: + adjust_num_logprobs: bool = True) -> tuple[torch.Tensor, int]: """Create a random vector of top logprob token indices Use to create fake sample logprobs for testing. The sampled token @@ -141,11 +141,11 @@ def _create_random_top_token_test_vector( def _create_random_top_token_test_matrix( - shape: Tuple[int, int], + shape: tuple[int, int], lower: int, upper: int, - tokens_list: List[int], -) -> Tuple[torch.Tensor, torch.Tensor]: + tokens_list: list[int], +) -> tuple[torch.Tensor, torch.Tensor]: """Create a random matrix of top logprob token indices Use to create fake prompt logprobs for testing. @@ -160,7 +160,7 @@ def _create_random_top_token_test_matrix( upper: upper range of token ids Returns: - Tuple containing: + tuple containing: - 2D num_tokens x num_logprobs+1 torch Tensor of token ids - 1D tensor of ranks of prompt tokens in their respective rows, or random values @@ -206,10 +206,10 @@ def decode_token( def generate_dummy_sample_logprobs( - sampled_tokens_list: List, + sampled_tokens_list: list, num_logprobs: int, tokenizer: PreTrainedTokenizer, -) -> List[Tuple[List[int], List[float], int]]: +) -> list[tuple[list[int], list[float], int]]: """Generate dummy sample logprobs Generate a test data structure which imitates the list of sample logprobs @@ -221,7 +221,7 @@ def generate_dummy_sample_logprobs( tokenizer: model tokenizer to use for detokenization Returns - List of (top token ids vector, logprobs vector, sampled token rank) + list of (top token ids vector, logprobs vector, sampled token rank) Python lists tuples; in each tuple the logprobs and top token ids vectors have the same length which is either `num_logprobs` or `num_logprobs+1`. Sampled token rank is the rank (index+1) of the @@ -253,7 +253,7 @@ def generate_dummy_sample_logprobs( def generate_dummy_prompt_logprobs_tensors( - prompt_tokens_list: List, + prompt_tokens_list: list, num_logprobs: int, tokenizer: PreTrainedTokenizer, ) -> LogprobsTensors: @@ -269,7 +269,7 @@ def generate_dummy_prompt_logprobs_tensors( tokenizer: model tokenizer to use for detokenization Returns - Single Tuple of (logprobs matrix, top token ids matrix) torch Tensor, + Single tuple of (logprobs matrix, top token ids matrix) torch Tensor, where both matrices have dimensions num_prompt_tokens x num_logprobs """ @@ -301,19 +301,19 @@ class DummyOutputProcessorTestVectors: tokenizer: GeneralTokenizerType tokenizer_group: BaseTokenizerGroup vllm_config: EngineArgs - full_tokens: List[List[int]] # Prompt + generated tokens - prompt_tokens: List[List[int]] - generation_tokens: List[List[int]] + full_tokens: list[list[int]] # Prompt + generated tokens + prompt_tokens: list[list[int]] + generation_tokens: list[list[int]] # Each request is associated with a tuple of # (top tokens, top logprobs, ranks) prompt logprobs tensors - prompt_logprobs: List[LogprobsTensors] + prompt_logprobs: list[LogprobsTensors] # Each request is associated with a sample logprobs; a request's # sample logprobs are a list of (top tokens, top logprobs, ranks) # sample logprobs tensors at each sequence position - generation_logprobs: List[List[Tuple[List[int], List[float], int]]] - prompt_strings: List[str] - prompt_strings_len: List[int] - generation_strings: List[str] + generation_logprobs: list[list[tuple[list[int], list[float], int]]] + prompt_strings: list[str] + prompt_strings_len: list[int] + generation_strings: list[str] class MockEngineCore: @@ -321,18 +321,18 @@ class MockEngineCore: def __init__( self, - tokens_list: List[List[int]], + tokens_list: list[list[int]], # For each request, for each sampled token offset, # a tuple of # (list of topk token ids, list of sample logprob vals, rank) - generated_logprobs_raw: Optional[List[List[Tuple[List[int], - List[float], + generated_logprobs_raw: Optional[list[list[tuple[list[int], + list[float], int]]]] = None, # For each request, a tuple of # (prompt logprob val matrix, prompt logprob tok id matrix); # each matrix has dimensions # (num prompt toks) x (num prompt logprobs+1) - prompt_logprobs_raw: Optional[List[LogprobsTensors]] = None, + prompt_logprobs_raw: Optional[list[LogprobsTensors]] = None, ) -> None: self.tokens_list = tokens_list self.current_idx = 0 @@ -341,7 +341,7 @@ def __init__( self.prompt_logprobs_raw = prompt_logprobs_raw self.do_prompt_logprobs = prompt_logprobs_raw is not None - def get_outputs(self) -> List[EngineCoreOutput]: + def get_outputs(self) -> list[EngineCoreOutput]: do_logprobs = self.do_logprobs do_prompt_logprobs = self.do_prompt_logprobs token_idx = self.current_idx diff --git a/tests/v1/entrypoints/openai/test_completion.py b/tests/v1/entrypoints/openai/test_completion.py index 35e059ccb5480..171c84176eae7 100644 --- a/tests/v1/entrypoints/openai/test_completion.py +++ b/tests/v1/entrypoints/openai/test_completion.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import re -from typing import Dict, List, Optional +from typing import Optional import openai # use the official client for correctness check import pytest @@ -193,7 +193,7 @@ async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI, async def test_prompt_logprobs_completion(client: openai.AsyncOpenAI, model_name: str, prompt_logprobs: Optional[int]): - params: Dict = { + params: dict = { "prompt": ["A robot may not injure another robot", "My name is"], "model": model_name, } @@ -237,7 +237,7 @@ async def test_completion_streaming(client: openai.AsyncOpenAI, max_tokens=5, temperature=0.0, stream=True) - chunks: List[str] = [] + chunks: list[str] = [] finish_reason_count = 0 async for chunk in stream: chunks.append(chunk.choices[0].text) @@ -278,7 +278,7 @@ async def test_parallel_no_streaming(client: openai.AsyncOpenAI, num_completions = len(completion.choices) assert num_completions == n, ( f"Num completions {num_completions} but expected {n}.") - completion_repeats: Dict[str, int] = {} + completion_repeats: dict[str, int] = {} for idx, choice in enumerate(completion.choices): # Assert correct completion index & some finish reason. assert choice.index == idx, ( @@ -321,7 +321,7 @@ async def test_parallel_streaming(client: openai.AsyncOpenAI, model_name: str): temperature=0.95, stream=True, seed=42) - chunks: List[List[str]] = [[] for i in range(n)] + chunks: list[list[str]] = [[] for i in range(n)] finish_reason_count = 0 async for chunk in stream: index = chunk.choices[0].index @@ -332,7 +332,7 @@ async def test_parallel_streaming(client: openai.AsyncOpenAI, model_name: str): # Assert `n` completions with correct finish reasons assert finish_reason_count == n, ( f"Expected {n} completions with valid indices and finish_reason.") - completion_repeats: Dict[str, int] = {} + completion_repeats: dict[str, int] = {} for chunk in chunks: chunk_len = len(chunk) # Assert correct number of completion tokens diff --git a/tests/v1/sample/test_logprobs.py b/tests/v1/sample/test_logprobs.py index a26a8c4ed0749..d564a8c2e7a77 100644 --- a/tests/v1/sample/test_logprobs.py +++ b/tests/v1/sample/test_logprobs.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 import itertools -from typing import List, Tuple import pytest import torch @@ -46,8 +45,8 @@ def hf_model(hf_runner): def _repeat_logprob_config( test_prompts, - logprob_prompt_logprob_list: List[Tuple], -) -> List[Tuple]: + logprob_prompt_logprob_list: list[tuple], +) -> list[tuple]: """Ensure each test prompt has a logprob config. A logprob config specifies the optional (i.e. @@ -74,7 +73,7 @@ def _repeat_logprob_config( tuples Returns: - List of + list of (optional num sample logprob,optional num prompt logprob) tuples which is either identical to `logprob_prompt_logprob_list`, or else repeats @@ -177,7 +176,7 @@ def _test_case_get_logprobs_and_prompt_logprobs( for r in range(1, num_top_logprobs + 1)) output_text = vllm_result.outputs[0].text - output_string_from_most_likely_tokens_lst: List[str] = [] + output_string_from_most_likely_tokens_lst: list[str] = [] for top_logprobs in vllm_result.outputs[0].logprobs: top_logprob = next(iter(top_logprobs.values())) output_string_from_most_likely_tokens_lst.append( diff --git a/tests/v1/sample/test_rejection_sampler.py b/tests/v1/sample/test_rejection_sampler.py index f00585b40ba3f..b1862455d0ece 100644 --- a/tests/v1/sample/test_rejection_sampler.py +++ b/tests/v1/sample/test_rejection_sampler.py @@ -1,5 +1,4 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List import pytest import torch @@ -13,7 +12,7 @@ def sampler(): return RejectionSampler() -def create_logits_tensor(token_ids: List[int], +def create_logits_tensor(token_ids: list[int], vocab_size: int = 100) -> torch.Tensor: """Helper function to create logits tensor that will produce desired token ids on argmax""" @@ -23,7 +22,7 @@ def create_logits_tensor(token_ids: List[int], return logits -def create_sampling_metadata(spec_tokens: List[List[int]]) -> SamplingMetadata: +def create_sampling_metadata(spec_tokens: list[list[int]]) -> SamplingMetadata: batch_size = len(spec_tokens) return SamplingMetadata( temperature=torch.tensor([]), @@ -106,7 +105,7 @@ def test_single_token_sequence(sampler): def test_empty_sequence(sampler): """Test handling empty sequence of speculated tokens""" - spec_tokens: List[List[int]] = [[]] + spec_tokens: list[list[int]] = [[]] output_tokens = [5] # Just the bonus token metadata = create_sampling_metadata(spec_tokens) diff --git a/tests/v1/sample/test_sampler.py b/tests/v1/sample/test_sampler.py index 435c1b7b5fda9..b702d9ed7f83f 100644 --- a/tests/v1/sample/test_sampler.py +++ b/tests/v1/sample/test_sampler.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Dict, List, Optional, Set, Tuple +from typing import Optional import numpy as np import pytest @@ -32,7 +32,7 @@ def _create_penalty_tensor(batch_size: int, penalty_value: float, def _create_prompt_tokens_tensor( - prompt_token_ids: List[List[int]], + prompt_token_ids: list[list[int]], vocab_size: int, device: torch.device, ) -> torch.Tensor: @@ -49,8 +49,8 @@ def _create_logit_bias( batch_size: int, vocab_size: int, bias_value: float, -) -> List[Optional[Dict[int, float]]]: - res: List[Optional[Dict[int, float]]] = [] +) -> list[Optional[dict[int, float]]]: + res: list[Optional[dict[int, float]]] = [] for i in range(batch_size): logit_bias = {min(i, vocab_size - 1): bias_value} res.append(logit_bias) @@ -83,8 +83,8 @@ def _create_default_sampling_metadata( vocab_size: int, device: torch.device, ) -> SamplingMetadata: - output_token_ids: List[List[int]] = [] - prompt_token_ids: List[List[int]] = [] + output_token_ids: list[list[int]] = [] + prompt_token_ids: list[list[int]] = [] for _ in range(batch_size): output_token_ids.append( np.random.randint(0, vocab_size, size=num_output_tokens).tolist()) @@ -118,8 +118,8 @@ def _create_default_sampling_metadata( def _generate_min_token_penalties_and_stop_tokens( num_output_tokens: int, batch_size: int, vocab_size: int, - batch_indices_for_min_token_penalty: List[int] -) -> Dict[int, Tuple[int, Set[int]]]: + batch_indices_for_min_token_penalty: list[int] +) -> dict[int, tuple[int, set[int]]]: """ Generates and returns a dict of minimum token penalties and corresponding stop token IDs (`min_tokens`, `stop_token_ids`) for each @@ -130,7 +130,7 @@ def _generate_min_token_penalties_and_stop_tokens( and a random set of stop token IDs is created. Otherwise, a lower `min_tokens` value is assigned, and the stop token IDs set is empty. """ - min_tokens: Dict[int, Tuple[int, Set[int]]] = {} + min_tokens: dict[int, tuple[int, set[int]]] = {} for index in range(batch_size): if index in batch_indices_for_min_token_penalty: min_tokens[index] = ( @@ -147,7 +147,7 @@ def _generate_min_token_penalties_and_stop_tokens( def _create_weighted_output_token_list( batch_size: int, - vocab_size: int) -> Tuple[List[List[int]], List[List[int]]]: + vocab_size: int) -> tuple[list[list[int]], list[list[int]]]: """ Creates an output token list where each token occurs a distinct number of times. @@ -157,7 +157,7 @@ def _create_weighted_output_token_list( list, each with a different frequency. Returns: - Tuple[List[List[int]], List[List[int]]]: + tuple[list[list[int]], list[list[int]]]: - The first element is the output token list, where each sublist corresponds to a batch and contains tokens with weighted frequencies. @@ -165,8 +165,8 @@ def _create_weighted_output_token_list( batch, ordered by their frequency in the corresponding output list. """ - output_token_ids: List[List[int]] = [] - sorted_token_ids_in_output: List[List[int]] = [] + output_token_ids: list[list[int]] = [] + sorted_token_ids_in_output: list[list[int]] = [] for _ in range(batch_size): distinct_token_ids = np.random.choice(vocab_size, size=np.random.randint(1, 10), diff --git a/tests/v1/sample/utils.py b/tests/v1/sample/utils.py index e1465b1239661..c69d0d49c46fa 100644 --- a/tests/v1/sample/utils.py +++ b/tests/v1/sample/utils.py @@ -1,12 +1,11 @@ # SPDX-License-Identifier: Apache-2.0 import re -from typing import List, Tuple from vllm import CompletionOutput -def get_test_batch(batch_logprobs_composition: str) -> List[Tuple]: +def get_test_batch(batch_logprobs_composition: str) -> list[tuple]: """Generate logprobs configs for a batch of requests A given request's logprobs configuration is (1) num_sample_logprobs and (2) @@ -32,7 +31,7 @@ def get_test_batch(batch_logprobs_composition: str) -> List[Tuple]: Returns: - List of (Optional[num_sample_logprobs], Optional[num_prompt_logprobs]) + list of (Optional[num_sample_logprobs], Optional[num_prompt_logprobs]) tuples """ if batch_logprobs_composition == "NONE": diff --git a/tests/v1/test_utils.py b/tests/v1/test_utils.py index 9b669ae006608..b68f08385866b 100644 --- a/tests/v1/test_utils.py +++ b/tests/v1/test_utils.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List - import torch from vllm.v1.utils import bind_kv_cache @@ -22,7 +20,7 @@ def test_bind_kv_cache(): 'layers.2.self_attn': torch.zeros((1, )), 'layers.3.self_attn': torch.zeros((1, )), } - runner_kv_caches: List[torch.Tensor] = [] + runner_kv_caches: list[torch.Tensor] = [] bind_kv_cache(kv_cache, ctx, runner_kv_caches) assert ctx['layers.0.self_attn'].kv_cache[0] is kv_cache[ 'layers.0.self_attn'] @@ -52,7 +50,7 @@ def test_bind_kv_cache_non_attention(): 'model.layers.28.attn': torch.zeros((1, )), } - runner_kv_caches: List[torch.Tensor] = [] + runner_kv_caches: list[torch.Tensor] = [] bind_kv_cache(kv_cache, ctx, runner_kv_caches) assert ctx['model.layers.20.attn'].kv_cache[0] is kv_cache[ diff --git a/tests/v1/worker/test_gpu_input_batch.py b/tests/v1/worker/test_gpu_input_batch.py index 327370e71fffc..72ec73701159e 100644 --- a/tests/v1/worker/test_gpu_input_batch.py +++ b/tests/v1/worker/test_gpu_input_batch.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Dict, List, Optional, Set, Tuple +from typing import Optional import numpy as np import pytest @@ -22,22 +22,22 @@ def _remove_requests( input_batch: InputBatch, batch_size: int, - reqs: List[CachedRequestState]) -> Tuple[Set[str], List[int]]: + reqs: list[CachedRequestState]) -> tuple[set[str], list[int]]: """ - Remove some requests randomly from the batch and returns a Tuple + Remove some requests randomly from the batch and returns a tuple of 1) set of request removed 2) indices of the requests removed ordered in descending order """ num_reqs_to_remove = np.random.randint(0, batch_size) - req_indices_to_remove: Set[int] = set() + req_indices_to_remove: set[int] = set() for _ in range(num_reqs_to_remove): req_index_to_remove = np.random.randint(0, batch_size) req_indices_to_remove.add(req_index_to_remove) req_indices_to_remove_list = list(req_indices_to_remove) req_indices_to_remove_list.sort(reverse=True) - req_ids_to_remove: Set[str] = set() + req_ids_to_remove: set[str] = set() for index in req_indices_to_remove: input_batch.remove_request(reqs[index].req_id) req_ids_to_remove.add(reqs[index].req_id) @@ -45,9 +45,9 @@ def _remove_requests( def _construct_expected_sampling_metadata( - reqs: List[CachedRequestState], - req_ids_retained: Set[int], - req_id_index_in_input_batch: Dict[str, int], + reqs: list[CachedRequestState], + req_ids_retained: set[int], + req_id_index_in_input_batch: dict[str, int], device: torch.device, ) -> SamplingMetadata: """ @@ -55,8 +55,8 @@ def _construct_expected_sampling_metadata( batch. """ num_reqs = len(req_ids_retained) - output_token_ids: List[List[int]] = [list() for _ in range(num_reqs)] - prompt_token_ids: List[List[int]] = [list() for _ in range(num_reqs)] + output_token_ids: list[list[int]] = [list() for _ in range(num_reqs)] + prompt_token_ids: list[list[int]] = [list() for _ in range(num_reqs)] presence_penalties = [0.0 for _ in range(num_reqs)] frequency_penalties = [0.0 for _ in range(num_reqs)] repetition_penalties = [1.0 for _ in range(num_reqs)] @@ -191,7 +191,7 @@ def test_sampling_metadata_in_input_batch(device: str, batch_size: int): pin_memory=is_pin_memory_available(), vocab_size=1024, ) - reqs: List[CachedRequestState] = [] + reqs: list[CachedRequestState] = [] req_id_reqs = {} req_id_output_token_ids = {} # Add requests diff --git a/tests/vllm_test_utils/vllm_test_utils/blame.py b/tests/vllm_test_utils/vllm_test_utils/blame.py index 392fd2705fb27..3b25980cb9463 100644 --- a/tests/vllm_test_utils/vllm_test_utils/blame.py +++ b/tests/vllm_test_utils/vllm_test_utils/blame.py @@ -4,7 +4,8 @@ import dataclasses import sys import traceback -from typing import Callable, Generator +from collections.abc import Generator +from typing import Callable @dataclasses.dataclass diff --git a/tests/vllm_test_utils/vllm_test_utils/monitor.py b/tests/vllm_test_utils/vllm_test_utils/monitor.py index 44d45f2621054..27077f13de24f 100644 --- a/tests/vllm_test_utils/vllm_test_utils/monitor.py +++ b/tests/vllm_test_utils/vllm_test_utils/monitor.py @@ -4,7 +4,8 @@ import dataclasses import sys import traceback -from typing import Callable, Generator, Generic, TypeVar +from collections.abc import Generator +from typing import Callable, Generic, TypeVar _T = TypeVar("_T") diff --git a/tests/worker/test_encoder_decoder_model_runner.py b/tests/worker/test_encoder_decoder_model_runner.py index 0ce0465a704cb..3e237aacc8c60 100644 --- a/tests/worker/test_encoder_decoder_model_runner.py +++ b/tests/worker/test_encoder_decoder_model_runner.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 import itertools -from typing import List import pytest import torch @@ -43,7 +42,7 @@ def test_empty_seq_group(): enable_chunked_prefill=False, enforce_eager=True, ) - seq_group_metadata_list: List[SequenceGroupMetadata] = [] + seq_group_metadata_list: list[SequenceGroupMetadata] = [] model_input = model_runner._prepare_model_input_tensors( seq_group_metadata_list) ( @@ -103,9 +102,9 @@ def test_prepare_prompt(batch_size): enforce_eager=True, ) - seq_lens: List[int] = [] - encoder_seq_lens: List[int] = [] - seq_group_metadata_list: List[SequenceGroupMetadata] = [] + seq_lens: list[int] = [] + encoder_seq_lens: list[int] = [] + seq_group_metadata_list: list[SequenceGroupMetadata] = [] block_tables = {0: [1]} cross_block_table = [2] for i in range(batch_size): @@ -295,9 +294,9 @@ def test_prepare_decode(batch_size, multiple_seqs_per_seq_group): enforce_eager=True, ) - seq_lens: List[int] = [] - encoder_seq_lens: List[int] = [] - seq_group_metadata_list: List[SequenceGroupMetadata] = [] + seq_lens: list[int] = [] + encoder_seq_lens: list[int] = [] + seq_group_metadata_list: list[SequenceGroupMetadata] = [] block_tables = { 0: [1], 1: [3] @@ -503,9 +502,9 @@ def test_prepare_decode_cuda_graph(batch_size, multiple_seqs_per_seq_group): } if multiple_seqs_per_seq_group else { 0: [1] } - seq_lens: List[int] = [] - encoder_seq_lens: List[int] = [] - seq_group_metadata_list: List[SequenceGroupMetadata] = [] + seq_lens: list[int] = [] + encoder_seq_lens: list[int] = [] + seq_group_metadata_list: list[SequenceGroupMetadata] = [] cross_block_table = [2] expanded_batch_size = 0 diff --git a/tests/worker/test_model_input.py b/tests/worker/test_model_input.py index eb341fb1b2931..a41fc52170fee 100644 --- a/tests/worker/test_model_input.py +++ b/tests/worker/test_model_input.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 import dataclasses -from typing import List, Tuple, Type import torch @@ -27,15 +26,15 @@ def get_impl_cls(): raise NotImplementedError @staticmethod - def get_metadata_cls() -> Type["AttentionMetadata"]: + def get_metadata_cls() -> type["AttentionMetadata"]: return AttentionMetadata @staticmethod - def get_builder_cls() -> Type["AttentionMetadataBuilder"]: + def get_builder_cls() -> type["AttentionMetadataBuilder"]: return AttentionMetadataBuilder @staticmethod - def get_state_cls() -> Type["CommonAttentionState"]: + def get_state_cls() -> type["CommonAttentionState"]: return CommonAttentionState @staticmethod @@ -44,7 +43,7 @@ def get_kv_cache_shape( block_size: int, num_kv_heads: int, head_size: int, - ) -> Tuple[int, ...]: + ) -> tuple[int, ...]: raise NotImplementedError @staticmethod @@ -57,7 +56,7 @@ def swap_blocks( @staticmethod def copy_blocks( - kv_caches: List[torch.Tensor], + kv_caches: list[torch.Tensor], src_to_dists: torch.Tensor, ) -> None: pass diff --git a/tests/worker/test_model_runner.py b/tests/worker/test_model_runner.py index 3f9a0d6faa619..b8ba69b0dd8f3 100644 --- a/tests/worker/test_model_runner.py +++ b/tests/worker/test_model_runner.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List - import pytest import torch @@ -42,8 +40,8 @@ def test_prepare_prompt(batch_size): enable_chunked_prefill=False, ) - seq_lens: List[int] = [] - seq_group_metadata_list: List[SequenceGroupMetadata] = [] + seq_lens: list[int] = [] + seq_group_metadata_list: list[SequenceGroupMetadata] = [] block_tables = {0: [1]} for i in range(batch_size): # make sure all tokens fit into one block @@ -159,8 +157,8 @@ def test_prepare_decode_cuda_graph(batch_size): enable_chunked_prefill=False, ) - context_lens: List[int] = [] - seq_group_metadata_list: List[SequenceGroupMetadata] = [] + context_lens: list[int] = [] + seq_group_metadata_list: list[SequenceGroupMetadata] = [] # Assume each seq group finishes prefill. for i in range(batch_size): # make sure all tokens fit into one block @@ -265,7 +263,7 @@ def test_empty_seq_group(): dtype="float16", enforce_eager=False, ) - seq_group_metadata_list: List[SequenceGroupMetadata] = [] + seq_group_metadata_list: list[SequenceGroupMetadata] = [] model_input = model_runner._prepare_model_input_tensors( seq_group_metadata_list) input_tokens, input_positions, attn_metadata = ( @@ -315,10 +313,10 @@ def test_hybrid_batches(batch_size, enforce_eager, distributed_init): ) # Add prefill requests. - seq_lens: List[int] = [] - seq_group_metadata_list: List[SequenceGroupMetadata] = [] - prefill_metadata_list: List[SequenceGroupMetadata] = [] - decode_metadata_list: List[SequenceGroupMetadata] = [] + seq_lens: list[int] = [] + seq_group_metadata_list: list[SequenceGroupMetadata] = [] + prefill_metadata_list: list[SequenceGroupMetadata] = [] + decode_metadata_list: list[SequenceGroupMetadata] = [] block_tables = {0: [1]} prefill_batch_size = batch_size // 2 decode_batch_size = batch_size - prefill_batch_size diff --git a/tools/profiler/print_layerwise_table.py b/tools/profiler/print_layerwise_table.py index adbb7301bfc76..9601b578eb97c 100644 --- a/tools/profiler/print_layerwise_table.py +++ b/tools/profiler/print_layerwise_table.py @@ -2,13 +2,12 @@ import argparse import json -from typing import Dict from vllm.profiler.layerwise_profile import ModelStatsEntry, SummaryStatsEntry from vllm.profiler.utils import TablePrinter, indent_string -def flatten_entries(entry_cls, profile_dict: Dict): +def flatten_entries(entry_cls, profile_dict: dict): entries_and_depth = [] def get_entries(node, curr_depth=0): diff --git a/tools/profiler/visualize_layerwise_profile.py b/tools/profiler/visualize_layerwise_profile.py index c527cdbe02259..8ec3dfc97a734 100644 --- a/tools/profiler/visualize_layerwise_profile.py +++ b/tools/profiler/visualize_layerwise_profile.py @@ -6,7 +6,7 @@ import math import os from pathlib import Path -from typing import Any, List, Optional, Tuple +from typing import Any, Optional import matplotlib.pyplot as plt import pandas as pd @@ -24,7 +24,7 @@ def largest_dist_from_leaf(node: dict, depth: int = 0): def get_entries_at_depth(depth: int, - entries_and_traces: List[Tuple[Any, Any]], + entries_and_traces: list[tuple[Any, Any]], node: dict, curr_depth: int = 0, trace=()): @@ -48,9 +48,9 @@ def get_entries_at_depth(depth: int, trace=trace) -def fold_nodes(root: dict, nodes_to_fold: List[str]): +def fold_nodes(root: dict, nodes_to_fold: list[str]): - stack: List[dict] = [root] + stack: list[dict] = [root] while len(stack) != 0: node = stack.pop() if node['entry']['name'] in nodes_to_fold: @@ -427,12 +427,12 @@ def main( plot_metric: str, make_names_unique: bool, top_k: int, - json_nodes_to_fold: List[str]): + json_nodes_to_fold: list[str]): - def prepare_data(profile_json: dict, step_keys: List[str]) -> pd.DataFrame: + def prepare_data(profile_json: dict, step_keys: list[str]) -> pd.DataFrame: def get_entries_and_traces(key: str): - entries_and_traces: List[Tuple[Any, Any]] = [] + entries_and_traces: list[tuple[Any, Any]] = [] for root in profile_json[key]["summary_stats"]: # Fold nodes in the traces as per user request. i.e. simply # make the requested nodes leaf-nodes. diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 0e83bcaead949..bd7104447401c 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -2,7 +2,7 @@ import contextlib import importlib -from typing import TYPE_CHECKING, List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Optional, Union import torch import torch.library @@ -198,7 +198,7 @@ def rms_norm_dynamic_per_token_quant( quant_dtype: torch.dtype, scale_ub: Optional[torch.Tensor] = None, residual: Optional[torch.Tensor] = None -) -> Tuple[torch.Tensor, torch.Tensor]: +) -> tuple[torch.Tensor, torch.Tensor]: output = torch.empty_like(input, dtype=quant_dtype) scales = torch.empty((input.numel() // input.shape[-1], 1), device=input.device, @@ -347,7 +347,7 @@ def _awq_gemm_fake(input: torch.Tensor, qweight: torch.Tensor, @register_fake("_C::aqlm_gemm") def _aqlm_gemm_fake(input: torch.Tensor, codes: torch.Tensor, codebooks: torch.Tensor, scales: torch.Tensor, - codebook_partition_sizes: List[int], + codebook_partition_sizes: list[int], bias: Optional[torch.Tensor]) -> torch.Tensor: out_features = codes.size(0) * codebooks.size(2) flat_input = input.reshape((-1, input.size(-1))) @@ -363,7 +363,7 @@ def _aqlm_gemm_fake(input: torch.Tensor, codes: torch.Tensor, @register_fake("_C::aqlm_dequant") def _aqlm_dequant_fake( codes: torch.Tensor, codebooks: torch.Tensor, - codebook_partition_sizes: List[int]) -> torch.Tensor: + codebook_partition_sizes: list[int]) -> torch.Tensor: in_features = codes.size(1) * 8 out_features = codes.size(0) return torch.empty((out_features, in_features), @@ -538,7 +538,7 @@ def cutlass_sparse_scaled_mm_supported(cuda_device_capability: int) -> bool: def cutlass_sparse_compress(a: torch.Tensor) \ - -> Tuple[torch.Tensor, torch.Tensor]: + -> tuple[torch.Tensor, torch.Tensor]: """ Compresses a sparse matrix for use with Cutlass sparse operations. @@ -555,7 +555,7 @@ def cutlass_sparse_compress(a: torch.Tensor) \ - `torch.float16` Returns: - Tuple[torch.Tensor, torch.Tensor]: + tuple[torch.Tensor, torch.Tensor]: A tuple containing: - `a_nzs` (torch.Tensor): A tensor containing non-zero elements of `a`. - `a_meta` (torch.Tensor): A tensor containing metadata for the sparse representation. @@ -630,14 +630,14 @@ def cutlass_scaled_sparse_mm( # aqlm def aqlm_gemm(input: torch.Tensor, codes: torch.Tensor, codebooks: torch.Tensor, scales: torch.Tensor, - codebook_partition_sizes: List[int], + codebook_partition_sizes: list[int], bias: Optional[torch.Tensor]) -> torch.Tensor: return torch.ops._C.aqlm_gemm(input, codes, codebooks, scales, codebook_partition_sizes, bias) def aqlm_dequant(codes: torch.Tensor, codebooks: torch.Tensor, - codebook_partition_sizes: List[int]) -> torch.Tensor: + codebook_partition_sizes: list[int]) -> torch.Tensor: return torch.ops._C.aqlm_dequant(codes, codebooks, codebook_partition_sizes) @@ -722,7 +722,7 @@ def machete_supported_schedules( group_zeros_type: Optional[torch.dtype] = None, channel_scales_type: Optional[torch.dtype] = None, token_scales_type: Optional[torch.dtype] = None, - out_type: Optional[torch.dtype] = None) -> List[str]: + out_type: Optional[torch.dtype] = None) -> list[str]: return torch.ops._C.machete_supported_schedules( a_type, b_type.id, group_scales_type, group_zeros_type, channel_scales_type, token_scales_type, out_type) @@ -767,7 +767,7 @@ def permute_cols(a: torch.Tensor, perm: torch.Tensor) -> torch.Tensor: # fp4 def scaled_fp4_quant( input: torch.Tensor, - input_global_scale: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + input_global_scale: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: """ Quantize input tensor to FP4 and return quantized tensor and scale. @@ -782,7 +782,7 @@ def scaled_fp4_quant( input_global_scale: A scalar scaling factor for the entire tensor. Returns: - Tuple[torch.Tensor, torch.Tensor]: The output tensor in FP4 but every + tuple[torch.Tensor, torch.Tensor]: The output tensor in FP4 but every two values are packed into a uint8 and float8_e4m3 scaling factors in the sizzled layout. """ @@ -829,7 +829,7 @@ def scaled_fp8_quant( num_token_padding: Optional[int] = None, scale_ub: Optional[torch.Tensor] = None, use_per_token_if_dynamic: bool = False, -) -> Tuple[torch.Tensor, torch.Tensor]: +) -> tuple[torch.Tensor, torch.Tensor]: """ Quantize input tensor to FP8 and return quantized tensor and scale. @@ -850,12 +850,12 @@ def scaled_fp8_quant( in the dynamic quantization case. Returns: - Tuple[torch.Tensor, torch.Tensor]: The output tensor in FP8 and + tuple[torch.Tensor, torch.Tensor]: The output tensor in FP8 and scaling factor. """ # This code assumes batch_dim and num_tokens are flattened assert (input.ndim == 2) - shape: Union[Tuple[int, int], torch.Size] = input.shape + shape: Union[tuple[int, int], torch.Size] = input.shape # For rocm, the output fp8 dtype is torch.float_e3m3fnuz out_dtype: torch.dtype = torch.float8_e4m3fnuz \ if current_platform.is_rocm() else torch.float8_e4m3fn @@ -887,7 +887,7 @@ def scaled_int8_quant( scale: Optional[torch.Tensor] = None, azp: Optional[torch.Tensor] = None, symmetric: bool = True -) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: +) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: """ Quantize the input tensor to int8 and return the quantized tensor and scale, and maybe azp. @@ -900,7 +900,7 @@ def scaled_int8_quant( symmetric: Whether to use symmetric quantization (scale only, azp ignored). Returns: - Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] : Output int8 tensor, scales, and optionally azp. + tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] : Output int8 tensor, scales, and optionally azp. """ output = torch.empty_like(input, dtype=torch.int8) if scale is not None: @@ -1088,13 +1088,13 @@ def concat_and_cache_mla( scale) -def copy_blocks(key_caches: List[torch.Tensor], - value_caches: List[torch.Tensor], +def copy_blocks(key_caches: list[torch.Tensor], + value_caches: list[torch.Tensor], block_mapping: torch.Tensor) -> None: torch.ops._C_cache_ops.copy_blocks(key_caches, value_caches, block_mapping) -def copy_blocks_mla(kv_caches: List[torch.Tensor], +def copy_blocks_mla(kv_caches: list[torch.Tensor], block_mapping: torch.Tensor) -> None: torch.ops._C_cache_ops.copy_blocks_mla(kv_caches, block_mapping) @@ -1132,7 +1132,7 @@ def get_max_shared_memory_per_block_device_attribute(device: int) -> int: # custom ar -def init_custom_ar(ipc_tensors: List[torch.Tensor], rank_data: torch.Tensor, +def init_custom_ar(ipc_tensors: list[torch.Tensor], rank_data: torch.Tensor, rank: int, full_nvlink: bool) -> int: return torch.ops._C_custom_ar.init_custom_ar(ipc_tensors, rank_data, rank, full_nvlink) @@ -1152,16 +1152,16 @@ def meta_size() -> int: return torch.ops._C_custom_ar.meta_size() -def register_buffer(fa: int, ipc_tensors: List[int]) -> None: +def register_buffer(fa: int, ipc_tensors: list[int]) -> None: return torch.ops._C_custom_ar.register_buffer(fa, ipc_tensors) -def get_graph_buffer_ipc_meta(fa: int) -> Tuple[List[int], List[int]]: +def get_graph_buffer_ipc_meta(fa: int) -> tuple[list[int], list[int]]: return torch.ops._C_custom_ar.get_graph_buffer_ipc_meta(fa) -def register_graph_buffers(fa: int, handles: List[List[int]], - offsets: List[List[int]]) -> None: +def register_graph_buffers(fa: int, handles: list[list[int]], + offsets: list[list[int]]) -> None: torch.ops._C_custom_ar.register_graph_buffers(fa, handles, offsets) @@ -1169,7 +1169,7 @@ def get_flash_mla_metadata( cache_seqlens: torch.Tensor, num_heads_per_head_k: int, num_heads_k: int, -) -> Tuple[torch.Tensor, torch.Tensor]: +) -> tuple[torch.Tensor, torch.Tensor]: """ Arguments: cache_seqlens: (batch_size), dtype torch.int32. @@ -1195,7 +1195,7 @@ def flash_mla_with_kvcache( num_splits: torch.Tensor, softmax_scale: Optional[float] = None, causal: bool = False, -) -> Tuple[torch.Tensor, torch.Tensor]: +) -> tuple[torch.Tensor, torch.Tensor]: """ Arguments: q: (batch_size, seq_len_q, num_heads_q, head_dim). diff --git a/vllm/_ipex_ops.py b/vllm/_ipex_ops.py index ccb67baa53383..a7b909d206347 100644 --- a/vllm/_ipex_ops.py +++ b/vllm/_ipex_ops.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List, Optional, Tuple +from typing import Optional import torch @@ -18,7 +18,7 @@ class ipex_ops: @staticmethod def _reshape_activation_tensor( - x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: num = x.size(0) d = x.size(1) // 2 x = x.reshape(num, 2, d) @@ -213,8 +213,8 @@ def reshape_and_cache( key, value, key_cache, value_cache, slot_mapping) @staticmethod - def copy_blocks(key_caches: List[torch.Tensor], - value_caches: List[torch.Tensor], + def copy_blocks(key_caches: list[torch.Tensor], + value_caches: list[torch.Tensor], block_mapping: torch.Tensor) -> None: torch.xpu.copy_blocks( # type: ignore key_caches, diff --git a/vllm/adapter_commons/layers.py b/vllm/adapter_commons/layers.py index 18e0c5227d45c..9cc2b181fc7cc 100644 --- a/vllm/adapter_commons/layers.py +++ b/vllm/adapter_commons/layers.py @@ -1,15 +1,14 @@ # SPDX-License-Identifier: Apache-2.0 from dataclasses import dataclass -from typing import Tuple @dataclass class AdapterMapping: # Per every token in input_ids: - index_mapping: Tuple[int, ...] + index_mapping: tuple[int, ...] # Per sampled token: - prompt_mapping: Tuple[int, ...] + prompt_mapping: tuple[int, ...] def __post_init__(self): self.index_mapping = tuple(self.index_mapping) diff --git a/vllm/adapter_commons/models.py b/vllm/adapter_commons/models.py index f9a5d2fffad5e..5d2663d56d0a1 100644 --- a/vllm/adapter_commons/models.py +++ b/vllm/adapter_commons/models.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 from abc import ABC, abstractmethod -from typing import Any, Callable, Dict, Optional, TypeVar +from typing import Any, Callable, Optional, TypeVar from torch import nn @@ -49,9 +49,9 @@ def __init__( model: the model to be adapted. """ self.model: nn.Module = model - self._registered_adapters: Dict[int, Any] = {} - # Dict instead of a Set for compatibility with LRUCache. - self._active_adapters: Dict[int, None] = {} + self._registered_adapters: dict[int, Any] = {} + # dict instead of a Set for compatibility with LRUCache. + self._active_adapters: dict[int, None] = {} self.adapter_type = 'Adapter' self._last_mapping = None @@ -97,7 +97,7 @@ def get_adapter(self, adapter_id: int) -> Optional[Any]: raise NotImplementedError @abstractmethod - def list_adapters(self) -> Dict[int, Any]: + def list_adapters(self) -> dict[int, Any]: raise NotImplementedError @abstractmethod diff --git a/vllm/adapter_commons/utils.py b/vllm/adapter_commons/utils.py index c2dc5433cc656..46e9629e1f55f 100644 --- a/vllm/adapter_commons/utils.py +++ b/vllm/adapter_commons/utils.py @@ -1,10 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Any, Callable, Dict, Optional, Set +from typing import Any, Callable, Optional ## model functions -def deactivate_adapter(adapter_id: int, active_adapters: Dict[int, None], +def deactivate_adapter(adapter_id: int, active_adapters: dict[int, None], deactivate_func: Callable) -> bool: if adapter_id in active_adapters: deactivate_func(adapter_id) @@ -13,7 +13,7 @@ def deactivate_adapter(adapter_id: int, active_adapters: Dict[int, None], return False -def add_adapter(adapter: Any, registered_adapters: Dict[int, Any], +def add_adapter(adapter: Any, registered_adapters: dict[int, Any], capacity: int, add_func: Callable) -> bool: if adapter.id not in registered_adapters: if len(registered_adapters) >= capacity: @@ -32,23 +32,23 @@ def set_adapter_mapping(mapping: Any, last_mapping: Any, return last_mapping -def remove_adapter(adapter_id: int, registered_adapters: Dict[int, Any], +def remove_adapter(adapter_id: int, registered_adapters: dict[int, Any], deactivate_func: Callable) -> bool: deactivate_func(adapter_id) return bool(registered_adapters.pop(adapter_id, None)) -def list_adapters(registered_adapters: Dict[int, Any]) -> Dict[int, Any]: +def list_adapters(registered_adapters: dict[int, Any]) -> dict[int, Any]: return dict(registered_adapters) def get_adapter(adapter_id: int, - registered_adapters: Dict[int, Any]) -> Optional[Any]: + registered_adapters: dict[int, Any]) -> Optional[Any]: return registered_adapters.get(adapter_id) ## worker functions -def set_active_adapters_worker(requests: Set[Any], mapping: Optional[Any], +def set_active_adapters_worker(requests: set[Any], mapping: Optional[Any], apply_adapters_func, set_adapter_mapping_func) -> None: apply_adapters_func(requests) @@ -66,7 +66,7 @@ def add_adapter_worker(adapter_request: Any, list_adapters_func, return loaded -def apply_adapters_worker(adapter_requests: Set[Any], list_adapters_func, +def apply_adapters_worker(adapter_requests: set[Any], list_adapters_func, adapter_slots: int, remove_adapter_func, add_adapter_func) -> None: models_that_exist = list_adapters_func() @@ -88,5 +88,5 @@ def apply_adapters_worker(adapter_requests: Set[Any], list_adapters_func, add_adapter_func(models_map[adapter_id]) -def list_adapters_worker(adapter_manager_list_adapters_func) -> Set[int]: +def list_adapters_worker(adapter_manager_list_adapters_func) -> set[int]: return set(adapter_manager_list_adapters_func()) diff --git a/vllm/adapter_commons/worker_manager.py b/vllm/adapter_commons/worker_manager.py index ce24e08a5b56e..3c1d26404c990 100644 --- a/vllm/adapter_commons/worker_manager.py +++ b/vllm/adapter_commons/worker_manager.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 from abc import ABC, abstractmethod -from typing import Any, Optional, Set +from typing import Any, Optional import torch @@ -17,7 +17,7 @@ def is_enabled(self) -> bool: raise NotImplementedError @abstractmethod - def set_active_adapters(self, requests: Set[Any], + def set_active_adapters(self, requests: set[Any], mapping: Optional[Any]) -> None: raise NotImplementedError @@ -34,5 +34,5 @@ def remove_all_adapters(self) -> None: raise NotImplementedError @abstractmethod - def list_adapters(self) -> Set[int]: + def list_adapters(self) -> set[int]: raise NotImplementedError diff --git a/vllm/assets/video.py b/vllm/assets/video.py index 494cfc38381cf..e45e1a65f8905 100644 --- a/vllm/assets/video.py +++ b/vllm/assets/video.py @@ -2,7 +2,7 @@ from dataclasses import dataclass from functools import lru_cache -from typing import List, Literal +from typing import Literal import cv2 import numpy as np @@ -58,7 +58,7 @@ def video_to_ndarrays(path: str, num_frames: int = -1) -> npt.NDArray: def video_to_pil_images_list(path: str, - num_frames: int = -1) -> List[Image.Image]: + num_frames: int = -1) -> list[Image.Image]: frames = video_to_ndarrays(path, num_frames) return [ Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) @@ -72,7 +72,7 @@ class VideoAsset: num_frames: int = -1 @property - def pil_images(self) -> List[Image.Image]: + def pil_images(self) -> list[Image.Image]: video_path = download_video_asset(self.name) ret = video_to_pil_images_list(video_path, self.num_frames) return ret diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py index 5f0a540135402..d610dde0a8e61 100644 --- a/vllm/attention/backends/abstract.py +++ b/vllm/attention/backends/abstract.py @@ -3,8 +3,7 @@ from abc import ABC, abstractmethod from contextlib import contextmanager from dataclasses import dataclass, fields -from typing import (TYPE_CHECKING, Any, Dict, Generic, List, Optional, - Protocol, Set, Tuple, Type, TypeVar) +from typing import TYPE_CHECKING, Any, Generic, Optional, Protocol, TypeVar import torch @@ -45,17 +44,17 @@ def get_name() -> str: @staticmethod @abstractmethod - def get_impl_cls() -> Type["AttentionImpl"]: + def get_impl_cls() -> type["AttentionImpl"]: raise NotImplementedError @staticmethod @abstractmethod - def get_metadata_cls() -> Type["AttentionMetadata"]: + def get_metadata_cls() -> type["AttentionMetadata"]: raise NotImplementedError @staticmethod @abstractmethod - def get_state_cls() -> Type["AttentionState"]: + def get_state_cls() -> type["AttentionState"]: raise NotImplementedError @classmethod @@ -64,7 +63,7 @@ def make_metadata(cls, *args, **kwargs) -> "AttentionMetadata": @staticmethod @abstractmethod - def get_builder_cls() -> Type["AttentionMetadataBuilder"]: + def get_builder_cls() -> type["AttentionMetadataBuilder"]: raise NotImplementedError @staticmethod @@ -74,7 +73,7 @@ def get_kv_cache_shape( block_size: int, num_kv_heads: int, head_size: int, - ) -> Tuple[int, ...]: + ) -> tuple[int, ...]: raise NotImplementedError @staticmethod @@ -89,7 +88,7 @@ def swap_blocks( @staticmethod @abstractmethod def copy_blocks( - kv_caches: List[torch.Tensor], + kv_caches: list[torch.Tensor], src_to_dists: torch.Tensor, ) -> None: raise NotImplementedError @@ -122,7 +121,7 @@ class AttentionMetadata: # N.B. These aren't really related to attention and don't belong on this # type -- this is just a temporary solution to make them available to # `model_executable`. - multi_modal_placeholder_index_maps: Optional[Dict[ + multi_modal_placeholder_index_maps: Optional[dict[ str, MultiModalPlaceholderMap.IndexMap]] # Enable/disable KV scales calculation. This is so that we can disable the @@ -144,8 +143,8 @@ def decode_metadata(self) -> Optional["AttentionMetadata"]: pass def asdict_zerocopy(self, - skip_fields: Optional[Set[str]] = None - ) -> Dict[str, Any]: + skip_fields: Optional[set[str]] = None + ) -> dict[str, Any]: """Similar to dataclasses.asdict, but avoids deepcopying.""" if skip_fields is None: skip_fields = set() @@ -191,14 +190,14 @@ def graph_capture_get_metadata_for_batch( def get_graph_input_buffers( self, attn_metadata: T, - is_encoder_decoder_model: bool = False) -> Dict[str, Any]: + is_encoder_decoder_model: bool = False) -> dict[str, Any]: """Get attention-specific input buffers for CUDA graph capture.""" ... @abstractmethod def prepare_graph_input_buffers( self, - input_buffers: Dict[str, Any], + input_buffers: dict[str, Any], attn_metadata: T, is_encoder_decoder_model: bool = False) -> None: """In-place modify input buffers dict for CUDA graph replay.""" @@ -224,7 +223,7 @@ def prepare(self) -> None: raise NotImplementedError @abstractmethod - def build(self, seq_lens: List[int], query_lens: List[int], + def build(self, seq_lens: list[int], query_lens: list[int], cuda_graph_pad_size: int, batch_size: int) -> T: """Build attention metadata with on-device tensors.""" raise NotImplementedError @@ -257,10 +256,10 @@ def __init__( head_size: int, scale: float, num_kv_heads: Optional[int] = None, - alibi_slopes: Optional[List[float]] = None, + alibi_slopes: Optional[list[float]] = None, sliding_window: Optional[int] = None, kv_cache_dtype: str = "auto", - blocksparse_params: Optional[Dict[str, Any]] = None, + blocksparse_params: Optional[dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, attn_type: str = AttentionType.DECODER, ) -> None: diff --git a/vllm/attention/backends/blocksparse_attn.py b/vllm/attention/backends/blocksparse_attn.py index 9765e7881ad9d..5300d158ce1c3 100644 --- a/vllm/attention/backends/blocksparse_attn.py +++ b/vllm/attention/backends/blocksparse_attn.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 from dataclasses import dataclass, field -from typing import Any, Dict, List, Optional, Tuple, Type +from typing import Any, Optional import torch @@ -58,7 +58,7 @@ class BlocksparseParams: head_sliding_step: int = field(init=False) # range of q heads to for a TP rank - active_head_range: Tuple = field(init=False) + active_head_range: tuple = field(init=False) def __post_init__(self): assert self.block_size > 0 @@ -95,19 +95,19 @@ def get_name() -> str: return "BLOCK_SPARSE_FLASH_ATTN" @staticmethod - def get_impl_cls() -> Type["BlocksparseFlashAttentionImpl"]: + def get_impl_cls() -> type["BlocksparseFlashAttentionImpl"]: return BlocksparseFlashAttentionImpl @staticmethod - def get_metadata_cls() -> Type["AttentionMetadata"]: + def get_metadata_cls() -> type["AttentionMetadata"]: return BlocksparseFlashAttentionMetadata @staticmethod - def get_builder_cls() -> Type["BlocksparseFlashAttentionMetadataBuilder"]: + def get_builder_cls() -> type["BlocksparseFlashAttentionMetadataBuilder"]: return BlocksparseFlashAttentionMetadataBuilder @staticmethod - def get_state_cls() -> Type["CommonAttentionState"]: + def get_state_cls() -> type["CommonAttentionState"]: return CommonAttentionState @staticmethod @@ -116,7 +116,7 @@ def get_kv_cache_shape( block_size: int, num_kv_heads: int, head_size: int, - ) -> Tuple[int, ...]: + ) -> tuple[int, ...]: return PagedAttention.get_kv_cache_shape(num_blocks, block_size, num_kv_heads, head_size) @@ -124,14 +124,14 @@ def get_kv_cache_shape( def swap_blocks( src_kv_cache: torch.Tensor, dst_kv_cache: torch.Tensor, - src_to_dst: Dict[int, int], + src_to_dst: dict[int, int], ) -> None: PagedAttention.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst) @staticmethod def copy_blocks( - kv_caches: List[torch.Tensor], - src_to_dists: Dict[int, List[int]], + kv_caches: list[torch.Tensor], + src_to_dists: dict[int, list[int]], ) -> None: PagedAttention.copy_blocks(kv_caches, src_to_dists) @@ -148,7 +148,7 @@ class BlocksparseFlashAttentionMetadata(AttentionMetadata): """ # (batch_size,). The sequence length per sequence. Sequence length means # the computed tokens + new tokens None if it is a decoding. - seq_lens: Optional[List[int]] + seq_lens: Optional[list[int]] # seq_lens stored as a tensor. seq_lens_tensor: Optional[torch.Tensor] @@ -299,10 +299,10 @@ def __init__( head_size: int, scale: float, num_kv_heads: int, - alibi_slopes: Optional[List[float]], + alibi_slopes: Optional[list[float]], sliding_window: Optional[int], kv_cache_dtype: str, - blocksparse_params: Optional[Dict[str, Any]] = None, + blocksparse_params: Optional[dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, attn_type: str = AttentionType.DECODER, ) -> None: diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py index 5aca10079f9be..a18c1d190902e 100755 --- a/vllm/attention/backends/flash_attn.py +++ b/vllm/attention/backends/flash_attn.py @@ -3,7 +3,7 @@ from collections import defaultdict from dataclasses import dataclass from itertools import accumulate -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type +from typing import TYPE_CHECKING, Any, Optional import torch @@ -37,7 +37,7 @@ class FlashAttentionBackend(AttentionBackend): accept_output_buffer: bool = True @staticmethod - def get_supported_head_sizes() -> List[int]: + def get_supported_head_sizes() -> list[int]: return [32, 64, 96, 128, 160, 192, 224, 256] @staticmethod @@ -45,19 +45,19 @@ def get_name() -> str: return "FLASH_ATTN" @staticmethod - def get_impl_cls() -> Type["FlashAttentionImpl"]: + def get_impl_cls() -> type["FlashAttentionImpl"]: return FlashAttentionImpl @staticmethod - def get_metadata_cls() -> Type["AttentionMetadata"]: + def get_metadata_cls() -> type["AttentionMetadata"]: return FlashAttentionMetadata @staticmethod - def get_builder_cls() -> Type["FlashAttentionMetadataBuilder"]: + def get_builder_cls() -> type["FlashAttentionMetadataBuilder"]: return FlashAttentionMetadataBuilder @staticmethod - def get_state_cls() -> Type["CommonAttentionState"]: + def get_state_cls() -> type["CommonAttentionState"]: return CommonAttentionState @staticmethod @@ -66,7 +66,7 @@ def get_kv_cache_shape( block_size: int, num_kv_heads: int, head_size: int, - ) -> Tuple[int, ...]: + ) -> tuple[int, ...]: if block_size % 16 != 0: raise ValueError("Block size must be a multiple of 16.") return (2, num_blocks, block_size, num_kv_heads, head_size) @@ -86,7 +86,7 @@ def swap_blocks( @staticmethod def copy_blocks( - kv_caches: List[torch.Tensor], + kv_caches: list[torch.Tensor], src_to_dists: torch.Tensor, ) -> None: key_caches = [kv_cache[0] for kv_cache in kv_caches] @@ -106,7 +106,7 @@ class FlashAttentionMetadata(AttentionMetadata): """ # (batch_size,). The sequence length per sequence. Sequence length means # the computed tokens + new tokens None if it is a decoding. - seq_lens: Optional[List[int]] + seq_lens: Optional[list[int]] # seq_lens stored as a tensor. seq_lens_tensor: Optional[torch.Tensor] @@ -163,7 +163,7 @@ class FlashAttentionMetadata(AttentionMetadata): # Begin encoder attn & enc/dec cross-attn fields... # Encoder sequence lengths representation - encoder_seq_lens: Optional[List[int]] = None + encoder_seq_lens: Optional[list[int]] = None encoder_seq_lens_tensor: Optional[torch.Tensor] = None # (batch_size + 1,). The cumulative sequence lengths of the sequences in # the batch, used to index into sequence. E.g., if the sequence length is @@ -387,12 +387,12 @@ def __init__(self, input_builder: "ModelInputForGPUBuilder"): self.block_size = input_builder.block_size def prepare(self): - self.slot_mapping: List[int] = [] - self.prefill_seq_lens: List[int] = [] - self.context_lens: List[int] = [] - self.block_tables: List[List[int]] = [] - self.curr_seq_lens: List[int] = [] - self.multimodal_placeholder_maps: Dict[ + self.slot_mapping: list[int] = [] + self.prefill_seq_lens: list[int] = [] + self.context_lens: list[int] = [] + self.block_tables: list[list[int]] = [] + self.curr_seq_lens: list[int] = [] + self.multimodal_placeholder_maps: dict[ str, MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap) self.num_prefills = 0 @@ -462,7 +462,7 @@ def _add_seq_group( def _get_graph_runner_block_tables( self, num_seqs: int, - block_tables: List[List[int]]) -> torch.Tensor: + block_tables: list[list[int]]) -> torch.Tensor: # The shape of graph_block_tables is # [max batch size, max context len // block size]. max_batch_size, max_blocks = self.runner.graph_block_tables.shape @@ -484,7 +484,7 @@ def _get_graph_runner_block_tables( return torch.from_numpy(graph_block_tables).to( device=self.runner.device, non_blocking=True) - def build(self, seq_lens: List[int], query_lens: List[int], + def build(self, seq_lens: list[int], query_lens: list[int], cuda_graph_pad_size: int, batch_size: int): """Build attention metadata with on-device tensors. @@ -606,10 +606,10 @@ def __init__( head_size: int, scale: float, num_kv_heads: int, - alibi_slopes: Optional[List[float]], + alibi_slopes: Optional[list[float]], sliding_window: Optional[int], kv_cache_dtype: str, - blocksparse_params: Optional[Dict[str, Any]] = None, + blocksparse_params: Optional[dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, attn_type: str = AttentionType.DECODER, ) -> None: diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py index 0556c191ddea6..19fe810427a11 100644 --- a/vllm/attention/backends/flashinfer.py +++ b/vllm/attention/backends/flashinfer.py @@ -4,7 +4,7 @@ from collections import defaultdict from contextlib import contextmanager from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple, Type +from typing import TYPE_CHECKING, Any, Optional from vllm.multimodal import MultiModalPlaceholderMap @@ -53,19 +53,19 @@ def get_name() -> str: return "FLASHINFER" @staticmethod - def get_impl_cls() -> Type["FlashInferImpl"]: + def get_impl_cls() -> type["FlashInferImpl"]: return FlashInferImpl @staticmethod - def get_metadata_cls() -> Type["AttentionMetadata"]: + def get_metadata_cls() -> type["AttentionMetadata"]: return FlashInferMetadata @staticmethod - def get_builder_cls() -> Type["FlashInferMetadataBuilder"]: + def get_builder_cls() -> type["FlashInferMetadataBuilder"]: return FlashInferMetadataBuilder @staticmethod - def get_state_cls() -> Type["FlashInferState"]: + def get_state_cls() -> type["FlashInferState"]: return FlashInferState @staticmethod @@ -74,7 +74,7 @@ def get_kv_cache_shape( block_size: int, num_kv_heads: int, head_size: int, - ) -> Tuple[int, ...]: + ) -> tuple[int, ...]: return (num_blocks, 2, block_size, num_kv_heads, head_size) @staticmethod @@ -87,13 +87,13 @@ def swap_blocks( @staticmethod def copy_blocks( - kv_caches: List[torch.Tensor], + kv_caches: list[torch.Tensor], src_to_dists: torch.Tensor, ) -> None: PagedAttention.copy_blocks(kv_caches, src_to_dists) @staticmethod - def get_supported_head_sizes() -> List[int]: + def get_supported_head_sizes() -> list[int]: return [64, 128, 256] @staticmethod @@ -119,14 +119,14 @@ class PerLayerParameters: def get_per_layer_parameters( - vllm_config: VllmConfig) -> Dict[str, PerLayerParameters]: + vllm_config: VllmConfig) -> dict[str, PerLayerParameters]: """ Scan all attention layers and determine some hyperparameters to use during `plan`. """ layers = vllm_config.compilation_config.static_forward_context - per_layer_params: Dict[str, PerLayerParameters] = {} + per_layer_params: dict[str, PerLayerParameters] = {} for key, layer in layers.items(): assert isinstance(layer, Attention) @@ -147,7 +147,7 @@ def get_per_layer_parameters( def infer_global_hyperparameters( - per_layer_params: Dict[str, PerLayerParameters]) -> PerLayerParameters: + per_layer_params: dict[str, PerLayerParameters]) -> PerLayerParameters: """ Currently, FlashInfer backend only support models in which all layers share the same values for the following hyperparameters: @@ -514,8 +514,8 @@ def begin_forward(self): q_data_type=self.q_data_type) def asdict_zerocopy(self, - skip_fields: Optional[Set[str]] = None - ) -> Dict[str, Any]: + skip_fields: Optional[set[str]] = None + ) -> dict[str, Any]: if skip_fields is None: skip_fields = set() # We need to skip the prefill/decode_wrapper field since it cannot be @@ -613,12 +613,12 @@ def __init__(self, input_builder: "ModelInputForGPUBuilder"): self.vllm_config = get_current_vllm_config() def prepare(self): - self.slot_mapping: List[int] = [] - self.prefill_seq_lens: List[int] = [] - self.context_lens: List[int] = [] - self.block_tables: List[List[int]] = [] - self.curr_seq_lens: List[int] = [] - self.multimodal_placeholder_maps: Dict[ + self.slot_mapping: list[int] = [] + self.prefill_seq_lens: list[int] = [] + self.context_lens: list[int] = [] + self.block_tables: list[list[int]] = [] + self.curr_seq_lens: list[int] = [] + self.multimodal_placeholder_maps: dict[ str, MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap) self.num_prefills = 0 @@ -635,12 +635,12 @@ def prepare(self): # [0, 5, 8, 1, 6, 7, 3, 4] # paged_kv_indptr is used to index into paged_kv_indices: # [0, 3, 6, 8] - self.paged_kv_indices: List[int] = [] + self.paged_kv_indices: list[int] = [] # 0 at the beginning of paged_kv_indptr indicates the start of the # first request’s page indices in the paged_kv_indices list. - self.paged_kv_indptr: List[int] = [0] + self.paged_kv_indptr: list[int] = [0] # paged_kv_last_page_len is the length of the last page of each request - self.paged_kv_last_page_len: List[int] = [] + self.paged_kv_last_page_len: list[int] = [] self.total_blocks = 0 self.is_profile_run: bool = False @@ -725,7 +725,7 @@ def _add_seq_group( block_table = block_tables[seq_id] self._update_paged_kv_tensors(block_table, seq_len) - def _update_paged_kv_tensors(self, block_table: List[int], seq_len: int): + def _update_paged_kv_tensors(self, block_table: list[int], seq_len: int): # Get the number of valid blocks based on sequence length. # If seq_len = 16, block_size = 16, # block_table_bound is 1 with 1 valid block. @@ -744,7 +744,7 @@ def _update_paged_kv_tensors(self, block_table: List[int], seq_len: int): last_page_len = self.block_size self.paged_kv_last_page_len.append(last_page_len) - def build(self, seq_lens: List[int], query_lens: List[int], + def build(self, seq_lens: list[int], query_lens: list[int], cuda_graph_pad_size: int, batch_size: int): """Build attention metadata with on-device tensors. @@ -901,10 +901,10 @@ def __init__( head_size: int, scale: float, num_kv_heads: int, - alibi_slopes: Optional[List[float]], + alibi_slopes: Optional[list[float]], sliding_window: Optional[int], kv_cache_dtype: str, - blocksparse_params: Optional[Dict[str, Any]] = None, + blocksparse_params: Optional[dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, attn_type: str = AttentionType.DECODER, ) -> None: diff --git a/vllm/attention/backends/flashmla.py b/vllm/attention/backends/flashmla.py index 273c69b63ec63..d119c7993fcdb 100644 --- a/vllm/attention/backends/flashmla.py +++ b/vllm/attention/backends/flashmla.py @@ -2,7 +2,7 @@ from contextlib import contextmanager from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type +from typing import TYPE_CHECKING, Any, Optional import torch @@ -27,25 +27,25 @@ def get_name() -> str: return "FLASHMLA" @staticmethod - def get_impl_cls() -> Type["FlashMLAImpl"]: + def get_impl_cls() -> type["FlashMLAImpl"]: return FlashMLAImpl @staticmethod - def get_metadata_cls() -> Type["FlashMLAMetadata"]: + def get_metadata_cls() -> type["FlashMLAMetadata"]: return FlashMLAMetadata @staticmethod - def get_builder_cls() -> Type["FlashMLAMetadataBuilder"]: + def get_builder_cls() -> type["FlashMLAMetadataBuilder"]: return FlashMLAMetadataBuilder @staticmethod - def get_state_cls() -> Type["FlashMLAState"]: + def get_state_cls() -> type["FlashMLAState"]: return FlashMLAState @dataclass class FlashMLAMetadata(MLACommonMetadata): - decode_tile_scheduler_metadata: Optional[Tuple[torch.Tensor, + decode_tile_scheduler_metadata: Optional[tuple[torch.Tensor, torch.Tensor]] = None decode_num_splits: Optional[torch.Tensor] = None @@ -79,7 +79,7 @@ def __init__(self, *args, **kwargs): self.num_q_heads = self.runner.model_config.get_num_attention_heads( self.runner.parallel_config) - def build(self, seq_lens: List[int], query_lens: List[int], + def build(self, seq_lens: list[int], query_lens: list[int], cuda_graph_pad_size: int, batch_size: int): m = super().build(seq_lens, query_lens, cuda_graph_pad_size, batch_size) @@ -176,10 +176,10 @@ def __init__( head_size: int, scale: float, num_kv_heads: int, - alibi_slopes: Optional[List[float]], + alibi_slopes: Optional[list[float]], sliding_window: Optional[int], kv_cache_dtype: str, - blocksparse_params: Optional[Dict[str, Any]], + blocksparse_params: Optional[dict[str, Any]], logits_soft_cap: Optional[float], attn_type: str, # MLA Specific Arguments diff --git a/vllm/attention/backends/hpu_attn.py b/vllm/attention/backends/hpu_attn.py index 9eb533685dbd2..e5048fbef6198 100644 --- a/vllm/attention/backends/hpu_attn.py +++ b/vllm/attention/backends/hpu_attn.py @@ -6,7 +6,7 @@ import os from dataclasses import dataclass -from typing import Any, Dict, List, Optional, Tuple, Type +from typing import Any, Optional import torch import vllm_hpu_extension.ops as ops @@ -31,15 +31,15 @@ def get_name() -> str: return "HPU_ATTN" @staticmethod - def get_impl_cls() -> Type["HPUAttentionImpl"]: + def get_impl_cls() -> type["HPUAttentionImpl"]: return HPUAttentionImpl @staticmethod - def get_metadata_cls() -> Type["AttentionMetadata"]: + def get_metadata_cls() -> type["AttentionMetadata"]: return HPUAttentionMetadata @staticmethod - def get_state_cls() -> Type["CommonAttentionState"]: + def get_state_cls() -> type["CommonAttentionState"]: return CommonAttentionState @staticmethod @@ -48,7 +48,7 @@ def get_kv_cache_shape( block_size: int, num_kv_heads: int, head_size: int, - ) -> Tuple[int, ...]: + ) -> tuple[int, ...]: return HPUPagedAttention.get_kv_cache_shape(num_blocks, block_size, num_kv_heads, head_size) @@ -56,14 +56,14 @@ def get_kv_cache_shape( def swap_blocks( src_kv_cache: torch.Tensor, dst_kv_cache: torch.Tensor, - src_to_dst: Dict[int, int], + src_to_dst: dict[int, int], ) -> None: HPUPagedAttention.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst) @staticmethod def copy_blocks( - kv_caches: List[torch.Tensor], - src_to_dists: Dict[int, List[int]], + kv_caches: list[torch.Tensor], + src_to_dists: dict[int, list[int]], ) -> None: HPUPagedAttention.copy_blocks(kv_caches, src_to_dists) @@ -101,10 +101,10 @@ def __init__( head_size: int, scale: float, num_kv_heads: int, - alibi_slopes: Optional[List[float]], + alibi_slopes: Optional[list[float]], sliding_window: Optional[int], kv_cache_dtype: str, - blocksparse_params: Optional[Dict[str, Any]] = None, + blocksparse_params: Optional[dict[str, Any]] = None, max_seq_len: int = 4096, attn_type: str = AttentionType.DECODER, ) -> None: diff --git a/vllm/attention/backends/ipex_attn.py b/vllm/attention/backends/ipex_attn.py index b4879af4cf20e..b772089ad25d7 100644 --- a/vllm/attention/backends/ipex_attn.py +++ b/vllm/attention/backends/ipex_attn.py @@ -2,7 +2,7 @@ """ Attention layer with torch scaled_dot_product_attention and PagedAttention.""" from dataclasses import dataclass -from typing import Any, Dict, List, Optional, Tuple, Type +from typing import Any, Optional import torch @@ -24,15 +24,15 @@ def get_name() -> str: return "IPEX" @staticmethod - def get_impl_cls() -> Type["IpexAttnBackendImpl"]: + def get_impl_cls() -> type["IpexAttnBackendImpl"]: return IpexAttnBackendImpl @staticmethod - def get_metadata_cls() -> Type["IpexAttnMetadata"]: + def get_metadata_cls() -> type["IpexAttnMetadata"]: return IpexAttnMetadata @staticmethod - def get_state_cls() -> Type["CommonAttentionState"]: + def get_state_cls() -> type["CommonAttentionState"]: return CommonAttentionState @staticmethod @@ -41,7 +41,7 @@ def get_kv_cache_shape( block_size: int, num_kv_heads: int, head_size: int, - ) -> Tuple[int, ...]: + ) -> tuple[int, ...]: return PagedAttention.get_kv_cache_shape(num_blocks, block_size, num_kv_heads, head_size) @@ -56,7 +56,7 @@ def swap_blocks( @staticmethod def copy_blocks( - kv_caches: List[torch.Tensor], + kv_caches: list[torch.Tensor], src_to_dists: torch.Tensor, ) -> None: from vllm._ipex_ops import ipex_ops as ops @@ -73,7 +73,7 @@ class IpexAttnMetadata(AttentionMetadata, PagedAttentionMetadata): # or all decoding. True if all sequences are prompts. is_prompt: bool slot_mapping: torch.Tensor - seq_lens: Optional[List[int]] + seq_lens: Optional[list[int]] seqlen_q: Optional[torch.Tensor] max_seqlen: Optional[int] @@ -83,7 +83,7 @@ def __post_init__(self): # when alibi slopes is used. It is because of the limitation # from xformer API. # will not appear in the __repr__ and __init__ - self.attn_bias: Optional[List[torch.Tensor]] = None + self.attn_bias: Optional[list[torch.Tensor]] = None @property def prefill_metadata(self) -> Optional["IpexAttnMetadata"]: @@ -112,10 +112,10 @@ def __init__( head_size: int, scale: float, num_kv_heads: int, - alibi_slopes: Optional[List[float]], + alibi_slopes: Optional[list[float]], sliding_window: Optional[int], kv_cache_dtype: str, - blocksparse_params: Optional[Dict[str, Any]] = None, + blocksparse_params: Optional[dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, attn_type: str = AttentionType.DECODER, ) -> None: @@ -160,7 +160,7 @@ def split_kv_cache( kv_cache: torch.Tensor, num_kv_heads: int, head_size: int, - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: x = 1 num_blocks = kv_cache.shape[1] @@ -341,8 +341,8 @@ def forward( def _make_alibi_bias( alibi_slopes: torch.Tensor, dtype: torch.dtype, - seq_lens: List[int], -) -> List[torch.Tensor]: + seq_lens: list[int], +) -> list[torch.Tensor]: attn_biases = [] for seq_len in seq_lens: bias = torch.arange(seq_len, dtype=dtype, device=alibi_slopes.device) @@ -366,10 +366,10 @@ def _make_alibi_bias( def _make_sliding_window_bias( - seq_lens: List[int], + seq_lens: list[int], window_size: Optional[int], dtype: torch.dtype, -) -> List[torch.Tensor]: +) -> list[torch.Tensor]: attn_biases = [] for seq_len in seq_lens: tensor = torch.full( diff --git a/vllm/attention/backends/mla/common.py b/vllm/attention/backends/mla/common.py index 1befcb6b45dfa..2eae3aa865998 100644 --- a/vllm/attention/backends/mla/common.py +++ b/vllm/attention/backends/mla/common.py @@ -198,8 +198,7 @@ from contextlib import contextmanager from dataclasses import dataclass from itertools import accumulate -from typing import (TYPE_CHECKING, Any, Dict, Generic, List, Optional, Tuple, - Type, TypeVar) +from typing import TYPE_CHECKING, Any, Generic, Optional, TypeVar import torch from compressed_tensors.quantization import QuantizationStrategy @@ -253,15 +252,15 @@ def get_name() -> str: return "TRITON_MLA" @staticmethod - def get_metadata_cls() -> Type["AttentionMetadata"]: + def get_metadata_cls() -> type["AttentionMetadata"]: return MLACommonMetadata @staticmethod - def get_builder_cls() -> Type["MLACommonMetadataBuilder"]: + def get_builder_cls() -> type["MLACommonMetadataBuilder"]: return MLACommonMetadataBuilder @staticmethod - def get_state_cls() -> Type["MLACommonState"]: + def get_state_cls() -> type["MLACommonState"]: return MLACommonState @staticmethod @@ -270,7 +269,7 @@ def get_kv_cache_shape( block_size: int, num_kv_heads: int, # assumed to be 1 for MLA head_size: int, - ) -> Tuple[int, ...]: + ) -> tuple[int, ...]: return (num_blocks, block_size, head_size) @staticmethod @@ -283,13 +282,13 @@ def swap_blocks( @staticmethod def copy_blocks( - kv_caches: List[torch.Tensor], + kv_caches: list[torch.Tensor], src_to_dists: torch.Tensor, ) -> None: ops.copy_blocks_mla(kv_caches, src_to_dists) @staticmethod - def get_supported_head_sizes() -> List[int]: + def get_supported_head_sizes() -> list[int]: return [576] @@ -475,7 +474,7 @@ class MLACommonMetadata(AttentionMetadata): # (batch_size,). The sequence length per sequence. Sequence length means # the computed tokens + new tokens None if it is a decoding. - seq_lens: Optional[List[int]] + seq_lens: Optional[list[int]] # seq_lens stored as a tensor. seq_lens_tensor: Optional[torch.Tensor] @@ -528,8 +527,8 @@ class MLACommonMetadata(AttentionMetadata): # For chunked prefill context_chunk_cu_seq_lens: Optional[torch.Tensor] = None context_chunk_starts: Optional[torch.Tensor] = None - context_chunk_seq_tot: Optional[List[int]] = None - context_chunk_max_seq_lens: Optional[List[int]] = None + context_chunk_seq_tot: Optional[list[int]] = None + context_chunk_max_seq_lens: Optional[list[int]] = None # Set by MLAAttentionState in `begin_forward` so it doesn't get broadcasted chunked_prefill_workspace: Optional[torch.Tensor] = None @@ -749,13 +748,13 @@ def __init__(self, input_builder: "ModelInputForGPUBuilder"): self.page_size = self.runner.block_size def prepare(self): - self.slot_mapping: List[int] = [] - self.prefill_seq_lens: List[int] = [] - self.context_lens: List[int] = [] - self.block_tables: List[List[int]] = [] - self.curr_seq_lens: List[int] = [] - self.input_positions: List[int] = [] - self.multimodal_placeholder_maps: Dict[ + self.slot_mapping: list[int] = [] + self.prefill_seq_lens: list[int] = [] + self.context_lens: list[int] = [] + self.block_tables: list[list[int]] = [] + self.curr_seq_lens: list[int] = [] + self.input_positions: list[int] = [] + self.multimodal_placeholder_maps: dict[ str, MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap) self.num_prefills = 0 @@ -820,7 +819,7 @@ def _add_seq_group( def _get_graph_runner_block_tables( self, num_seqs: int, - block_tables: List[List[int]]) -> torch.Tensor: + block_tables: list[list[int]]) -> torch.Tensor: # The shape of graph_block_tables is # [max batch size, max context len // block size]. max_batch_size, max_blocks = self.runner.graph_block_tables.shape @@ -842,7 +841,7 @@ def _get_graph_runner_block_tables( return torch.from_numpy(graph_block_tables).to( device=self.runner.device, non_blocking=True) - def build(self, seq_lens: List[int], query_lens: List[int], + def build(self, seq_lens: list[int], query_lens: list[int], cuda_graph_pad_size: int, batch_size: int): """Build attention metadata with on-device tensors. @@ -1006,10 +1005,10 @@ def __init__( head_size: int, scale: float, num_kv_heads: int, - alibi_slopes: Optional[List[float]], + alibi_slopes: Optional[list[float]], sliding_window: Optional[int], kv_cache_dtype: str, - blocksparse_params: Optional[Dict[str, Any]], + blocksparse_params: Optional[dict[str, Any]], logits_soft_cap: Optional[float], attn_type: str, # MLA Specific Arguments @@ -1102,7 +1101,7 @@ def process_weights_after_loading(self, act_dtype: torch.dtype): # # returns input_group_shape, weight_group_shape def get_scale_group_shapes_for_fp8(layer: LinearBase) -> \ - Tuple[Tuple[int, int], Tuple[int, int]]: + tuple[tuple[int, int], tuple[int, int]]: if isinstance(layer.quant_method, Fp8LinearMethod): if layer.quant_method.block_quant: weight_block_size = \ diff --git a/vllm/attention/backends/openvino.py b/vllm/attention/backends/openvino.py index 9908620a32a23..61fcb16b7c944 100644 --- a/vllm/attention/backends/openvino.py +++ b/vllm/attention/backends/openvino.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 from dataclasses import dataclass -from typing import Dict, List, Optional, Tuple, Type +from typing import Optional import openvino as ov import torch @@ -54,7 +54,7 @@ def make_metadata(*args, **kwargs) -> "AttentionMetadata": raise NotImplementedError @staticmethod - def get_state_cls() -> Type["CommonAttentionState"]: + def get_state_cls() -> type["CommonAttentionState"]: return CommonAttentionState @staticmethod @@ -67,22 +67,22 @@ def get_kv_cache_shape( block_size: int, num_kv_heads: int, head_size: int, - ) -> Tuple[int, ...]: + ) -> tuple[int, ...]: return (2, num_blocks, num_kv_heads, block_size, head_size) @staticmethod def swap_blocks( src_tensor: ov.Tensor, dst_tensor: ov.Tensor, - src_to_dists: List[Tuple[int, int]], + src_to_dists: list[tuple[int, int]], ) -> None: for src, dst in src_to_dists: copy_cache_block(src_tensor, dst_tensor, src, dst) @staticmethod def copy_blocks( - kv_caches: List[Tuple[ov.Tensor, ov.Tensor]], - src_to_dists: List[Tuple[int, int]], + kv_caches: list[tuple[ov.Tensor, ov.Tensor]], + src_to_dists: list[tuple[int, int]], ) -> None: for src, dst in src_to_dists: for key_cache, value_cache in kv_caches: @@ -138,7 +138,7 @@ class OpenVINOAttentionMetadata: # N.B. These aren't really related to attention and don't belong on this # type -- this is just a temporary solution to make them available to # `model_executable`. - multi_modal_placeholder_index_maps: Optional[Dict[ + multi_modal_placeholder_index_maps: Optional[dict[ str, MultiModalPlaceholderMap.IndexMap]] # Enable/disable KV scales calculation. This is so that we can disable the diff --git a/vllm/attention/backends/pallas.py b/vllm/attention/backends/pallas.py index b61dfe63ddcaa..66260fc92a9b1 100644 --- a/vllm/attention/backends/pallas.py +++ b/vllm/attention/backends/pallas.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 from dataclasses import dataclass -from typing import Any, Dict, List, Optional, Tuple, Type +from typing import Any, Optional import torch import torch_xla.experimental.custom_kernel # Required to register custom ops. @@ -19,15 +19,15 @@ def get_name() -> str: return "PALLAS" @staticmethod - def get_impl_cls() -> Type["PallasAttentionBackendImpl"]: + def get_impl_cls() -> type["PallasAttentionBackendImpl"]: return PallasAttentionBackendImpl @staticmethod - def get_metadata_cls() -> Type["PallasMetadata"]: + def get_metadata_cls() -> type["PallasMetadata"]: return PallasMetadata @staticmethod - def get_state_cls() -> Type["CommonAttentionState"]: + def get_state_cls() -> type["CommonAttentionState"]: return CommonAttentionState @staticmethod @@ -36,7 +36,7 @@ def get_kv_cache_shape( block_size: int, num_kv_heads: int, head_size: int, - ) -> Tuple[int, ...]: + ) -> tuple[int, ...]: return (num_kv_heads, num_blocks, block_size, head_size) @staticmethod @@ -50,8 +50,8 @@ def swap_blocks( @torch.compile(backend="openxla") @staticmethod def copy_blocks( - kv_caches: List[Tuple[torch.Tensor, torch.Tensor]], - src_to_dists: Tuple[torch.Tensor, torch.Tensor], + kv_caches: list[tuple[torch.Tensor, torch.Tensor]], + src_to_dists: tuple[torch.Tensor, torch.Tensor], ) -> None: src_indices, dst_indices = src_to_dists for k_cache, v_cache in kv_caches: @@ -98,10 +98,10 @@ def __init__( head_size: int, scale: float, num_kv_heads: int, - alibi_slopes: Optional[List[float]], + alibi_slopes: Optional[list[float]], sliding_window: Optional[int], kv_cache_dtype: str, - blocksparse_params: Optional[Dict[str, Any]] = None, + blocksparse_params: Optional[dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, attn_type: str = AttentionType.DECODER, ) -> None: @@ -155,7 +155,7 @@ def forward( query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, - kv_cache: Tuple[torch.Tensor, torch.Tensor], + kv_cache: tuple[torch.Tensor, torch.Tensor], attn_metadata: PallasMetadata, output: Optional[torch.Tensor] = None, ) -> torch.Tensor: diff --git a/vllm/attention/backends/placeholder_attn.py b/vllm/attention/backends/placeholder_attn.py index f1def25c89cff..84b725473af98 100644 --- a/vllm/attention/backends/placeholder_attn.py +++ b/vllm/attention/backends/placeholder_attn.py @@ -3,7 +3,7 @@ from collections import defaultdict from dataclasses import dataclass from itertools import accumulate -from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Type +from typing import TYPE_CHECKING, Optional import torch @@ -30,19 +30,19 @@ def get_name() -> str: return "NO_ATTENTION" @staticmethod - def get_impl_cls() -> Type["PlaceholderAttentionImpl"]: + def get_impl_cls() -> type["PlaceholderAttentionImpl"]: return PlaceholderAttentionImpl @staticmethod - def get_builder_cls() -> Type["PlaceholderAttentionMetadataBuilder"]: + def get_builder_cls() -> type["PlaceholderAttentionMetadataBuilder"]: return PlaceholderAttentionMetadataBuilder @staticmethod - def get_metadata_cls() -> Type["PlaceholderAttentionMetadata"]: + def get_metadata_cls() -> type["PlaceholderAttentionMetadata"]: return PlaceholderAttentionMetadata @staticmethod - def get_state_cls() -> Type["CommonAttentionState"]: + def get_state_cls() -> type["CommonAttentionState"]: return CommonAttentionState @staticmethod @@ -51,7 +51,7 @@ def get_kv_cache_shape( block_size: int, num_kv_heads: int, head_size: int, - ) -> Tuple[int, ...]: + ) -> tuple[int, ...]: return (1, 1, 1, 1, 1) @staticmethod @@ -64,7 +64,7 @@ def swap_blocks( @staticmethod def copy_blocks( - kv_caches: List[torch.Tensor], + kv_caches: list[torch.Tensor], src_to_dists: torch.Tensor, ) -> None: return @@ -75,7 +75,7 @@ class PlaceholderAttentionMetadata(AttentionMetadata): """Attention metadata for prefill and decode batched together.""" # (batch_size,). The sequence length per sequence. Sequence length means # the computed tokens + new tokens None if it is a decoding. - seq_lens: Optional[List[int]] + seq_lens: Optional[list[int]] # seq_lens stored as a tensor. seq_lens_tensor: Optional[torch.Tensor] @@ -269,10 +269,10 @@ def __init__(self, input_builder: "ModelInputForGPUBuilder"): self.runner = input_builder.runner def prepare(self): - self.prefill_seq_lens: List[int] = [] - self.context_lens: List[int] = [] - self.curr_seq_lens: List[int] = [] - self.multimodal_placeholder_maps: Dict[ + self.prefill_seq_lens: list[int] = [] + self.context_lens: list[int] = [] + self.curr_seq_lens: list[int] = [] + self.multimodal_placeholder_maps: dict[ str, MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap) self.num_prefills = 0 @@ -309,7 +309,7 @@ def _add_seq_group( self.num_decode_tokens += query_len self.curr_seq_lens.append(curr_seq_len) - def build(self, seq_lens: List[int], query_lens: List[int], + def build(self, seq_lens: list[int], query_lens: list[int], cuda_graph_pad_size: int, batch_size: int): """Build attention metadata with on-device tensors. diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py index 3f40686ee2fda..2c908451d151d 100644 --- a/vllm/attention/backends/rocm_flash_attn.py +++ b/vllm/attention/backends/rocm_flash_attn.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 """Attention layer ROCm GPUs.""" from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type +from typing import TYPE_CHECKING, Any, Optional import torch @@ -35,19 +35,19 @@ def get_name() -> str: return "ROCM_FLASH" @staticmethod - def get_impl_cls() -> Type["ROCmFlashAttentionImpl"]: + def get_impl_cls() -> type["ROCmFlashAttentionImpl"]: return ROCmFlashAttentionImpl @staticmethod - def get_metadata_cls() -> Type["AttentionMetadata"]: + def get_metadata_cls() -> type["AttentionMetadata"]: return ROCmFlashAttentionMetadata @staticmethod - def get_builder_cls() -> Type["ROCmFlashAttentionMetadataBuilder"]: + def get_builder_cls() -> type["ROCmFlashAttentionMetadataBuilder"]: return ROCmFlashAttentionMetadataBuilder @staticmethod - def get_state_cls() -> Type["CommonAttentionState"]: + def get_state_cls() -> type["CommonAttentionState"]: return CommonAttentionState @staticmethod @@ -56,7 +56,7 @@ def get_kv_cache_shape( block_size: int, num_kv_heads: int, head_size: int, - ) -> Tuple[int, ...]: + ) -> tuple[int, ...]: return PagedAttention.get_kv_cache_shape(num_blocks, block_size, num_kv_heads, head_size) @@ -70,7 +70,7 @@ def swap_blocks( @staticmethod def copy_blocks( - kv_caches: List[torch.Tensor], + kv_caches: list[torch.Tensor], src_to_dists: torch.Tensor, ) -> None: PagedAttention.copy_blocks(kv_caches, src_to_dists) @@ -87,7 +87,7 @@ class ROCmFlashAttentionMetadata(AttentionMetadata, PagedAttentionMetadata): """ # (batch_size,). The sequence length per sequence. Sequence length means # the computed tokens + new tokens None if it is a decoding. - seq_lens: Optional[List[int]] + seq_lens: Optional[list[int]] # seq_lens stored as a tensor. seq_lens_tensor: Optional[torch.Tensor] # Maximum sequence length among prefill batch. 0 if there are decoding @@ -133,7 +133,7 @@ class ROCmFlashAttentionMetadata(AttentionMetadata, PagedAttentionMetadata): # Begin encoder attn & enc/dec cross-attn fields... # Encoder sequence lengths representation - encoder_seq_lens: Optional[List[int]] = None + encoder_seq_lens: Optional[list[int]] = None encoder_seq_lens_tensor: Optional[torch.Tensor] = None # Maximum sequence length among encoder sequences @@ -301,8 +301,8 @@ class ROCmFlashAttentionMetadataBuilder( def _make_alibi_bias(alibi_slopes: torch.Tensor, dtype: torch.dtype, - seq_lens: Optional[List[int]], - make_attn_mask: bool = True) -> List[torch.Tensor]: + seq_lens: Optional[list[int]], + make_attn_mask: bool = True) -> list[torch.Tensor]: attn_biases = [] if seq_lens: for seq_len in seq_lens: @@ -453,10 +453,10 @@ def __init__( head_size: int, scale: float, num_kv_heads: int, - alibi_slopes: Optional[List[float]], + alibi_slopes: Optional[list[float]], sliding_window: Optional[int], kv_cache_dtype: str, - blocksparse_params: Optional[Dict[str, Any]] = None, + blocksparse_params: Optional[dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, attn_type: str = AttentionType.DECODER, ) -> None: @@ -846,12 +846,12 @@ def _sdpa_attention( query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, - seq_lens: List[int], + seq_lens: list[int], num_tokens: int, num_heads: int, head_size: int, scale: float, - attn_masks: Optional[List[torch.Tensor]] = None, + attn_masks: Optional[list[torch.Tensor]] = None, ) -> torch.Tensor: start = 0 output = torch.empty((num_tokens, num_heads, head_size), diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py index 25fe6ed95c5df..8e249abfa13da 100644 --- a/vllm/attention/backends/torch_sdpa.py +++ b/vllm/attention/backends/torch_sdpa.py @@ -2,7 +2,7 @@ """ Attention layer with torch scaled_dot_product_attention and PagedAttention.""" from dataclasses import dataclass -from typing import Any, Dict, List, Optional, Tuple, Type +from typing import Any, Optional import torch from torch.nn.functional import scaled_dot_product_attention @@ -29,19 +29,19 @@ def get_name() -> str: return "TORCH_SDPA" @staticmethod - def get_impl_cls() -> Type["TorchSDPABackendImpl"]: + def get_impl_cls() -> type["TorchSDPABackendImpl"]: return TorchSDPABackendImpl @staticmethod - def get_metadata_cls() -> Type["AttentionMetadata"]: + def get_metadata_cls() -> type["AttentionMetadata"]: return TorchSDPAMetadata @staticmethod - def get_state_cls() -> Type["CommonAttentionState"]: + def get_state_cls() -> type["CommonAttentionState"]: return CommonAttentionState @staticmethod - def get_builder_cls() -> Type["TorchSDPAMetadataBuilder"]: + def get_builder_cls() -> type["TorchSDPAMetadataBuilder"]: return TorchSDPAMetadataBuilder @staticmethod @@ -50,7 +50,7 @@ def get_kv_cache_shape( block_size: int, num_kv_heads: int, head_size: int, - ) -> Tuple[int, ...]: + ) -> tuple[int, ...]: return PagedAttention.get_kv_cache_shape(num_blocks, block_size, num_kv_heads, head_size) @@ -64,7 +64,7 @@ def swap_blocks( @staticmethod def copy_blocks( - kv_caches: List[torch.Tensor], + kv_caches: list[torch.Tensor], src_to_dists: torch.Tensor, ) -> None: PagedAttention.copy_blocks(kv_caches, src_to_dists) @@ -77,7 +77,7 @@ class TorchSDPAMetadata(AttentionMetadata, PagedAttentionMetadata): # Currently, input sequences can only contain all prompts # or all decoding. True if all sequences are prompts. chunked_prefill: bool - seq_lens: Optional[List[int]] = None # For non-chunked prefill + seq_lens: Optional[list[int]] = None # For non-chunked prefill # For chunked prefill only max_query_len: Optional[int] = None @@ -88,7 +88,7 @@ class TorchSDPAMetadata(AttentionMetadata, PagedAttentionMetadata): # Begin encoder attn & enc/dec cross-attn fields... # Encoder sequence lengths representation - encoder_seq_lens: Optional[List[int]] = None + encoder_seq_lens: Optional[list[int]] = None encoder_seq_lens_tensor: Optional[torch.Tensor] = None # Maximum sequence length among encoder sequences @@ -108,9 +108,9 @@ def __post_init__(self): # when alibi slopes is used. It is because of the limitation # from xformer API. # will not appear in the __repr__ and __init__ - self.attn_bias: Optional[List[torch.Tensor]] = None - self.encoder_attn_bias: Optional[List[torch.Tensor]] = None - self.cross_attn_bias: Optional[List[torch.Tensor]] = None + self.attn_bias: Optional[list[torch.Tensor]] = None + self.encoder_attn_bias: Optional[list[torch.Tensor]] = None + self.cross_attn_bias: Optional[list[torch.Tensor]] = None @property def is_all_encoder_attn_metadata_set(self): @@ -180,7 +180,7 @@ def get_seq_lens( def get_attn_bias( self, attn_type: str, - ) -> Optional[List[torch.Tensor]]: + ) -> Optional[list[torch.Tensor]]: ''' Extract appropriate attention bias from attention metadata according to attention type. @@ -207,7 +207,7 @@ def get_attn_bias( def set_attn_bias( self, - attn_bias: List[torch.Tensor], + attn_bias: list[torch.Tensor], attn_type: str, ) -> None: ''' @@ -288,7 +288,7 @@ def __init__(self, input_builder: ModelInputForCPUBuilder) -> None: def prepare(self): self.input_data = self.input_builder.input_data - def build(self, seq_lens: List[int], query_lens: List[int], + def build(self, seq_lens: list[int], query_lens: list[int], cuda_graph_pad_size: int, batch_size: int) -> TorchSDPAMetadata: input_data = self.input_data prefill_seq_lens = seq_lens[0:input_data.num_prefills] @@ -394,10 +394,10 @@ def __init__( head_size: int, scale: float, num_kv_heads: int, - alibi_slopes: Optional[List[float]], + alibi_slopes: Optional[list[float]], sliding_window: Optional[int], kv_cache_dtype: str, - blocksparse_params: Optional[Dict[str, Any]] = None, + blocksparse_params: Optional[dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, attn_type: str = AttentionType.DECODER, ) -> None: @@ -636,9 +636,9 @@ def _run_sdpa_forward( def _make_alibi_bias( alibi_slopes: torch.Tensor, dtype: torch.dtype, - seq_lens: List[int], -) -> List[torch.Tensor]: - attn_biases: List[torch.Tensor] = [] + seq_lens: list[int], +) -> list[torch.Tensor]: + attn_biases: list[torch.Tensor] = [] for seq_len in seq_lens: bias = torch.arange(seq_len, dtype=dtype) # NOTE(zhuohan): HF uses @@ -660,11 +660,11 @@ def _make_alibi_bias( def _make_sliding_window_bias( - seq_lens: List[int], + seq_lens: list[int], window_size: Optional[int], dtype: torch.dtype, -) -> List[torch.Tensor]: - attn_biases: List[torch.Tensor] = [] +) -> list[torch.Tensor]: + attn_biases: list[torch.Tensor] = [] for seq_len in seq_lens: tensor = torch.full( (1, seq_len, seq_len), diff --git a/vllm/attention/backends/triton_mla.py b/vllm/attention/backends/triton_mla.py index 08e8226ab04c0..048ecbeee5c0d 100644 --- a/vllm/attention/backends/triton_mla.py +++ b/vllm/attention/backends/triton_mla.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Any, Dict, List, Optional, Type +from typing import Any, Optional import torch @@ -18,7 +18,7 @@ def get_name() -> str: return "TRITON_MLA" @staticmethod - def get_impl_cls() -> Type["TritonMLAImpl"]: + def get_impl_cls() -> type["TritonMLAImpl"]: return TritonMLAImpl @@ -30,10 +30,10 @@ def __init__( head_size: int, scale: float, num_kv_heads: int, - alibi_slopes: Optional[List[float]], + alibi_slopes: Optional[list[float]], sliding_window: Optional[int], kv_cache_dtype: str, - blocksparse_params: Optional[Dict[str, Any]], + blocksparse_params: Optional[dict[str, Any]], logits_soft_cap: Optional[float], attn_type: str, # MLA Specific Arguments diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py index baf01c9263d4f..c967f67873f4b 100644 --- a/vllm/attention/backends/utils.py +++ b/vllm/attention/backends/utils.py @@ -3,7 +3,7 @@ from collections import defaultdict from contextlib import contextmanager from itertools import accumulate -from typing import TYPE_CHECKING, Any, Dict, List, Tuple, Type, TypeVar, Union +from typing import TYPE_CHECKING, Any, TypeVar, Union import numpy as np import torch @@ -37,7 +37,7 @@ from vllm.worker.model_runner import ModelInputForGPUBuilder -def is_block_tables_empty(block_tables: Union[None, Dict]): +def is_block_tables_empty(block_tables: Union[None, dict]): """ Check if block_tables is None or a dictionary with all None values. """ @@ -58,8 +58,8 @@ def compute_slot_mapping_start_idx(is_prompt: bool, query_len: int, return start_idx -def _compute_slot_mapping_python(slot_mapping: List[int], - block_table: List[int], range_start: int, +def _compute_slot_mapping_python(slot_mapping: list[int], + block_table: list[int], range_start: int, range_end: int, block_size: int): for i in range(range_start, range_end): block_number = block_table[i // block_size] @@ -68,8 +68,8 @@ def _compute_slot_mapping_python(slot_mapping: List[int], slot_mapping.append(slot) -def _compute_slot_mapping_numpy(slot_mapping: List[int], - block_table: List[int], range_start: int, +def _compute_slot_mapping_numpy(slot_mapping: list[int], + block_table: list[int], range_start: int, range_end: int, block_size: int): block_table_array = np.array(block_table) idx = np.arange(range_start, range_end) @@ -81,10 +81,10 @@ def _compute_slot_mapping_numpy(slot_mapping: List[int], slot_mapping.extend(seq_slot_mapping_array) -def compute_slot_mapping(is_profile_run: bool, slot_mapping: List[int], +def compute_slot_mapping(is_profile_run: bool, slot_mapping: list[int], seq_id: int, seq_len: int, context_len: int, start_idx: int, block_size: int, - block_tables: Dict[int, List[int]]): + block_tables: dict[int, list[int]]): """ Compute slot mapping. """ @@ -125,7 +125,7 @@ def compute_slot_mapping(is_profile_run: bool, slot_mapping: List[int], class CommonMetadataBuilder(AttentionMetadataBuilder[TAttentionMetadata]): - _metadata_cls: Type[TAttentionMetadata] + _metadata_cls: type[TAttentionMetadata] def __init__(self, input_builder: "ModelInputForGPUBuilder"): self.input_builder = input_builder @@ -135,12 +135,12 @@ def __init__(self, input_builder: "ModelInputForGPUBuilder"): self.block_size = input_builder.block_size def prepare(self): - self.slot_mapping: List[int] = [] - self.prefill_seq_lens: List[int] = [] - self.context_lens: List[int] = [] - self.block_tables: List[List[int]] = [] - self.curr_seq_lens: List[int] = [] - self.multimodal_placeholder_maps: Dict[ + self.slot_mapping: list[int] = [] + self.prefill_seq_lens: list[int] = [] + self.context_lens: list[int] = [] + self.block_tables: list[list[int]] = [] + self.curr_seq_lens: list[int] = [] + self.multimodal_placeholder_maps: dict[ str, MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap) self.num_prefills = 0 @@ -202,7 +202,7 @@ def _add_seq_group( seq_len, context_len, start_idx, self.block_size, inter_data.block_tables) - def build(self, seq_lens: List[int], query_lens: List[int], + def build(self, seq_lens: list[int], query_lens: list[int], cuda_graph_pad_size: int, batch_size: int): """Build attention metadata with on-device tensors. @@ -357,7 +357,7 @@ def graph_capture_get_metadata_for_batch( def get_graph_input_buffers( self, attn_metadata, - is_encoder_decoder_model: bool = False) -> Dict[str, Any]: + is_encoder_decoder_model: bool = False) -> dict[str, Any]: input_buffers = { "slot_mapping": attn_metadata.slot_mapping, "seq_lens_tensor": attn_metadata.decode_metadata.seq_lens_tensor, @@ -426,7 +426,7 @@ def _update_captured_metadata_for_enc_dec_model(self, batch_size: int, attn_metadata.num_encoder_tokens = 0 def _add_additonal_input_buffers_for_enc_dec_model( - self, attn_metadata, input_buffers: Dict[str, Any]): + self, attn_metadata, input_buffers: dict[str, Any]): """ Saves additional input buffers specific to the encoder-decoder model from the attention metadata. @@ -445,7 +445,7 @@ def _add_additonal_input_buffers_for_enc_dec_model( attn_metadata.decode_metadata.cross_block_tables) def _prepare_input_buffers_for_enc_dec_model(self, attn_metadata, - input_buffers: Dict[str, + input_buffers: dict[str, Any]): """ Populates input buffers with data from the encoder-decoder model's @@ -543,7 +543,7 @@ def get_seq_len_block_table_args( def get_num_prefill_decode_query_kv_tokens( attn_metadata, attn_type: str, -) -> Tuple[int, int, int]: +) -> tuple[int, int, int]: """ Calculate the number of prefill and decode tokens for query, key/value based on the attention metadata and the specified attention type. @@ -552,7 +552,7 @@ def get_num_prefill_decode_query_kv_tokens( attn_metadata (FlashAttentionMetadata): Attention Metadata object. attn_type (AttentionType): The type of attention being used. Returns: - Tuple[int, int, int]: A tuple containing three integers: + tuple[int, int, int]: A tuple containing three integers: - The number of prefill query tokens. - The number of prefill key/value tokens. - The number of decode query tokens. diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py index 9fa76634e1fc9..d60b4a1de5af8 100644 --- a/vllm/attention/backends/xformers.py +++ b/vllm/attention/backends/xformers.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 """Attention layer with xFormers and PagedAttention.""" from dataclasses import dataclass -from typing import Any, Dict, List, Optional, Tuple, Type +from typing import Any, Optional import torch from xformers import ops as xops @@ -31,19 +31,19 @@ def get_name() -> str: return "XFORMERS" @staticmethod - def get_impl_cls() -> Type["XFormersImpl"]: + def get_impl_cls() -> type["XFormersImpl"]: return XFormersImpl @staticmethod - def get_metadata_cls() -> Type["AttentionMetadata"]: + def get_metadata_cls() -> type["AttentionMetadata"]: return XFormersMetadata @staticmethod - def get_builder_cls() -> Type["XFormersMetadataBuilder"]: + def get_builder_cls() -> type["XFormersMetadataBuilder"]: return XFormersMetadataBuilder @staticmethod - def get_state_cls() -> Type["CommonAttentionState"]: + def get_state_cls() -> type["CommonAttentionState"]: return CommonAttentionState @staticmethod @@ -52,7 +52,7 @@ def get_kv_cache_shape( block_size: int, num_kv_heads: int, head_size: int, - ) -> Tuple[int, ...]: + ) -> tuple[int, ...]: return PagedAttention.get_kv_cache_shape(num_blocks, block_size, num_kv_heads, head_size) @@ -60,13 +60,13 @@ def get_kv_cache_shape( def swap_blocks( src_kv_cache: torch.Tensor, dst_kv_cache: torch.Tensor, - src_to_dst: Dict[int, int], + src_to_dst: dict[int, int], ) -> None: PagedAttention.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst) @staticmethod def copy_blocks( - kv_caches: List[torch.Tensor], + kv_caches: list[torch.Tensor], src_to_dists: torch.Tensor, ) -> None: PagedAttention.copy_blocks(kv_caches, src_to_dists) @@ -107,7 +107,7 @@ class XFormersMetadata(AttentionMetadata, PagedAttentionMetadata): # (batch_size,). The sequence length per sequence. Sequence length means # the computed tokens + new tokens None if it is a decoding. - seq_lens: Optional[List[int]] = None + seq_lens: Optional[list[int]] = None # FIXME: It is for flash attn. # (batch_size + 1,). The cumulative sequence lengths of the sequences in @@ -137,7 +137,7 @@ class XFormersMetadata(AttentionMetadata, PagedAttentionMetadata): # Begin encoder attn & enc/dec cross-attn fields... # Encoder sequence lengths representation - encoder_seq_lens: Optional[List[int]] = None + encoder_seq_lens: Optional[list[int]] = None encoder_seq_lens_tensor: Optional[torch.Tensor] = None # FIXME: It is for flash attn. # (batch_size + 1,). The cumulative sequence lengths of the sequences in @@ -162,9 +162,9 @@ def __post_init__(self): # when alibi slopes is used. It is because of the limitation # from xformer API. # will not appear in the __repr__ and __init__ - self.attn_bias: Optional[List[AttentionBias]] = None - self.encoder_attn_bias: Optional[List[AttentionBias]] = None - self.cross_attn_bias: Optional[List[AttentionBias]] = None + self.attn_bias: Optional[list[AttentionBias]] = None + self.encoder_attn_bias: Optional[list[AttentionBias]] = None + self.cross_attn_bias: Optional[list[AttentionBias]] = None @property def is_all_encoder_attn_metadata_set(self): @@ -320,7 +320,7 @@ def _get_attn_bias( def _set_attn_bias( attn_metadata: XFormersMetadata, - attn_bias: List[Optional[AttentionBias]], + attn_bias: list[Optional[AttentionBias]], attn_type: str, ) -> None: ''' @@ -383,10 +383,10 @@ def __init__( head_size: int, scale: float, num_kv_heads: int, - alibi_slopes: Optional[List[float]], + alibi_slopes: Optional[list[float]], sliding_window: Optional[int], kv_cache_dtype: str, - blocksparse_params: Optional[Dict[str, Any]] = None, + blocksparse_params: Optional[dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, attn_type: str = AttentionType.DECODER, ) -> None: @@ -763,9 +763,9 @@ def _make_alibi_bias( alibi_slopes: torch.Tensor, num_kv_heads: int, dtype: torch.dtype, - seq_lens: List[int], -) -> List[AttentionBias]: - attn_biases: List[AttentionBias] = [] + seq_lens: list[int], +) -> list[AttentionBias]: + attn_biases: list[AttentionBias] = [] for seq_len in seq_lens: bias = torch.arange(seq_len, dtype=dtype) # NOTE(zhuohan): HF uses diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index c45c83a0707fd..04923eb6b4d0b 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 """Attention layer.""" -from typing import Any, Dict, List, Optional +from typing import Any, Optional import torch import torch.nn as nn @@ -36,10 +36,10 @@ def __init__( head_size: int, scale: float, num_kv_heads: Optional[int] = None, - alibi_slopes: Optional[List[float]] = None, + alibi_slopes: Optional[list[float]] = None, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, - blocksparse_params: Optional[Dict[str, Any]] = None, + blocksparse_params: Optional[dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, per_layer_sliding_window: Optional[int] = None, use_mla: bool = False, diff --git a/vllm/attention/ops/flashmla.py b/vllm/attention/ops/flashmla.py index 18b69a6b3ddf8..1e7bb07c7838b 100644 --- a/vllm/attention/ops/flashmla.py +++ b/vllm/attention/ops/flashmla.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # adapted from: https://github.com/deepseek-ai/FlashMLA/blob/main/flash_mla/flash_mla_interface.py -from typing import Optional, Tuple +from typing import Optional import torch @@ -19,7 +19,7 @@ _flashmla_C_AVAILABLE = False -def is_flashmla_supported() -> Tuple[bool, Optional[str]]: +def is_flashmla_supported() -> tuple[bool, Optional[str]]: """ Return: is_supported_flag, unsupported_reason (optional). """ @@ -39,7 +39,7 @@ def get_mla_metadata( cache_seqlens: torch.Tensor, num_heads_per_head_k: int, num_heads_k: int, -) -> Tuple[torch.Tensor, torch.Tensor]: +) -> tuple[torch.Tensor, torch.Tensor]: """ Arguments: cache_seqlens: (batch_size), dtype torch.int32. @@ -66,7 +66,7 @@ def flash_mla_with_kvcache( num_splits: torch.Tensor, softmax_scale: Optional[float] = None, causal: bool = False, -) -> Tuple[torch.Tensor, torch.Tensor]: +) -> tuple[torch.Tensor, torch.Tensor]: """ Arguments: q: (batch_size, seq_len_q, num_heads_q, head_dim). @@ -106,10 +106,10 @@ def flash_mla_with_kvcache( # TODO: Add fake functions # # @register_fake("_flashmla_C::get_mla_metadata") -# def _get_mla_metadata_fake(....) -> Tuple[torch.Tensor, torch.Tensor]: +# def _get_mla_metadata_fake(....) -> tuple[torch.Tensor, torch.Tensor]: # return .... # # @register_fake("_flashmla_C::fwd_kvcache_mla") -# def _fwd_kvcache_mla_fake(....) -> Tuple[torch.Tensor, torch.Tensor]: +# def _fwd_kvcache_mla_fake(....) -> tuple[torch.Tensor, torch.Tensor]: # return .... # diff --git a/vllm/attention/ops/hpu_paged_attn.py b/vllm/attention/ops/hpu_paged_attn.py index 49ea420d092cc..994a4556c115c 100644 --- a/vllm/attention/ops/hpu_paged_attn.py +++ b/vllm/attention/ops/hpu_paged_attn.py @@ -5,7 +5,7 @@ ############################################################################### from dataclasses import dataclass -from typing import Dict, List, Optional, Tuple +from typing import Optional import torch from vllm_hpu_extension import cache_ops, ops @@ -29,7 +29,7 @@ class HPUPagedAttentionMetadata: class HPUPagedAttention: @staticmethod - def get_supported_head_sizes() -> List[int]: + def get_supported_head_sizes() -> list[int]: return [64, 80, 96, 112, 128, 256] @staticmethod @@ -38,7 +38,7 @@ def get_kv_cache_shape( block_size: int, num_kv_heads: int, head_size: int, - ) -> Tuple[int, ...]: + ) -> tuple[int, ...]: return (num_blocks, block_size, num_kv_heads, head_size) @staticmethod @@ -46,7 +46,7 @@ def split_kv_cache( kv_cache: torch.Tensor, num_kv_heads: int, head_size: int, - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: key_cache = kv_cache[0] value_cache = kv_cache[1] return key_cache, value_cache @@ -86,7 +86,7 @@ def forward_prefix( def swap_blocks( src_kv_cache: torch.Tensor, dst_kv_cache: torch.Tensor, - src_to_dst: Dict[int, int], + src_to_dst: dict[int, int], ) -> None: src_key_cache = src_kv_cache[0] dst_key_cache = dst_kv_cache[0] @@ -98,8 +98,8 @@ def swap_blocks( @staticmethod def copy_blocks( - kv_caches: List[torch.Tensor], - src_to_dists: Dict[int, List[int]], + kv_caches: list[torch.Tensor], + src_to_dists: dict[int, list[int]], ) -> None: key_caches = [kv_cache[0] for kv_cache in kv_caches] value_caches = [kv_cache[1] for kv_cache in kv_caches] diff --git a/vllm/attention/ops/ipex_attn.py b/vllm/attention/ops/ipex_attn.py index 598ceea130d97..9e1274b4f1d50 100644 --- a/vllm/attention/ops/ipex_attn.py +++ b/vllm/attention/ops/ipex_attn.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Dict, List, Optional, Tuple +from typing import Optional try: import intel_extension_for_pytorch.llm.modules as ipex_modules @@ -16,7 +16,7 @@ class _PagedAttention: @staticmethod - def get_supported_head_sizes() -> List[int]: + def get_supported_head_sizes() -> list[int]: return [32, 64, 80, 96, 112, 128, 256] @staticmethod @@ -26,7 +26,7 @@ def get_kv_cache_shape( num_kv_heads: int, head_size: int, *args, - ) -> Tuple[int, ...]: + ) -> tuple[int, ...]: return (2, num_blocks, block_size * num_kv_heads * head_size) @staticmethod @@ -35,7 +35,7 @@ def split_kv_cache( num_kv_heads: int, head_size: int, *args, - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: x = 16 // kv_cache.element_size() num_blocks = kv_cache.shape[1] @@ -117,8 +117,8 @@ def forward_decode( @staticmethod def copy_blocks( - kv_caches: List[torch.Tensor], - src_to_dists: Dict[int, List[int]], + kv_caches: list[torch.Tensor], + src_to_dists: dict[int, list[int]], *args, ) -> None: key_caches = [kv_cache[0] for kv_cache in kv_caches] @@ -134,7 +134,7 @@ def split_kv_cache( num_kv_heads: int, head_size: int, *args, - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: num_blocks = kv_cache.shape[1] key_cache = kv_cache[0] diff --git a/vllm/attention/ops/paged_attn.py b/vllm/attention/ops/paged_attn.py index fd703413db908..2ec534d5b154f 100644 --- a/vllm/attention/ops/paged_attn.py +++ b/vllm/attention/ops/paged_attn.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 from dataclasses import dataclass -from typing import List, Optional, Tuple +from typing import Optional import torch @@ -35,7 +35,7 @@ class PagedAttentionMetadata: class PagedAttention: @staticmethod - def get_supported_head_sizes() -> List[int]: + def get_supported_head_sizes() -> list[int]: return [32, 64, 80, 96, 112, 120, 128, 192, 256] @staticmethod @@ -44,7 +44,7 @@ def get_kv_cache_shape( block_size: int, num_kv_heads: int, head_size: int, - ) -> Tuple[int, ...]: + ) -> tuple[int, ...]: return (2, num_blocks, block_size * num_kv_heads * head_size) @staticmethod @@ -52,7 +52,7 @@ def split_kv_cache( kv_cache: torch.Tensor, num_kv_heads: int, head_size: int, - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: x = 16 // kv_cache.element_size() num_blocks = kv_cache.shape[1] @@ -245,7 +245,7 @@ def swap_blocks( @staticmethod def copy_blocks( - kv_caches: List[torch.Tensor], + kv_caches: list[torch.Tensor], src_to_dists: torch.Tensor, ) -> None: key_caches = [kv_cache[0] for kv_cache in kv_caches] diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py index 26c6ac812a125..419d3d1327598 100644 --- a/vllm/attention/selector.py +++ b/vllm/attention/selector.py @@ -1,9 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 import os +from collections.abc import Generator from contextlib import contextmanager from functools import cache -from typing import Generator, Optional, Type +from typing import Optional import torch @@ -86,7 +87,7 @@ def get_attn_backend( is_attention_free: bool, is_blocksparse: bool = False, use_mla: bool = False, -) -> Type[AttentionBackend]: +) -> type[AttentionBackend]: """Selects which attention backend to use and lazily imports it.""" # Accessing envs.* behind an @lru_cache decorator can cause the wrong # value to be returned from the cache if the value changes between calls. @@ -114,7 +115,7 @@ def _cached_get_attn_backend( is_blocksparse: bool = False, use_v1: bool = False, use_mla: bool = False, -) -> Type[AttentionBackend]: +) -> type[AttentionBackend]: if is_blocksparse: logger.info("Using BlocksparseFlashAttention backend.") from vllm.attention.backends.blocksparse_attn import ( diff --git a/vllm/beam_search.py b/vllm/beam_search.py index 97b2b630fc3e5..5d4ebdb7acbcf 100644 --- a/vllm/beam_search.py +++ b/vllm/beam_search.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union +from typing import TYPE_CHECKING, Any, Optional, Union from vllm.sequence import Logprob @@ -17,14 +17,14 @@ class BeamSearchSequence: about to be returned to the user. """ # The tokens includes the prompt. - tokens: List[int] - logprobs: List[Dict[int, Logprob]] + tokens: list[int] + logprobs: list[dict[int, Logprob]] cum_logprob: float = 0.0 text: Optional[str] = None finish_reason: Optional[str] = None stop_reason: Union[int, str, None] = None multi_modal_data: Optional["MultiModalDataDict"] = None - mm_processor_kwargs: Optional[Dict[str, Any]] = None + mm_processor_kwargs: Optional[dict[str, Any]] = None @dataclass @@ -33,20 +33,20 @@ class BeamSearchOutput: It contains the list of the best beam search sequences. The length of the list is equal to the beam width. """ - sequences: List[BeamSearchSequence] + sequences: list[BeamSearchSequence] class BeamSearchInstance: - def __init__(self, prompt_tokens: List[int]): - self.beams: List[BeamSearchSequence] = [ + def __init__(self, prompt_tokens: list[int]): + self.beams: list[BeamSearchSequence] = [ BeamSearchSequence(tokens=prompt_tokens, logprobs=[]) ] - self.completed: List[BeamSearchSequence] = [] + self.completed: list[BeamSearchSequence] = [] def get_beam_search_score( - tokens: List[int], + tokens: list[int], cumulative_logprob: float, eos_token_id: int, length_penalty: float = 1.0, diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index b972f03c9685b..edc0ffb31a233 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -5,8 +5,9 @@ import os import pprint import time +from collections.abc import Sequence from contextlib import ExitStack -from typing import Any, Callable, Dict, List, Optional, Sequence, Set, Tuple +from typing import Any, Callable, Optional from unittest.mock import patch import torch @@ -42,7 +43,7 @@ class CompilerManager: """ def __init__(self, use_inductor: bool): - self.cache: Dict[Tuple[Optional[int], int, str], Any] = dict() + self.cache: dict[tuple[Optional[int], int, str], Any] = dict() cls = InductorAdaptor if use_inductor else EagerAdaptor self.compiler = cls() @@ -75,7 +76,7 @@ def save_to_file(self): def load(self, graph: fx.GraphModule, - example_inputs: List[Any], + example_inputs: list[Any], graph_index: int, runtime_shape: Optional[int] = None) -> Optional[Callable]: if (runtime_shape, graph_index, self.compiler.name) not in self.cache: @@ -159,7 +160,7 @@ class SplitItem: def split_graph(graph: fx.GraphModule, - ops: List[str]) -> Tuple[fx.GraphModule, List[SplitItem]]: + ops: list[str]) -> tuple[fx.GraphModule, list[SplitItem]]: # split graph by ops subgraph_id = 0 node_to_subgraph_id = {} @@ -225,7 +226,7 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter): """ def __init__(self, module: torch.fx.GraphModule, - compile_submod_names: List[str], vllm_config: VllmConfig, + compile_submod_names: list[str], vllm_config: VllmConfig, graph_pool, vllm_backend: "VllmBackend"): super().__init__(module) from torch._guards import detect_fake_mode @@ -245,8 +246,8 @@ def run(self, *args): return super().run(*fake_args) def call_module(self, target: torch.fx.node.Target, - args: Tuple[torch.fx.node.Argument, - ...], kwargs: Dict[str, Any]) -> Any: + args: tuple[torch.fx.node.Argument, + ...], kwargs: dict[str, Any]) -> Any: assert isinstance(target, str) output = super().call_module(target, args, kwargs) @@ -297,12 +298,12 @@ class VllmBackend: graph: fx.GraphModule # the stiching graph module for all the piecewise graphs split_gm: fx.GraphModule - piecewise_graphs: List[SplitItem] + piecewise_graphs: list[SplitItem] returned_callable: Callable # Inductor passes to run on the graph pre-defunctionalization post_grad_passes: Sequence[Callable] - sym_tensor_indices: List[int] - input_buffers: List[torch.Tensor] + sym_tensor_indices: list[int] + input_buffers: list[torch.Tensor] compiler_manager: CompilerManager def __init__( @@ -523,14 +524,14 @@ class ConcreteSizeEntry: # for cudagraph debugging, track the input addresses # during capture, and check if they are the same during replay - input_addresses: Optional[List[int]] = None + input_addresses: Optional[list[int]] = None class PiecewiseBackend: def __init__(self, graph: fx.GraphModule, vllm_config: VllmConfig, graph_pool: Any, piecewise_compile_index: int, - total_piecewise_compiles: int, sym_shape_indices: List[int], + total_piecewise_compiles: int, sym_shape_indices: list[int], compiled_graph_for_general_shape: Callable, vllm_backend: VllmBackend): """ @@ -558,9 +559,9 @@ def __init__(self, graph: fx.GraphModule, vllm_config: VllmConfig, self.is_last_graph = ( piecewise_compile_index == total_piecewise_compiles - 1) - self.compile_sizes: Set[int] = set( + self.compile_sizes: set[int] = set( self.compilation_config.compile_sizes) - self.cudagraph_capture_sizes: Set[int] = set( + self.cudagraph_capture_sizes: set[int] = set( self.compilation_config.cudagraph_capture_sizes ) if self.compilation_config.use_cudagraph else set() @@ -574,11 +575,11 @@ def __init__(self, graph: fx.GraphModule, vllm_config: VllmConfig, # the entries for different shapes that we need to either # compile or capture cudagraph - self.concrete_size_entries: Dict[int, ConcreteSizeEntry] = {} + self.concrete_size_entries: dict[int, ConcreteSizeEntry] = {} # to_be_compiled_sizes tracks the remaining sizes to compile, # and updates during the compilation process, so we need to copy it - self.to_be_compiled_sizes: Set[int] = self.compile_sizes.copy() + self.to_be_compiled_sizes: set[int] = self.compile_sizes.copy() for shape in self.compile_sizes.union(self.cudagraph_capture_sizes): self.concrete_size_entries[shape] = ConcreteSizeEntry( runtime_shape=shape, diff --git a/vllm/compilation/compiler_interface.py b/vllm/compilation/compiler_interface.py index ac0544ad64037..d9bdac365cae5 100644 --- a/vllm/compilation/compiler_interface.py +++ b/vllm/compilation/compiler_interface.py @@ -3,7 +3,7 @@ import hashlib import os from contextlib import ExitStack -from typing import Any, Callable, Dict, List, Optional, Tuple +from typing import Any, Callable, Optional from unittest.mock import patch import torch @@ -43,10 +43,10 @@ def compute_hash(self, vllm_config: VllmConfig) -> str: def compile( self, graph: fx.GraphModule, - example_inputs: List[Any], - compiler_config: Dict[str, Any], + example_inputs: list[Any], + compiler_config: dict[str, Any], runtime_shape: Optional[int] = None - ) -> Tuple[Optional[Callable], Optional[Any]]: + ) -> tuple[Optional[Callable], Optional[Any]]: """ Compile the graph with the given example inputs and compiler config, with a runtime shape. If the `runtime_shape` is None, it means @@ -72,7 +72,7 @@ def compile( def load(self, handle: Any, graph: fx.GraphModule, - example_inputs: List[Any], + example_inputs: list[Any], graph_index: int, runtime_shape: Optional[int] = None) -> Callable: """ @@ -110,7 +110,7 @@ class AlwaysHitShapeEnv: """ def __init__(self) -> None: - self.guards: List[Any] = [] + self.guards: list[Any] = [] def evaluate_guards_expression(self, *args, **kwargs): return True @@ -129,7 +129,7 @@ class InductorAdaptor(CompilerInterface): name = "inductor" def compute_hash(self, vllm_config: VllmConfig) -> str: - factors: List[Any] = [] + factors: list[Any] = [] # summarize system state from torch._inductor.codecache import CacheBase system_factors = CacheBase.get_system() @@ -159,10 +159,10 @@ def initialize_cache(self, cache_dir: str, disable_cache: bool = False): def compile( self, graph: fx.GraphModule, - example_inputs: List[Any], - compiler_config: Dict[str, Any], + example_inputs: list[Any], + compiler_config: dict[str, Any], runtime_shape: Optional[int] = None - ) -> Tuple[Optional[Callable], Optional[Any]]: + ) -> tuple[Optional[Callable], Optional[Any]]: from torch._inductor import config current_config = config.get_config_copy() from torch._inductor.compile_fx import compile_fx @@ -273,7 +273,7 @@ def _get_shape_env() -> AlwaysHitShapeEnv: def load(self, handle: Any, graph: fx.GraphModule, - example_inputs: List[Any], + example_inputs: list[Any], graph_index: int, runtime_shape: Optional[int] = None) -> Callable: assert isinstance(handle, tuple) @@ -331,10 +331,10 @@ class EagerAdaptor(CompilerInterface): def compile( self, graph: fx.GraphModule, - example_inputs: List[Any], - compiler_config: Dict[str, Any], + example_inputs: list[Any], + compiler_config: dict[str, Any], runtime_shape: Optional[int] = None - ) -> Tuple[Optional[Callable], Optional[Any]]: + ) -> tuple[Optional[Callable], Optional[Any]]: # we don't need to compile the graph, just return the graph itself. # It does not support caching, return None for the handle. return graph, None diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py index 20afe6967df39..f02994c55527d 100644 --- a/vllm/compilation/decorators.py +++ b/vllm/compilation/decorators.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import inspect -from typing import Callable, Dict, List, Optional, TypeVar, Union, overload +from typing import Callable, Optional, TypeVar, Union, overload from unittest.mock import patch import torch @@ -25,7 +25,7 @@ @overload def support_torch_compile( *, - dynamic_arg_dims: Optional[Dict[str, Union[int, List[int]]]], + dynamic_arg_dims: Optional[dict[str, Union[int, list[int]]]], ) -> Callable[[_T], _T]: ... @@ -38,7 +38,7 @@ def support_torch_compile(cls: _T) -> _T: def support_torch_compile( cls: Optional[_T] = None, *, - dynamic_arg_dims: Optional[Dict[str, Union[int, List[int]]]] = None, + dynamic_arg_dims: Optional[dict[str, Union[int, list[int]]]] = None, ) -> Union[Callable[[_T], _T], _T]: """ A decorator to add support for compiling the forward method of a class. @@ -131,7 +131,7 @@ def cls_decorator_helper(cls: _T) -> _T: def _support_torch_compile( cls: _T, - dynamic_arg_dims: Dict[str, Union[int, List[int]]], + dynamic_arg_dims: dict[str, Union[int, list[int]]], ) -> _T: """ A decorator to add support for compiling the forward method of a class. diff --git a/vllm/compilation/fix_functionalization.py b/vllm/compilation/fix_functionalization.py index 9b0e9c5d04081..b7443cf11a331 100644 --- a/vllm/compilation/fix_functionalization.py +++ b/vllm/compilation/fix_functionalization.py @@ -1,7 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 import operator -from typing import Dict, Iterable, List, Optional, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch._higher_order_ops.auto_functionalize import auto_functionalized @@ -27,7 +28,7 @@ def __call__(self, graph: torch.fx.Graph): self.begin() self.dump_graph(graph, "before_fix_functionalization") - self.nodes_to_remove: List[torch.fx.Node] = [] + self.nodes_to_remove: list[torch.fx.Node] = [] count = 0 for node in graph.nodes: if not is_func(node, auto_functionalized): @@ -110,8 +111,8 @@ def _remove(self, node_or_nodes: Union[torch.fx.Node, def defunctionalize(self, graph: torch.fx.Graph, node: torch.fx.Node, - mutated_args: Dict[int, Union[torch.fx.Node, str]], - args: Optional[Tuple[Union[torch.fx.Node, str], + mutated_args: dict[int, Union[torch.fx.Node, str]], + args: Optional[tuple[Union[torch.fx.Node, str], ...]] = None): """ De-functionalize a node by replacing it with a call to the original. @@ -123,7 +124,7 @@ def defunctionalize(self, self._remove(node) def replace_users_with_mutated_args(self, node: torch.fx.Node, - mutated_args: Dict[int, + mutated_args: dict[int, Union[torch.fx.Node, str]]): """ @@ -139,7 +140,7 @@ def replace_users_with_mutated_args(self, node: torch.fx.Node, user.replace_all_uses_with(arg) self._remove(user) - def getitem_users(self, node: torch.fx.Node) -> Dict[int, torch.fx.Node]: + def getitem_users(self, node: torch.fx.Node) -> dict[int, torch.fx.Node]: """ Returns the operator.getitem users of the auto-functionalized node, indexed by the index they are getting. @@ -154,7 +155,7 @@ def getitem_users(self, node: torch.fx.Node) -> Dict[int, torch.fx.Node]: def insert_defunctionalized(self, graph: torch.fx.Graph, node: torch.fx.Node, - args: Optional[Tuple[Union[torch.fx.Node, str], + args: Optional[tuple[Union[torch.fx.Node, str], ...]] = None): """ Insert a new defunctionalized node into the graph before node. diff --git a/vllm/compilation/fusion.py b/vllm/compilation/fusion.py index 0c3d8697b2375..3f77fb61dfe83 100644 --- a/vllm/compilation/fusion.py +++ b/vllm/compilation/fusion.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Callable, Dict, List, NamedTuple, Optional, Tuple +from typing import Callable, NamedTuple, Optional import torch import torch._inductor.pattern_matcher as pm @@ -57,7 +57,7 @@ def __str__(self): kFp8DynamicTensorSym = QuantKey(FP8_DTYPE, False, True, True) kFp8DynamicTokenSym = QuantKey(FP8_DTYPE, False, False, True) -QUANT_OPS: Dict[QuantKey, OpOverload] = { +QUANT_OPS: dict[QuantKey, OpOverload] = { kFp8StaticTensorSym: torch.ops._C.static_scaled_fp8_quant.default, # noqa kFp8DynamicTensorSym: torch.ops._C.dynamic_scaled_fp8_quant.default, # noqa @@ -80,7 +80,7 @@ def __str__(self): f"{'' if self.fused_add else 'out'} residual)") -FUSED_OPS: Dict[FusedRMSQuantKey, OpOverload] = { +FUSED_OPS: dict[FusedRMSQuantKey, OpOverload] = { FusedRMSQuantKey(kFp8StaticTensorSym, False): torch.ops._C.rms_norm_static_fp8_quant.default, # noqa FusedRMSQuantKey(kFp8StaticTensorSym, True): @@ -101,7 +101,7 @@ def __init__(self, match: pm.Match, quant_op, fused_op): self.QUANT_OP = quant_op # in-place quant op self.FUSED_OP = fused_op # in-place fused quant op - def insert_fused_node(self, fused_return_mapping: Dict[int, Tuple[fx.Node, + def insert_fused_node(self, fused_return_mapping: dict[int, tuple[fx.Node, int]], **kwargs): """ @@ -548,7 +548,7 @@ def __init__(self, config: CompilationConfig.PassConfig): "FusionPass singleton instance already exists" super().__init__(config) - self.matches: List[MultiOutputMatch] = [] + self.matches: list[MultiOutputMatch] = [] self.patterns: PatternMatcherPass = PatternMatcherPass( pass_name="fusion_pass") diff --git a/vllm/compilation/fx_utils.py b/vllm/compilation/fx_utils.py index b9a8d3112e775..e3ca7f24fed55 100644 --- a/vllm/compilation/fx_utils.py +++ b/vllm/compilation/fx_utils.py @@ -1,7 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 import operator -from typing import Iterable, Optional +from collections.abc import Iterable +from typing import Optional from torch import fx from torch._higher_order_ops.auto_functionalize import auto_functionalized diff --git a/vllm/compilation/multi_output_match.py b/vllm/compilation/multi_output_match.py index e6f6a60b25950..c26f148252863 100644 --- a/vllm/compilation/multi_output_match.py +++ b/vllm/compilation/multi_output_match.py @@ -3,7 +3,7 @@ import abc import operator from abc import abstractmethod -from typing import Iterable, List, Tuple +from collections.abc import Iterable from torch import fx from torch._higher_order_ops.auto_functionalize import auto_functionalized @@ -56,7 +56,7 @@ def process(self): raise NotImplementedError @property - def nodes(self) -> List[fx.Node]: + def nodes(self) -> list[fx.Node]: return self.match.nodes @property @@ -87,13 +87,13 @@ def inserting_after_match(self): return self.graph.inserting_after(last_node_in_match) def insert_getitems(self, tuple_node: fx.Node, - indices: Iterable[int]) -> Tuple[fx.Node, ...]: + indices: Iterable[int]) -> tuple[fx.Node, ...]: """ Insert operator.getitem nodes to extract elements from a tuple node. :param tuple_node: The tuple node to extract elements from. :param indices: The indices of the elements to extract. - :return: Tuple of the new getitem nodes, corresponding to the indices. + :return: tuple of the new getitem nodes, corresponding to the indices. """ with self.graph.inserting_after(tuple_node): return tuple( diff --git a/vllm/compilation/pass_manager.py b/vllm/compilation/pass_manager.py index 52f8c3b1ec15a..7c967b59035d3 100644 --- a/vllm/compilation/pass_manager.py +++ b/vllm/compilation/pass_manager.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Any, Dict, List +from typing import Any import torch from torch import fx as fx @@ -43,7 +43,7 @@ class PostGradPassManager(Parent): """ def __init__(self): - self.passes: List[InductorPass] = [] + self.passes: list[InductorPass] = [] def __call__(self, graph: fx.Graph): for pass_ in self.passes: @@ -69,7 +69,7 @@ def add(self, pass_: InductorPass): def uuid(self): return self.__getstate__() - def __getstate__(self) -> Dict[str, List[Any]]: + def __getstate__(self) -> dict[str, list[Any]]: """ Custom pickling for the pass manager, as some passes cannot be pickled. Pickling occurs because the pass manager is set as the value of diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py index a8a283ddd8c0c..1a8211f0ab7c6 100644 --- a/vllm/compilation/wrapper.py +++ b/vllm/compilation/wrapper.py @@ -5,7 +5,7 @@ from abc import abstractmethod from contextlib import contextmanager from types import CodeType -from typing import Callable, List, Optional +from typing import Callable, Optional import torch @@ -48,7 +48,7 @@ def __init__(self, self.compiled_callable = compiled_callable self.original_code_object = self.__class__.forward.__code__ - self.compiled_codes: List[CodeType] = [] + self.compiled_codes: list[CodeType] = [] torch._dynamo.convert_frame.register_bytecode_hook(self.bytecode_hook) # read the env var to determine whether to use the custom dispatcher diff --git a/vllm/config.py b/vllm/config.py index d1384c6375f30..a40f45bcbbb75 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -7,13 +7,14 @@ import json import sys import warnings +from collections import Counter +from collections.abc import Mapping from contextlib import contextmanager from dataclasses import dataclass, field, replace from importlib.util import find_spec from pathlib import Path -from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Counter, Dict, - Final, List, Literal, Mapping, Optional, Protocol, Set, - Tuple, Type, Union) +from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Final, Literal, + Optional, Protocol, Union) import torch from pydantic import BaseModel, Field, PrivateAttr @@ -67,20 +68,20 @@ RunnerType = Literal["generate", "pooling", "draft", "transcription"] -_RUNNER_TASKS: Dict[RunnerType, List[_ResolvedTask]] = { +_RUNNER_TASKS: dict[RunnerType, list[_ResolvedTask]] = { "generate": ["generate"], "pooling": ["embed", "classify", "score", "reward"], "draft": ["draft"], "transcription": ["transcription"], } -_TASK_RUNNER: Dict[_ResolvedTask, RunnerType] = { +_TASK_RUNNER: dict[_ResolvedTask, RunnerType] = { task: runner for runner, tasks in _RUNNER_TASKS.items() for task in tasks } -HfOverrides = Union[Dict[str, Any], Callable[[PretrainedConfig], +HfOverrides = Union[dict[str, Any], Callable[[PretrainedConfig], PretrainedConfig]] @@ -92,7 +93,7 @@ def compute_hash(self) -> str: class SupportsMetricsInfo(Protocol): - def metrics_info(self) -> Dict[str, str]: + def metrics_info(self) -> dict[str, str]: ... @@ -209,7 +210,7 @@ def compute_hash(self) -> str: excluding anything before input ids/embeddings and after the final hidden states. """ - factors: List[Any] = [] + factors: list[Any] = [] factors.append(self.model) factors.append(self.dtype) factors.append(self.quantization) @@ -233,7 +234,7 @@ def __init__( allowed_local_media_path: str = "", revision: Optional[str] = None, code_revision: Optional[str] = None, - rope_scaling: Optional[Dict[str, Any]] = None, + rope_scaling: Optional[dict[str, Any]] = None, rope_theta: Optional[float] = None, tokenizer_revision: Optional[str] = None, max_model_len: Optional[int] = None, @@ -244,19 +245,19 @@ def __init__( max_logprobs: int = 20, disable_sliding_window: bool = False, skip_tokenizer_init: bool = False, - served_model_name: Optional[Union[str, List[str]]] = None, + served_model_name: Optional[Union[str, list[str]]] = None, limit_mm_per_prompt: Optional[Mapping[str, int]] = None, use_async_output_proc: bool = True, config_format: ConfigFormat = ConfigFormat.AUTO, hf_overrides: Optional[HfOverrides] = None, - mm_processor_kwargs: Optional[Dict[str, Any]] = None, + mm_processor_kwargs: Optional[dict[str, Any]] = None, disable_mm_preprocessor_cache: bool = False, - override_neuron_config: Optional[Dict[str, Any]] = None, + override_neuron_config: Optional[dict[str, Any]] = None, override_pooler_config: Optional["PoolerConfig"] = None, logits_processor_pattern: Optional[str] = None, generation_config: Optional[str] = None, enable_sleep_mode: bool = False, - override_generation_config: Optional[Dict[str, Any]] = None, + override_generation_config: Optional[dict[str, Any]] = None, model_impl: Union[str, ModelImpl] = ModelImpl.AUTO, ) -> None: self.model = model @@ -283,7 +284,7 @@ def __init__( hf_overrides_fn = None if rope_scaling is not None: - hf_override: Dict[str, Any] = {"rope_scaling": rope_scaling} + hf_override: dict[str, Any] = {"rope_scaling": rope_scaling} hf_overrides_kw.update(hf_override) msg = ("`--rope-scaling` will be removed in a future release. " f"'Please instead use `--hf-overrides '{hf_override!r}'`") @@ -501,8 +502,8 @@ def _verify_tokenizer_mode(self) -> None: def _get_preferred_task( self, - architectures: List[str], - supported_tasks: Set[_ResolvedTask], + architectures: list[str], + supported_tasks: set[_ResolvedTask], ) -> Optional[_ResolvedTask]: model_id = self.model if get_pooling_config(model_id, self.revision): @@ -512,7 +513,7 @@ def _get_preferred_task( if ModelRegistry.is_transcription_model(architectures): return "transcription" - suffix_to_preferred_task: List[Tuple[str, _ResolvedTask]] = [ + suffix_to_preferred_task: list[tuple[str, _ResolvedTask]] = [ # Other models follow this pattern ("ForCausalLM", "generate"), ("ForConditionalGeneration", "generate"), @@ -534,13 +535,13 @@ def _resolve_task( self, task_option: Union[TaskOption, Literal["draft"]], hf_config: PretrainedConfig, - ) -> Tuple[Set[_ResolvedTask], _ResolvedTask]: + ) -> tuple[set[_ResolvedTask], _ResolvedTask]: if task_option == "draft": return {"draft"}, "draft" architectures = getattr(hf_config, "architectures", []) - runner_support: Dict[RunnerType, bool] = { + runner_support: dict[RunnerType, bool] = { # NOTE: Listed from highest to lowest priority, # in case the model supports multiple of them "transcription": @@ -548,13 +549,13 @@ def _resolve_task( "generate": ModelRegistry.is_text_generation_model(architectures), "pooling": ModelRegistry.is_pooling_model(architectures), } - supported_runner_types_lst: List[RunnerType] = [ + supported_runner_types_lst: list[RunnerType] = [ runner_type for runner_type, is_supported in runner_support.items() if is_supported ] - supported_tasks_lst: List[_ResolvedTask] = [ + supported_tasks_lst: list[_ResolvedTask] = [ task for runner_type in supported_runner_types_lst for task in _RUNNER_TASKS[runner_type] ] @@ -765,7 +766,7 @@ def verify_with_parallel_config( self.use_async_output_proc = False def get_hf_config_sliding_window( - self) -> Union[Optional[int], List[Optional[int]]]: + self) -> Union[Optional[int], list[Optional[int]]]: """Get the sliding window size, or None if disabled.""" # Some models, like Qwen2 and Qwen1.5, use `use_sliding_window` in @@ -776,7 +777,7 @@ def get_hf_config_sliding_window( return None return getattr(self.hf_text_config, "sliding_window", None) - def get_sliding_window(self) -> Optional[Union[int, List[Optional[int]]]]: + def get_sliding_window(self) -> Optional[Union[int, list[Optional[int]]]]: """Get the sliding window size, or None if disabled. """ # If user disables sliding window, return None. @@ -886,7 +887,7 @@ def get_num_attention_heads(self, return num_heads // parallel_config.tensor_parallel_size def get_layers_start_end_indices( - self, parallel_config: "ParallelConfig") -> Tuple[int, int]: + self, parallel_config: "ParallelConfig") -> tuple[int, int]: from vllm.distributed.utils import get_pp_indices if self.hf_text_config.model_type == "deepseek_mtp": total_num_hidden_layers = getattr(self.hf_text_config, @@ -947,7 +948,7 @@ def get_multimodal_config(self) -> "MultiModalConfig": return self.multimodal_config - def try_get_generation_config(self) -> Dict[str, Any]: + def try_get_generation_config(self) -> dict[str, Any]: if self.generation_config is None or self.generation_config == "auto": config = try_get_generation_config( self.hf_config_path or self.model, @@ -965,7 +966,7 @@ def try_get_generation_config(self) -> Dict[str, Any]: return config.to_diff_dict() - def get_diff_sampling_param(self) -> Dict[str, Any]: + def get_diff_sampling_param(self) -> dict[str, Any]: """ This method returns a dictionary containing the parameters that differ from the default sampling parameters, but only @@ -973,7 +974,7 @@ def get_diff_sampling_param(self) -> Dict[str, Any]: set, an empty dictionary is returned. Returns: - Dict[str, Any]: A dictionary with the differing sampling + dict[str, Any]: A dictionary with the differing sampling parameters if `generation_config` is set, otherwise an empty dictionary. """ @@ -1031,7 +1032,7 @@ def use_mla(self) -> bool: return self.is_deepseek_mla and not envs.VLLM_MLA_DISABLE @property - def supported_runner_types(self) -> Set[RunnerType]: + def supported_runner_types(self) -> set[RunnerType]: return {_TASK_RUNNER[task] for task in self.supported_tasks} @property @@ -1069,7 +1070,7 @@ def compute_hash(self) -> str: excluding anything before input ids/embeddings and after the final hidden states. """ - factors: List[Any] = [] + factors: list[Any] = [] factors.append(self.cache_dtype) # `cpu_offload_gb` does not use `torch.compile` yet. hash_str = hashlib.md5(str(factors).encode()).hexdigest() @@ -1177,7 +1178,7 @@ class TokenizerPoolConfig: pool type. """ pool_size: int - pool_type: Union[str, Type["BaseTokenizerGroup"]] + pool_type: Union[str, type["BaseTokenizerGroup"]] extra_config: dict def compute_hash(self) -> str: @@ -1194,7 +1195,7 @@ def compute_hash(self) -> str: """ # no factors to consider. # this config will not affect the computation graph. - factors: List[Any] = [] + factors: list[Any] = [] hash_str = hashlib.md5(str(factors).encode()).hexdigest() return hash_str @@ -1208,7 +1209,7 @@ def __post_init__(self): @classmethod def create_config( cls, tokenizer_pool_size: int, - tokenizer_pool_type: Union[str, Type["BaseTokenizerGroup"]], + tokenizer_pool_type: Union[str, type["BaseTokenizerGroup"]], tokenizer_pool_extra_config: Optional[Union[str, dict]] ) -> Optional["TokenizerPoolConfig"]: """Create a TokenizerPoolConfig from the given parameters. @@ -1279,7 +1280,7 @@ class LoadConfig: download_dir: Optional[str] = None model_loader_extra_config: Optional[Union[str, dict]] = field( default_factory=dict) - ignore_patterns: Optional[Union[List[str], str]] = None + ignore_patterns: Optional[Union[list[str], str]] = None def compute_hash(self) -> str: """ @@ -1295,7 +1296,7 @@ def compute_hash(self) -> str: """ # no factors to consider. # this config will not affect the computation graph. - factors: List[Any] = [] + factors: list[Any] = [] hash_str = hashlib.md5(str(factors).encode()).hexdigest() return hash_str @@ -1353,7 +1354,7 @@ class ParallelConfig: # to "ray" if Ray is installed and fail otherwise. Note that tpu # and hpu only support Ray for distributed inference. distributed_executor_backend: Optional[Union[str, - Type["ExecutorBase"]]] = None + type["ExecutorBase"]]] = None # the full name of the worker class to use. If "auto", the worker class # will be determined based on the platform. @@ -1417,7 +1418,7 @@ def compute_hash(self): excluding anything before input ids/embeddings and after the final hidden states. """ - factors: List[Any] = [] + factors: list[Any] = [] factors.append(self.pipeline_parallel_size) factors.append(self.tensor_parallel_size) return hashlib.sha256(str(factors).encode()).hexdigest() @@ -1594,7 +1595,7 @@ class SchedulerConfig: # scheduler class or path. "vllm.core.scheduler.Scheduler" (default) # or "mod.custom_class". - scheduler_cls: Union[str, Type[object]] = "vllm.core.scheduler.Scheduler" + scheduler_cls: Union[str, type[object]] = "vllm.core.scheduler.Scheduler" def compute_hash(self) -> str: """ @@ -1610,7 +1611,7 @@ def compute_hash(self) -> str: """ # no factors to consider. # this config will not affect the computation graph. - factors: List[Any] = [] + factors: list[Any] = [] hash_str = hashlib.md5(str(factors).encode()).hexdigest() return hash_str @@ -1746,7 +1747,7 @@ def compute_hash(self) -> str: # no factors to consider. # the device/platform information will be summarized # by torch/vllm automatically. - factors: List[Any] = [] + factors: list[Any] = [] hash_str = hashlib.md5(str(factors).encode()).hexdigest() return hash_str @@ -1792,7 +1793,7 @@ def compute_hash(self) -> str: """ # no factors to consider. # spec decode does not use `torch.compile` yet. - factors: List[Any] = [] + factors: list[Any] = [] hash_str = hashlib.md5(str(factors).encode()).hexdigest() return hash_str @@ -2256,7 +2257,7 @@ class LoRAConfig: lora_extra_vocab_size: int = 256 # This is a constant. lora_vocab_padding_size: ClassVar[int] = 256 - long_lora_scaling_factors: Optional[Tuple[float]] = None + long_lora_scaling_factors: Optional[tuple[float]] = None bias_enabled: bool = False def compute_hash(self) -> str: @@ -2273,7 +2274,7 @@ def compute_hash(self) -> str: """ # no factors to consider. # LoRA is not compatible with `torch.compile` . - factors: List[Any] = [] + factors: list[Any] = [] hash_str = hashlib.md5(str(factors).encode()).hexdigest() return hash_str @@ -2345,7 +2346,7 @@ def compute_hash(self) -> str: """ # no factors to consider. # this config will not affect the computation graph. - factors: List[Any] = [] + factors: list[Any] = [] hash_str = hashlib.md5(str(factors).encode()).hexdigest() return hash_str @@ -2390,7 +2391,7 @@ def compute_hash(self) -> str: """ # no factors to consider. # this config will not affect the computation graph. - factors: List[Any] = [] + factors: list[Any] = [] hash_str = hashlib.md5(str(factors).encode()).hexdigest() return hash_str @@ -2426,7 +2427,7 @@ class PoolerConfig: are returned. """ - returned_token_ids: Optional[List[int]] = None + returned_token_ids: Optional[list[int]] = None """ A list of indices for the vocabulary dimensions to be extracted, such as the token IDs of ``good_token`` and ``bad_token`` in the @@ -2447,7 +2448,7 @@ def compute_hash(self) -> str: """ # no factors to consider. # this config will not affect the computation graph. - factors: List[Any] = [] + factors: list[Any] = [] hash_str = hashlib.md5(str(factors).encode()).hexdigest() return hash_str @@ -2464,7 +2465,7 @@ def from_json(json_str: str) -> "PoolerConfig": "bfloat16": torch.bfloat16, } -_ROCM_NOT_SUPPORTED_DTYPE: List[str] = [] # +_ROCM_NOT_SUPPORTED_DTYPE: list[str] = [] # def _get_and_verify_dtype( @@ -2553,7 +2554,7 @@ def _get_and_verify_max_len( hf_config: PretrainedConfig, max_model_len: Optional[int], disable_sliding_window: bool, - sliding_window_len: Optional[Union[int, List[Optional[int]]]], + sliding_window_len: Optional[Union[int, list[Optional[int]]]], spec_target_max_model_len: Optional[int] = None, encoder_config: Optional[Any] = None, ) -> int: @@ -2679,7 +2680,7 @@ def _get_and_verify_max_len( def get_min_sliding_window( - sliding_window: Union[int, List[Optional[int]]]) -> int: + sliding_window: Union[int, list[Optional[int]]]) -> int: if isinstance(sliding_window, list): return min(s for s in sliding_window if s is not None) @@ -2687,7 +2688,7 @@ def get_min_sliding_window( def get_served_model_name(model: str, - served_model_name: Optional[Union[str, List[str]]]): + served_model_name: Optional[Union[str, list[str]]]): """ If the input is a non-empty list, the first model_name in `served_model_name` is taken. @@ -2724,7 +2725,7 @@ def compute_hash(self) -> str: """ # no factors to consider. # this config will not affect the computation graph. - factors: List[Any] = [] + factors: list[Any] = [] hash_str = hashlib.md5(str(factors).encode()).hexdigest() return hash_str @@ -2767,7 +2768,7 @@ def compute_hash(self) -> str: """ # no factors to consider. # this config will not affect the computation graph. - factors: List[Any] = [] + factors: list[Any] = [] hash_str = hashlib.md5(str(factors).encode()).hexdigest() return hash_str @@ -2826,7 +2827,7 @@ def compute_hash(self) -> str: """ # no factors to consider. # this config will not affect the computation graph. - factors: List[Any] = [] + factors: list[Any] = [] hash_str = hashlib.md5(str(factors).encode()).hexdigest() return hash_str @@ -2923,7 +2924,7 @@ class CompilationConfig(BaseModel): torch.compile will handle cudagraph capture logic in the future. - cudagraph_capture_sizes: sizes to capture cudagraph. - None (default): capture sizes are inferred from vllm config. - - List[int]: capture sizes are specified as given. + - list[int]: capture sizes are specified as given. - cudagraph_num_of_warmups: number of warmup runs for cudagraph. It means the first several runs will be treated as warmup runs. Only after that, the execution will be recorded, and the recorded @@ -2965,17 +2966,17 @@ class CompilationConfig(BaseModel): debug_dump_path: str = "" cache_dir: str = "" backend: str = "" - custom_ops: List[str] = Field(default_factory=list) - splitting_ops: List[str] = Field(default=None) # type: ignore + custom_ops: list[str] = Field(default_factory=list) + splitting_ops: list[str] = Field(default=None) # type: ignore use_inductor: bool = True - compile_sizes: Optional[List[Union[int, str]]] = Field(default=None) - inductor_compile_config: Dict = Field(default_factory=dict) - inductor_passes: Dict[str, str] = Field(default_factory=dict) + compile_sizes: Optional[list[Union[int, str]]] = Field(default=None) + inductor_compile_config: dict = Field(default_factory=dict) + inductor_passes: dict[str, str] = Field(default_factory=dict) use_cudagraph: bool = False cudagraph_num_of_warmups: int = 0 - cudagraph_capture_sizes: Optional[List[int]] = None + cudagraph_capture_sizes: Optional[list[int]] = None cudagraph_copy_inputs: bool = False class PassConfig(BaseModel): @@ -2991,7 +2992,7 @@ class PassConfig(BaseModel): - enable_reshape: whether to enable the custom reshape elimination pass. TODO better pass enabling system. """ - dump_graph_stages: List[str] = Field(default_factory=list) + dump_graph_stages: list[str] = Field(default_factory=list) dump_graph_dir: Path = Field(default=Path(".")) enable_fusion: bool = True enable_reshape: bool = True @@ -3020,20 +3021,20 @@ def model_post_init(self, __context: Any) -> None: max_capture_size: int = PrivateAttr local_cache_dir: str = PrivateAttr # local cache dir for each rank # optimization: - # Intuitively, bs_to_padded_graph_size should be Dict[int, int]. + # Intuitively, bs_to_padded_graph_size should be dict[int, int]. # since we know all keys are in a range [0, max_capture_size], - # we can optimize it to List[int] for better lookup performance. - bs_to_padded_graph_size: List[int] = PrivateAttr + # we can optimize it to list[int] for better lookup performance. + bs_to_padded_graph_size: list[int] = PrivateAttr # keep track of enabled and disabled custom ops enabled_custom_ops: Counter[str] = PrivateAttr disabled_custom_ops: Counter[str] = PrivateAttr - traced_files: Set[str] = PrivateAttr + traced_files: set[str] = PrivateAttr compilation_time: float = PrivateAttr # Per-model forward context # Map from layer name to the attention cls - static_forward_context: Dict[str, Any] = PrivateAttr + static_forward_context: dict[str, Any] = PrivateAttr def compute_hash(self) -> str: """ @@ -3047,7 +3048,7 @@ def compute_hash(self) -> str: excluding anything before input ids/embeddings and after the final hidden states. """ - factors: List[Any] = [] + factors: list[Any] = [] factors.append(self.level) factors.append(self.backend) factors.append(self.custom_ops) @@ -3144,7 +3145,7 @@ def init_backend(self, vllm_config: "VllmConfig") -> Union[str, Callable]: return VllmBackend(vllm_config) def init_with_cudagraph_sizes(self, - cudagraph_capture_sizes: List[int]) -> None: + cudagraph_capture_sizes: list[int]) -> None: """To complete the initialization of config, we need to know the cudagraph sizes.""" @@ -3237,10 +3238,10 @@ def compute_hash(self) -> str: excluding anything before input ids/embeddings and after the final hidden states. """ - factors: List[Any] = [] + factors: list[Any] = [] # summarize vllm config - vllm_factors: List[Any] = [] + vllm_factors: list[Any] = [] from vllm import __version__ vllm_factors.append(__version__) if self.model_config: diff --git a/vllm/connections.py b/vllm/connections.py index dc060bb6f88a7..2c259bb7c3e64 100644 --- a/vllm/connections.py +++ b/vllm/connections.py @@ -1,7 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 +from collections.abc import Mapping, MutableMapping from pathlib import Path -from typing import Mapping, MutableMapping, Optional +from typing import Optional from urllib.parse import urlparse import aiohttp diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py index d4d31c58dc8d4..4e7f6338d3a42 100644 --- a/vllm/core/block/block_table.py +++ b/vllm/core/block/block_table.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import math -from typing import List, Optional +from typing import Optional from vllm.core.block.common import BlockList from vllm.core.block.interfaces import Block, DeviceAwareBlockAllocator @@ -21,7 +21,7 @@ class BlockTable: single block. block_allocator (DeviceAwareBlockAllocator): The block allocator used to manage memory for the blocks. - _blocks (Optional[List[Block]], optional): An optional list of existing + _blocks (Optional[list[Block]], optional): An optional list of existing blocks to initialize the BlockTable with. If not provided, an empty BlockTable is created. max_block_sliding_window (Optional[int], optional): The number of @@ -34,7 +34,7 @@ class BlockTable: single block. _allocator (DeviceAwareBlockAllocator): The block allocator used to manage memory for the blocks. - _blocks (Optional[List[Block]]): The list of blocks managed by this + _blocks (Optional[list[Block]]): The list of blocks managed by this BlockTable. _num_full_slots (int): The number of tokens currently stored in the blocks. @@ -44,7 +44,7 @@ def __init__( self, block_size: int, block_allocator: DeviceAwareBlockAllocator, - _blocks: Optional[List[Block]] = None, + _blocks: Optional[list[Block]] = None, max_block_sliding_window: Optional[int] = None, ): self._block_size = block_size @@ -57,7 +57,7 @@ def __init__( self._num_full_slots = self._get_num_token_ids() @staticmethod - def get_num_required_blocks(token_ids: List[int], + def get_num_required_blocks(token_ids: list[int], block_size: int, num_lookahead_slots: int = 0) -> int: """Calculates the minimum number of blocks required to store a given @@ -68,7 +68,7 @@ def get_num_required_blocks(token_ids: List[int], allocation (e.g. ignoring prefix caching). Args: - token_ids (List[int]): The sequence of token IDs to be stored. + token_ids (list[int]): The sequence of token IDs to be stored. block_size (int): The maximum number of tokens that can be stored in a single block. num_lookahead_slots (int): look-ahead slots that the sequence may @@ -81,7 +81,7 @@ def get_num_required_blocks(token_ids: List[int], return cdiv(len(token_ids) + num_lookahead_slots, block_size) def allocate(self, - token_ids: List[int], + token_ids: list[int], device: Device = Device.GPU, extra_hash: Optional[int] = None) -> None: """Allocates memory blocks for storing the given sequence of token IDs. @@ -90,7 +90,7 @@ def allocate(self, sequence of token IDs. Args: - token_ids (List[int]): The sequence of token IDs to be stored. + token_ids (list[int]): The sequence of token IDs to be stored. device (Device, optional): The device on which the blocks should be allocated. Defaults to Device.GPU. extra_hash (Optional[int]): The hash value of additional @@ -106,14 +106,14 @@ def allocate(self, self.update(blocks) self._num_full_slots = len(token_ids) - def update(self, blocks: List[Block]) -> None: + def update(self, blocks: list[Block]) -> None: """Resets the table to the newly provided blocks (with their corresponding block ids) """ self._blocks.update(blocks) def append_token_ids(self, - token_ids: List[int], + token_ids: list[int], num_lookahead_slots: int = 0, num_computed_slots: Optional[int] = None, extra_hash: Optional[int] = None) -> None: @@ -130,7 +130,7 @@ def append_token_ids(self, separate block. Args: - token_ids (List[int]): The sequence of token IDs to be appended. + token_ids (list[int]): The sequence of token IDs to be appended. num_computed_slots (Optional[int]): The number of KV cache slots that are already filled (computed). When sliding window is enabled, this is used to compute how many @@ -244,7 +244,7 @@ def free(self) -> None: self._blocks.reset() @property - def physical_block_ids(self) -> List[int]: + def physical_block_ids(self) -> list[int]: """Returns a list of physical block indices for the blocks in the BlockTable. @@ -254,23 +254,23 @@ def physical_block_ids(self) -> List[int]: occupied by the block. Returns: - List[int]: A list of physical block indices for the blocks in the + list[int]: A list of physical block indices for the blocks in the BlockTable. """ return self._blocks.ids() - def get_unseen_token_ids(self, sequence_token_ids: List[int]) -> List[int]: + def get_unseen_token_ids(self, sequence_token_ids: list[int]) -> list[int]: """Get the number of "unseen" tokens in the sequence. Unseen tokens are tokens in the sequence corresponding to this block table, but are not yet appended to this block table. Args: - sequence_token_ids (List[int]): The list of token ids in the + sequence_token_ids (list[int]): The list of token ids in the sequence. Returns: - List[int]: The postfix of sequence_token_ids that has not yet been + list[int]: The postfix of sequence_token_ids that has not yet been appended to the block table. """ @@ -281,10 +281,10 @@ def get_unseen_token_ids(self, sequence_token_ids: List[int]) -> List[int]: def _allocate_blocks_for_token_ids( self, prev_block: Optional[Block], - token_ids: List[int], + token_ids: list[int], device: Device, - extra_hash: Optional[int] = None) -> List[Block]: - blocks: List[Block] = [] + extra_hash: Optional[int] = None) -> list[Block]: + blocks: list[Block] = [] block_token_ids = [] tail_token_ids = [] @@ -315,9 +315,9 @@ def _allocate_blocks_for_token_ids( return blocks - def _get_all_token_ids(self) -> List[int]: + def _get_all_token_ids(self) -> list[int]: # NOTE: This function is O(seq_len); use sparingly. - token_ids: List[int] = [] + token_ids: list[int] = [] if not self._is_allocated: return token_ids @@ -339,7 +339,7 @@ def _is_allocated(self) -> bool: return len(self._blocks) > 0 @property - def blocks(self) -> List[Block]: + def blocks(self) -> list[Block]: return self._blocks.list() @property @@ -358,7 +358,7 @@ def num_full_slots(self) -> int: return self._num_full_slots def get_num_blocks_touched_by_append_slots( - self, token_ids: List[int], num_lookahead_slots: int) -> int: + self, token_ids: list[int], num_lookahead_slots: int) -> int: """Determine how many blocks will be "touched" by appending the token ids. @@ -378,7 +378,7 @@ def get_num_blocks_touched_by_append_slots( return num_token_blocks def _chunk_token_blocks_for_append( - self, token_ids: List[int]) -> List[List[int]]: + self, token_ids: list[int]) -> list[list[int]]: """Split the token ids into block-sized chunks so they can be easily appended to blocks. The first such "token block" may have less token ids than the block size, since the last allocated block may be partially diff --git a/vllm/core/block/common.py b/vllm/core/block/common.py index 1966eac1cf9e0..9e444ac1f7dd1 100644 --- a/vllm/core/block/common.py +++ b/vllm/core/block/common.py @@ -1,8 +1,9 @@ # SPDX-License-Identifier: Apache-2.0 from collections import deque +from collections.abc import Iterable from dataclasses import dataclass -from typing import Deque, Dict, Iterable, List, Optional, Protocol, Tuple +from typing import Optional, Protocol from vllm.core.block.interfaces import Block, BlockAllocator @@ -36,7 +37,7 @@ class RefCounter(RefCounterProtocol): def __init__(self, all_block_indices: Iterable[BlockId]): deduped = set(all_block_indices) - self._refcounts: Dict[BlockId, RefCount] = { + self._refcounts: dict[BlockId, RefCount] = { index: 0 for index in deduped } @@ -108,7 +109,7 @@ class CopyOnWriteTracker: """ def __init__(self, refcounter: RefCounterProtocol): - self._copy_on_writes: List[Tuple[BlockId, BlockId]] = [] + self._copy_on_writes: list[tuple[BlockId, BlockId]] = [] self._refcounter = refcounter def is_appendable(self, block: Block) -> bool: @@ -135,7 +136,7 @@ def record_cow(self, src_block_id: Optional[BlockId], assert trg_block_id is not None self._copy_on_writes.append((src_block_id, trg_block_id)) - def clear_cows(self) -> List[Tuple[BlockId, BlockId]]: + def clear_cows(self) -> list[tuple[BlockId, BlockId]]: """Clears the copy-on-write tracking information and returns the current state. @@ -144,7 +145,7 @@ def clear_cows(self) -> List[Tuple[BlockId, BlockId]]: It then clears the internal tracking information. Returns: - List[Tuple[BlockId, BlockId]]: A list mapping source + list[tuple[BlockId, BlockId]]: A list mapping source block indices to destination block indices for the current copy-on-write operations. """ @@ -172,7 +173,7 @@ def __init__(self, block_size: int, create_block: Block.Factory, self._pool_size = pool_size assert self._pool_size >= 0 - self._free_ids: Deque[int] = deque(range(self._pool_size)) + self._free_ids: deque[int] = deque(range(self._pool_size)) self._pool = [] for i in range(self._pool_size): self._pool.append( @@ -203,7 +204,7 @@ def increase_pool(self): def init_block(self, prev_block: Optional[Block], - token_ids: List[int], + token_ids: list[int], block_size: int, physical_block_id: Optional[int], extra_hash: Optional[int] = None) -> Block: @@ -235,9 +236,9 @@ class BlockList: list on every iteration of the block manager """ - def __init__(self, blocks: List[Block]): - self._blocks: List[Block] = [] - self._block_ids: List[int] = [] + def __init__(self, blocks: list[Block]): + self._blocks: list[Block] = [] + self._block_ids: list[int] = [] self.update(blocks) @@ -250,7 +251,7 @@ def _update_block_id(self, block_index: int, assert new_block_id is not None self._block_ids[block_index] = new_block_id - def update(self, blocks: List[Block]): + def update(self, blocks: list[Block]): self._blocks = blocks # Cache block ids for fast query @@ -258,7 +259,7 @@ def update(self, blocks: List[Block]): for block in self._blocks: self._add_block_id(block.block_id) - def append_token_ids(self, block_index: int, token_ids: List[int]) -> None: + def append_token_ids(self, block_index: int, token_ids: list[int]) -> None: block = self._blocks[block_index] prev_block_id = block.block_id @@ -286,10 +287,10 @@ def reset(self): self._blocks = [] self._block_ids = [] - def list(self) -> List[Block]: + def list(self) -> list[Block]: return self._blocks - def ids(self) -> List[int]: + def ids(self) -> list[int]: return self._block_ids @@ -345,7 +346,7 @@ def get_hit_rate(self): return (completed_block_hit + incompleted_block_hit) / total_blocks -def get_all_blocks_recursively(last_block: Block) -> List[Block]: +def get_all_blocks_recursively(last_block: Block) -> list[Block]: """Retrieves all the blocks in a sequence starting from the last block. This function recursively traverses the sequence of blocks in reverse order, @@ -356,15 +357,15 @@ def get_all_blocks_recursively(last_block: Block) -> List[Block]: last_block (Block): The last block in the sequence. Returns: - List[Block]: A list of all the blocks in the sequence, in the order they + list[Block]: A list of all the blocks in the sequence, in the order they appear. """ - def recurse(block: Block, lst: List[Block]) -> None: + def recurse(block: Block, lst: list[Block]) -> None: if block.prev_block is not None: recurse(block.prev_block, lst) lst.append(block) - all_blocks: List[Block] = [] + all_blocks: list[Block] = [] recurse(last_block, all_blocks) return all_blocks diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py index 359b5b263f689..d777b6ab1d160 100644 --- a/vllm/core/block/cpu_gpu_block_allocator.py +++ b/vllm/core/block/cpu_gpu_block_allocator.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Dict, FrozenSet, List, Optional, Tuple +from typing import Optional from vllm.core.block.interfaces import (Block, BlockAllocator, BlockId, DeviceAwareBlockAllocator) @@ -109,10 +109,10 @@ def __init__(self, cpu_block_allocator: BlockAllocator, Device.GPU: gpu_block_allocator, } - self._swap_mapping: Dict[int, int] = {} + self._swap_mapping: dict[int, int] = {} self._null_block: Optional[Block] = None - self._block_ids_to_allocator: Dict[int, BlockAllocator] = {} + self._block_ids_to_allocator: dict[int, BlockAllocator] = {} for _, allocator in self._allocators.items(): for block_id in allocator.all_block_ids: self._block_ids_to_allocator[block_id] = allocator @@ -146,16 +146,16 @@ def allocate_mutable_block(self, def allocate_immutable_blocks( self, prev_block: Optional[Block], - block_token_ids: List[List[int]], + block_token_ids: list[list[int]], device: Device, - extra_hash: Optional[int] = None) -> List[Block]: + extra_hash: Optional[int] = None) -> list[Block]: """Allocates a new group of immutable blocks with the provided block token IDs on the specified device. Args: prev_block (Optional[Block]): The previous block in the sequence. Used for prefix hashing. - block_token_ids (List[int]): The list of block token IDs to be + block_token_ids (list[int]): The list of block token IDs to be stored in the new blocks. device (Device): The device on which to allocate the new block. extra_hash (Optional[int]): The hash value of additional @@ -163,7 +163,7 @@ def allocate_immutable_blocks( in the prefix caching block. Returns: - List[Block]: The newly allocated list of immutable blocks + list[Block]: The newly allocated list of immutable blocks containing the provided block token IDs. """ return self._allocators[device].allocate_immutable_blocks( @@ -171,7 +171,7 @@ def allocate_immutable_blocks( def allocate_immutable_block(self, prev_block: Optional[Block], - token_ids: List[int], + token_ids: list[int], device: Device, extra_hash: Optional[int] = None) -> Block: """Allocates a new immutable block with the provided token IDs on the @@ -180,7 +180,7 @@ def allocate_immutable_block(self, Args: prev_block (Optional[Block]): The previous block in the sequence. Used for prefix hashing. - token_ids (List[int]): The list of token IDs to be stored in the new + token_ids (list[int]): The list of token IDs to be stored in the new block. device (Device): The device on which to allocate the new block. extra_hash (Optional[int]): The hash value of additional @@ -208,7 +208,7 @@ def free(self, block: Block) -> None: allocator = self._block_ids_to_allocator[block_id] allocator.free(block) - def fork(self, last_block: Block) -> List[Block]: + def fork(self, last_block: Block) -> list[Block]: """Creates a new sequence of blocks that shares the same underlying memory as the original sequence. @@ -216,7 +216,7 @@ def fork(self, last_block: Block) -> List[Block]: last_block (Block): The last block in the original sequence. Returns: - List[Block]: A new list of blocks that shares the same memory as the + list[Block]: A new list of blocks that shares the same memory as the original sequence. """ # do not attempt to fork the null block @@ -255,20 +255,20 @@ def get_physical_block_id(self, device: Device, absolute_id: int) -> int: """ return self._allocators[device].get_physical_block_id(absolute_id) - def swap(self, blocks: List[Block], src_device: Device, - dst_device: Device) -> Dict[int, int]: + def swap(self, blocks: list[Block], src_device: Device, + dst_device: Device) -> dict[int, int]: """Execute the swap for the given blocks from source_device on to dest_device, save the current swap mapping and append them to the accumulated `self._swap_mapping` for each scheduling move. Args: - blocks: List of blocks to be swapped. + blocks: list of blocks to be swapped. src_device (Device): Device to swap the 'blocks' from. dst_device (Device): Device to swap the 'blocks' to. Returns: - Dict[int, int]: Swap mapping from source_device + dict[int, int]: Swap mapping from source_device on to dest_device. """ src_block_ids = [block.block_id for block in blocks] @@ -276,20 +276,20 @@ def swap(self, blocks: List[Block], src_device: Device, self._allocators[dst_device].swap_in(blocks) dst_block_ids = [block.block_id for block in blocks] - current_swap_mapping: Dict[int, int] = {} + current_swap_mapping: dict[int, int] = {} for src_block_id, dst_block_id in zip(src_block_ids, dst_block_ids): if src_block_id is not None and dst_block_id is not None: self._swap_mapping[src_block_id] = dst_block_id current_swap_mapping[src_block_id] = dst_block_id return current_swap_mapping - def get_num_full_blocks_touched(self, blocks: List[Block], + def get_num_full_blocks_touched(self, blocks: list[Block], device: Device) -> int: """Returns the number of full blocks that will be touched by swapping in/out the given blocks on to the 'device'. Args: - blocks: List of blocks to be swapped. + blocks: list of blocks to be swapped. device (Device): Device to swap the 'blocks' on. Returns: @@ -300,40 +300,40 @@ def get_num_full_blocks_touched(self, blocks: List[Block], """ return self._allocators[device].get_num_full_blocks_touched(blocks) - def clear_copy_on_writes(self) -> List[Tuple[int, int]]: + def clear_copy_on_writes(self) -> list[tuple[int, int]]: """Clears the copy-on-write (CoW) state and returns the mapping of source to destination block IDs. Returns: - List[Tuple[int, int]]: A list mapping source block IDs to + list[tuple[int, int]]: A list mapping source block IDs to destination block IDs. """ # CoW only supported on GPU device = Device.GPU return self._allocators[device].clear_copy_on_writes() - def mark_blocks_as_accessed(self, block_ids: List[int], + def mark_blocks_as_accessed(self, block_ids: list[int], now: float) -> None: """Mark blocks as accessed, only use for prefix caching.""" # Prefix caching only supported on GPU. device = Device.GPU return self._allocators[device].mark_blocks_as_accessed(block_ids, now) - def mark_blocks_as_computed(self, block_ids: List[int]) -> None: + def mark_blocks_as_computed(self, block_ids: list[int]) -> None: """Mark blocks as accessed, only use for prefix caching.""" # Prefix caching only supported on GPU. device = Device.GPU return self._allocators[device].mark_blocks_as_computed(block_ids) def get_common_computed_block_ids( - self, computed_seq_block_ids: List[List[int]]) -> List[int]: + self, computed_seq_block_ids: list[list[int]]) -> list[int]: # Prefix caching only supported on GPU. device = Device.GPU return self._allocators[device].get_common_computed_block_ids( computed_seq_block_ids) @property - def all_block_ids(self) -> FrozenSet[int]: + def all_block_ids(self) -> frozenset[int]: return frozenset(self._block_ids_to_allocator.keys()) def get_prefix_cache_hit_rate(self, device: Device) -> float: @@ -348,13 +348,13 @@ def reset_prefix_cache(self) -> bool: success = success and allocator.reset_prefix_cache() return success - def get_and_reset_swaps(self) -> List[Tuple[int, int]]: + def get_and_reset_swaps(self) -> list[tuple[int, int]]: """Returns and clears the mapping of source to destination block IDs. Will be called after every swapping operations for now, and after every schedule when BlockManagerV2 become default. Currently not useful. Returns: - List[Tuple[int, int]]: A mapping of source to destination block IDs. + list[tuple[int, int]]: A mapping of source to destination block IDs. """ mapping = self._swap_mapping.copy() self._swap_mapping.clear() @@ -362,9 +362,9 @@ def get_and_reset_swaps(self) -> List[Tuple[int, int]]: def find_cached_blocks_prefix( self, - block_hashes: List[int], + block_hashes: list[int], device: Device = Device.GPU, - ) -> List[int]: + ) -> list[int]: return self._allocators[device].find_cached_blocks_prefix(block_hashes) @@ -381,7 +381,7 @@ def __init__(self, proxy: Block): super().__init__() self._proxy = proxy - def append_token_ids(self, token_ids: List[BlockId]): + def append_token_ids(self, token_ids: list[BlockId]): raise ValueError("null block should not be modified") @property @@ -393,7 +393,7 @@ def block_id(self, value: Optional[BlockId]): raise ValueError("null block should not be modified") @property - def token_ids(self) -> List[BlockId]: + def token_ids(self) -> list[BlockId]: return self._proxy.token_ids @property diff --git a/vllm/core/block/interfaces.py b/vllm/core/block/interfaces.py index 0b0197deb8d47..1c47fded26afd 100644 --- a/vllm/core/block/interfaces.py +++ b/vllm/core/block/interfaces.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 from abc import ABC, abstractmethod -from typing import Dict, FrozenSet, List, Optional, Protocol, Tuple +from typing import Optional, Protocol from vllm.utils import Device @@ -11,7 +11,7 @@ class Block(ABC): @abstractmethod - def append_token_ids(self, token_ids: List[int]) -> None: + def append_token_ids(self, token_ids: list[int]) -> None: pass @property @@ -27,7 +27,7 @@ def block_id(self, value: Optional[int]) -> None: @property @abstractmethod - def token_ids(self) -> List[int]: + def token_ids(self) -> list[int]: pass @property @@ -84,7 +84,7 @@ class Factory(Protocol): def __call__( self, prev_block: Optional["Block"], - token_ids: List[int], + token_ids: list[int], block_size: int, allocator: "BlockAllocator", block_id: Optional[int] = None, @@ -114,14 +114,14 @@ def allocate_mutable_block(self, prev_block: Optional[Block], @abstractmethod def allocate_immutable_block(self, prev_block: Optional[Block], - token_ids: List[int], + token_ids: list[int], extra_hash: Optional[int]) -> Block: pass @abstractmethod def allocate_immutable_blocks(self, prev_block: Optional[Block], - block_token_ids: List[List[int]], - extra_hash: Optional[int]) -> List[Block]: + block_token_ids: list[list[int]], + extra_hash: Optional[int]) -> list[Block]: pass @abstractmethod @@ -129,7 +129,7 @@ def free(self, block: Block) -> None: pass @abstractmethod - def fork(self, last_block: Block) -> List[Block]: + def fork(self, last_block: Block) -> list[Block]: pass @abstractmethod @@ -145,34 +145,34 @@ def get_physical_block_id(self, absolute_id: int) -> int: pass @abstractmethod - def swap_out(self, blocks: List[Block]) -> None: + def swap_out(self, blocks: list[Block]) -> None: pass @abstractmethod - def swap_in(self, blocks: List[Block]) -> None: + def swap_in(self, blocks: list[Block]) -> None: pass @property @abstractmethod - def all_block_ids(self) -> FrozenSet[int]: + def all_block_ids(self) -> frozenset[int]: pass @abstractmethod - def clear_copy_on_writes(self) -> List[Tuple[int, int]]: + def clear_copy_on_writes(self) -> list[tuple[int, int]]: pass @abstractmethod - def mark_blocks_as_accessed(self, block_ids: List[int], + def mark_blocks_as_accessed(self, block_ids: list[int], now: float) -> None: pass @abstractmethod - def mark_blocks_as_computed(self, block_ids: List[int]) -> None: + def mark_blocks_as_computed(self, block_ids: list[int]) -> None: pass @abstractmethod def get_common_computed_block_ids( - self, computed_seq_block_ids: List[List[int]]) -> List[int]: + self, computed_seq_block_ids: list[list[int]]) -> list[int]: pass @abstractmethod @@ -186,7 +186,7 @@ def promote_to_immutable_block(self, block: Block) -> BlockId: pass @abstractmethod - def get_num_full_blocks_touched(self, blocks: List[Block]) -> int: + def get_num_full_blocks_touched(self, blocks: list[Block]) -> int: pass @abstractmethod @@ -205,8 +205,8 @@ class NoFreeBlocksError(ValueError): @abstractmethod def find_cached_blocks_prefix( self, - block_hashes: List[int], - ) -> List[int]: + block_hashes: list[int], + ) -> list[int]: pass @@ -222,7 +222,7 @@ def allocate_mutable_block(self, @abstractmethod def allocate_immutable_block(self, prev_block: Optional[Block], - token_ids: List[int], + token_ids: list[int], device: Device, extra_hash: Optional[int] = None) -> Block: pass @@ -231,10 +231,10 @@ def allocate_immutable_block(self, def allocate_immutable_blocks( self, prev_block: Optional[Block], - block_token_ids: List[List[int]], + block_token_ids: list[list[int]], device: Device, extra_hash: Optional[int] = None, - ) -> List[Block]: + ) -> list[Block]: pass @abstractmethod @@ -250,40 +250,40 @@ def free(self, block: Block) -> None: pass @abstractmethod - def fork(self, last_block: Block) -> List[Block]: + def fork(self, last_block: Block) -> list[Block]: pass @property @abstractmethod - def all_block_ids(self) -> FrozenSet[int]: + def all_block_ids(self) -> frozenset[int]: pass @abstractmethod - def clear_copy_on_writes(self) -> List[Tuple[int, int]]: + def clear_copy_on_writes(self) -> list[tuple[int, int]]: pass @abstractmethod - def mark_blocks_as_accessed(self, block_ids: List[int], + def mark_blocks_as_accessed(self, block_ids: list[int], now: float) -> None: pass @abstractmethod - def mark_blocks_as_computed(self, block_ids: List[int]) -> None: + def mark_blocks_as_computed(self, block_ids: list[int]) -> None: pass @abstractmethod def get_common_computed_block_ids( - self, computed_seq_block_ids: List[List[int]]) -> List[int]: + self, computed_seq_block_ids: list[list[int]]) -> list[int]: pass @abstractmethod - def get_num_full_blocks_touched(self, blocks: List[Block], + def get_num_full_blocks_touched(self, blocks: list[Block], device: Device) -> int: pass @abstractmethod - def swap(self, blocks: List[Block], src_device: Device, - dst_device: Device) -> Dict[int, int]: + def swap(self, blocks: list[Block], src_device: Device, + dst_device: Device) -> dict[int, int]: pass @abstractmethod @@ -312,7 +312,7 @@ def reset_prefix_cache(self) -> bool: @abstractmethod def find_cached_blocks_prefix( self, - block_hashes: List[int], + block_hashes: list[int], device: Device = Device.GPU, - ) -> List[int]: + ) -> list[int]: pass diff --git a/vllm/core/block/naive_block.py b/vllm/core/block/naive_block.py index c388366b825f2..b7b645187a7bc 100644 --- a/vllm/core/block/naive_block.py +++ b/vllm/core/block/naive_block.py @@ -1,7 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 from collections import deque -from typing import Deque, FrozenSet, Iterable, List, Optional, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union from vllm.core.block.common import (BlockPool, CopyOnWriteTracker, RefCounter, get_all_blocks_recursively) @@ -38,7 +39,7 @@ def __init__( if block_ids is None: block_ids = range(num_blocks) - self._free_block_indices: Deque[BlockId] = deque(block_ids) + self._free_block_indices: deque[BlockId] = deque(block_ids) self._all_block_indices = frozenset(block_ids) assert len(self._all_block_indices) == num_blocks @@ -64,7 +65,7 @@ def __init__( def allocate_immutable_block(self, prev_block: Optional[Block], - token_ids: List[int], + token_ids: list[int], extra_hash: Optional[int] = None, device: Optional[Device] = None) -> Block: """Allocates a new immutable block with the given token IDs, linked to @@ -74,7 +75,7 @@ def allocate_immutable_block(self, prev_block (Optional[Block]): The previous block in the sequence. If None, then the block to be allocated is the first block in the sequence. - token_ids (List[int]): The token IDs to be stored in the new block. + token_ids (list[int]): The token IDs to be stored in the new block. Returns: Block: The newly allocated immutable block. @@ -87,9 +88,9 @@ def allocate_immutable_block(self, def allocate_immutable_blocks( self, prev_block: Optional[Block], - block_token_ids: List[List[int]], + block_token_ids: list[list[int]], extra_hash: Optional[int] = None, - device: Optional[Device] = None) -> List[Block]: + device: Optional[Device] = None) -> list[Block]: assert device is None num_blocks = len(block_token_ids) @@ -161,7 +162,7 @@ def free(self, block: Block, keep_block_object: bool = False) -> None: def free_block_id(self, block_id: BlockId) -> None: self._free_block_id(block_id) - def fork(self, last_block: Block) -> List[Block]: + def fork(self, last_block: Block) -> list[Block]: """Creates a new sequence of blocks that shares the same underlying memory as the original sequence. @@ -169,12 +170,12 @@ def fork(self, last_block: Block) -> List[Block]: last_block (Block): The last block in the original sequence. Returns: - List[Block]: The new sequence of blocks that shares the same memory + list[Block]: The new sequence of blocks that shares the same memory as the original sequence. """ source_blocks = get_all_blocks_recursively(last_block) - forked_blocks: List[Block] = [] + forked_blocks: list[Block] = [] prev_block = None for block in source_blocks: @@ -218,7 +219,7 @@ def refcounter(self): return self._refcounter @property - def all_block_ids(self) -> FrozenSet[int]: + def all_block_ids(self) -> frozenset[int]: return self._all_block_indices def cow_block_if_not_appendable(self, block: Block) -> BlockId: @@ -246,16 +247,16 @@ def cow_block_if_not_appendable(self, block: Block) -> BlockId: return trg_block_id - def clear_copy_on_writes(self) -> List[Tuple[BlockId, BlockId]]: + def clear_copy_on_writes(self) -> list[tuple[BlockId, BlockId]]: """Returns the copy-on-write source->destination mapping and clears it. Returns: - List[Tuple[BlockId, BlockId]]: A list mapping source + list[tuple[BlockId, BlockId]]: A list mapping source block indices to destination block indices. """ return self._cow_tracker.clear_cows() - def mark_blocks_as_accessed(self, block_ids: List[int], + def mark_blocks_as_accessed(self, block_ids: list[int], now: float) -> None: """Mark blocks as accessed, used in prefix caching. @@ -264,7 +265,7 @@ def mark_blocks_as_accessed(self, block_ids: List[int], """ pass - def mark_blocks_as_computed(self, block_ids: List[int]) -> None: + def mark_blocks_as_computed(self, block_ids: list[int]) -> None: """Mark blocks as computed, used in prefix caching. Since the naive allocator does not implement prefix caching, we do @@ -273,7 +274,7 @@ def mark_blocks_as_computed(self, block_ids: List[int]) -> None: pass def get_common_computed_block_ids( - self, computed_seq_block_ids: List[List[int]]) -> List[int]: + self, computed_seq_block_ids: list[list[int]]) -> list[int]: """Determine blocks that can be skipped in prefill. Since the naive allocator does not support prefix caching, always return @@ -284,12 +285,12 @@ def get_common_computed_block_ids( def promote_to_immutable_block(self, block: Block) -> BlockId: raise NotImplementedError("There is no promotion for naive blocks") - def get_num_full_blocks_touched(self, blocks: List[Block]) -> int: + def get_num_full_blocks_touched(self, blocks: list[Block]) -> int: """Returns the number of full blocks that will be touched by swapping in/out. Args: - blocks: List of blocks to be swapped. + blocks: list of blocks to be swapped. Returns: int: the number of full blocks that will be touched by swapping in/out the given blocks. Non full blocks are ignored @@ -305,11 +306,11 @@ def get_num_full_blocks_touched(self, blocks: List[Block]) -> int: old_block_set.add(block) return len(old_block_set) - def swap_out(self, blocks: List[Block]) -> None: + def swap_out(self, blocks: list[Block]) -> None: for block in blocks: self._free_block_id(block) - def swap_in(self, blocks: List[Block]) -> None: + def swap_in(self, blocks: list[Block]) -> None: for block in blocks: # Here we allocate either immutable or mutable block and then # extract its block_id. Note that the block object is released @@ -336,7 +337,7 @@ def reset_prefix_cache(self) -> bool: """No prefix cache for naive block allocator.""" return True - def find_cached_blocks_prefix(self, block_hashes: List[int]) -> List[int]: + def find_cached_blocks_prefix(self, block_hashes: list[int]) -> list[int]: # Not applicable for naive block allocator. return [] @@ -351,7 +352,7 @@ class NaiveBlock(Block): Args: prev_block (Block): The previous block in the sequence. - token_ids (List[int]): The initial token IDs to be stored in the block. + token_ids (list[int]): The initial token IDs to be stored in the block. block_size (int): The maximum number of token IDs that can be stored in the block. allocator (BlockAllocator): The block allocator associated with this @@ -365,13 +366,13 @@ class NaiveBlock(Block): def __init__(self, prev_block: Optional[Block], - token_ids: List[int], + token_ids: list[int], block_size: int, allocator: BlockAllocator, block_id: Optional[int] = None, _cow_target: Optional[Block] = None, extra_hash: Optional[int] = None): - self._token_ids: List[int] = [] + self._token_ids: list[int] = [] self._block_size = block_size self._prev_block = prev_block self._block_id = block_id @@ -380,12 +381,12 @@ def __init__(self, self._append_token_ids_no_cow(token_ids) - def append_token_ids(self, token_ids: List[int]) -> None: + def append_token_ids(self, token_ids: list[int]) -> None: """Appends the given token IDs to the block and performs a copy-on-write if necessary. Args: - token_ids (Optional[List[int]]): The token IDs to be appended + token_ids (Optional[list[int]]): The token IDs to be appended to the block. """ self._append_token_ids_no_cow(token_ids) @@ -394,11 +395,11 @@ def append_token_ids(self, token_ids: List[int]) -> None: self._block_id = (self._allocator.cow_block_if_not_appendable( self._cow_target)) - def _append_token_ids_no_cow(self, token_ids: List[int]) -> None: + def _append_token_ids_no_cow(self, token_ids: list[int]) -> None: """Appends the given token IDs to the block Args: - token_ids (List[int]): The token IDs to be appended to the block. + token_ids (list[int]): The token IDs to be appended to the block. """ if len(token_ids) == 0: return @@ -440,7 +441,7 @@ def num_empty_slots(self) -> int: return self._block_size - len(self.token_ids) @property - def token_ids(self) -> List[int]: + def token_ids(self) -> list[int]: return self._token_ids @property diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py index 1ca9e49dac371..f5907a69bf452 100644 --- a/vllm/core/block/prefix_caching_block.py +++ b/vllm/core/block/prefix_caching_block.py @@ -2,9 +2,9 @@ """Token blocks.""" import sys from bisect import bisect_left +from collections.abc import Iterable from os.path import commonprefix -from typing import (Callable, Dict, FrozenSet, Iterable, List, Optional, Set, - Tuple) +from typing import Callable, Optional from vllm.core.block.common import (CacheMetricData, CopyOnWriteTracker, get_all_blocks_recursively) @@ -88,15 +88,15 @@ def __init__( # A mapping of prefix hash to block index. All blocks which have a # prefix hash will be in this dict, even if they have refcount 0. - self._cached_blocks: Dict[PrefixHash, BlockId] = {} + self._cached_blocks: dict[PrefixHash, BlockId] = {} # A list of immutable block IDs that have been touched by scheduler # and should be marked as computed after an entire batch of sequences # are scheduled. - self._touched_blocks: Set[BlockId] = set() + self._touched_blocks: set[BlockId] = set() # Used to track status of each physical block id - self._block_tracker: Dict[BlockId, BlockTracker] = {} + self._block_tracker: dict[BlockId, BlockTracker] = {} for block_id in block_ids: self._block_tracker[block_id] = BlockTracker() @@ -134,7 +134,7 @@ def __init__( def _create_block( self, prev_block: Optional[Block], - token_ids: List[int], + token_ids: list[int], block_size: int, allocator: BlockAllocator, block_id: Optional[int] = None, @@ -156,7 +156,7 @@ def _create_block( def allocate_immutable_block(self, prev_block: Optional[Block], - token_ids: List[int], + token_ids: list[int], extra_hash: Optional[int] = None, device: Optional[Device] = None) -> Block: """Allocates an immutable block with the given token IDs, reusing cached @@ -164,7 +164,7 @@ def allocate_immutable_block(self, Args: prev_block (Optional[Block]): The previous block in the sequence. - token_ids (List[int]): The token IDs to be stored in the block. + token_ids (list[int]): The token IDs to be stored in the block. Returns: Block: The allocated immutable block. @@ -197,9 +197,9 @@ def allocate_immutable_block(self, def allocate_immutable_blocks( self, prev_block: Optional[Block], - block_token_ids: List[List[int]], + block_token_ids: list[list[int]], extra_hash: Optional[int] = None, - device: Optional[Device] = None) -> List[Block]: + device: Optional[Device] = None) -> list[Block]: blocks = [] for token_ids in block_token_ids: prev_block = self.allocate_immutable_block(prev_block=prev_block, @@ -376,7 +376,7 @@ def free(self, block: Block, keep_block_object: bool = False) -> None: if not keep_block_object: self._block_pool.free_block(block) - def fork(self, last_block: Block) -> List[Block]: + def fork(self, last_block: Block) -> list[Block]: """Creates a new sequence of blocks that shares the same underlying memory as the original sequence. @@ -384,12 +384,12 @@ def fork(self, last_block: Block) -> List[Block]: last_block (Block): The last block in the original sequence. Returns: - List[Block]: The new sequence of blocks that shares the same memory + list[Block]: The new sequence of blocks that shares the same memory as the original sequence. """ source_blocks = get_all_blocks_recursively(last_block) - forked_blocks: List[Block] = [] + forked_blocks: list[Block] = [] prev_block = None for block in source_blocks: block_id = block.block_id @@ -435,7 +435,7 @@ def get_physical_block_id(self, absolute_id: int) -> int: return sorted(self.all_block_ids).index(absolute_id) @property - def all_block_ids(self) -> FrozenSet[int]: + def all_block_ids(self) -> frozenset[int]: return self._hashless_allocator.all_block_ids def get_prefix_cache_hit_rate(self) -> float: @@ -551,16 +551,16 @@ def cow_block_if_not_appendable(self, block: Block) -> BlockId: return trg_block_id - def clear_copy_on_writes(self) -> List[Tuple[BlockId, BlockId]]: + def clear_copy_on_writes(self) -> list[tuple[BlockId, BlockId]]: """Returns the copy-on-write source->destination mapping and clears it. Returns: - List[Tuple[BlockId, BlockId]]: A list mapping source + list[tuple[BlockId, BlockId]]: A list mapping source block indices to destination block indices. """ return self._cow_tracker.clear_cows() - def mark_blocks_as_accessed(self, block_ids: List[int], + def mark_blocks_as_accessed(self, block_ids: list[int], now: float) -> None: """Mark blocks as accessed, used in prefix caching. @@ -577,7 +577,7 @@ def mark_blocks_as_accessed(self, block_ids: List[int], raise ValueError( "Mark block as accessed which is not belonged to GPU") - def mark_blocks_as_computed(self, block_ids: List[int]) -> None: + def mark_blocks_as_computed(self, block_ids: list[int]) -> None: # Mark all touched blocks as computed. for block_id in self._touched_blocks: self._block_tracker[block_id].computed = True @@ -600,7 +600,7 @@ def block_is_computed(self, block_id: int) -> bool: return block_id in self.evictor def get_common_computed_block_ids( - self, computed_seq_block_ids: List[List[int]]) -> List[int]: + self, computed_seq_block_ids: list[list[int]]) -> list[int]: """Return the block ids that are common for a given sequence group. Only those blocks that are immutable and already be marked @@ -620,12 +620,12 @@ def get_common_computed_block_ids( if ids ]) - def get_num_full_blocks_touched(self, blocks: List[Block]) -> int: + def get_num_full_blocks_touched(self, blocks: list[Block]) -> int: """Returns the number of full blocks that will be touched by swapping in/out. Args: - blocks: List of blocks to be swapped. + blocks: list of blocks to be swapped. Returns: int: the number of full blocks that will be touched by swapping in/out the given blocks. Non full blocks are ignored @@ -643,23 +643,23 @@ def get_num_full_blocks_touched(self, blocks: List[Block]) -> int: num_touched_blocks += 1 return num_touched_blocks - def swap_out(self, blocks: List[Block]) -> None: + def swap_out(self, blocks: list[Block]) -> None: """Execute the swap out actions. Basically just free the given blocks. Args: - blocks: List of blocks to be swapped out. + blocks: list of blocks to be swapped out. """ for block in blocks: self._free_block_id(block) - def swap_in(self, blocks: List[Block]) -> None: + def swap_in(self, blocks: list[Block]) -> None: """Execute the swap in actions. Change the block id from old allocator to current allocator for each block to finish the block table update. Args: - blocks: List of blocks to be swapped in. + blocks: list of blocks to be swapped in. """ for block in blocks: # Here we allocate either immutable or mutable block and then @@ -681,7 +681,7 @@ def swap_in(self, blocks: List[Block]) -> None: block.block_id = block_id # Assign block_id - def find_cached_blocks_prefix(self, block_hashes: List[int]) -> List[int]: + def find_cached_blocks_prefix(self, block_hashes: list[int]) -> list[int]: """ Given a list of block hashes, return the prefix of the block hashes that are all cached. @@ -692,10 +692,10 @@ def find_cached_blocks_prefix(self, block_hashes: List[int]) -> List[int]: property, we can use binary search to find the prefix of cached blocks. Args: - block_hashes (List[int]): The list of block hashes. + block_hashes (list[int]): The list of block hashes. Returns: - List[int]: The prefix of the `block_hashes` that are cached. + list[int]: The prefix of the `block_hashes` that are cached. """ def _block_is_cached(block_hash: PrefixHash) -> bool: @@ -734,7 +734,7 @@ class PrefixCachingBlock(Block): Args: prev_block (Optional[PrefixCachingBlock]): The previous block in the sequence. - token_ids (List[int]): The initial token IDs to be stored in the block. + token_ids (list[int]): The initial token IDs to be stored in the block. block_size (int): The maximum number of token IDs that can be stored in the block. allocator (BlockAllocator): The prefix @@ -756,7 +756,7 @@ class PrefixCachingBlock(Block): def __init__( self, prev_block: Optional[Block], - token_ids: List[int], + token_ids: list[int], block_size: int, allocator: BlockAllocator, block_id: Optional[int] = None, @@ -826,12 +826,12 @@ def last_accessed(self) -> float: def last_accessed(self, last_accessed_ts: float): self._last_accessed = last_accessed_ts - def append_token_ids(self, token_ids: List[int]) -> None: + def append_token_ids(self, token_ids: list[int]) -> None: """Appends the given token IDs to the block and registers the block as immutable if the block becomes full. Args: - token_ids (List[int]): The token IDs to be appended to the block. + token_ids (list[int]): The token IDs to be appended to the block. """ # Ensure this is mutable block (not promoted) assert self.content_hash is None @@ -878,7 +878,7 @@ def block_size(self) -> int: return self._block.block_size @property - def token_ids(self) -> List[int]: + def token_ids(self) -> list[int]: return self._block.token_ids @property @@ -927,7 +927,7 @@ def content_hash(self) -> Optional[int]: def hash_block_tokens(cls, is_first_block: bool, prev_block_hash: Optional[int], - cur_block_token_ids: List[int], + cur_block_token_ids: list[int], extra_hash: Optional[int] = None) -> int: """Computes a hash value corresponding to the contents of a block and the contents of the preceding block(s). The hash value is used for @@ -938,7 +938,7 @@ def hash_block_tokens(cls, the sequence. - prev_block_hash (Optional[int]): The hash of the previous block. None if this is the first block. - - cur_block_token_ids (List[int]): A list of token ids in the current + - cur_block_token_ids (list[int]): A list of token ids in the current block. The current block is assumed to be full. - extra_hash (Optional[int]): The hash value of additional factors such as adapters that influence the block, apart from the token_ids. @@ -990,14 +990,14 @@ def __init__( # for the sequence when we need to check if the sequence is cached. # Note a block that's not full will not have its hash calculated and # recorded. - self._seq_id_to_blocks_hashes: Dict[int, List[int]] = {} + self._seq_id_to_blocks_hashes: dict[int, list[int]] = {} # A map from seq_id to the number of tokens that are cached for the # sequence. # We need this so that a sequence in continuous prefill doesn't # accidentally see its cached token count change. See comments in # `get_num_cached_tokens` for more details. - self._seq_id_to_num_tokens_computed: Dict[int, int] = {} + self._seq_id_to_num_tokens_computed: dict[int, int] = {} def _update_seq_hashes(self, seq: Sequence) -> None: """Incrementally update the sequence's block hashes and record them.""" @@ -1096,7 +1096,7 @@ class LastAccessBlocksTracker: def __init__(self, allocator): self._allocator = allocator - self._seq_last_access: Dict[int, Optional[float]] = {} + self._seq_last_access: dict[int, Optional[float]] = {} def add_seq(self, seq_id: int) -> None: """Start tracking seq_id @@ -1115,7 +1115,7 @@ def update_last_access(self, seq_id: int, time: float) -> None: self._seq_last_access[seq_id] = time def update_seq_blocks_last_access(self, seq_id: int, - block_ids: List[int]) -> None: + block_ids: list[int]) -> None: assert seq_id in self._seq_last_access ts = self._seq_last_access[seq_id] diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py index c5b3b04f37ca3..b229bbb6d4391 100644 --- a/vllm/core/block_manager.py +++ b/vllm/core/block_manager.py @@ -1,8 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 """A block manager that manages token blocks.""" -from typing import Dict, List, Optional -from typing import Sequence as GenericSequence -from typing import Tuple +from collections.abc import Sequence as GenericSequence +from typing import Optional from vllm.core.block.block_table import BlockTable from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator @@ -98,8 +97,8 @@ def __init__( block_size=block_size, ) - self.block_tables: Dict[SeqId, BlockTable] = {} - self.cross_block_tables: Dict[EncoderSeqId, BlockTable] = {} + self.block_tables: dict[SeqId, BlockTable] = {} + self.cross_block_tables: dict[EncoderSeqId, BlockTable] = {} self._computed_blocks_tracker = ComputedBlocksTracker( self.block_allocator, self.block_size, self.enable_caching) @@ -236,7 +235,7 @@ def append_slots( self, seq: Sequence, num_lookahead_slots: int, - ) -> List[Tuple[int, int]]: + ) -> list[tuple[int, int]]: block_table = self.block_tables[seq.seq_id] @@ -277,11 +276,11 @@ def free_cross(self, seq_group: SequenceGroup) -> None: self.cross_block_tables[request_id].free() del self.cross_block_tables[request_id] - def get_block_table(self, seq: Sequence) -> List[int]: + def get_block_table(self, seq: Sequence) -> list[int]: block_ids = self.block_tables[seq.seq_id].physical_block_ids return block_ids # type: ignore - def get_cross_block_table(self, seq_group: SequenceGroup) -> List[int]: + def get_cross_block_table(self, seq_group: SequenceGroup) -> list[int]: request_id = seq_group.request_id assert request_id in self.cross_block_tables block_ids = self.cross_block_tables[request_id].physical_block_ids @@ -307,7 +306,7 @@ def mark_blocks_as_computed(self, seq_group: SequenceGroup, self.block_allocator.mark_blocks_as_computed([]) def get_common_computed_block_ids( - self, seqs: List[Sequence]) -> GenericSequence[int]: + self, seqs: list[Sequence]) -> GenericSequence[int]: """Determine which blocks for which we skip prefill. With prefix caching we can skip prefill for previously-generated blocks. @@ -357,7 +356,7 @@ def can_swap_in(self, seq_group: SequenceGroup, return self._can_swap(seq_group, Device.GPU, SequenceStatus.SWAPPED, num_lookahead_slots) - def swap_in(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]: + def swap_in(self, seq_group: SequenceGroup) -> list[tuple[int, int]]: """Returns the block id mapping (from CPU to GPU) generated by swapping in the given seq_group with num_lookahead_slots. @@ -365,7 +364,7 @@ def swap_in(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]: seq_group (SequenceGroup): The sequence group to swap in. Returns: - List[Tuple[int, int]]: The mapping of swapping block from CPU + list[tuple[int, int]]: The mapping of swapping block from CPU to GPU. """ physical_block_id_mapping = [] @@ -410,7 +409,7 @@ def can_swap_out(self, seq_group: SequenceGroup) -> bool: SequenceStatus.RUNNING) return alloc_status == AllocStatus.OK - def swap_out(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]: + def swap_out(self, seq_group: SequenceGroup) -> list[tuple[int, int]]: """Returns the block id mapping (from GPU to CPU) generated by swapping out the given sequence_group with num_lookahead_slots. @@ -418,7 +417,7 @@ def swap_out(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]: sequence_group (SequenceGroup): The sequence group to swap out. Returns: - List[Tuple[int, int]]: The mapping of swapping block from + list[tuple[int, int]]: The mapping of swapping block from GPU to CPU. """ physical_block_id_mapping = [] @@ -483,7 +482,7 @@ def _can_swap(self, # swap. Then verify if there are available blocks in the device # to perform the swap. num_blocks_touched = 0 - blocks: List[Block] = [] + blocks: list[Block] = [] for seq in seq_group.get_seqs(status=status): block_table = self.block_tables[seq.seq_id] if block_table.blocks is not None: diff --git a/vllm/core/evictor.py b/vllm/core/evictor.py index 0e363eddc8a5e..68a2f704def8a 100644 --- a/vllm/core/evictor.py +++ b/vllm/core/evictor.py @@ -3,7 +3,6 @@ import enum import heapq from abc import ABC, abstractmethod -from typing import Dict, List, Tuple class EvictionPolicy(enum.Enum): @@ -27,7 +26,7 @@ def __contains__(self, block_id: int) -> bool: pass @abstractmethod - def evict(self) -> Tuple[int, int]: + def evict(self) -> tuple[int, int]: """Runs the eviction algorithm and returns the evicted block's content hash along with physical block id along with physical block id """ @@ -84,13 +83,13 @@ class LRUEvictor(Evictor): CLEANUP_THRESHOLD = 50 def __init__(self): - self.free_table: Dict[int, BlockMetaData] = {} + self.free_table: dict[int, BlockMetaData] = {} self.priority_queue = [] def __contains__(self, block_id: int) -> bool: return block_id in self.free_table - def evict(self) -> Tuple[int, int]: + def evict(self) -> tuple[int, int]: if len(self.free_table) == 0: raise ValueError("No usable cache memory left") @@ -128,7 +127,7 @@ def _cleanup_if_necessary(self): self._cleanup() def _cleanup(self): - new_priority_queue: List[Tuple[float, int, int, int]] = [] + new_priority_queue: list[tuple[float, int, int, int]] = [] for block_id, block in self.free_table.items(): new_priority_queue.append( diff --git a/vllm/core/interfaces.py b/vllm/core/interfaces.py index b48ba87e95a0b..819f372490ec9 100644 --- a/vllm/core/interfaces.py +++ b/vllm/core/interfaces.py @@ -2,9 +2,7 @@ import enum from abc import ABC, abstractmethod -from typing import List -from typing import Sequence as GenericSequence -from typing import Tuple +from collections.abc import Sequence as GenericSequence from vllm.sequence import Sequence, SequenceGroup from vllm.utils import Device @@ -61,7 +59,7 @@ def append_slots( self, seq: Sequence, num_lookahead_slots: int, - ) -> List[Tuple[int, int]]: + ) -> list[tuple[int, int]]: pass @abstractmethod @@ -74,7 +72,7 @@ def can_swap_in(self, seq_group: SequenceGroup, pass @abstractmethod - def swap_in(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]: + def swap_in(self, seq_group: SequenceGroup) -> list[tuple[int, int]]: pass @abstractmethod @@ -82,7 +80,7 @@ def can_swap_out(self, seq_group: SequenceGroup) -> bool: pass @abstractmethod - def swap_out(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]: + def swap_out(self, seq_group: SequenceGroup) -> list[tuple[int, int]]: pass @abstractmethod @@ -90,7 +88,7 @@ def free(self, seq: Sequence) -> None: pass @abstractmethod - def get_block_table(self, seq: Sequence) -> List[int]: + def get_block_table(self, seq: Sequence) -> list[int]: pass @abstractmethod @@ -111,7 +109,7 @@ def access_all_blocks_in_seq( @abstractmethod def get_common_computed_block_ids( - self, seqs: List[Sequence]) -> GenericSequence[int]: + self, seqs: list[Sequence]) -> GenericSequence[int]: pass @abstractmethod diff --git a/vllm/core/placeholder_block_space_manager.py b/vllm/core/placeholder_block_space_manager.py index 70c22afa8e158..a4f721544606a 100644 --- a/vllm/core/placeholder_block_space_manager.py +++ b/vllm/core/placeholder_block_space_manager.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List, Tuple - from vllm.core.interfaces import AllocStatus, BlockSpaceManager from vllm.sequence import Sequence, SequenceGroup from vllm.utils import Device @@ -42,7 +40,7 @@ def append_slots( self, seq: Sequence, num_lookahead_slots: int, - ) -> List[Tuple[int, int]]: + ) -> list[tuple[int, int]]: return [] def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None: @@ -52,20 +50,20 @@ def can_swap_in(self, seq_group: SequenceGroup, num_lookahead_slots: int) -> AllocStatus: return AllocStatus.OK - def swap_in(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]: + def swap_in(self, seq_group: SequenceGroup) -> list[tuple[int, int]]: return None # type: ignore def can_swap_out(self, seq_group: SequenceGroup) -> bool: return True - def swap_out(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]: + def swap_out(self, seq_group: SequenceGroup) -> list[tuple[int, int]]: return None # type: ignore def free(self, seq: Sequence) -> None: # No operation on free return - def get_block_table(self, seq: Sequence) -> List[int]: + def get_block_table(self, seq: Sequence) -> list[int]: return None # type: ignore def get_num_free_gpu_blocks(self) -> int: @@ -82,7 +80,7 @@ def access_all_blocks_in_seq( pass def get_common_computed_block_ids(self, - seq_group: List[Sequence]) -> List[int]: + seq_group: list[Sequence]) -> list[int]: return [] def mark_blocks_as_computed(self, seq_group: SequenceGroup, diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index 3cdad496e8435..2a43878ea395b 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -5,10 +5,10 @@ import random import time from collections import deque +from collections.abc import Iterable +from collections.abc import Sequence as GenericSequence from dataclasses import dataclass, field -from typing import Callable, Deque, Dict, Iterable, List, Optional -from typing import Sequence as GenericSequence -from typing import Set, Tuple, Union +from typing import Callable, Optional, Union from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig from vllm.core.interfaces import AllocStatus, BlockSpaceManager @@ -58,8 +58,8 @@ class SchedulingBudget: token_budget: int max_num_seqs: int - _request_ids_num_batched_tokens: Set[str] = field(default_factory=set) - _request_ids_num_curr_seqs: Set[str] = field(default_factory=set) + _request_ids_num_batched_tokens: set[str] = field(default_factory=set) + _request_ids_num_curr_seqs: set[str] = field(default_factory=set) # Number of cached tokens in the batch. _num_cached_tokens: int = 0 # Number of actual non-cached tokens in the batch. @@ -141,14 +141,14 @@ class SchedulerOutputs: num_prefill_groups: int # Total number of batched tokens. num_batched_tokens: int - # Blocks to swap in. List of CPU -> GPU block number. - blocks_to_swap_in: List[Tuple[int, int]] - # Blocks to swap out. List of GPU -> CPU block number. - blocks_to_swap_out: List[Tuple[int, int]] + # Blocks to swap in. list of CPU -> GPU block number. + blocks_to_swap_in: list[tuple[int, int]] + # Blocks to swap out. list of GPU -> CPU block number. + blocks_to_swap_out: list[tuple[int, int]] # Blocks to copy. Source to dest block. - blocks_to_copy: List[Tuple[int, int]] + blocks_to_copy: list[tuple[int, int]] # Sequence groups that are going to be ignored. - ignored_seq_groups: List[SequenceGroup] + ignored_seq_groups: list[SequenceGroup] # The number of slots for lookahead decoding. num_lookahead_slots: int # The number of requests in the running queue @@ -185,7 +185,7 @@ def key_fn(group: ScheduledSequenceGroup): key=key_fn) @property - def lora_requests(self) -> Set[LoRARequest]: + def lora_requests(self) -> set[LoRARequest]: return { g.seq_group.lora_request for g in self.scheduled_seq_groups @@ -193,7 +193,7 @@ def lora_requests(self) -> Set[LoRARequest]: } @property - def prompt_adapter_requests(self) -> Set[PromptAdapterRequest]: + def prompt_adapter_requests(self) -> set[PromptAdapterRequest]: return { g.seq_group.prompt_adapter_request for g in self.scheduled_seq_groups @@ -210,24 +210,24 @@ class SchedulerRunningOutputs: """ # Selected sequences that are running and in a decoding phase. - decode_seq_groups: List[ScheduledSequenceGroup] + decode_seq_groups: list[ScheduledSequenceGroup] # Selected sequences that are running and in a prefill phase. # I.e., it means the prefill has been chunked. - prefill_seq_groups: List[ScheduledSequenceGroup] + prefill_seq_groups: list[ScheduledSequenceGroup] # The preempted sequences. - preempted: List[SequenceGroup] + preempted: list[SequenceGroup] # Sequences that are swapped out. - swapped_out: List[SequenceGroup] + swapped_out: list[SequenceGroup] # The blocks to swap out. - blocks_to_swap_out: List[Tuple[int, int]] + blocks_to_swap_out: list[tuple[int, int]] # The blocks to copy. - blocks_to_copy: List[Tuple[int, int]] + blocks_to_copy: list[tuple[int, int]] # The number of slots for lookahead decoding. num_lookahead_slots: int # Optimization for fast-access to seq_group lists - decode_seq_groups_list: List[SequenceGroup] - prefill_seq_groups_list: List[SequenceGroup] + decode_seq_groups_list: list[SequenceGroup] + prefill_seq_groups_list: list[SequenceGroup] @classmethod def create_empty(cls) -> "SchedulerRunningOutputs": @@ -253,18 +253,18 @@ class SchedulerSwappedInOutputs: # Selected sequences that are going to be swapped in and is in a # decoding phase. - decode_seq_groups: List[ScheduledSequenceGroup] + decode_seq_groups: list[ScheduledSequenceGroup] # Selected sequences that are going to be swapped in and in a prefill # phase. I.e., it means the prefill has been chunked. - prefill_seq_groups: List[ScheduledSequenceGroup] + prefill_seq_groups: list[ScheduledSequenceGroup] # The blocks to swap in. - blocks_to_swap_in: List[Tuple[int, int]] + blocks_to_swap_in: list[tuple[int, int]] # The blocks to copy. - blocks_to_copy: List[Tuple[int, int]] + blocks_to_copy: list[tuple[int, int]] # The number of slots for lookahead decoding. num_lookahead_slots: int # Infeasible sequence groups. - infeasible_seq_groups: List[SequenceGroup] + infeasible_seq_groups: list[SequenceGroup] @classmethod def create_empty(cls) -> "SchedulerSwappedInOutputs": @@ -287,9 +287,9 @@ class SchedulerPrefillOutputs: """ # Selected sequences for prefill. - seq_groups: List[ScheduledSequenceGroup] + seq_groups: list[ScheduledSequenceGroup] # Ignored sequence groups. - ignored_seq_groups: List[SequenceGroup] + ignored_seq_groups: list[SequenceGroup] num_lookahead_slots: int @classmethod @@ -372,8 +372,8 @@ def maybe_increment_partial_prefills(self, @classmethod def from_queues( cls, - running: Deque[SequenceGroup], - waiting: Deque[SequenceGroup], + running: deque[SequenceGroup], + waiting: deque[SequenceGroup], scheduler_config: SchedulerConfig, ) -> "PartialPrefillMetadata": """Create a PartialPrefillMetadata object from the current state of @@ -465,18 +465,18 @@ def __init__( # Sequence groups in the WAITING state. # Contain new prefill or preempted requests. - self.waiting: Deque[SequenceGroup] = deque() + self.waiting: deque[SequenceGroup] = deque() # Sequence groups in the RUNNING state. # Contain decode requests. - self.running: Deque[SequenceGroup] = deque() + self.running: deque[SequenceGroup] = deque() # Sequence groups in the SWAPPED state. # Contain decode requests that are swapped out. - self.swapped: Deque[SequenceGroup] = deque() + self.swapped: deque[SequenceGroup] = deque() # Sequence groups finished requests ids since last step iteration. # It lets the model know that any state associated with these requests # can and must be released after the current step. # This is used to evict the finished requests from the Mamba cache. - self._finished_requests_ids: List[str] = list() + self._finished_requests_ids: list[str] = list() # Time at previous scheduling step self.prev_time = 0.0 # Did we schedule a prompt at previous step? @@ -495,9 +495,9 @@ def __init__( self.num_cumulative_preemption: int = 0 # Used to cache python objects - self._seq_group_metadata_cache: List[PyObjectCache] = [] - self._scheduler_running_outputs_cache: List[PyObjectCache] = [] - self._scheduled_seq_group_cache: List[PyObjectCache] = [] + self._seq_group_metadata_cache: list[PyObjectCache] = [] + self._scheduler_running_outputs_cache: list[PyObjectCache] = [] + self._scheduled_seq_group_cache: list[PyObjectCache] = [] # For async output processing, we need to swap cache buffers between # iterations. I.e. since the output processing is lagged one step, @@ -520,7 +520,7 @@ def __init__( # when the request reaches max_model_len. In this case, the request # will be stopped during schedule() call and added to this stop list # for processing and deallocation by the free_finished_seq_groups() - self._async_stopped: List[SequenceGroup] = [] + self._async_stopped: list[SequenceGroup] = [] # List with the chunk sizes to hand out to each sequence depending # on how many partial prefills are running. This is slightly faster than @@ -578,7 +578,7 @@ def abort_seq_group(self, request_id: Union[str, Iterable[str]]) -> None: request_id = (request_id, ) request_ids = set(request_id) for state_queue in [self.waiting, self.running, self.swapped]: - aborted_groups: List[SequenceGroup] = [] + aborted_groups: list[SequenceGroup] = [] for seq_group in state_queue: if not request_ids: # Using 'break' here may add two extra iterations, @@ -625,7 +625,7 @@ def reset_prefix_cache(self) -> bool: def get_num_unfinished_seq_groups(self) -> int: return len(self.waiting) + len(self.running) + len(self.swapped) - def get_and_reset_finished_requests_ids(self) -> List[str]: + def get_and_reset_finished_requests_ids(self) -> list[str]: """Flushes the list of request ids of previously finished seq_groups.""" finished_requests_ids = self._finished_requests_ids self._finished_requests_ids = list() @@ -634,7 +634,7 @@ def get_and_reset_finished_requests_ids(self) -> List[str]: def _schedule_running( self, budget: SchedulingBudget, - curr_loras: Optional[Set[int]], + curr_loras: Optional[set[int]], enable_chunking: bool = False, partial_prefill_metadata: Optional[PartialPrefillMetadata] = None, ) -> SchedulerRunningOutputs: @@ -673,14 +673,14 @@ def _schedule_running( ret.prefill_seq_groups_list.clear() # Blocks that need to be swapped or copied before model execution. - blocks_to_swap_out: List[Tuple[int, int]] = ret.blocks_to_swap_out - blocks_to_copy: List[Tuple[int, int]] = ret.blocks_to_copy + blocks_to_swap_out: list[tuple[int, int]] = ret.blocks_to_swap_out + blocks_to_copy: list[tuple[int, int]] = ret.blocks_to_copy - decode_seq_groups: List[ScheduledSequenceGroup] = ret.decode_seq_groups - prefill_seq_groups: List[ + decode_seq_groups: list[ScheduledSequenceGroup] = ret.decode_seq_groups + prefill_seq_groups: list[ ScheduledSequenceGroup] = ret.prefill_seq_groups - preempted: List[SequenceGroup] = ret.preempted - swapped_out: List[SequenceGroup] = ret.swapped_out + preempted: list[SequenceGroup] = ret.preempted + swapped_out: list[SequenceGroup] = ret.swapped_out running_queue = self.running assert len(self._async_stopped) == 0 @@ -806,7 +806,7 @@ def _schedule_running( def _schedule_swapped( self, budget: SchedulingBudget, - curr_loras: Optional[Set[int]], + curr_loras: Optional[set[int]], enable_chunking: bool = False, ) -> SchedulerSwappedInOutputs: """Schedule sequence groups that are swapped out. @@ -829,15 +829,15 @@ def _schedule_swapped( SchedulerSwappedInOutputs. """ # Blocks that need to be swapped or copied before model execution. - blocks_to_swap_in: List[Tuple[int, int]] = [] - blocks_to_copy: List[Tuple[int, int]] = [] - decode_seq_groups: List[ScheduledSequenceGroup] = [] - prefill_seq_groups: List[ScheduledSequenceGroup] = [] - infeasible_seq_groups: List[SequenceGroup] = [] + blocks_to_swap_in: list[tuple[int, int]] = [] + blocks_to_copy: list[tuple[int, int]] = [] + decode_seq_groups: list[ScheduledSequenceGroup] = [] + prefill_seq_groups: list[ScheduledSequenceGroup] = [] + infeasible_seq_groups: list[SequenceGroup] = [] swapped_queue = self.swapped - leftover_swapped: Deque[SequenceGroup] = deque() + leftover_swapped: deque[SequenceGroup] = deque() while swapped_queue: seq_group = swapped_queue[0] @@ -939,7 +939,7 @@ def _get_prompt_limit(self, seq_group: SequenceGroup) -> int: return prompt_limit def _get_priority(self, - seq_group: SequenceGroup) -> Tuple[Optional[int], float]: + seq_group: SequenceGroup) -> tuple[Optional[int], float]: """Get the priority of the sequence group. Highest preference to user-defined priority, followed by arrival time. Args: @@ -967,7 +967,7 @@ def _schedule_priority_preemption( running_queue = deque(sorted(self.running, key=self._get_priority)) - blocks_to_swap_out: List[Tuple[int, int]] = [] + blocks_to_swap_out: list[tuple[int, int]] = [] force_preemption_count = 0 if waiting_queue: @@ -1017,7 +1017,7 @@ def _schedule_priority_preemption( def _schedule_prefills( self, budget: SchedulingBudget, - curr_loras: Optional[Set[int]], + curr_loras: Optional[set[int]], enable_chunking: bool = False, partial_prefill_metadata: Optional[PartialPrefillMetadata] = None, ) -> SchedulerPrefillOutputs: @@ -1054,12 +1054,12 @@ def _schedule_prefills( num_lookahead_slots=self._get_num_lookahead_slots( is_prefill=True, enable_chunking=enable_chunking), ) - ignored_seq_groups: List[SequenceGroup] = [] - seq_groups: List[ScheduledSequenceGroup] = [] + ignored_seq_groups: list[SequenceGroup] = [] + seq_groups: list[ScheduledSequenceGroup] = [] waiting_queue = self.waiting - leftover_waiting_sequences: Deque[SequenceGroup] = deque() + leftover_waiting_sequences: deque[SequenceGroup] = deque() while self._passed_delay(time.time()) and waiting_queue: seq_group = waiting_queue[0] @@ -1162,7 +1162,7 @@ def _schedule_prefills( seq_group) if enable_chunking and self.scheduler_config.is_multi_step: - blocks_to_copy: List[Tuple[int, int]] = [] + blocks_to_copy: list[tuple[int, int]] = [] # init_multi_step_from_lookahead_slots happens in append_slots self._append_slots(seq_group, blocks_to_copy, enable_chunking) # This assert will trip when a copy-on-write happens. This is @@ -1325,7 +1325,7 @@ def _schedule_chunked_prefill(self) -> SchedulerOutputs: token_budget=self.scheduler_config.max_num_batched_tokens, max_num_seqs=self.scheduler_config.max_num_seqs, ) - curr_loras: Set[int] = set() + curr_loras: set[int] = set() prefills = SchedulerPrefillOutputs.create_empty() swapped_in = SchedulerSwappedInOutputs.create_empty() @@ -1423,8 +1423,8 @@ def _schedule_chunked_prefill(self) -> SchedulerOutputs: ) def _order_finishing_prefills_first( - self, scheduled_prefill_seqs: List[ScheduledSequenceGroup] - ) -> List[SequenceGroup]: + self, scheduled_prefill_seqs: list[ScheduledSequenceGroup] + ) -> list[SequenceGroup]: """Returns a list of prefilling SequenceGroups where sequences that are scheduled to finish prefilling are listed first""" finishing = [ @@ -1477,7 +1477,7 @@ def _allow_async_output_proc(self, seq_group: SequenceGroup) -> bool: def schedule( self - ) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs, bool]: + ) -> tuple[list[SequenceGroupMetadata], SchedulerOutputs, bool]: # Schedule sequence groups. # This function call changes the internal states of the scheduler # such as self.running, self.swapped, and self.waiting. @@ -1492,7 +1492,7 @@ def schedule( allow_async_output_proc: bool = self.use_async_output_proc # Create input data structures. - seq_group_metadata_list: List[SequenceGroupMetadata] = [] + seq_group_metadata_list: list[SequenceGroupMetadata] = [] for i, scheduled_seq_group in enumerate( scheduler_outputs.scheduled_seq_groups): seq_group = scheduled_seq_group.seq_group @@ -1505,9 +1505,9 @@ def schedule( seq_group_metadata.block_tables.clear() # seq_id -> SequenceData - seq_data: Dict[int, SequenceData] = {} + seq_data: dict[int, SequenceData] = {} # seq_id -> physical block numbers - block_tables: Dict[int, List[int]] = {} + block_tables: dict[int, list[int]] = {} if seq_group.is_encoder_decoder(): # Encoder associated with SequenceGroup @@ -1661,7 +1661,7 @@ def _free_finished_seq_group(self, seq_group: SequenceGroup) -> None: self._free_finished_seqs(seq_group) def free_finished_seq_groups(self) -> None: - remaining: Deque[SequenceGroup] = deque() + remaining: deque[SequenceGroup] = deque() for seq_group in self.running: self._free_finished_seq_group(seq_group) if not seq_group.is_finished(): @@ -1689,7 +1689,7 @@ def _allocate_and_set_running(self, seq_group: SequenceGroup) -> None: def _append_slots( self, seq_group: SequenceGroup, - blocks_to_copy: List[Tuple[int, int]], + blocks_to_copy: list[tuple[int, int]], enable_chunking: bool = False, ) -> None: """Appends new slots to the sequences in the given sequence group. @@ -1697,7 +1697,7 @@ def _append_slots( Args: seq_group (SequenceGroup): The sequence group containing the sequences to append slots to. - blocks_to_copy (List[Tuple[int, int]]): A list of tuple of two + blocks_to_copy (list[tuple[int, int]]): A list of tuple of two ints, the first int is the source block index, and the second int is the destination block index. This list is updated with the new source and destination block indices for the appended @@ -1727,7 +1727,7 @@ def _append_slots( blocks_to_copy.extend(cows) def _preempt(self, seq_group: SequenceGroup, - blocks_to_swap_out: List[Tuple[int, int]]) -> PreemptionMode: + blocks_to_swap_out: list[tuple[int, int]]) -> PreemptionMode: # If preemption mode is not specified, we determine the mode as follows: # We use recomputation by default since it incurs lower overhead than # swapping. However, when the sequence group has multiple sequences @@ -1786,14 +1786,14 @@ def _preempt_by_recompute( def _preempt_by_swap( self, seq_group: SequenceGroup, - blocks_to_swap_out: List[Tuple[int, int]], + blocks_to_swap_out: list[tuple[int, int]], ) -> None: self._swap_out(seq_group, blocks_to_swap_out) def _swap_in( self, seq_group: SequenceGroup, - blocks_to_swap_in: List[Tuple[int, int]], + blocks_to_swap_in: list[tuple[int, int]], ) -> None: mapping = self.block_manager.swap_in(seq_group) blocks_to_swap_in.extend(mapping) @@ -1803,7 +1803,7 @@ def _swap_in( def _swap_out( self, seq_group: SequenceGroup, - blocks_to_swap_out: List[Tuple[int, int]], + blocks_to_swap_out: list[tuple[int, int]], ) -> None: if not self.block_manager.can_swap_out(seq_group): # FIXME(woosuk): Abort the sequence group instead of aborting the @@ -1867,7 +1867,7 @@ def _get_num_new_uncached_and_cached_tokens( enable_chunking: bool, budget: SchedulingBudget, partial_prefill_metadata: Optional[PartialPrefillMetadata] = None, - ) -> Tuple[int, int]: + ) -> tuple[int, int]: """ Returns the number of new uncached and cached tokens to schedule for a given sequence group that's in a given `status`. @@ -1982,7 +1982,7 @@ def _chunk_new_tokens_to_schedule( budget: SchedulingBudget, prompt_limit: int, num_new_tokens: int, - partial_prefill_budget_lookup_list: List[int], + partial_prefill_budget_lookup_list: list[int], partial_prefill_metadata: Optional[PartialPrefillMetadata] = None, ) -> int: """ diff --git a/vllm/device_allocator/cumem.py b/vllm/device_allocator/cumem.py index 7f63fc1437872..5aa6e114e3594 100644 --- a/vllm/device_allocator/cumem.py +++ b/vllm/device_allocator/cumem.py @@ -9,7 +9,7 @@ # the only successful approach is to call cuda driver API in C. import dataclasses from contextlib import contextmanager -from typing import Any, Callable, Dict, Optional, Tuple, Union +from typing import Any, Callable, Optional, Union import torch @@ -61,7 +61,7 @@ def find_loaded_library(lib_name) -> Optional[str]: libcudart = None # py_device, py_alignedSize, py_d_mem, py_p_memHandle -HandleType = Tuple[int, int, int, int] +HandleType = tuple[int, int, int, int] @dataclasses.dataclass @@ -140,9 +140,9 @@ def get_instance() -> "CuMemAllocator": return CuMemAllocator.instance def __init__(self): - self.pointer_to_data: Dict[int, AllocationData] = {} + self.pointer_to_data: dict[int, AllocationData] = {} self.current_tag: str = CuMemAllocator.default_tag - self.allocator_and_pools: Dict[str, Any] = {} + self.allocator_and_pools: dict[str, Any] = {} def python_malloc_callback(self, allocation_handle: HandleType) -> None: """ @@ -164,7 +164,7 @@ def python_free_callback(self, ptr: int) -> HandleType: def sleep( self, - offload_tags: Optional[Union[Tuple[str, ...], + offload_tags: Optional[Union[tuple[str, ...], str]] = None) -> None: """ Put the allocator in sleep mode. diff --git a/vllm/distributed/communication_op.py b/vllm/distributed/communication_op.py index 0228264f91f9a..96af7a64c2fca 100644 --- a/vllm/distributed/communication_op.py +++ b/vllm/distributed/communication_op.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Any, Dict, Optional, Union +from typing import Any, Optional, Union import torch import torch.distributed @@ -26,7 +26,7 @@ def tensor_model_parallel_gather(input_: torch.Tensor, return get_tp_group().gather(input_, dst, dim) -def broadcast_tensor_dict(tensor_dict: Optional[Dict[Any, Union[torch.Tensor, +def broadcast_tensor_dict(tensor_dict: Optional[dict[Any, Union[torch.Tensor, Any]]] = None, src: int = 0): if not torch.distributed.is_initialized(): diff --git a/vllm/distributed/device_communicators/cuda_wrapper.py b/vllm/distributed/device_communicators/cuda_wrapper.py index 1d53b1c5b8099..6c15ef644b8c2 100644 --- a/vllm/distributed/device_communicators/cuda_wrapper.py +++ b/vllm/distributed/device_communicators/cuda_wrapper.py @@ -6,7 +6,7 @@ import ctypes from dataclasses import dataclass -from typing import Any, Dict, List, Optional +from typing import Any, Optional # this line makes it possible to directly load `libcudart.so` using `ctypes` import torch # noqa @@ -32,7 +32,7 @@ class cudaIpcMemHandle_t(ctypes.Structure): class Function: name: str restype: Any - argtypes: List[Any] + argtypes: list[Any] def find_loaded_library(lib_name) -> Optional[str]: @@ -97,11 +97,11 @@ class CudaRTLibrary: # class attribute to store the mapping from the path to the library # to avoid loading the same library multiple times - path_to_library_cache: Dict[str, Any] = {} + path_to_library_cache: dict[str, Any] = {} # class attribute to store the mapping from library path # to the corresponding dictionary - path_to_dict_mapping: Dict[str, Dict[str, Any]] = {} + path_to_dict_mapping: dict[str, dict[str, Any]] = {} def __init__(self, so_file: Optional[str] = None): if so_file is None: diff --git a/vllm/distributed/device_communicators/custom_all_reduce.py b/vllm/distributed/device_communicators/custom_all_reduce.py index 90f7f2d0f9823..46efa72ed101c 100644 --- a/vllm/distributed/device_communicators/custom_all_reduce.py +++ b/vllm/distributed/device_communicators/custom_all_reduce.py @@ -2,7 +2,7 @@ import ctypes from contextlib import contextmanager -from typing import List, Optional, Union +from typing import Optional, Union import torch import torch.distributed as dist @@ -177,7 +177,7 @@ def __init__(self, @staticmethod def create_shared_buffer( size_in_bytes: int, - group: Optional[ProcessGroup] = None) -> List[int]: + group: Optional[ProcessGroup] = None) -> list[int]: """ Creates a shared buffer and returns a list of pointers representing the buffer on all processes in the group. @@ -190,7 +190,7 @@ def create_shared_buffer( handles = [None] * world_size dist.all_gather_object(handles, handle, group=group) - pointers: List[int] = [] + pointers: list[int] = [] for i, h in enumerate(handles): if i == rank: pointers.append(pointer.value) # type: ignore @@ -201,7 +201,7 @@ def create_shared_buffer( return pointers @staticmethod - def free_shared_buffer(pointers: List[int], + def free_shared_buffer(pointers: list[int], group: Optional[ProcessGroup] = None, rank: Optional[int] = None) -> None: if rank is None: diff --git a/vllm/distributed/device_communicators/custom_all_reduce_utils.py b/vllm/distributed/device_communicators/custom_all_reduce_utils.py index d8d6eed2dd7ec..11b8b57fe2aed 100644 --- a/vllm/distributed/device_communicators/custom_all_reduce_utils.py +++ b/vllm/distributed/device_communicators/custom_all_reduce_utils.py @@ -7,8 +7,9 @@ import subprocess import sys import tempfile +from collections.abc import Sequence from itertools import product -from typing import Dict, List, Optional, Sequence +from typing import Optional import torch.distributed as dist import torch.multiprocessing as mp @@ -149,7 +150,7 @@ def can_actually_p2p( p_src.join() p_tgt.join() assert p_src.exitcode == 0 and p_tgt.exitcode == 0 - result: List[bool] = [] + result: list[bool] = [] for src, tgt in zip(batch_src, batch_tgt): a = result_queue.get() b = result_queue.get() @@ -175,7 +176,7 @@ def can_actually_p2p( # e.g. used by different vllm engines. The device id in the cache file is a # **local** device id, i.e. from 0 to num_dev-1, where num_dev is the number # of visible devices in the vllm engine. -_gpu_p2p_access_cache: Optional[Dict[str, bool]] = None +_gpu_p2p_access_cache: Optional[dict[str, bool]] = None def gpu_p2p_access_check(src: int, tgt: int) -> bool: @@ -204,7 +205,7 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool: # only the local master process (with local_rank == 0) can # enter this block to calculate the cache logger.info("generating GPU P2P access cache in %s", path) - cache: Dict[str, bool] = {} + cache: dict[str, bool] = {} ids = list(range(num_dev)) # batch of all pairs of GPUs batch_src, batch_tgt = zip(*list(product(ids, ids))) diff --git a/vllm/distributed/device_communicators/pynccl_wrapper.py b/vllm/distributed/device_communicators/pynccl_wrapper.py index 4f04899e92e6d..6f69089b61968 100644 --- a/vllm/distributed/device_communicators/pynccl_wrapper.py +++ b/vllm/distributed/device_communicators/pynccl_wrapper.py @@ -24,7 +24,7 @@ import ctypes import platform from dataclasses import dataclass -from typing import Any, Dict, List, Optional +from typing import Any, Optional import torch from torch.distributed import ReduceOp @@ -121,7 +121,7 @@ def from_torch(cls, op: ReduceOp) -> int: class Function: name: str restype: Any - argtypes: List[Any] + argtypes: list[Any] class NCCLLibrary: @@ -210,11 +210,11 @@ class NCCLLibrary: # class attribute to store the mapping from the path to the library # to avoid loading the same library multiple times - path_to_library_cache: Dict[str, Any] = {} + path_to_library_cache: dict[str, Any] = {} # class attribute to store the mapping from library path # to the corresponding dictionary - path_to_dict_mapping: Dict[str, Dict[str, Any]] = {} + path_to_dict_mapping: dict[str, dict[str, Any]] = {} def __init__(self, so_file: Optional[str] = None): @@ -238,7 +238,7 @@ def __init__(self, so_file: Optional[str] = None): raise e if so_file not in NCCLLibrary.path_to_dict_mapping: - _funcs: Dict[str, Any] = {} + _funcs: dict[str, Any] = {} for func in NCCLLibrary.exported_functions: f = getattr(self.lib, func.name) f.restype = func.restype diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py index 12a720d47fbba..1361207e04763 100644 --- a/vllm/distributed/device_communicators/shm_broadcast.py +++ b/vllm/distributed/device_communicators/shm_broadcast.py @@ -7,7 +7,7 @@ from contextlib import contextmanager from dataclasses import dataclass, field from multiprocessing import shared_memory -from typing import List, Optional, Tuple, Union +from typing import Optional, Union from unittest.mock import patch import torch @@ -166,9 +166,9 @@ def get_metadata(self, current_idx: int): @dataclass class Handle: - local_reader_ranks: List[int] = field(default_factory=list) + local_reader_ranks: list[int] = field(default_factory=list) - buffer_handle: Optional[Tuple[int, int, int, str]] = None + buffer_handle: Optional[tuple[int, int, int, str]] = None local_subscribe_addr: Optional[str] = None remote_subscribe_addr: Optional[str] = None remote_addr_ipv6: bool = False @@ -180,7 +180,7 @@ def __init__( self, n_reader, # number of all readers n_local_reader, # number of local readers through shared memory - local_reader_ranks: Optional[List[int]] = None, + local_reader_ranks: Optional[list[int]] = None, max_chunk_bytes: int = 1024 * 1024 * 10, max_chunks: int = 10, connect_ip: Optional[str] = None, diff --git a/vllm/distributed/kv_transfer/kv_connector/base.py b/vllm/distributed/kv_transfer/kv_connector/base.py index 57c764b481c29..92a472bbe600a 100644 --- a/vllm/distributed/kv_transfer/kv_connector/base.py +++ b/vllm/distributed/kv_transfer/kv_connector/base.py @@ -8,7 +8,7 @@ """ from abc import ABC, abstractmethod -from typing import TYPE_CHECKING, List, Tuple, Union +from typing import TYPE_CHECKING, Union import torch @@ -54,7 +54,7 @@ def send_kv_caches_and_hidden_states( self, model_executable: torch.nn.Module, model_input: "ModelInputForGPUWithSamplingMetadata", - kv_caches: List[torch.Tensor], + kv_caches: list[torch.Tensor], hidden_or_intermediate_states: Union[torch.Tensor, IntermediateTensors], ) -> None: @@ -70,7 +70,7 @@ def send_kv_caches_and_hidden_states( start and end layer information. model_input (ModelInputForGPUWithSamplingMetadata): The input metadata from vLLM. - kv_caches (List[torch.Tensor]): List of KV caches (keys and values) + kv_caches (list[torch.Tensor]): list of KV caches (keys and values) for each layer. hidden_or_intermediate_states (Union[torch.Tensor, IntermediateTensors]): @@ -87,8 +87,8 @@ def send_kv_caches_and_hidden_states( def recv_kv_caches_and_hidden_states( self, model_executable: torch.nn.Module, model_input: "ModelInputForGPUWithSamplingMetadata", - kv_caches: List[torch.Tensor] - ) -> Tuple[Union[torch.Tensor, IntermediateTensors], bool, + kv_caches: list[torch.Tensor] + ) -> tuple[Union[torch.Tensor, IntermediateTensors], bool, "ModelInputForGPUWithSamplingMetadata"]: """ Receive KV caches and hidden states from the connector. @@ -103,8 +103,8 @@ def recv_kv_caches_and_hidden_states( The model executable from vLLM modelrunner. model_input (ModelInputForGPUWithSamplingMetadata): The model input from vLLM modelrunner. - kv_caches (List[torch.Tensor]): - List of KV caches for each layer. + kv_caches (list[torch.Tensor]): + list of KV caches for each layer. Returns: - hidden_or_intermediate_states (torch.Tensor or diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py index 7336c54ec8a30..4f45f29f36a07 100644 --- a/vllm/distributed/kv_transfer/kv_connector/factory.py +++ b/vllm/distributed/kv_transfer/kv_connector/factory.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import importlib -from typing import TYPE_CHECKING, Callable, Dict, Type +from typing import TYPE_CHECKING, Callable from .base import KVConnectorBase @@ -10,7 +10,7 @@ class KVConnectorFactory: - _registry: Dict[str, Callable[[], Type[KVConnectorBase]]] = {} + _registry: dict[str, Callable[[], type[KVConnectorBase]]] = {} @classmethod def register_connector(cls, name: str, module_path: str, @@ -19,7 +19,7 @@ def register_connector(cls, name: str, module_path: str, if name in cls._registry: raise ValueError(f"Connector '{name}' is already registered.") - def loader() -> Type[KVConnectorBase]: + def loader() -> type[KVConnectorBase]: module = importlib.import_module(module_path) return getattr(module, class_name) diff --git a/vllm/distributed/kv_transfer/kv_connector/lmcache_connector.py b/vllm/distributed/kv_transfer/kv_connector/lmcache_connector.py index bf9117133af56..4f003eefa4aa8 100644 --- a/vllm/distributed/kv_transfer/kv_connector/lmcache_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/lmcache_connector.py @@ -7,7 +7,7 @@ (2) offload and share KV caches. """ -from typing import TYPE_CHECKING, List, Tuple, Union +from typing import TYPE_CHECKING, Union import torch @@ -61,8 +61,8 @@ def __init__( def recv_kv_caches_and_hidden_states( self, model_executable: torch.nn.Module, model_input: "ModelInputForGPUWithSamplingMetadata", - kv_caches: List[torch.Tensor] - ) -> Tuple[Union[torch.Tensor, IntermediateTensors], bool, + kv_caches: list[torch.Tensor] + ) -> tuple[Union[torch.Tensor, IntermediateTensors], bool, "ModelInputForGPUWithSamplingMetadata"]: hidden_or_intermediate_states = None @@ -80,7 +80,7 @@ def send_kv_caches_and_hidden_states( self, model_executable: torch.nn.Module, model_input: "ModelInputForGPUWithSamplingMetadata", - kv_caches: List[torch.Tensor], + kv_caches: list[torch.Tensor], hidden_or_intermediate_states: Union[torch.Tensor, IntermediateTensors], ) -> None: diff --git a/vllm/distributed/kv_transfer/kv_connector/simple_connector.py b/vllm/distributed/kv_transfer/kv_connector/simple_connector.py index 2033e9762ac0b..0f092c890ecee 100644 --- a/vllm/distributed/kv_transfer/kv_connector/simple_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/simple_connector.py @@ -8,7 +8,7 @@ But the logic can be extended to support other pipe and lookup buffer. """ -from typing import TYPE_CHECKING, List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Optional, Union import torch @@ -132,7 +132,7 @@ def __init__( ) def select(self, input_tokens: Optional[torch.Tensor], - roi: Optional[torch.Tensor]) -> List[Optional[torch.Tensor]]: + roi: Optional[torch.Tensor]) -> list[Optional[torch.Tensor]]: assert self.consumer_buffer is not None, "Please initialize the "\ "consumer buffer before calling select." @@ -151,7 +151,7 @@ def send_kv_caches_and_hidden_states( self, model_executable: torch.nn.Module, model_input: "ModelInputForGPUWithSamplingMetadata", - kv_caches: List[torch.Tensor], + kv_caches: list[torch.Tensor], hidden_or_intermediate_states: Union[torch.Tensor, IntermediateTensors], ) -> None: @@ -202,8 +202,8 @@ def send_kv_caches_and_hidden_states( def recv_kv_caches_and_hidden_states( self, model_executable: torch.nn.Module, model_input: "ModelInputForGPUWithSamplingMetadata", - kv_caches: List[torch.Tensor] - ) -> Tuple[Union[torch.Tensor, IntermediateTensors], bool, + kv_caches: list[torch.Tensor] + ) -> tuple[Union[torch.Tensor, IntermediateTensors], bool, "ModelInputForGPUWithSamplingMetadata"]: # When bypass_model_exec is set to False, it means that at least for one diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py index 845da7c501e88..f9a0e22f918fa 100644 --- a/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py +++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py @@ -9,7 +9,7 @@ """ from abc import ABC, abstractmethod -from typing import List, Optional +from typing import Optional import torch @@ -71,7 +71,7 @@ def insert(self, input_tokens: torch.Tensor, roi: torch.Tensor, @abstractmethod def drop_select( self, input_tokens: Optional[torch.Tensor], - roi: Optional[torch.Tensor]) -> List[Optional[torch.Tensor]]: + roi: Optional[torch.Tensor]) -> list[Optional[torch.Tensor]]: """Select and *drop* KV cache entries from the lookup buffer. The functionality is similar to the following python statements @@ -89,7 +89,7 @@ def drop_select( roi (torch.Tensor): A binary mask on top of the input tokens Returns: - List[Optional[torch.Tensor]]: A list of tensors. Can be None. + list[Optional[torch.Tensor]]: A list of tensors. Can be None. Raises: NotImplementedError: This method must be implemented in subclasses. diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py index 3462f7de020ef..641762c199c46 100644 --- a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py +++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py @@ -11,7 +11,7 @@ """ import threading from collections import deque -from typing import Deque, List, Optional, Union +from typing import Optional, Union import torch @@ -38,7 +38,7 @@ def __init__(self, signal_pipe: KVPipeBase, data_pipe: KVPipeBase, data_pipe: on device (e.g. GPU) """ - self.buffer: Deque[List[torch.Tensor]] = deque() + self.buffer: deque[list[torch.Tensor]] = deque() self.buffer_size = 0 self.buffer_size_threshold = buffer_size_thresh @@ -50,8 +50,8 @@ def __init__(self, signal_pipe: KVPipeBase, data_pipe: KVPipeBase, self.normal_signal = torch.tensor([0], device="cpu") self.end_signal = None - def _matches(self, tokens_roi_sender: List[torch.Tensor], - tokens_roi_recver: List[torch.Tensor]): + def _matches(self, tokens_roi_sender: list[torch.Tensor], + tokens_roi_recver: list[torch.Tensor]): # tokens_roi_sender: tokens and roi of the producer (in the buffer) # tokens_roi_recver: tokens and roi of the consumer (query) @@ -88,7 +88,7 @@ def _send_tensor_and_dec_size(self, tensor = tensor.float() self.data_pipe.send_tensor(tensor) - def _get_element_size(self, data: Optional[Union[List, torch.Tensor]]): + def _get_element_size(self, data: Optional[Union[list, torch.Tensor]]): if isinstance(data, torch.Tensor): return data.element_size() * data.numel() @@ -151,7 +151,7 @@ def drop_select_handler(self): tokens_roi_recver = [input_tokens, roi] def is_buffer_available( - tokens_roi_recver: List[torch.Tensor], ) -> bool: + tokens_roi_recver: list[torch.Tensor], ) -> bool: # perform input tokens and roi matching # FIXME: this matching is O(n), ideally it should be O(1) # but this buffer size won't (and shouldn't) be too large so @@ -184,7 +184,7 @@ def is_buffer_available( def drop_select( self, input_tokens: Optional[torch.Tensor], - roi: Optional[torch.Tensor]) -> List[Optional[torch.Tensor]]: + roi: Optional[torch.Tensor]) -> list[Optional[torch.Tensor]]: assert self.request_handling_thread is None, \ "drop_select should be called by the KV cache consumer "\ diff --git a/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py index 7aa53d07a9ef2..37e1e4d8269f8 100644 --- a/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py +++ b/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py @@ -15,7 +15,7 @@ import threading import time from concurrent.futures import ThreadPoolExecutor -from typing import Callable, Dict, Optional, Tuple +from typing import Callable, Optional import torch @@ -35,7 +35,7 @@ def __init__(self, message): super().__init__(self.message) -Metadata = Dict[str, Optional[torch.Tensor]] +Metadata = dict[str, Optional[torch.Tensor]] class PyNcclPipe(KVPipeBase): @@ -81,7 +81,7 @@ def __init__(self, def _get_device_send_recv_impl( self, group: StatelessProcessGroup - ) -> Tuple[Callable[[torch.Tensor, int], None], Callable[ + ) -> tuple[Callable[[torch.Tensor, int], None], Callable[ [torch.Tensor, int], None]]: send: Callable[[torch.Tensor, int], None] diff --git a/vllm/distributed/kv_transfer/kv_transfer_agent.py b/vllm/distributed/kv_transfer/kv_transfer_agent.py index 1e80e0bd7de86..2873ae9a86b97 100644 --- a/vllm/distributed/kv_transfer/kv_transfer_agent.py +++ b/vllm/distributed/kv_transfer/kv_transfer_agent.py @@ -5,7 +5,7 @@ 1. `send_kv_caches_and_hidden_states` 2. `recv_kv_caches_and_hidden_states """ -from typing import TYPE_CHECKING, List, Tuple, Union +from typing import TYPE_CHECKING, Union if TYPE_CHECKING: from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata @@ -53,7 +53,7 @@ def send_kv_caches_and_hidden_states( self, model_executable: torch.nn.Module, model_input: "ModelInputForGPUWithSamplingMetadata", - kv_caches: List[torch.Tensor], + kv_caches: list[torch.Tensor], hidden_or_intermediate_states: Union[torch.Tensor, IntermediateTensors], ) -> None: @@ -68,8 +68,8 @@ def close(self) -> None: def recv_kv_caches_and_hidden_states( self, model_executable: torch.nn.Module, model_input: "ModelInputForGPUWithSamplingMetadata", - kv_caches: List[torch.Tensor] - ) -> Tuple[Union[torch.Tensor, IntermediateTensors], bool, + kv_caches: list[torch.Tensor] + ) -> tuple[Union[torch.Tensor, IntermediateTensors], bool, "ModelInputForGPUWithSamplingMetadata"]: return self.connector.recv_kv_caches_and_hidden_states( diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index 86166dd5bb831..dd38b699b39dd 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -29,8 +29,7 @@ from contextlib import contextmanager, nullcontext from dataclasses import dataclass from multiprocessing import shared_memory -from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, - Union) +from typing import TYPE_CHECKING, Any, Callable, Optional, Union from unittest.mock import patch import torch @@ -59,15 +58,15 @@ class GraphCaptureContext: def _split_tensor_dict( - tensor_dict: Dict[str, Union[torch.Tensor, Any]] -) -> Tuple[List[Tuple[str, Any]], List[torch.Tensor]]: + tensor_dict: dict[str, Union[torch.Tensor, Any]] +) -> tuple[list[tuple[str, Any]], list[torch.Tensor]]: """Split the tensor dictionary into two parts: 1. A list of (key, value) pairs. If the value is a tensor, it is replaced by its metadata. 2. A list of tensors. """ - metadata_list: List[Tuple[str, Any]] = [] - tensor_list: List[torch.Tensor] = [] + metadata_list: list[tuple[str, Any]] = [] + tensor_list: list[torch.Tensor] = [] for key, value in tensor_dict.items(): if isinstance(value, torch.Tensor): # Note: we cannot use `value.device` here, @@ -83,7 +82,7 @@ def _split_tensor_dict( return metadata_list, tensor_list -_group_name_counter: Dict[str, int] = {} +_group_name_counter: dict[str, int] = {} def _get_unique_name(name: str) -> str: @@ -99,7 +98,7 @@ def _get_unique_name(name: str) -> str: return newname -_groups: Dict[str, Callable[[], Optional["GroupCoordinator"]]] = {} +_groups: dict[str, Callable[[], Optional["GroupCoordinator"]]] = {} def _register_group(group: "GroupCoordinator") -> None: @@ -139,7 +138,7 @@ class GroupCoordinator: # available attributes: rank: int # global rank - ranks: List[int] # global ranks in the group + ranks: list[int] # global ranks in the group world_size: int # size of the group # difference between `local_rank` and `rank_in_group`: # if we have a group of size 4 across two nodes: @@ -158,7 +157,7 @@ class GroupCoordinator: def __init__( self, - group_ranks: List[List[int]], + group_ranks: list[list[int]], local_rank: int, torch_distributed_backend: Union[str, Backend], use_device_communicator: bool, @@ -377,7 +376,7 @@ def broadcast_object(self, obj: Optional[Any] = None, src: int = 0): return recv[0] def broadcast_object_list(self, - obj_list: List[Any], + obj_list: list[Any], src: int = 0, group: Optional[ProcessGroup] = None): """Broadcast the input object list. @@ -460,11 +459,11 @@ def recv_object(self, src: int) -> Any: def broadcast_tensor_dict( self, - tensor_dict: Optional[Dict[str, Union[torch.Tensor, Any]]] = None, + tensor_dict: Optional[dict[str, Union[torch.Tensor, Any]]] = None, src: int = 0, group: Optional[ProcessGroup] = None, metadata_group: Optional[ProcessGroup] = None - ) -> Optional[Dict[str, Union[torch.Tensor, Any]]]: + ) -> Optional[dict[str, Union[torch.Tensor, Any]]]: """Broadcast the input tensor dictionary. NOTE: `src` is the local rank of the source rank. """ @@ -478,7 +477,7 @@ def broadcast_tensor_dict( rank_in_group = self.rank_in_group if rank_in_group == src: - metadata_list: List[Tuple[Any, Any]] = [] + metadata_list: list[tuple[Any, Any]] = [] assert isinstance( tensor_dict, dict), (f"Expecting a dictionary, got {type(tensor_dict)}") @@ -545,10 +544,10 @@ def broadcast_tensor_dict( def send_tensor_dict( self, - tensor_dict: Dict[str, Union[torch.Tensor, Any]], + tensor_dict: dict[str, Union[torch.Tensor, Any]], dst: Optional[int] = None, all_gather_group: Optional["GroupCoordinator"] = None, - ) -> Optional[Dict[str, Union[torch.Tensor, Any]]]: + ) -> Optional[dict[str, Union[torch.Tensor, Any]]]: """Send the input tensor dictionary. NOTE: `dst` is the local rank of the source rank. """ @@ -568,7 +567,7 @@ def send_tensor_dict( dst = (self.rank_in_group + 1) % self.world_size assert dst < self.world_size, f"Invalid dst rank ({dst})" - metadata_list: List[Tuple[Any, Any]] = [] + metadata_list: list[tuple[Any, Any]] = [] assert isinstance( tensor_dict, dict), f"Expecting a dictionary, got {type(tensor_dict)}" @@ -603,7 +602,7 @@ def recv_tensor_dict( self, src: Optional[int] = None, all_gather_group: Optional["GroupCoordinator"] = None, - ) -> Optional[Dict[str, Union[torch.Tensor, Any]]]: + ) -> Optional[dict[str, Union[torch.Tensor, Any]]]: """Recv the input tensor dictionary. NOTE: `src` is the local rank of the source rank. """ @@ -624,7 +623,7 @@ def recv_tensor_dict( assert src < self.world_size, f"Invalid src rank ({src})" recv_metadata_list = self.recv_object(src=src) - tensor_dict: Dict[str, Any] = {} + tensor_dict: dict[str, Any] = {} for key, value in recv_metadata_list: if isinstance(value, TensorMetadata): tensor = torch.empty(value.size, @@ -708,7 +707,7 @@ def get_world_group() -> GroupCoordinator: return _WORLD -def init_world_group(ranks: List[int], local_rank: int, +def init_world_group(ranks: list[int], local_rank: int, backend: str) -> GroupCoordinator: return GroupCoordinator( group_ranks=[ranks], @@ -720,7 +719,7 @@ def init_world_group(ranks: List[int], local_rank: int, def init_model_parallel_group( - group_ranks: List[List[int]], + group_ranks: list[list[int]], local_rank: int, backend: str, use_message_queue_broadcaster: bool = False, @@ -1089,7 +1088,7 @@ def cleanup_dist_env_and_memory(shutdown_ray: bool = False): def in_the_same_node_as(pg: Union[ProcessGroup, StatelessProcessGroup], - source_rank: int = 0) -> List[bool]: + source_rank: int = 0) -> list[bool]: """ This is a collective operation that returns if each rank is in the same node as the source rank. It tests if processes are attached to the same diff --git a/vllm/distributed/utils.py b/vllm/distributed/utils.py index d6fca4f0221b8..b518e6272aedb 100644 --- a/vllm/distributed/utils.py +++ b/vllm/distributed/utils.py @@ -8,7 +8,8 @@ import pickle import time from collections import deque -from typing import Any, Deque, Dict, Optional, Sequence, Tuple +from collections.abc import Sequence +from typing import Any, Optional import torch from torch.distributed import ProcessGroup, TCPStore @@ -65,7 +66,7 @@ def split_tensor_along_last_dim( def get_pp_indices(num_hidden_layers: int, pp_rank: int, - pp_size: int) -> Tuple[int, int]: + pp_size: int) -> tuple[int, int]: """Try to evenly distribute layers across partitions. If the number of layers is not divisible by the number of partitions, @@ -123,15 +124,15 @@ class StatelessProcessGroup: data_expiration_seconds: int = 3600 # 1 hour # dst rank -> counter - send_dst_counter: Dict[int, int] = dataclasses.field(default_factory=dict) + send_dst_counter: dict[int, int] = dataclasses.field(default_factory=dict) # src rank -> counter - recv_src_counter: Dict[int, int] = dataclasses.field(default_factory=dict) + recv_src_counter: dict[int, int] = dataclasses.field(default_factory=dict) broadcast_send_counter: int = 0 - broadcast_recv_src_counter: Dict[int, int] = dataclasses.field( + broadcast_recv_src_counter: dict[int, int] = dataclasses.field( default_factory=dict) # A deque to store the data entries, with key and timestamp. - entries: Deque[Tuple[str, + entries: deque[tuple[str, float]] = dataclasses.field(default_factory=deque) def __post_init__(self): diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 1a2f794c9151d..187d8ebee9d55 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -3,9 +3,9 @@ import argparse import dataclasses import json +from collections.abc import Mapping from dataclasses import dataclass -from typing import (TYPE_CHECKING, Any, Dict, List, Literal, Mapping, Optional, - Tuple, Type, Union, cast, get_args) +from typing import TYPE_CHECKING, Any, Literal, Optional, Union, cast, get_args import torch @@ -65,7 +65,7 @@ def nullable_kvs(val: str) -> Optional[Mapping[str, int]]: if len(val) == 0: return None - out_dict: Dict[str, int] = {} + out_dict: dict[str, int] = {} for item in val.split(","): kv_parts = [part.lower().strip() for part in item.split("=")] if len(kv_parts) != 2: @@ -91,7 +91,7 @@ def nullable_kvs(val: str) -> Optional[Mapping[str, int]]: class EngineArgs: """Arguments for vLLM engine.""" model: str = 'facebook/opt-125m' - served_model_name: Optional[Union[str, List[str]]] = None + served_model_name: Optional[Union[str, list[str]]] = None tokenizer: Optional[str] = None hf_config_path: Optional[str] = None task: TaskOption = "auto" @@ -110,7 +110,7 @@ class EngineArgs: # is intended for expert use only. The API may change without # notice. distributed_executor_backend: Optional[Union[str, - Type[ExecutorBase]]] = None + type[ExecutorBase]]] = None # number of P/D disaggregation (or other disaggregation) workers pipeline_parallel_size: int = 1 tensor_parallel_size: int = 1 @@ -131,7 +131,7 @@ class EngineArgs: disable_log_stats: bool = False revision: Optional[str] = None code_revision: Optional[str] = None - rope_scaling: Optional[Dict[str, Any]] = None + rope_scaling: Optional[dict[str, Any]] = None rope_theta: Optional[float] = None hf_overrides: Optional[HfOverrides] = None tokenizer_revision: Optional[str] = None @@ -143,10 +143,10 @@ class EngineArgs: # Note: Specifying a tokenizer pool by passing a class # is intended for expert use only. The API may change without # notice. - tokenizer_pool_type: Union[str, Type["BaseTokenizerGroup"]] = "ray" - tokenizer_pool_extra_config: Optional[Dict[str, Any]] = None + tokenizer_pool_type: Union[str, type["BaseTokenizerGroup"]] = "ray" + tokenizer_pool_extra_config: Optional[dict[str, Any]] = None limit_mm_per_prompt: Optional[Mapping[str, int]] = None - mm_processor_kwargs: Optional[Dict[str, Any]] = None + mm_processor_kwargs: Optional[dict[str, Any]] = None disable_mm_preprocessor_cache: bool = False enable_lora: bool = False enable_lora_bias: bool = False @@ -157,7 +157,7 @@ class EngineArgs: max_prompt_adapter_token: int = 0 fully_sharded_loras: bool = False lora_extra_vocab_size: int = 256 - long_lora_scaling_factors: Optional[Tuple[float]] = None + long_lora_scaling_factors: Optional[tuple[float]] = None lora_dtype: Optional[Union[str, torch.dtype]] = 'auto' max_cpu_loras: Optional[int] = None device: str = 'auto' @@ -167,7 +167,7 @@ class EngineArgs: num_gpu_blocks_override: Optional[int] = None num_lookahead_slots: int = 0 model_loader_extra_config: Optional[dict] = None - ignore_patterns: Optional[Union[str, List[str]]] = None + ignore_patterns: Optional[Union[str, list[str]]] = None preemption_mode: Optional[str] = None scheduler_delay_factor: float = 0.0 @@ -196,9 +196,9 @@ class EngineArgs: collect_detailed_traces: Optional[str] = None disable_async_output_proc: bool = False scheduling_policy: Literal["fcfs", "priority"] = "fcfs" - scheduler_cls: Union[str, Type[object]] = "vllm.core.scheduler.Scheduler" + scheduler_cls: Union[str, type[object]] = "vllm.core.scheduler.Scheduler" - override_neuron_config: Optional[Dict[str, Any]] = None + override_neuron_config: Optional[dict[str, Any]] = None override_pooler_config: Optional[PoolerConfig] = None compilation_config: Optional[CompilationConfig] = None worker_cls: str = "auto" @@ -206,13 +206,13 @@ class EngineArgs: kv_transfer_config: Optional[KVTransferConfig] = None generation_config: Optional[str] = None - override_generation_config: Optional[Dict[str, Any]] = None + override_generation_config: Optional[dict[str, Any]] = None enable_sleep_mode: bool = False model_impl: str = "auto" calculate_kv_scales: Optional[bool] = None - additional_config: Optional[Dict[str, Any]] = None + additional_config: Optional[dict[str, Any]] = None def __post_init__(self): if not self.tokenizer: diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 93d9b74d8e1e8..1f7041eff149e 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -4,9 +4,9 @@ import copy import time import weakref +from collections.abc import AsyncGenerator, Coroutine, Iterable, Mapping from functools import partial -from typing import (Any, AsyncGenerator, Callable, Coroutine, Dict, Iterable, - List, Mapping, Optional, Set, Tuple, Type, Union, overload) +from typing import Any, Callable, Optional, Union, overload from weakref import ReferenceType from typing_extensions import deprecated @@ -93,7 +93,7 @@ def put(self, item: Union[RequestOutput, PoolingRequestOutput, def finish( self, - exception: Optional[Union[BaseException, Type[BaseException]]] = None, + exception: Optional[Union[BaseException, type[BaseException]]] = None, ) -> None: if not self._finished: self._finished = True @@ -130,9 +130,9 @@ class RequestTracker: """Synchronous abstraction for tracking requests.""" def __init__(self) -> None: - self._request_streams: Dict[str, AsyncStream] = {} + self._request_streams: dict[str, AsyncStream] = {} self._aborted_requests: asyncio.Queue[str] = asyncio.Queue() - self._new_requests: asyncio.Queue[Tuple[AsyncStream, + self._new_requests: asyncio.Queue[tuple[AsyncStream, dict]] = asyncio.Queue() self.new_requests_event = asyncio.Event() @@ -216,7 +216,7 @@ def abort_request(self, request_id: str, *, exception: Optional[Union[BaseException, - Type[BaseException]]] = None, + type[BaseException]]] = None, verbose: bool = False) -> None: """Abort a request during next background loop iteration.""" if verbose: @@ -228,11 +228,11 @@ def abort_request(self, if stream is not None: stream.finish(exception=exception) - def get_new_and_aborted_requests(self) -> Tuple[List[Dict], Set[str]]: + def get_new_and_aborted_requests(self) -> tuple[list[dict], set[str]]: """Get the new requests and finished requests to be sent to the engine.""" - new_requests: List[Dict] = [] - finished_requests: Set[str] = set() + new_requests: list[dict] = [] + finished_requests: set[str] = set() while not self._aborted_requests.empty(): request_id = self._aborted_requests.get_nowait() @@ -268,7 +268,7 @@ def __init__(self, *args, **kwargs): async def step_async( self, virtual_engine: int - ) -> List[Union[RequestOutput, PoolingRequestOutput]]: + ) -> list[Union[RequestOutput, PoolingRequestOutput]]: """Performs one decoding iteration and returns newly generated results. The workers are ran asynchronously if possible. @@ -583,7 +583,7 @@ class AsyncLLMEngine(EngineClient): **kwargs: Arguments for :class:`LLMEngine`. """ - _engine_class: Type[_AsyncLLMEngine] = _AsyncLLMEngine + _engine_class: type[_AsyncLLMEngine] = _AsyncLLMEngine def __init__(self, *args, @@ -621,7 +621,7 @@ def __del__(self): @classmethod def _get_executor_cls(cls, - engine_config: VllmConfig) -> Type[ExecutorBase]: + engine_config: VllmConfig) -> type[ExecutorBase]: return LLMEngine._get_executor_cls(engine_config) @classmethod @@ -631,7 +631,7 @@ def from_engine_args( engine_config: Optional[VllmConfig] = None, start_engine_loop: bool = True, usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, - stat_loggers: Optional[Dict[str, StatLoggerBase]] = None, + stat_loggers: Optional[dict[str, StatLoggerBase]] = None, ) -> "AsyncLLMEngine": """Creates an async LLM engine from the engine arguments.""" # Create the engine configs. @@ -1156,7 +1156,7 @@ async def get_lora_config(self) -> LoRAConfig: async def do_log_stats( self, scheduler_outputs: Optional[SchedulerOutputs] = None, - model_output: Optional[List[SamplerOutput]] = None) -> None: + model_output: Optional[list[SamplerOutput]] = None) -> None: self.engine.do_log_stats() async def check_health(self) -> None: diff --git a/vllm/engine/async_timeout.py b/vllm/engine/async_timeout.py index aa54c0693941f..88b4a3e9e7e81 100644 --- a/vllm/engine/async_timeout.py +++ b/vllm/engine/async_timeout.py @@ -10,7 +10,7 @@ import sys import warnings from types import TracebackType -from typing import Any, Optional, Type +from typing import Any, Optional if sys.version_info[:2] >= (3, 11): from asyncio import timeout as asyncio_timeout @@ -77,7 +77,7 @@ def __enter__(self) -> "Timeout": def __exit__( self, - exc_type: Optional[Type[BaseException]], + exc_type: Optional[type[BaseException]], exc_val: Optional[BaseException], exc_tb: Optional[TracebackType], ) -> Optional[bool]: @@ -90,7 +90,7 @@ async def __aenter__(self) -> "Timeout": async def __aexit__( self, - exc_type: Optional[Type[BaseException]], + exc_type: Optional[type[BaseException]], exc_val: Optional[BaseException], exc_tb: Optional[TracebackType], ) -> Optional[bool]: @@ -173,7 +173,7 @@ def _do_enter(self) -> None: self._state = _State.ENTER self._reschedule() - def _do_exit(self, exc_type: Optional[Type[BaseException]]) -> None: + def _do_exit(self, exc_type: Optional[type[BaseException]]) -> None: if exc_type is asyncio.CancelledError and \ self._state == _State.TIMEOUT: self._timeout_handler = None diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 3dee4dab4c47e..9e4f6e7535063 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -4,13 +4,13 @@ import time from collections import Counter as collectionsCounter from collections import deque +from collections.abc import Iterable, Mapping +from collections.abc import Sequence as GenericSequence from contextlib import contextmanager from dataclasses import dataclass from functools import partial -from typing import (TYPE_CHECKING, Callable, ClassVar, Deque, Dict, Iterable, - List, Mapping, NamedTuple, Optional) -from typing import Sequence as GenericSequence -from typing import Set, Type, Union, cast, overload +from typing import (TYPE_CHECKING, Callable, ClassVar, NamedTuple, Optional, + Union, cast, overload) import torch from typing_extensions import TypeVar, deprecated @@ -72,15 +72,15 @@ @dataclass class SchedulerOutputState: """Caches the scheduler outputs for a virtual engine. Used for Multi-Step""" - seq_group_metadata_list: Optional[List[SequenceGroupMetadata]] = None + seq_group_metadata_list: Optional[list[SequenceGroupMetadata]] = None scheduler_outputs: Optional[SchedulerOutputs] = None allow_async_output_proc: bool = False last_output: Optional[SamplerOutput] = None class OutputData(NamedTuple): - outputs: List[SamplerOutput] - seq_group_metadata_list: List[SequenceGroupMetadata] + outputs: list[SamplerOutput] + seq_group_metadata_list: list[SequenceGroupMetadata] scheduler_outputs: SchedulerOutputs is_async: bool is_last_step: bool @@ -90,23 +90,23 @@ class OutputData(NamedTuple): # is_first_step_output is invalid when `outputs` has # outputs from multiple steps. is_first_step_output: Optional[bool] - skip: List[int] + skip: list[int] class SchedulerContext: def __init__(self, multi_step_stream_outputs: bool = False): - self.output_queue: Deque[OutputData] = deque() - self.request_outputs: List[Union[RequestOutput, + self.output_queue: deque[OutputData] = deque() + self.request_outputs: list[Union[RequestOutput, PoolingRequestOutput]] = [] self.seq_group_metadata_list: Optional[ - List[SequenceGroupMetadata]] = None + list[SequenceGroupMetadata]] = None self.scheduler_outputs: Optional[SchedulerOutputs] = None self.multi_step_stream_outputs: bool = multi_step_stream_outputs - def append_output(self, outputs: List[SamplerOutput], - seq_group_metadata_list: List[SequenceGroupMetadata], + def append_output(self, outputs: list[SamplerOutput], + seq_group_metadata_list: list[SequenceGroupMetadata], scheduler_outputs: SchedulerOutputs, is_async: bool, is_last_step: bool, is_first_step_output: Optional[bool]): @@ -170,7 +170,7 @@ def enable_output_validation(cls): def validate_output( cls, output: object, - output_type: Type[_O], + output_type: type[_O], ) -> _O: do_validate = cls.DO_VALIDATE_OUTPUT @@ -185,11 +185,11 @@ def validate_output( def validate_outputs( cls, outputs: GenericSequence[object], - output_type: Type[_O], - ) -> List[_O]: + output_type: type[_O], + ) -> list[_O]: do_validate = cls.DO_VALIDATE_OUTPUT - outputs_: List[_O] + outputs_: list[_O] if TYPE_CHECKING or do_validate: outputs_ = [] for output in outputs: @@ -208,10 +208,10 @@ def validate_outputs( def __init__( self, vllm_config: VllmConfig, - executor_class: Type[ExecutorBase], + executor_class: type[ExecutorBase], log_stats: bool, usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, - stat_loggers: Optional[Dict[str, StatLoggerBase]] = None, + stat_loggers: Optional[dict[str, StatLoggerBase]] = None, input_registry: InputRegistry = INPUT_REGISTRY, mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY, use_cached_outputs: bool = False, @@ -409,7 +409,7 @@ def get_tokenizer_for_seq(sequence: Sequence) -> AnyTokenizer: ), )) - self.seq_id_to_seq_group: Dict[str, SequenceGroupBase] = {} + self.seq_id_to_seq_group: dict[str, SequenceGroupBase] = {} # Flag to set when an input fails to process and the engine should run # the next step without re-scheduling. @@ -443,7 +443,7 @@ def _initialize_kv_caches(self) -> None: @classmethod def _get_executor_cls(cls, - engine_config: VllmConfig) -> Type[ExecutorBase]: + engine_config: VllmConfig) -> type[ExecutorBase]: # distributed_executor_backend must be set in VllmConfig.__post_init__ distributed_executor_backend = ( engine_config.parallel_config.distributed_executor_backend) @@ -484,7 +484,7 @@ def from_engine_args( cls, engine_args: EngineArgs, usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, - stat_loggers: Optional[Dict[str, StatLoggerBase]] = None, + stat_loggers: Optional[dict[str, StatLoggerBase]] = None, ) -> "LLMEngine": """Creates an LLM engine from the engine arguments.""" # Create the engine configs. @@ -514,7 +514,7 @@ def __del__(self): def get_tokenizer_group( self, - group_type: Type[_G] = BaseTokenizerGroup, + group_type: type[_G] = BaseTokenizerGroup, ) -> _G: tokenizer_group = self.tokenizer @@ -937,7 +937,7 @@ def reset_prefix_cache(self) -> bool: @staticmethod def _process_sequence_group_outputs( seq_group: SequenceGroup, - outputs: List[PoolingSequenceGroupOutput], + outputs: list[PoolingSequenceGroupOutput], ) -> None: seq_group.pooled_data = outputs[0].data @@ -1016,7 +1016,7 @@ def _process_model_outputs(self, scheduler_outputs.scheduled_seq_groups) has_multiple_outputs: bool = len(outputs) > 1 - outputs_by_sequence_group: List[List[SequenceGroupOutput]] + outputs_by_sequence_group: list[list[SequenceGroupOutput]] if has_multiple_outputs: assert self.scheduler_config.is_multi_step or \ self.speculative_config @@ -1062,8 +1062,8 @@ def _process_model_outputs(self, else: indices = range(len(seq_group_metadata_list)) # type: ignore - finished_before: List[int] = [] - finished_now: List[int] = [] + finished_before: list[int] = [] + finished_now: list[int] = [] for i in indices: if i in skip: continue @@ -1077,7 +1077,7 @@ def _process_model_outputs(self, finished_before.append(i) continue - output: List[SequenceGroupOutput] + output: list[SequenceGroupOutput] if has_multiple_outputs: output = outputs_by_sequence_group[i] else: @@ -1221,9 +1221,9 @@ def _process_model_outputs(self, return None def _advance_to_next_step( - self, output: List[SamplerOutput], - seq_group_metadata_list: List[SequenceGroupMetadata], - scheduled_seq_groups: List[ScheduledSequenceGroup]) -> None: + self, output: list[SamplerOutput], + seq_group_metadata_list: list[SequenceGroupMetadata], + scheduled_seq_groups: list[ScheduledSequenceGroup]) -> None: """Given model output from a single run, append the tokens to the sequences. This is normally done inside output processor, but it is required if the worker is to perform async forward pass to next step. @@ -1264,7 +1264,7 @@ def _advance_to_next_step( else: seq.append_token_id(sample.output_token, sample.logprobs) - def step(self) -> List[Union[RequestOutput, PoolingRequestOutput]]: + def step(self) -> list[Union[RequestOutput, PoolingRequestOutput]]: """Performs one decoding iteration and returns newly generated results. .. figure:: https://i.imgur.com/sv2HssD.png @@ -1490,7 +1490,7 @@ def step(self) -> List[Union[RequestOutput, PoolingRequestOutput]]: def _abort_and_cache_schedule( self, request_id: str, virtual_engine: int, - seq_group_metadata_list: List[SequenceGroupMetadata], + seq_group_metadata_list: list[SequenceGroupMetadata], scheduler_outputs: SchedulerOutputs, allow_async_output_proc: bool) -> None: """Aborts a single request, and caches the scheduler outputs minus that @@ -1521,7 +1521,7 @@ def _abort_and_cache_schedule( allow_async_output_proc=allow_async_output_proc) def _has_remaining_steps( - self, seq_group_metadata_list: Optional[List[SequenceGroupMetadata]] + self, seq_group_metadata_list: Optional[list[SequenceGroupMetadata]] ) -> bool: if (not self.scheduler_config.is_multi_step or not seq_group_metadata_list): @@ -1542,7 +1542,7 @@ def _has_remaining_steps( def _cache_scheduler_outputs_for_multi_step( self, virtual_engine: int, - seq_group_metadata_list: Optional[List[SequenceGroupMetadata]], + seq_group_metadata_list: Optional[list[SequenceGroupMetadata]], scheduler_outputs: SchedulerOutputs, allow_async_output_proc: bool) -> None: co = self.cached_scheduler_outputs[virtual_engine] @@ -1554,7 +1554,7 @@ def _cache_scheduler_outputs_for_multi_step( def _update_cached_scheduler_output( self, virtual_engine: int, - output: List[Optional[SamplerOutput]]) -> None: + output: list[Optional[SamplerOutput]]) -> None: if (self.parallel_config.pipeline_parallel_size > 1 and len(output) > 0 and output[0] is not None): last_output = output[-1] @@ -1596,9 +1596,9 @@ def remove_logger(self, logger_name: str) -> None: def do_log_stats(self, scheduler_outputs: Optional[SchedulerOutputs] = None, - model_output: Optional[List[SamplerOutput]] = None, - finished_before: Optional[List[int]] = None, - skip: Optional[List[int]] = None) -> None: + model_output: Optional[list[SamplerOutput]] = None, + finished_before: Optional[list[int]] = None, + skip: Optional[list[int]] = None) -> None: """Forced log when no requests active.""" if self.log_stats: stats = self._get_stats(scheduler_outputs, model_output, @@ -1608,9 +1608,9 @@ def do_log_stats(self, def _get_stats(self, scheduler_outputs: Optional[SchedulerOutputs], - model_output: Optional[List[SamplerOutput]] = None, - finished_before: Optional[List[int]] = None, - skip: Optional[List[int]] = None) -> Stats: + model_output: Optional[list[SamplerOutput]] = None, + finished_before: Optional[list[int]] = None, + skip: Optional[list[int]] = None) -> Stats: """Get Stats to be Logged to Prometheus. Args: @@ -1662,28 +1662,28 @@ def _get_stats(self, num_prompt_tokens_iter = 0 num_generation_tokens_iter = 0 num_tokens_iter = 0 - time_to_first_tokens_iter: List[float] = [] - time_per_output_tokens_iter: List[float] = [] + time_to_first_tokens_iter: list[float] = [] + time_per_output_tokens_iter: list[float] = [] num_preemption_iter = (0 if scheduler_outputs is None else scheduler_outputs.preempted) # Request stats # Latency - time_e2e_requests: List[float] = [] - time_queue_requests: List[float] = [] - time_inference_requests: List[float] = [] - time_prefill_requests: List[float] = [] - time_decode_requests: List[float] = [] - time_in_queue_requests: List[float] = [] - model_forward_time_requests: List[float] = [] - model_execute_time_requests: List[float] = [] + time_e2e_requests: list[float] = [] + time_queue_requests: list[float] = [] + time_inference_requests: list[float] = [] + time_prefill_requests: list[float] = [] + time_decode_requests: list[float] = [] + time_in_queue_requests: list[float] = [] + model_forward_time_requests: list[float] = [] + model_execute_time_requests: list[float] = [] # Metadata - num_prompt_tokens_requests: List[int] = [] - num_generation_tokens_requests: List[int] = [] - n_requests: List[int] = [] - max_num_generation_tokens_requests: List[int] = [] - max_tokens_requests: List[int] = [] - finished_reason_requests: List[str] = [] + num_prompt_tokens_requests: list[int] = [] + num_generation_tokens_requests: list[int] = [] + n_requests: list[int] = [] + max_num_generation_tokens_requests: list[int] = [] + max_tokens_requests: list[int] = [] + finished_reason_requests: list[str] = [] # LoRA requests running_lora_adapters = dict( @@ -1882,7 +1882,7 @@ def add_lora(self, lora_request: LoRARequest) -> bool: def remove_lora(self, lora_id: int) -> bool: return self.model_executor.remove_lora(lora_id) - def list_loras(self) -> Set[int]: + def list_loras(self) -> set[int]: return self.model_executor.list_loras() def pin_lora(self, lora_id: int) -> bool: @@ -1895,7 +1895,7 @@ def add_prompt_adapter( def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool: return self.model_executor.remove_prompt_adapter(prompt_adapter_id) - def list_prompt_adapters(self) -> List[int]: + def list_prompt_adapters(self) -> list[int]: return self.model_executor.list_prompt_adapters() def start_profile(self) -> None: @@ -1924,7 +1924,7 @@ def is_tracing_enabled(self) -> bool: def do_tracing(self, scheduler_outputs: SchedulerOutputs, - finished_before: Optional[List[int]] = None) -> None: + finished_before: Optional[list[int]] = None) -> None: if self.tracer is None: return diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index cb3ca7a118819..efa4a805a81c7 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -1,9 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 import time -from typing import TYPE_CHECKING -from typing import Counter as CollectionsCounter -from typing import Dict, List, Optional, Type, Union, cast +from collections import Counter as CollectionsCounter +from typing import TYPE_CHECKING, Optional, Union, cast import numpy as np import prometheus_client @@ -46,7 +45,7 @@ class Metrics: _counter_cls = prometheus_client.Counter _histogram_cls = prometheus_client.Histogram - def __init__(self, labelnames: List[str], vllm_config: VllmConfig): + def __init__(self, labelnames: list[str], vllm_config: VllmConfig): # Unregister any existing vLLM collectors (for CI/CD) self._unregister_vllm_metrics() @@ -276,7 +275,7 @@ class _RayGaugeWrapper: def __init__(self, name: str, documentation: str = "", - labelnames: Optional[List[str]] = None, + labelnames: Optional[list[str]] = None, multiprocess_mode: str = ""): del multiprocess_mode labelnames_tuple = tuple(labelnames) if labelnames else None @@ -303,7 +302,7 @@ class _RayCounterWrapper: def __init__(self, name: str, documentation: str = "", - labelnames: Optional[List[str]] = None): + labelnames: Optional[list[str]] = None): labelnames_tuple = tuple(labelnames) if labelnames else None self._counter = ray_metrics.Counter(name=name, description=documentation, @@ -326,8 +325,8 @@ class _RayHistogramWrapper: def __init__(self, name: str, documentation: str = "", - labelnames: Optional[List[str]] = None, - buckets: Optional[List[float]] = None): + labelnames: Optional[list[str]] = None, + buckets: Optional[list[float]] = None): labelnames_tuple = tuple(labelnames) if labelnames else None boundaries = buckets if buckets else [] self._histogram = ray_metrics.Histogram(name=name, @@ -348,14 +347,14 @@ class RayMetrics(Metrics): RayMetrics is used by RayPrometheusStatLogger to log to Ray metrics. Provides the same metrics as Metrics but uses Ray's util.metrics library. """ - _gauge_cls: Type[prometheus_client.Gauge] = cast( - Type[prometheus_client.Gauge], _RayGaugeWrapper) - _counter_cls: Type[prometheus_client.Counter] = cast( - Type[prometheus_client.Counter], _RayCounterWrapper) - _histogram_cls: Type[prometheus_client.Histogram] = cast( - Type[prometheus_client.Histogram], _RayHistogramWrapper) - - def __init__(self, labelnames: List[str], vllm_config: VllmConfig): + _gauge_cls: type[prometheus_client.Gauge] = cast( + type[prometheus_client.Gauge], _RayGaugeWrapper) + _counter_cls: type[prometheus_client.Counter] = cast( + type[prometheus_client.Counter], _RayCounterWrapper) + _histogram_cls: type[prometheus_client.Histogram] = cast( + type[prometheus_client.Histogram], _RayHistogramWrapper) + + def __init__(self, labelnames: list[str], vllm_config: VllmConfig): if ray_metrics is None: raise ImportError("RayMetrics requires Ray to be installed.") super().__init__(labelnames, vllm_config) @@ -365,14 +364,14 @@ def _unregister_vllm_metrics(self) -> None: pass -def build_buckets(mantissa_lst: List[int], max_value: int) -> List[int]: +def build_buckets(mantissa_lst: list[int], max_value: int) -> list[int]: """ Builds a list of buckets with increasing powers of 10 multiplied by mantissa values until the value exceeds the specified maximum. """ exponent = 0 - buckets: List[int] = [] + buckets: list[int] = [] while True: for m in mantissa_lst: value = m * 10**exponent @@ -383,7 +382,7 @@ def build_buckets(mantissa_lst: List[int], max_value: int) -> List[int]: exponent += 1 -def build_1_2_5_buckets(max_value: int) -> List[int]: +def build_1_2_5_buckets(max_value: int) -> list[int]: """ Example: >>> build_1_2_5_buckets(100) @@ -392,7 +391,7 @@ def build_1_2_5_buckets(max_value: int) -> List[int]: return build_buckets([1, 2, 5], max_value) -def build_1_2_3_5_8_buckets(max_value: int) -> List[int]: +def build_1_2_3_5_8_buckets(max_value: int) -> list[int]: """ Example: >>> build_1_2_3_5_8_buckets(100) @@ -407,7 +406,7 @@ def local_interval_elapsed(now: float, last_log: float, return elapsed_time > local_interval -def get_throughput(tracked_stats: List[int], now: float, +def get_throughput(tracked_stats: list[int], now: float, last_log: float) -> float: return float(np.sum(tracked_stats) / (now - last_log)) @@ -508,7 +507,7 @@ class PrometheusStatLogger(StatLoggerBase): _metrics_cls = Metrics _gauge_cls = prometheus_client.Gauge - def __init__(self, local_interval: float, labels: Dict[str, str], + def __init__(self, local_interval: float, labels: dict[str, str], vllm_config: VllmConfig) -> None: super().__init__(local_interval, vllm_config) # Prometheus metrics @@ -540,13 +539,13 @@ def _log_counter_labels(self, counter, data: CollectionsCounter, for label, count in data.items(): counter.labels(**{**self.labels, label_key: label}).inc(count) - def _log_histogram(self, histogram, data: Union[List[int], - List[float]]) -> None: + def _log_histogram(self, histogram, data: Union[list[int], + list[float]]) -> None: # Convenience function for logging list to histogram. for datum in data: histogram.labels(**self.labels).observe(datum) - def _log_gauge_string(self, gauge, data: Dict[str, str]) -> None: + def _log_gauge_string(self, gauge, data: dict[str, str]) -> None: gauge.labels(**data).set_to_current_time() def _log_prometheus(self, stats: Stats) -> None: diff --git a/vllm/engine/metrics_types.py b/vllm/engine/metrics_types.py index 9e6d5ef29bedb..541f9fb9ff793 100644 --- a/vllm/engine/metrics_types.py +++ b/vllm/engine/metrics_types.py @@ -15,7 +15,7 @@ import time from abc import ABC, abstractmethod from dataclasses import dataclass -from typing import List, Optional +from typing import Optional from vllm.config import SupportsMetricsInfo, VllmConfig from vllm.spec_decode.metrics import SpecDecodeWorkerMetrics @@ -42,29 +42,29 @@ class Stats: num_prompt_tokens_iter: int num_generation_tokens_iter: int num_tokens_iter: int - time_to_first_tokens_iter: List[float] - time_per_output_tokens_iter: List[float] + time_to_first_tokens_iter: list[float] + time_per_output_tokens_iter: list[float] num_preemption_iter: int # Request stats (should have _requests suffix) # Latency - time_e2e_requests: List[float] - time_queue_requests: List[float] - time_inference_requests: List[float] - time_prefill_requests: List[float] - time_decode_requests: List[float] - time_in_queue_requests: List[float] - model_forward_time_requests: List[float] - model_execute_time_requests: List[float] + time_e2e_requests: list[float] + time_queue_requests: list[float] + time_inference_requests: list[float] + time_prefill_requests: list[float] + time_decode_requests: list[float] + time_in_queue_requests: list[float] + model_forward_time_requests: list[float] + model_execute_time_requests: list[float] # Metadata - num_prompt_tokens_requests: List[int] - num_generation_tokens_requests: List[int] - n_requests: List[int] - max_num_generation_tokens_requests: List[int] - max_tokens_requests: List[int] - finished_reason_requests: List[str] - waiting_lora_adapters: List[str] - running_lora_adapters: List[str] + num_prompt_tokens_requests: list[int] + num_generation_tokens_requests: list[int] + n_requests: list[int] + max_num_generation_tokens_requests: list[int] + max_tokens_requests: list[int] + finished_reason_requests: list[str] + waiting_lora_adapters: list[str] + running_lora_adapters: list[str] max_lora: str spec_decode_metrics: Optional["SpecDecodeWorkerMetrics"] = None @@ -75,8 +75,8 @@ class StatLoggerBase(ABC): def __init__(self, local_interval: float, vllm_config: VllmConfig) -> None: # Tracked stats over current local logging interval. - self.num_prompt_tokens: List[int] = [] - self.num_generation_tokens: List[int] = [] + self.num_prompt_tokens: list[int] = [] + self.num_generation_tokens: list[int] = [] self.last_local_log = time.time() self.local_interval = local_interval self.spec_decode_metrics: Optional[SpecDecodeWorkerMetrics] = None diff --git a/vllm/engine/multiprocessing/__init__.py b/vllm/engine/multiprocessing/__init__.py index 26dfb63c3dbf3..ea9742a6d38bc 100644 --- a/vllm/engine/multiprocessing/__init__.py +++ b/vllm/engine/multiprocessing/__init__.py @@ -1,9 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 import uuid +from collections.abc import Mapping from dataclasses import dataclass, field from enum import Enum -from typing import List, Mapping, Optional, Union, overload +from typing import Optional, Union, overload from typing_extensions import deprecated @@ -153,7 +154,7 @@ class RPCAdapterLoadedResponse: RPCResetPrefixCacheRequest, RPCSleepRequest, RPCWakeUpRequest] -REQUEST_OUTPUTS_T = Union[List[RequestOutput], RPCAdapterLoadedResponse, +REQUEST_OUTPUTS_T = Union[list[RequestOutput], RPCAdapterLoadedResponse, RPCError] diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py index c12fe242082bf..e8b830c2a38be 100644 --- a/vllm/engine/multiprocessing/client.py +++ b/vllm/engine/multiprocessing/client.py @@ -3,9 +3,9 @@ import asyncio import copy import pickle +from collections.abc import AsyncGenerator, Iterator, Mapping from contextlib import contextmanager, suppress -from typing import (Any, AsyncGenerator, Dict, Iterator, List, Mapping, - Optional, Union, cast, overload) +from typing import Any, Optional, Union, cast, overload import cloudpickle import psutil @@ -76,7 +76,7 @@ class MQLLMEngineClient(EngineClient): - Pulls RequestOutputs from its queue and yields them MQLLMEngine runs two background loops: - - output_loop: the output loop pulls List[RequestOutput] + - output_loop: the output loop pulls list[RequestOutput] from the MQLLMEngine via zmq (each list is the output of one engine_step in the LLMEngine). It then parses the list and pushes individual request_outputs into @@ -120,7 +120,7 @@ def __init__(self, ipc_path: str, engine_config: VllmConfig, self.data_ipc_path = f"{ipc_path}{IPC_DATA_EXT}" # Stream for each individual request. - self.output_queues: Dict[str, asyncio.Queue] = {} + self.output_queues: dict[str, asyncio.Queue] = {} # Loop to handle output of the LLMEngine periodically. # Started after the MQLLMEngine is ready so that we can @@ -401,7 +401,7 @@ async def abort(self, request_id: str): async def do_log_stats( self, scheduler_outputs: Optional[SchedulerOutputs] = None, - model_output: Optional[List[SamplerOutput]] = None, + model_output: Optional[list[SamplerOutput]] = None, ) -> None: """ Ignore do_log_stats (handled on MQLLMEngine polling) diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py index efea6ee2c69aa..897ccdf127fab 100644 --- a/vllm/engine/multiprocessing/engine.py +++ b/vllm/engine/multiprocessing/engine.py @@ -2,8 +2,9 @@ import pickle import signal +from collections.abc import Iterator from contextlib import contextmanager -from typing import Iterator, List, Optional, Union +from typing import Optional, Union import cloudpickle import zmq @@ -205,7 +206,7 @@ def run_engine_loop(self): if not self.use_async_sockets: self._send_outputs(request_outputs) - def engine_step(self) -> List[RequestOutput]: + def engine_step(self) -> list[RequestOutput]: """Engine step wrapper with error handling.""" try: return self.engine.step() diff --git a/vllm/engine/output_processor/interfaces.py b/vllm/engine/output_processor/interfaces.py index 4c8e295c13815..db32a145c8626 100644 --- a/vllm/engine/output_processor/interfaces.py +++ b/vllm/engine/output_processor/interfaces.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 from abc import ABC, abstractmethod -from typing import Callable, List +from typing import Callable from vllm.config import SchedulerConfig from vllm.core.scheduler import Scheduler @@ -28,7 +28,7 @@ class SequenceGroupOutputProcessor(ABC): def create_output_processor( scheduler_config: SchedulerConfig, detokenizer: Detokenizer, - scheduler: List[Scheduler], + scheduler: list[Scheduler], seq_counter: Counter, get_tokenizer_for_seq: Callable[[Sequence], AnyTokenizer], stop_checker: "StopChecker", @@ -59,7 +59,7 @@ def create_output_processor( @abstractmethod def process_outputs(self, sequence_group: SequenceGroup, - outputs: List[SequenceGroupOutput], + outputs: list[SequenceGroupOutput], is_async: bool) -> None: """Process new token ids for the sequence group. Handles logic such as detokenization, stop checking, and freeing/forking sequences in the @@ -69,6 +69,6 @@ def process_outputs(self, sequence_group: SequenceGroup, @abstractmethod def process_prompt_logprob(self, seq_group: SequenceGroup, - outputs: List[SequenceGroupOutput]) -> None: + outputs: list[SequenceGroupOutput]) -> None: """Update prompt logprobs received from outputs to seq_group.""" pass diff --git a/vllm/engine/output_processor/multi_step.py b/vllm/engine/output_processor/multi_step.py index 8ceef855e020f..4c7d29c4a77e4 100644 --- a/vllm/engine/output_processor/multi_step.py +++ b/vllm/engine/output_processor/multi_step.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import functools -from typing import Callable, List, cast +from typing import Callable, cast from vllm.core.scheduler import Scheduler from vllm.engine.output_processor.interfaces import ( @@ -39,7 +39,7 @@ class MultiStepOutputProcessor(SequenceGroupOutputProcessor): def __init__( self, detokenizer: Detokenizer, - scheduler: List[Scheduler], + scheduler: list[Scheduler], seq_counter: Counter, get_tokenizer_for_seq: Callable[[Sequence], AnyTokenizer], stop_checker: StopChecker, @@ -51,7 +51,7 @@ def __init__( self.stop_checker = stop_checker def process_prompt_logprob(self, seq_group: SequenceGroup, - outputs: List[SequenceGroupOutput]) -> None: + outputs: list[SequenceGroupOutput]) -> None: """Process prompt logprobs associated with each step of a multi-step- scheduled computation. @@ -75,7 +75,7 @@ def _log_prompt_logprob_unsupported_warning_once(): def process_outputs(self, sequence_group: SequenceGroup, - outputs: List[SequenceGroupOutput], + outputs: list[SequenceGroupOutput], is_async: bool = False) -> None: """Append new tokens in the outputs to sequences in the sequence group. @@ -112,7 +112,7 @@ def process_outputs(self, isinstance(output, CompletionSequenceGroupOutput) for output in outputs ]) - compl_outputs = cast(List[CompletionSequenceGroupOutput], outputs) + compl_outputs = cast(list[CompletionSequenceGroupOutput], outputs) assert all([ seq_id == output.samples[0].parent_seq_id for output in compl_outputs @@ -158,7 +158,7 @@ def _process_decode_and_stop(self, seq: Sequence, ) def _process_seq_outputs(self, seq: Sequence, - valid_samples: List[SequenceOutput], + valid_samples: list[SequenceOutput], sampling_params: SamplingParams) -> None: output_token_ids = [sample.output_token for sample in valid_samples] output_logprobs = [sample.logprobs for sample in valid_samples] diff --git a/vllm/engine/output_processor/single_step.py b/vllm/engine/output_processor/single_step.py index 4d96791a1f8a3..270f062cce13e 100644 --- a/vllm/engine/output_processor/single_step.py +++ b/vllm/engine/output_processor/single_step.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List - from vllm.config import SchedulerConfig from vllm.core.scheduler import Scheduler from vllm.engine.output_processor.interfaces import ( @@ -69,7 +67,7 @@ class SingleStepOutputProcessor(SequenceGroupOutputProcessor): """ def __init__(self, scheduler_config: SchedulerConfig, - detokenizer: Detokenizer, scheduler: List[Scheduler], + detokenizer: Detokenizer, scheduler: list[Scheduler], seq_counter: Counter, stop_checker: StopChecker): self.scheduler_config = scheduler_config self.detokenizer = detokenizer @@ -78,7 +76,7 @@ def __init__(self, scheduler_config: SchedulerConfig, self.stop_checker = stop_checker def process_outputs(self, sequence_group: SequenceGroup, - outputs: List[SequenceGroupOutput], + outputs: list[SequenceGroupOutput], is_async: bool) -> None: """Append all new tokens to sequences in the sequence group. Fork any surviving beam candidates; free any unsurviving ones. @@ -98,7 +96,7 @@ def process_outputs(self, sequence_group: SequenceGroup, is_async) def process_prompt_logprob(self, seq_group: SequenceGroup, - outputs: List[SequenceGroupOutput]) -> None: + outputs: list[SequenceGroupOutput]) -> None: """Process prompt logprobs associated with one step of a single-step- scheduled computation. diff --git a/vllm/engine/output_processor/stop_checker.py b/vllm/engine/output_processor/stop_checker.py index 6cad9ec8f327f..c757d8eded9f5 100644 --- a/vllm/engine/output_processor/stop_checker.py +++ b/vllm/engine/output_processor/stop_checker.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Callable, List, Optional, Tuple +from typing import Callable, Optional from vllm.lora.request import LoRARequest from vllm.sampling_params import SamplingParams @@ -94,9 +94,9 @@ def maybe_stop_sequence( def check_stop_strings( output_text: str, new_char_count: int, - stop: List[str], + stop: list[str], include_in_output: bool, - ) -> Optional[Tuple[str, int]]: + ) -> Optional[tuple[str, int]]: """Check if any stop strings are matched and truncate sequence output text accordingly. diff --git a/vllm/engine/output_processor/util.py b/vllm/engine/output_processor/util.py index 0d2b58c109e32..09ed76b2a8023 100644 --- a/vllm/engine/output_processor/util.py +++ b/vllm/engine/output_processor/util.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List -from typing import Sequence as GenericSequence +from collections.abc import Sequence as GenericSequence from typing import cast from vllm.model_executor.layers.sampler import SamplerOutput @@ -10,11 +9,11 @@ def create_output_by_sequence_group( outputs: GenericSequence[SamplerOutput], - num_seq_groups: int) -> List[List[SequenceGroupOutput]]: + num_seq_groups: int) -> list[list[SequenceGroupOutput]]: """Helper method which transforms a 2d list organized by [step][sequence group] into [sequence group][step]. """ - output_by_sequence_group: List[List[CompletionSequenceGroupOutput]] = [ + output_by_sequence_group: list[list[CompletionSequenceGroupOutput]] = [ [] for _ in range(num_seq_groups) ] for step in outputs: @@ -24,4 +23,4 @@ def create_output_by_sequence_group( # Cast to the more generic type that CompletionSequenceGroupOutput # inherits from. - return cast(List[List[SequenceGroupOutput]], output_by_sequence_group) + return cast(list[list[SequenceGroupOutput]], output_by_sequence_group) diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py index ee9accd32f218..e45b3facfc980 100644 --- a/vllm/engine/protocol.py +++ b/vllm/engine/protocol.py @@ -2,7 +2,8 @@ import asyncio from abc import ABC, abstractmethod -from typing import AsyncGenerator, List, Mapping, Optional +from collections.abc import AsyncGenerator, Mapping +from typing import Optional from vllm.beam_search import BeamSearchSequence, create_sort_beams_key_function from vllm.config import DecodingConfig, ModelConfig @@ -254,7 +255,7 @@ async def is_tracing_enabled(self) -> bool: async def do_log_stats( self, scheduler_outputs: Optional[SchedulerOutputs] = None, - model_output: Optional[List[SamplerOutput]] = None, + model_output: Optional[list[SamplerOutput]] = None, ) -> None: ... diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py index 28b8c847c0fdf..c81ff958531bd 100644 --- a/vllm/entrypoints/api_server.py +++ b/vllm/entrypoints/api_server.py @@ -10,7 +10,8 @@ import json import ssl from argparse import Namespace -from typing import Any, AsyncGenerator, Optional +from collections.abc import AsyncGenerator +from typing import Any, Optional from fastapi import FastAPI, Request from fastapi.responses import JSONResponse, Response, StreamingResponse diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index c50c631dafccc..b05842dd27d3b 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -5,10 +5,11 @@ import json from abc import ABC, abstractmethod from collections import defaultdict, deque +from collections.abc import Awaitable, Iterable from functools import cache, lru_cache, partial from pathlib import Path -from typing import (Any, Awaitable, Callable, Dict, Generic, Iterable, List, - Literal, Optional, Tuple, TypeVar, Union, cast) +from typing import (Any, Callable, Generic, Literal, Optional, TypeVar, Union, + cast) import jinja2.nodes import transformers.utils.chat_template_utils as hf_chat_utils @@ -117,7 +118,7 @@ class CustomChatCompletionMessageParam(TypedDict, total=False): role: Required[str] """The role of the message's author.""" - content: Union[str, List[ChatCompletionContentPartParam]] + content: Union[str, list[ChatCompletionContentPartParam]] """The contents of the message.""" name: str @@ -143,7 +144,7 @@ class ConversationMessage(TypedDict, total=False): role: Required[str] """The role of the message's author.""" - content: Union[Optional[str], List[Dict[str, str]]] + content: Union[Optional[str], list[dict[str, str]]] """The contents of the message""" tool_call_id: Optional[str] @@ -495,13 +496,13 @@ def __init__(self) -> None: super().__init__() # multimodal placeholder_string : count - self._placeholder_counts: Dict[str, int] = defaultdict(lambda: 0) + self._placeholder_counts: dict[str, int] = defaultdict(lambda: 0) def _add_placeholder(self, placeholder: Optional[str]): if placeholder: self._placeholder_counts[placeholder] += 1 - def mm_placeholder_counts(self) -> Dict[str, int]: + def mm_placeholder_counts(self) -> dict[str, int]: return dict(self._placeholder_counts) @abstractmethod @@ -652,12 +653,12 @@ def load_chat_template( # TODO: Let user specify how to insert multimodal tokens into prompt # (similar to chat template) -def _get_full_multimodal_text_prompt(placeholder_counts: Dict[str, int], +def _get_full_multimodal_text_prompt(placeholder_counts: dict[str, int], text_prompt: str) -> str: """Combine multimodal prompts for a multimodal language model.""" # Look through the text prompt to check for missing placeholders - missing_placeholders: List[str] = [] + missing_placeholders: list[str] = [] for placeholder in placeholder_counts: # For any existing placeholder in the text prompt, we leave it as is @@ -684,10 +685,10 @@ def _get_full_multimodal_text_prompt(placeholder_counts: Dict[str, int], _RefusalParser = partial(cast, ChatCompletionContentPartRefusalParam) _VideoParser = partial(cast, ChatCompletionContentPartVideoParam) -_ContentPart: TypeAlias = Union[str, Dict[str, str], InputAudio] +_ContentPart: TypeAlias = Union[str, dict[str, str], InputAudio] # Define a mapping from part types to their corresponding parsing functions. -MM_PARSER_MAP: Dict[ +MM_PARSER_MAP: dict[ str, Callable[[ChatCompletionContentPartParam], _ContentPart], ] = { @@ -749,7 +750,7 @@ def _parse_chat_message_content_mm_part( part) return "audio_url", audio_params.get("audio_url", "") if part.get("input_audio") is not None: - input_audio_params = cast(Dict[str, str], part) + input_audio_params = cast(dict[str, str], part) return "input_audio", input_audio_params if part.get("video_url") is not None: video_params = cast(CustomChatCompletionContentSimpleVideoParam, @@ -773,7 +774,7 @@ def _parse_chat_message_content_parts( mm_tracker: BaseMultiModalItemTracker, *, wrap_dicts: bool, -) -> List[ConversationMessage]: +) -> list[ConversationMessage]: content = list[_ContentPart]() mm_parser = mm_tracker.create_parser() @@ -791,7 +792,7 @@ def _parse_chat_message_content_parts( # Parsing wraps images and texts as interleaved dictionaries return [ConversationMessage(role=role, content=content)] # type: ignore - texts = cast(List[str], content) + texts = cast(list[str], content) text_prompt = "\n".join(texts) mm_placeholder_counts = mm_parser.mm_placeholder_counts() if mm_placeholder_counts: @@ -866,7 +867,7 @@ def _parse_chat_message_content( message: ChatCompletionMessageParam, mm_tracker: BaseMultiModalItemTracker, content_format: _ChatTemplateContentFormat, -) -> List[ConversationMessage]: +) -> list[ConversationMessage]: role = message["role"] content = message.get("content") @@ -900,7 +901,7 @@ def _parse_chat_message_content( return result -def _postprocess_messages(messages: List[ConversationMessage]) -> None: +def _postprocess_messages(messages: list[ConversationMessage]) -> None: # per the Transformers docs & maintainers, tool call arguments in # assistant-role messages with tool_calls need to be dicts not JSON str - # this is how tool-use chat templates will expect them moving forwards @@ -916,12 +917,12 @@ def _postprocess_messages(messages: List[ConversationMessage]) -> None: def parse_chat_messages( - messages: List[ChatCompletionMessageParam], + messages: list[ChatCompletionMessageParam], model_config: ModelConfig, tokenizer: AnyTokenizer, content_format: _ChatTemplateContentFormat, -) -> Tuple[List[ConversationMessage], Optional[MultiModalDataDict]]: - conversation: List[ConversationMessage] = [] +) -> tuple[list[ConversationMessage], Optional[MultiModalDataDict]]: + conversation: list[ConversationMessage] = [] mm_tracker = MultiModalItemTracker(model_config, tokenizer) for msg in messages: @@ -939,12 +940,12 @@ def parse_chat_messages( def parse_chat_messages_futures( - messages: List[ChatCompletionMessageParam], + messages: list[ChatCompletionMessageParam], model_config: ModelConfig, tokenizer: AnyTokenizer, content_format: _ChatTemplateContentFormat, -) -> Tuple[List[ConversationMessage], Awaitable[Optional[MultiModalDataDict]]]: - conversation: List[ConversationMessage] = [] +) -> tuple[list[ConversationMessage], Awaitable[Optional[MultiModalDataDict]]]: + conversation: list[ConversationMessage] = [] mm_tracker = AsyncMultiModalItemTracker(model_config, tokenizer) for msg in messages: @@ -963,7 +964,7 @@ def parse_chat_messages_futures( def apply_hf_chat_template( tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], - conversation: List[ConversationMessage], + conversation: list[ConversationMessage], chat_template: Optional[str], *, tokenize: bool = False, # Different from HF's default @@ -985,10 +986,10 @@ def apply_hf_chat_template( def apply_mistral_chat_template( tokenizer: MistralTokenizer, - messages: List[ChatCompletionMessageParam], + messages: list[ChatCompletionMessageParam], chat_template: Optional[str] = None, **kwargs: Any, -) -> List[int]: +) -> list[int]: if chat_template is not None: logger.warning_once( "'chat_template' cannot be overridden for mistral tokenizer.") diff --git a/vllm/entrypoints/cli/openai.py b/vllm/entrypoints/cli/openai.py index 73df900f610f2..21a7d48b75c18 100644 --- a/vllm/entrypoints/cli/openai.py +++ b/vllm/entrypoints/cli/openai.py @@ -5,7 +5,7 @@ import os import signal import sys -from typing import List, Optional, Tuple +from typing import Optional from openai import OpenAI from openai.types.chat import ChatCompletionMessageParam @@ -23,7 +23,7 @@ def signal_handler(sig, frame): signal.signal(signal.SIGTSTP, signal_handler) -def _interactive_cli(args: argparse.Namespace) -> Tuple[str, OpenAI]: +def _interactive_cli(args: argparse.Namespace) -> tuple[str, OpenAI]: _register_signal_handlers() base_url = args.url @@ -43,7 +43,7 @@ def _interactive_cli(args: argparse.Namespace) -> Tuple[str, OpenAI]: def chat(system_prompt: Optional[str], model_name: str, client: OpenAI) -> None: - conversation: List[ChatCompletionMessageParam] = [] + conversation: list[ChatCompletionMessageParam] = [] if system_prompt is not None: conversation.append({"role": "system", "content": system_prompt}) @@ -100,7 +100,7 @@ def __init__(self): def cmd(args: argparse.Namespace) -> None: model_name, client = _interactive_cli(args) system_prompt = args.system_prompt - conversation: List[ChatCompletionMessageParam] = [] + conversation: list[ChatCompletionMessageParam] = [] if system_prompt is not None: conversation.append({"role": "system", "content": system_prompt}) @@ -168,5 +168,5 @@ def subparser_init( return complete_parser -def cmd_init() -> List[CLISubcommand]: +def cmd_init() -> list[CLISubcommand]: return [ChatCommand(), CompleteCommand()] diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py index 1afead8a120d4..c345ece4dada9 100644 --- a/vllm/entrypoints/cli/serve.py +++ b/vllm/entrypoints/cli/serve.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 import argparse -from typing import List import uvloop @@ -59,5 +58,5 @@ def subparser_init( return make_arg_parser(serve_parser) -def cmd_init() -> List[CLISubcommand]: +def cmd_init() -> list[CLISubcommand]: return [ServeSubcommand()] diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 3f3262f6e72c0..122e2ed86cb64 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -2,9 +2,9 @@ import itertools import warnings +from collections.abc import Sequence from contextlib import contextmanager -from typing import (Any, Callable, ClassVar, Dict, List, Optional, Sequence, - Tuple, Type, Union, cast, overload) +from typing import Any, Callable, ClassVar, Optional, Union, cast, overload import cloudpickle import torch.nn as nn @@ -177,11 +177,11 @@ def __init__( disable_custom_all_reduce: bool = False, disable_async_output_proc: bool = False, hf_overrides: Optional[HfOverrides] = None, - mm_processor_kwargs: Optional[Dict[str, Any]] = None, + mm_processor_kwargs: Optional[dict[str, Any]] = None, # After positional args are removed, move this right below `model` task: TaskOption = "auto", override_pooler_config: Optional[PoolerConfig] = None, - compilation_config: Optional[Union[int, Dict[str, Any]]] = None, + compilation_config: Optional[Union[int, dict[str, Any]]] = None, **kwargs, ) -> None: ''' @@ -246,7 +246,7 @@ def __init__( self.request_counter = Counter() @staticmethod - def get_engine_class() -> Type[LLMEngine]: + def get_engine_class() -> type[LLMEngine]: if envs.VLLM_USE_V1: # Lazy import: the v1 package isn't distributed from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine @@ -283,11 +283,11 @@ def generate( Sequence[SamplingParams]]] = None, *, use_tqdm: bool = True, - lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None, + lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, guided_options_request: Optional[Union[LLMGuidedOptions, GuidedDecodingRequest]] = None, - ) -> List[RequestOutput]: + ) -> list[RequestOutput]: ... @overload # LEGACY: single (prompt + optional token ids) @@ -296,30 +296,30 @@ def generate( self, prompts: str, sampling_params: Optional[Union[SamplingParams, - List[SamplingParams]]] = None, - prompt_token_ids: Optional[List[int]] = None, + list[SamplingParams]]] = None, + prompt_token_ids: Optional[list[int]] = None, use_tqdm: bool = True, - lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None, + lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, guided_options_request: Optional[Union[LLMGuidedOptions, GuidedDecodingRequest]] = None, - ) -> List[RequestOutput]: + ) -> list[RequestOutput]: ... @overload # LEGACY: multi (prompt + optional token ids) @deprecated("'prompt_token_ids' will become part of 'prompts'") def generate( self, - prompts: List[str], + prompts: list[str], sampling_params: Optional[Union[SamplingParams, - List[SamplingParams]]] = None, - prompt_token_ids: Optional[List[List[int]]] = None, + list[SamplingParams]]] = None, + prompt_token_ids: Optional[list[list[int]]] = None, use_tqdm: bool = True, - lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None, + lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, guided_options_request: Optional[Union[LLMGuidedOptions, GuidedDecodingRequest]] = None, - ) -> List[RequestOutput]: + ) -> list[RequestOutput]: ... @overload # LEGACY: single (token ids + optional prompt) @@ -328,32 +328,32 @@ def generate( self, prompts: Optional[str] = None, sampling_params: Optional[Union[SamplingParams, - List[SamplingParams]]] = None, + list[SamplingParams]]] = None, *, - prompt_token_ids: List[int], + prompt_token_ids: list[int], use_tqdm: bool = True, - lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None, + lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, guided_options_request: Optional[Union[LLMGuidedOptions, GuidedDecodingRequest]] = None, - ) -> List[RequestOutput]: + ) -> list[RequestOutput]: ... @overload # LEGACY: multi (token ids + optional prompt) @deprecated("'prompt_token_ids' will become part of 'prompts'") def generate( self, - prompts: Optional[List[str]] = None, + prompts: Optional[list[str]] = None, sampling_params: Optional[Union[SamplingParams, - List[SamplingParams]]] = None, + list[SamplingParams]]] = None, *, - prompt_token_ids: List[List[int]], + prompt_token_ids: list[list[int]], use_tqdm: bool = True, - lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None, + lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, guided_options_request: Optional[Union[LLMGuidedOptions, GuidedDecodingRequest]] = None, - ) -> List[RequestOutput]: + ) -> list[RequestOutput]: ... @overload # LEGACY: single or multi token ids [pos-only] @@ -362,13 +362,13 @@ def generate( self, prompts: None, sampling_params: None, - prompt_token_ids: Union[List[int], List[List[int]]], + prompt_token_ids: Union[list[int], list[list[int]]], use_tqdm: bool = True, - lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None, + lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, guided_options_request: Optional[Union[LLMGuidedOptions, GuidedDecodingRequest]] = None, - ) -> List[RequestOutput]: + ) -> list[RequestOutput]: ... @deprecate_kwargs( @@ -379,17 +379,17 @@ def generate( def generate( self, prompts: Union[Union[PromptType, Sequence[PromptType]], - Optional[Union[str, List[str]]]] = None, + Optional[Union[str, list[str]]]] = None, sampling_params: Optional[Union[SamplingParams, Sequence[SamplingParams]]] = None, - prompt_token_ids: Optional[Union[List[int], List[List[int]]]] = None, + prompt_token_ids: Optional[Union[list[int], list[list[int]]]] = None, use_tqdm: bool = True, - lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None, + lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, guided_options_request: Optional[Union[LLMGuidedOptions, GuidedDecodingRequest]] = None, - priority: Optional[List[int]] = None, - ) -> List[RequestOutput]: + priority: Optional[list[int]] = None, + ) -> list[RequestOutput]: """Generates the completions for the input prompts. This class automatically batches the given prompts, considering @@ -440,7 +440,7 @@ def generate( if prompt_token_ids is not None: parsed_prompts = self._convert_v1_inputs( - prompts=cast(Optional[Union[str, List[str]]], prompts), + prompts=cast(Optional[Union[str, list[str]]], prompts), prompt_token_ids=prompt_token_ids, ) else: @@ -473,8 +473,8 @@ def generate( def collective_rpc(self, method: Union[str, Callable[..., _R]], timeout: Optional[float] = None, - args: Tuple = (), - kwargs: Optional[Dict[str, Any]] = None) -> List[_R]: + args: tuple = (), + kwargs: Optional[dict[str, Any]] = None) -> list[_R]: """ Execute an RPC call on all workers. @@ -510,9 +510,9 @@ def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]: def beam_search( self, - prompts: List[Union[TokensPrompt, TextPrompt]], + prompts: list[Union[TokensPrompt, TextPrompt]], params: BeamSearchParams, - ) -> List[BeamSearchOutput]: + ) -> list[BeamSearchOutput]: """ Generate sequences using beam search. @@ -543,7 +543,7 @@ def sort_beams_key(x: BeamSearchSequence) -> float: beam_search_params = SamplingParams(logprobs=2 * beam_width, max_tokens=1, temperature=temperature) - instances: List[BeamSearchInstance] = [] + instances: list[BeamSearchInstance] = [] for prompt in prompts: if is_token_prompt(prompt): @@ -553,12 +553,12 @@ def sort_beams_key(x: BeamSearchSequence) -> float: instances.append(BeamSearchInstance(prompt_tokens)) for _ in range(max_tokens): - all_beams: List[BeamSearchSequence] = list( + all_beams: list[BeamSearchSequence] = list( sum((instance.beams for instance in instances), [])) pos = [0] + list( itertools.accumulate( len(instance.beams) for instance in instances)) - instance_start_and_end: List[Tuple[int, int]] = list( + instance_start_and_end: list[tuple[int, int]] = list( zip(pos[:-1], pos[1:])) if len(all_beams) == 0: @@ -620,19 +620,19 @@ def sort_beams_key(x: BeamSearchSequence) -> float: def chat( self, - messages: Union[List[ChatCompletionMessageParam], - List[List[ChatCompletionMessageParam]]], + messages: Union[list[ChatCompletionMessageParam], + list[list[ChatCompletionMessageParam]]], sampling_params: Optional[Union[SamplingParams, - List[SamplingParams]]] = None, + list[SamplingParams]]] = None, use_tqdm: bool = True, lora_request: Optional[LoRARequest] = None, chat_template: Optional[str] = None, chat_template_content_format: ChatTemplateContentFormatOption = "auto", add_generation_prompt: bool = True, continue_final_message: bool = False, - tools: Optional[List[Dict[str, Any]]] = None, - mm_processor_kwargs: Optional[Dict[str, Any]] = None, - ) -> List[RequestOutput]: + tools: Optional[list[dict[str, Any]]] = None, + mm_processor_kwargs: Optional[dict[str, Any]] = None, + ) -> list[RequestOutput]: """ Generate responses for a chat conversation. @@ -678,17 +678,17 @@ def chat( A list of ``RequestOutput`` objects containing the generated responses in the same order as the input messages. """ - list_of_messages: List[List[ChatCompletionMessageParam]] + list_of_messages: list[list[ChatCompletionMessageParam]] # Handle multi and single conversations if is_list_of(messages, list): - # messages is List[List[...]] - list_of_messages = cast(List[List[ChatCompletionMessageParam]], + # messages is list[list[...]] + list_of_messages = cast(list[list[ChatCompletionMessageParam]], messages) else: - # messages is List[...] + # messages is list[...] list_of_messages = [ - cast(List[ChatCompletionMessageParam], messages) + cast(list[ChatCompletionMessageParam], messages) ] tokenizer = self.get_tokenizer() @@ -699,7 +699,7 @@ def chat( tokenizer, ) - prompts: List[Union[TokensPrompt, TextPrompt]] = [] + prompts: list[Union[TokensPrompt, TextPrompt]] = [] for msgs in list_of_messages: # NOTE: _parse_chat_message_content_parts() currently doesn't @@ -712,7 +712,7 @@ def chat( content_format=resolved_content_format, ) - prompt_data: Union[str, List[int]] + prompt_data: Union[str, list[int]] if isinstance(tokenizer, MistralTokenizer): prompt_data = apply_mistral_chat_template( tokenizer, @@ -762,9 +762,9 @@ def encode( Sequence[PoolingParams]]] = None, *, use_tqdm: bool = True, - lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None, + lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, - ) -> List[PoolingRequestOutput]: + ) -> list[PoolingRequestOutput]: ... @overload # LEGACY: single (prompt + optional token ids) @@ -774,25 +774,25 @@ def encode( prompts: str, pooling_params: Optional[Union[PoolingParams, Sequence[PoolingParams]]] = None, - prompt_token_ids: Optional[List[int]] = None, + prompt_token_ids: Optional[list[int]] = None, use_tqdm: bool = True, - lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None, + lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, - ) -> List[PoolingRequestOutput]: + ) -> list[PoolingRequestOutput]: ... @overload # LEGACY: multi (prompt + optional token ids) @deprecated("'prompt_token_ids' will become part of 'prompts'") def encode( self, - prompts: List[str], + prompts: list[str], pooling_params: Optional[Union[PoolingParams, Sequence[PoolingParams]]] = None, - prompt_token_ids: Optional[List[List[int]]] = None, + prompt_token_ids: Optional[list[list[int]]] = None, use_tqdm: bool = True, - lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None, + lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, - ) -> List[PoolingRequestOutput]: + ) -> list[PoolingRequestOutput]: ... @overload # LEGACY: single (token ids + optional prompt) @@ -803,26 +803,26 @@ def encode( pooling_params: Optional[Union[PoolingParams, Sequence[PoolingParams]]] = None, *, - prompt_token_ids: List[int], + prompt_token_ids: list[int], use_tqdm: bool = True, - lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None, + lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, - ) -> List[PoolingRequestOutput]: + ) -> list[PoolingRequestOutput]: ... @overload # LEGACY: multi (token ids + optional prompt) @deprecated("'prompt_token_ids' will become part of 'prompts'") def encode( self, - prompts: Optional[List[str]] = None, + prompts: Optional[list[str]] = None, pooling_params: Optional[Union[PoolingParams, Sequence[PoolingParams]]] = None, *, - prompt_token_ids: List[List[int]], + prompt_token_ids: list[list[int]], use_tqdm: bool = True, - lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None, + lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, - ) -> List[PoolingRequestOutput]: + ) -> list[PoolingRequestOutput]: ... @overload # LEGACY: single or multi token ids [pos-only] @@ -831,11 +831,11 @@ def encode( self, prompts: None, pooling_params: None, - prompt_token_ids: Union[List[int], List[List[int]]], + prompt_token_ids: Union[list[int], list[list[int]]], use_tqdm: bool = True, - lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None, + lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, - ) -> List[PoolingRequestOutput]: + ) -> list[PoolingRequestOutput]: ... @deprecate_kwargs( @@ -846,14 +846,14 @@ def encode( def encode( self, prompts: Union[Union[PromptType, Sequence[PromptType]], - Optional[Union[str, List[str]]]] = None, + Optional[Union[str, list[str]]]] = None, pooling_params: Optional[Union[PoolingParams, Sequence[PoolingParams]]] = None, - prompt_token_ids: Optional[Union[List[int], List[List[int]]]] = None, + prompt_token_ids: Optional[Union[list[int], list[list[int]]]] = None, use_tqdm: bool = True, - lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None, + lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, - ) -> List[PoolingRequestOutput]: + ) -> list[PoolingRequestOutput]: """Apply pooling to the hidden states corresponding to the input prompts. @@ -898,7 +898,7 @@ def encode( if prompt_token_ids is not None: parsed_prompts = self._convert_v1_inputs( - prompts=cast(Optional[Union[str, List[str]]], prompts), + prompts=cast(Optional[Union[str, list[str]]], prompts), prompt_token_ids=prompt_token_ids, ) else: @@ -926,9 +926,9 @@ def embed( /, *, use_tqdm: bool = True, - lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None, + lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, - ) -> List[EmbeddingRequestOutput]: + ) -> list[EmbeddingRequestOutput]: """ Generate an embedding vector for each prompt. @@ -966,9 +966,9 @@ def classify( /, *, use_tqdm: bool = True, - lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None, + lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, - ) -> List[ClassificationRequestOutput]: + ) -> list[ClassificationRequestOutput]: """ Generate class logits for each prompt. @@ -1003,29 +1003,29 @@ def classify( def _embedding_score( self, tokenizer: AnyTokenizer, - text_1: List[Union[str, TextPrompt, TokensPrompt]], - text_2: List[Union[str, TextPrompt, TokensPrompt]], + text_1: list[Union[str, TextPrompt, TokensPrompt]], + text_2: list[Union[str, TextPrompt, TokensPrompt]], truncate_prompt_tokens: Optional[int] = None, use_tqdm: bool = True, - lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None, + lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, - ) -> List[ScoringRequestOutput]: + ) -> list[ScoringRequestOutput]: - encoded_output: List[PoolingRequestOutput] = self.encode( + encoded_output: list[PoolingRequestOutput] = self.encode( text_1 + text_2, use_tqdm=use_tqdm, lora_request=lora_request, prompt_adapter_request=prompt_adapter_request) - encoded_output_1: List[PoolingRequestOutput] = encoded_output[ + encoded_output_1: list[PoolingRequestOutput] = encoded_output[ 0:len(text_1)] - encoded_output_2: List[PoolingRequestOutput] = encoded_output[ + encoded_output_2: list[PoolingRequestOutput] = encoded_output[ len(text_1):] if len(encoded_output_1) == 1: encoded_output_1 = encoded_output_1 * len(encoded_output_2) - scores: List[PoolingRequestOutput] = [] + scores: list[PoolingRequestOutput] = [] scores = _cosine_similarity(tokenizer=tokenizer, embed_1=encoded_output_1, @@ -1038,13 +1038,13 @@ def _embedding_score( def _cross_encoding_score( self, tokenizer: AnyTokenizer, - text_1: List[str], - text_2: List[str], + text_1: list[str], + text_2: list[str], truncate_prompt_tokens: Optional[int] = None, use_tqdm: bool = True, - lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None, + lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, - ) -> List[ScoringRequestOutput]: + ) -> list[ScoringRequestOutput]: if isinstance(tokenizer, MistralTokenizer): raise ValueError( @@ -1057,7 +1057,7 @@ def _cross_encoding_score( pooling_params = PoolingParams() - tokenization_kwargs: Dict[str, Any] = {} + tokenization_kwargs: dict[str, Any] = {} if truncate_prompt_tokens is not None: tokenization_kwargs["truncation"] = True tokenization_kwargs["max_length"] = truncate_prompt_tokens @@ -1094,9 +1094,9 @@ def score( *, truncate_prompt_tokens: Optional[int] = None, use_tqdm: bool = True, - lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None, + lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, - ) -> List[ScoringRequestOutput]: + ) -> list[ScoringRequestOutput]: """Generate similarity scores for all pairs ````. The inputs can be ``1 -> 1``, ``1 -> N`` or ``N -> N``. @@ -1162,12 +1162,12 @@ def ensure_str(prompt: SingletonPrompt): if isinstance(text_1, (str, dict)): # Convert a single prompt to a list. text_1 = [text_1] - input_text_1: List[str] = [ensure_str(t) for t in text_1] + input_text_1: list[str] = [ensure_str(t) for t in text_1] if isinstance(text_2, (str, dict)): # Convert a single prompt to a list. text_2 = [text_2] - input_text_2: List[str] = [ensure_str(t) for t in text_2] + input_text_2: list[str] = [ensure_str(t) for t in text_2] _validate_score_input_lens(input_text_1, input_text_2) @@ -1226,8 +1226,8 @@ def wake_up(self): # LEGACY def _convert_v1_inputs( self, - prompts: Optional[Union[str, List[str]]], - prompt_token_ids: Optional[Union[List[int], List[List[int]]]], + prompts: Optional[Union[str, list[str]]], + prompt_token_ids: Optional[Union[list[int], list[list[int]]]], ): # skip_tokenizer_init is now checked in engine @@ -1252,7 +1252,7 @@ def _convert_v1_inputs( raise ValueError("Either prompts or prompt_token_ids must be " "provided.") - parsed_prompts: List[PromptType] = [] + parsed_prompts: list[PromptType] = [] for i in range(num_requests): item: PromptType @@ -1275,7 +1275,7 @@ def _validate_and_add_requests( lora_request: Optional[Union[Sequence[LoRARequest], LoRARequest]], prompt_adapter_request: Optional[PromptAdapterRequest], guided_options: Optional[GuidedDecodingRequest] = None, - priority: Optional[List[int]] = None, + priority: Optional[list[int]] = None, ) -> None: if guided_options is not None: warnings.warn( @@ -1357,7 +1357,7 @@ def _add_guided_params( def _run_engine( self, *, use_tqdm: bool - ) -> List[Union[RequestOutput, PoolingRequestOutput]]: + ) -> list[Union[RequestOutput, PoolingRequestOutput]]: # Initialize tqdm. if use_tqdm: num_requests = self.llm_engine.get_num_unfinished_requests() @@ -1370,7 +1370,7 @@ def _run_engine( ) # Run the engine. - outputs: List[Union[RequestOutput, PoolingRequestOutput]] = [] + outputs: list[Union[RequestOutput, PoolingRequestOutput]] = [] total_in_toks = 0 total_out_toks = 0 while self.llm_engine.has_unfinished_requests(): diff --git a/vllm/entrypoints/logger.py b/vllm/entrypoints/logger.py index e82b6ba6c7bae..ea5759152a226 100644 --- a/vllm/entrypoints/logger.py +++ b/vllm/entrypoints/logger.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List, Optional, Union +from typing import Optional, Union from vllm.logger import init_logger from vllm.lora.request import LoRARequest @@ -22,7 +22,7 @@ def log_inputs( self, request_id: str, prompt: Optional[str], - prompt_token_ids: Optional[List[int]], + prompt_token_ids: Optional[list[int]], params: Optional[Union[SamplingParams, PoolingParams, BeamSearchParams]], lora_request: Optional[LoRARequest], diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 1b65484c446a5..ec2099d4cebf0 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -13,10 +13,11 @@ import tempfile import uuid from argparse import Namespace +from collections.abc import AsyncIterator from contextlib import asynccontextmanager from functools import partial from http import HTTPStatus -from typing import Annotated, AsyncIterator, Dict, Optional, Set, Tuple, Union +from typing import Annotated, Optional, Union import uvloop from fastapi import APIRouter, Depends, FastAPI, Form, HTTPException, Request @@ -93,7 +94,7 @@ # Cannot use __name__ (https://github.com/vllm-project/vllm/pull/4765) logger = init_logger('vllm.entrypoints.openai.api_server') -_running_tasks: Set[asyncio.Task] = set() +_running_tasks: set[asyncio.Task] = set() @asynccontextmanager @@ -587,7 +588,7 @@ async def do_rerank_v2(request: RerankRequest, raw_request: Request): return await do_rerank(request, raw_request) -TASK_HANDLERS: Dict[str, Dict[str, tuple]] = { +TASK_HANDLERS: dict[str, dict[str, tuple]] = { "generate": { "messages": (ChatCompletionRequest, create_chat_completion), "default": (CompletionRequest, create_completion), @@ -894,7 +895,7 @@ async def init_app_state( state.task = model_config.task -def create_server_socket(addr: Tuple[str, int]) -> socket.socket: +def create_server_socket(addr: tuple[str, int]) -> socket.socket: family = socket.AF_INET if is_valid_ipv6_address(addr[0]): family = socket.AF_INET6 diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py index ba953c219708e..3cfcd53e8e234 100644 --- a/vllm/entrypoints/openai/cli_args.py +++ b/vllm/entrypoints/openai/cli_args.py @@ -8,7 +8,8 @@ import argparse import json import ssl -from typing import List, Optional, Sequence, Union, get_args +from collections.abc import Sequence +from typing import Optional, Union, get_args from vllm.engine.arg_utils import AsyncEngineArgs, nullable_str from vllm.entrypoints.chat_utils import (ChatTemplateContentFormatOption, @@ -34,7 +35,7 @@ def __call__( if isinstance(values, str): raise TypeError("Expected values to be a list") - lora_list: List[LoRAModulePath] = [] + lora_list: list[LoRAModulePath] = [] for item in values: if item in [None, '']: # Skip if item is None or empty string continue @@ -70,7 +71,7 @@ def __call__( if isinstance(values, str): raise TypeError("Expected values to be a list") - adapter_list: List[PromptAdapterPath] = [] + adapter_list: list[PromptAdapterPath] = [] for item in values: name, path = item.split('=') adapter_list.append(PromptAdapterPath(name, path)) diff --git a/vllm/entrypoints/openai/logits_processors.py b/vllm/entrypoints/openai/logits_processors.py index 41e5eef40eaf8..04d5091a96811 100644 --- a/vllm/entrypoints/openai/logits_processors.py +++ b/vllm/entrypoints/openai/logits_processors.py @@ -1,7 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 +from collections.abc import Iterable from functools import lru_cache, partial -from typing import Dict, FrozenSet, Iterable, List, Optional, Union +from typing import Optional, Union import torch @@ -14,10 +15,10 @@ class AllowedTokenIdsLogitsProcessor: specific set of token ids.""" def __init__(self, allowed_ids: Iterable[int]): - self.allowed_ids: Optional[List[int]] = list(allowed_ids) + self.allowed_ids: Optional[list[int]] = list(allowed_ids) self.mask: Optional[torch.Tensor] = None - def __call__(self, token_ids: List[int], + def __call__(self, token_ids: list[int], logits: torch.Tensor) -> torch.Tensor: if self.mask is None: self.mask = torch.ones((logits.shape[-1], ), @@ -31,7 +32,7 @@ def __call__(self, token_ids: List[int], @lru_cache(maxsize=32) def _get_allowed_token_ids_logits_processor( - allowed_token_ids: FrozenSet[int], + allowed_token_ids: frozenset[int], vocab_size: int, ) -> LogitsProcessor: if not allowed_token_ids: @@ -43,8 +44,8 @@ def _get_allowed_token_ids_logits_processor( def logit_bias_logits_processor( - logit_bias: Dict[int, float], - token_ids: List[int], + logit_bias: dict[int, float], + token_ids: list[int], logits: torch.Tensor, ) -> torch.Tensor: for token_id, bias in logit_bias.items(): @@ -53,16 +54,16 @@ def logit_bias_logits_processor( def get_logits_processors( - logit_bias: Optional[Union[Dict[int, float], Dict[str, float]]], - allowed_token_ids: Optional[List[int]], + logit_bias: Optional[Union[dict[int, float], dict[str, float]]], + allowed_token_ids: Optional[list[int]], tokenizer: AnyTokenizer, -) -> List[LogitsProcessor]: - logits_processors: List[LogitsProcessor] = [] +) -> list[LogitsProcessor]: + logits_processors: list[LogitsProcessor] = [] if logit_bias: try: # Convert token_id to integer # Clamp the bias between -100 and 100 per OpenAI API spec - clamped_logit_bias: Dict[int, float] = { + clamped_logit_bias: dict[int, float] = { int(token_id): min(100.0, max(-100.0, bias)) for token_id, bias in logit_bias.items() } diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 31214211cfc48..14ce71cd3c2e7 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -5,13 +5,13 @@ import re import time from argparse import Namespace -from typing import Any, ClassVar, Dict, List, Literal, Optional, Set, Union +from typing import Annotated, Any, ClassVar, Literal, Optional, Union import torch from fastapi import UploadFile from pydantic import (BaseModel, ConfigDict, Field, TypeAdapter, ValidationInfo, field_validator, model_validator) -from typing_extensions import Annotated, TypeAlias +from typing_extensions import TypeAlias from vllm.entrypoints.chat_utils import ChatCompletionMessageParam from vllm.logger import init_logger @@ -47,7 +47,7 @@ class OpenAIBaseModel(BaseModel): model_config = ConfigDict(extra="allow") # Cache class field names - field_names: ClassVar[Optional[Set[str]]] = None + field_names: ClassVar[Optional[set[str]]] = None @model_validator(mode="wrap") @classmethod @@ -105,12 +105,12 @@ class ModelCard(OpenAIBaseModel): root: Optional[str] = None parent: Optional[str] = None max_model_len: Optional[int] = None - permission: List[ModelPermission] = Field(default_factory=list) + permission: list[ModelPermission] = Field(default_factory=list) class ModelList(OpenAIBaseModel): object: str = "list" - data: List[ModelCard] = Field(default_factory=list) + data: list[ModelCard] = Field(default_factory=list) class PromptTokenUsageInfo(OpenAIBaseModel): @@ -134,7 +134,7 @@ class JsonSchemaResponseFormat(OpenAIBaseModel): description: Optional[str] = None # schema is the field in openai but that causes conflicts with pydantic so # instead use json_schema with an alias - json_schema: Optional[Dict[str, Any]] = Field(default=None, alias='schema') + json_schema: Optional[dict[str, Any]] = Field(default=None, alias='schema') strict: Optional[bool] = None @@ -152,7 +152,7 @@ class StreamOptions(OpenAIBaseModel): class FunctionDefinition(OpenAIBaseModel): name: str description: Optional[str] = None - parameters: Optional[Dict[str, Any]] = None + parameters: Optional[dict[str, Any]] = None class ChatCompletionToolsParam(OpenAIBaseModel): @@ -171,15 +171,15 @@ class ChatCompletionNamedToolChoiceParam(OpenAIBaseModel): class LogitsProcessorConstructor(BaseModel): qualname: str - args: Optional[List[Any]] = None - kwargs: Optional[Dict[str, Any]] = None + args: Optional[list[Any]] = None + kwargs: Optional[dict[str, Any]] = None -LogitsProcessors = List[Union[str, LogitsProcessorConstructor]] +LogitsProcessors = list[Union[str, LogitsProcessorConstructor]] def get_logits_processors(processors: Optional[LogitsProcessors], - pattern: Optional[str]) -> Optional[List[Any]]: + pattern: Optional[str]) -> Optional[list[Any]]: if processors and pattern: logits_processors = [] for processor in processors: @@ -212,10 +212,10 @@ def get_logits_processors(processors: Optional[LogitsProcessors], class ChatCompletionRequest(OpenAIBaseModel): # Ordered by official OpenAI API documentation # https://platform.openai.com/docs/api-reference/chat/create - messages: List[ChatCompletionMessageParam] + messages: list[ChatCompletionMessageParam] model: Optional[str] = None frequency_penalty: Optional[float] = 0.0 - logit_bias: Optional[Dict[str, float]] = None + logit_bias: Optional[dict[str, float]] = None logprobs: Optional[bool] = False top_logprobs: Optional[int] = 0 # TODO(#9845): remove max_tokens when field is removed from OpenAI API @@ -228,12 +228,12 @@ class ChatCompletionRequest(OpenAIBaseModel): presence_penalty: Optional[float] = 0.0 response_format: Optional[ResponseFormat] = None seed: Optional[int] = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max) - stop: Optional[Union[str, List[str]]] = Field(default_factory=list) + stop: Optional[Union[str, list[str]]] = Field(default_factory=list) stream: Optional[bool] = False stream_options: Optional[StreamOptions] = None temperature: Optional[float] = None top_p: Optional[float] = None - tools: Optional[List[ChatCompletionToolsParam]] = None + tools: Optional[list[ChatCompletionToolsParam]] = None tool_choice: Optional[Union[Literal["none"], Literal["auto"], ChatCompletionNamedToolChoiceParam]] = "none" @@ -248,7 +248,7 @@ class ChatCompletionRequest(OpenAIBaseModel): min_p: Optional[float] = None repetition_penalty: Optional[float] = None length_penalty: float = 1.0 - stop_token_ids: Optional[List[int]] = Field(default_factory=list) + stop_token_ids: Optional[list[int]] = Field(default_factory=list) include_stop_str_in_output: bool = False ignore_eos: bool = False min_tokens: int = 0 @@ -290,7 +290,7 @@ class ChatCompletionRequest(OpenAIBaseModel): "special tokens so this should be set to false (as is the " "default)."), ) - documents: Optional[List[Dict[str, str]]] = Field( + documents: Optional[list[dict[str, str]]] = Field( default=None, description= ("A list of dicts representing documents that will be accessible to " @@ -307,12 +307,12 @@ class ChatCompletionRequest(OpenAIBaseModel): "allowed, so you must provide a chat template if the tokenizer " "does not define one."), ) - chat_template_kwargs: Optional[Dict[str, Any]] = Field( + chat_template_kwargs: Optional[dict[str, Any]] = Field( default=None, description=("Additional kwargs to pass to the template renderer. " "Will be accessible by the chat template."), ) - mm_processor_kwargs: Optional[Dict[str, Any]] = Field( + mm_processor_kwargs: Optional[dict[str, Any]] = Field( default=None, description=("Additional kwargs to pass to the HF processor."), ) @@ -325,7 +325,7 @@ class ChatCompletionRequest(OpenAIBaseModel): description=( "If specified, the output will follow the regex pattern."), ) - guided_choice: Optional[List[str]] = Field( + guided_choice: Optional[list[str]] = Field( default=None, description=( "If specified, the output will be exactly one of the choices."), @@ -643,17 +643,17 @@ class CompletionRequest(OpenAIBaseModel): # Ordered by official OpenAI API documentation # https://platform.openai.com/docs/api-reference/completions/create model: Optional[str] = None - prompt: Union[List[int], List[List[int]], str, List[str]] + prompt: Union[list[int], list[list[int]], str, list[str]] best_of: Optional[int] = None echo: Optional[bool] = False frequency_penalty: Optional[float] = 0.0 - logit_bias: Optional[Dict[str, float]] = None + logit_bias: Optional[dict[str, float]] = None logprobs: Optional[int] = None max_tokens: Optional[int] = 16 n: int = 1 presence_penalty: Optional[float] = 0.0 seed: Optional[int] = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max) - stop: Optional[Union[str, List[str]]] = Field(default_factory=list) + stop: Optional[Union[str, list[str]]] = Field(default_factory=list) stream: Optional[bool] = False stream_options: Optional[StreamOptions] = None suffix: Optional[str] = None @@ -667,14 +667,14 @@ class CompletionRequest(OpenAIBaseModel): min_p: Optional[float] = None repetition_penalty: Optional[float] = None length_penalty: float = 1.0 - stop_token_ids: Optional[List[int]] = Field(default_factory=list) + stop_token_ids: Optional[list[int]] = Field(default_factory=list) include_stop_str_in_output: bool = False ignore_eos: bool = False min_tokens: int = 0 skip_special_tokens: bool = True spaces_between_special_tokens: bool = True truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None - allowed_token_ids: Optional[List[int]] = None + allowed_token_ids: Optional[list[int]] = None prompt_logprobs: Optional[int] = None # doc: end-completion-sampling-params @@ -701,7 +701,7 @@ class CompletionRequest(OpenAIBaseModel): description=( "If specified, the output will follow the regex pattern."), ) - guided_choice: Optional[List[str]] = Field( + guided_choice: Optional[list[str]] = Field( default=None, description=( "If specified, the output will be exactly one of the choices."), @@ -908,7 +908,7 @@ class EmbeddingCompletionRequest(OpenAIBaseModel): # Ordered by official OpenAI API documentation # https://platform.openai.com/docs/api-reference/embeddings model: Optional[str] = None - input: Union[List[int], List[List[int]], str, List[str]] + input: Union[list[int], list[list[int]], str, list[str]] encoding_format: Literal["float", "base64"] = "float" dimensions: Optional[int] = None user: Optional[str] = None @@ -940,7 +940,7 @@ def to_pooling_params(self): class EmbeddingChatRequest(OpenAIBaseModel): model: Optional[str] = None - messages: List[ChatCompletionMessageParam] + messages: list[ChatCompletionMessageParam] encoding_format: Literal["float", "base64"] = "float" dimensions: Optional[int] = None @@ -969,12 +969,12 @@ class EmbeddingChatRequest(OpenAIBaseModel): "allowed, so you must provide a chat template if the tokenizer " "does not define one."), ) - chat_template_kwargs: Optional[Dict[str, Any]] = Field( + chat_template_kwargs: Optional[dict[str, Any]] = Field( default=None, description=("Additional kwargs to pass to the template renderer. " "Will be accessible by the chat template."), ) - mm_processor_kwargs: Optional[Dict[str, Any]] = Field( + mm_processor_kwargs: Optional[dict[str, Any]] = Field( default=None, description=("Additional kwargs to pass to the HF processor."), ) @@ -1008,8 +1008,8 @@ def to_pooling_params(self): class ScoreRequest(OpenAIBaseModel): model: Optional[str] = None - text_1: Union[List[str], str] - text_2: Union[List[str], str] + text_1: Union[list[str], str] + text_2: Union[list[str], str] truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None # doc: begin-score-pooling-params @@ -1033,7 +1033,7 @@ def to_pooling_params(self): class RerankRequest(OpenAIBaseModel): model: Optional[str] = None query: str - documents: List[str] + documents: list[str] top_n: int = Field(default_factory=lambda: 0) truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None @@ -1073,14 +1073,14 @@ class RerankResponse(OpenAIBaseModel): id: str model: str usage: RerankUsage - results: List[RerankResult] + results: list[RerankResult] class CompletionLogProbs(OpenAIBaseModel): - text_offset: List[int] = Field(default_factory=list) - token_logprobs: List[Optional[float]] = Field(default_factory=list) - tokens: List[str] = Field(default_factory=list) - top_logprobs: List[Optional[Dict[str, + text_offset: list[int] = Field(default_factory=list) + token_logprobs: list[Optional[float]] = Field(default_factory=list) + tokens: list[str] = Field(default_factory=list) + top_logprobs: list[Optional[dict[str, float]]] = Field(default_factory=list) @@ -1096,7 +1096,7 @@ class CompletionResponseChoice(OpenAIBaseModel): "to stop, None if the completion finished for some other reason " "including encountering the EOS token"), ) - prompt_logprobs: Optional[List[Optional[Dict[int, Logprob]]]] = None + prompt_logprobs: Optional[list[Optional[dict[int, Logprob]]]] = None class CompletionResponse(OpenAIBaseModel): @@ -1104,7 +1104,7 @@ class CompletionResponse(OpenAIBaseModel): object: str = "text_completion" created: int = Field(default_factory=lambda: int(time.time())) model: str - choices: List[CompletionResponseChoice] + choices: list[CompletionResponseChoice] usage: UsageInfo @@ -1127,14 +1127,14 @@ class CompletionStreamResponse(OpenAIBaseModel): object: str = "text_completion" created: int = Field(default_factory=lambda: int(time.time())) model: str - choices: List[CompletionResponseStreamChoice] + choices: list[CompletionResponseStreamChoice] usage: Optional[UsageInfo] = Field(default=None) class EmbeddingResponseData(OpenAIBaseModel): index: int object: str = "embedding" - embedding: Union[List[float], str] + embedding: Union[list[float], str] class EmbeddingResponse(OpenAIBaseModel): @@ -1142,14 +1142,14 @@ class EmbeddingResponse(OpenAIBaseModel): object: str = "list" created: int = Field(default_factory=lambda: int(time.time())) model: str - data: List[EmbeddingResponseData] + data: list[EmbeddingResponseData] usage: UsageInfo class PoolingResponseData(OpenAIBaseModel): index: int object: str = "pooling" - data: Union[List[List[float]], List[float], str] + data: Union[list[list[float]], list[float], str] class PoolingResponse(OpenAIBaseModel): @@ -1157,7 +1157,7 @@ class PoolingResponse(OpenAIBaseModel): object: str = "list" created: int = Field(default_factory=lambda: int(time.time())) model: str - data: List[PoolingResponseData] + data: list[PoolingResponseData] usage: UsageInfo @@ -1172,7 +1172,7 @@ class ScoreResponse(OpenAIBaseModel): object: str = "list" created: int = Field(default_factory=lambda: int(time.time())) model: str - data: List[ScoreResponseData] + data: list[ScoreResponseData] usage: UsageInfo @@ -1205,7 +1205,7 @@ class ExtractedToolCallInformation(BaseModel): tools_called: bool # extracted tool calls - tool_calls: List[ToolCall] + tool_calls: list[ToolCall] # content - per OpenAI spec, content AND tool calls can be returned rarely # But some models will do this intentionally @@ -1216,21 +1216,21 @@ class ChatMessage(OpenAIBaseModel): role: str reasoning_content: Optional[str] = None content: Optional[str] = None - tool_calls: List[ToolCall] = Field(default_factory=list) + tool_calls: list[ToolCall] = Field(default_factory=list) class ChatCompletionLogProb(OpenAIBaseModel): token: str logprob: float = -9999.0 - bytes: Optional[List[int]] = None + bytes: Optional[list[int]] = None class ChatCompletionLogProbsContent(ChatCompletionLogProb): - top_logprobs: List[ChatCompletionLogProb] = Field(default_factory=list) + top_logprobs: list[ChatCompletionLogProb] = Field(default_factory=list) class ChatCompletionLogProbs(OpenAIBaseModel): - content: Optional[List[ChatCompletionLogProbsContent]] = None + content: Optional[list[ChatCompletionLogProbsContent]] = None class ChatCompletionResponseChoice(OpenAIBaseModel): @@ -1248,16 +1248,16 @@ class ChatCompletionResponse(OpenAIBaseModel): object: Literal["chat.completion"] = "chat.completion" created: int = Field(default_factory=lambda: int(time.time())) model: str - choices: List[ChatCompletionResponseChoice] + choices: list[ChatCompletionResponseChoice] usage: UsageInfo - prompt_logprobs: Optional[List[Optional[Dict[int, Logprob]]]] = None + prompt_logprobs: Optional[list[Optional[dict[int, Logprob]]]] = None class DeltaMessage(OpenAIBaseModel): role: Optional[str] = None content: Optional[str] = None reasoning_content: Optional[str] = None - tool_calls: List[DeltaToolCall] = Field(default_factory=list) + tool_calls: list[DeltaToolCall] = Field(default_factory=list) class ChatCompletionResponseStreamChoice(OpenAIBaseModel): @@ -1273,7 +1273,7 @@ class ChatCompletionStreamResponse(OpenAIBaseModel): object: Literal["chat.completion.chunk"] = "chat.completion.chunk" created: int = Field(default_factory=lambda: int(time.time())) model: str - choices: List[ChatCompletionResponseStreamChoice] + choices: list[ChatCompletionResponseStreamChoice] usage: Optional[UsageInfo] = Field(default=None) @@ -1358,7 +1358,7 @@ class TokenizeCompletionRequest(OpenAIBaseModel): class TokenizeChatRequest(OpenAIBaseModel): model: Optional[str] = None - messages: List[ChatCompletionMessageParam] + messages: list[ChatCompletionMessageParam] add_generation_prompt: bool = Field( default=True, @@ -1393,12 +1393,12 @@ class TokenizeChatRequest(OpenAIBaseModel): "allowed, so you must provide a chat template if the tokenizer " "does not define one."), ) - chat_template_kwargs: Optional[Dict[str, Any]] = Field( + chat_template_kwargs: Optional[dict[str, Any]] = Field( default=None, description=("Additional kwargs to pass to the template renderer. " "Will be accessible by the chat template."), ) - mm_processor_kwargs: Optional[Dict[str, Any]] = Field( + mm_processor_kwargs: Optional[dict[str, Any]] = Field( default=None, description=("Additional kwargs to pass to the HF processor."), ) @@ -1419,12 +1419,12 @@ def check_generation_prompt(cls, data): class TokenizeResponse(OpenAIBaseModel): count: int max_model_len: int - tokens: List[int] + tokens: list[int] class DetokenizeRequest(OpenAIBaseModel): model: Optional[str] = None - tokens: List[int] + tokens: list[int] class DetokenizeResponse(OpenAIBaseModel): @@ -1492,7 +1492,7 @@ class TranscriptionRequest(OpenAIBaseModel): to automatically increase the temperature until certain thresholds are hit. """ - timestamp_granularities: List[Literal["word", "segment"]] = Field( + timestamp_granularities: list[Literal["word", "segment"]] = Field( alias="timestamp_granularities[]", default=[]) """The timestamp granularities to populate for this transcription. @@ -1580,7 +1580,7 @@ class TranscriptionSegment(OpenAIBaseModel): text: str """Text content of the segment.""" - tokens: List[int] + tokens: list[int] """Array of token IDs for the text content.""" @@ -1594,8 +1594,8 @@ class TranscriptionResponseVerbose(OpenAIBaseModel): text: str """The transcribed text.""" - segments: Optional[List[TranscriptionSegment]] = None + segments: Optional[list[TranscriptionSegment]] = None """Segments of the transcribed text and their corresponding details.""" - words: Optional[List[TranscriptionWord]] = None + words: Optional[list[TranscriptionWord]] = None """Extracted words and their corresponding timestamps.""" diff --git a/vllm/entrypoints/openai/reasoning_parsers/abs_reasoning_parsers.py b/vllm/entrypoints/openai/reasoning_parsers/abs_reasoning_parsers.py index b5df7e47446b7..b3bc0e836d4cc 100644 --- a/vllm/entrypoints/openai/reasoning_parsers/abs_reasoning_parsers.py +++ b/vllm/entrypoints/openai/reasoning_parsers/abs_reasoning_parsers.py @@ -1,8 +1,9 @@ # SPDX-License-Identifier: Apache-2.0 import os +from collections.abc import Sequence from functools import cached_property -from typing import Callable, Dict, List, Optional, Sequence, Tuple, Type, Union +from typing import Callable, Optional, Union from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, DeltaMessage) @@ -25,14 +26,14 @@ def __init__(self, tokenizer: AnyTokenizer): self.model_tokenizer = tokenizer @cached_property - def vocab(self) -> Dict[str, int]: + def vocab(self) -> dict[str, int]: # NOTE: Only PreTrainedTokenizerFast is guaranteed to have .vocab # whereas all tokenizers have .get_vocab() return self.model_tokenizer.get_vocab() def extract_reasoning_content( self, model_output: str, request: ChatCompletionRequest - ) -> Tuple[Optional[str], Optional[str]]: + ) -> tuple[Optional[str], Optional[str]]: """ Extract reasoning content from a complete model-generated string. @@ -47,7 +48,7 @@ def extract_reasoning_content( The request object that was used to generate the model_output. Returns: - Tuple[Optional[str], Optional[str]] + tuple[Optional[str], Optional[str]] A tuple containing the reasoning content and the content. """ @@ -77,10 +78,10 @@ def extract_reasoning_content_streaming( class ReasoningParserManager: - reasoning_parsers: Dict[str, Type] = {} + reasoning_parsers: dict[str, type] = {} @classmethod - def get_reasoning_parser(cls, name) -> Type: + def get_reasoning_parser(cls, name) -> type: """ Get reasoning parser by name which is registered by `register_module`. @@ -94,8 +95,8 @@ def get_reasoning_parser(cls, name) -> Type: @classmethod def _register_module(cls, - module: Type, - module_name: Optional[Union[str, List[str]]] = None, + module: type, + module_name: Optional[Union[str, list[str]]] = None, force: bool = True) -> None: if not issubclass(module, ReasoningParser): raise TypeError("module must be subclass of ReasoningParser, " @@ -114,9 +115,9 @@ def _register_module(cls, @classmethod def register_module( cls, - name: Optional[Union[str, List[str]]] = None, + name: Optional[Union[str, list[str]]] = None, force: bool = True, - module: Union[Type, None] = None) -> Union[type, Callable]: + module: Union[type, None] = None) -> Union[type, Callable]: """ Register module with the given name or name list. it can be used as a decoder(with module as None) or normal function(with module as not diff --git a/vllm/entrypoints/openai/reasoning_parsers/deepseek_r1_reasoning_parser.py b/vllm/entrypoints/openai/reasoning_parsers/deepseek_r1_reasoning_parser.py index e5ab6e6b2339d..1a2c66a60e966 100644 --- a/vllm/entrypoints/openai/reasoning_parsers/deepseek_r1_reasoning_parser.py +++ b/vllm/entrypoints/openai/reasoning_parsers/deepseek_r1_reasoning_parser.py @@ -1,7 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 import re -from typing import Optional, Sequence, Tuple, Union +from collections.abc import Sequence +from typing import Optional, Union from transformers import PreTrainedTokenizerBase @@ -122,7 +123,7 @@ def extract_reasoning_content_streaming( def extract_reasoning_content( self, model_output: str, request: ChatCompletionRequest - ) -> Tuple[Optional[str], Optional[str]]: + ) -> tuple[Optional[str], Optional[str]]: # DeepSeek R1 doesn't generate now. # Thus we assume the reasoning content is always at the start. diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py index e4496f61e6074..0d06ba3df23f9 100644 --- a/vllm/entrypoints/openai/run_batch.py +++ b/vllm/entrypoints/openai/run_batch.py @@ -2,9 +2,10 @@ import asyncio import tempfile +from collections.abc import Awaitable from http import HTTPStatus from io import StringIO -from typing import Awaitable, Callable, List, Optional +from typing import Callable, Optional import aiohttp import torch @@ -143,7 +144,7 @@ async def read_file(path_or_url: str) -> str: async def write_local_file(output_path: str, - batch_outputs: List[BatchRequestOutput]) -> None: + batch_outputs: list[BatchRequestOutput]) -> None: """ Write the responses to a local file. output_path: The path to write the responses to. @@ -204,7 +205,7 @@ async def upload_data(output_url: str, data_or_file: str, f"Error message: {str(e)}.") from e -async def write_file(path_or_url: str, batch_outputs: List[BatchRequestOutput], +async def write_file(path_or_url: str, batch_outputs: list[BatchRequestOutput], output_tmp_dir: str) -> None: """ Write batch_outputs to a file or upload to a URL. @@ -353,7 +354,7 @@ async def main(args): logger.info("Reading batch from %s...", args.input_file) # Submit all requests in the file to the engine "concurrently". - response_futures: List[Awaitable[BatchRequestOutput]] = [] + response_futures: list[Awaitable[BatchRequestOutput]] = [] for request_json in (await read_file(args.input_file)).strip().split("\n"): # Skip empty lines. request_json = request_json.strip() diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 02dd2c4881c62..98e9ea0fc61a2 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -3,10 +3,9 @@ import asyncio import json import time -from typing import (AsyncGenerator, AsyncIterator, Callable, Dict, Final, List, - Optional) -from typing import Sequence as GenericSequence -from typing import Union +from collections.abc import AsyncGenerator, AsyncIterator +from collections.abc import Sequence as GenericSequence +from typing import Callable, Final, Optional, Union from fastapi import Request @@ -205,7 +204,7 @@ async def create_chat_completion( raw_request.state.request_metadata = request_metadata # Schedule the request and get the result generator. - generators: List[AsyncGenerator[RequestOutput, None]] = [] + generators: list[AsyncGenerator[RequestOutput, None]] = [] try: for i, engine_prompt in enumerate(engine_prompts): sampling_params: Union[SamplingParams, BeamSearchParams] @@ -282,7 +281,7 @@ async def chat_completion_stream_generator( result_generator: AsyncIterator[RequestOutput], request_id: str, model_name: str, - conversation: List[ConversationMessage], + conversation: list[ConversationMessage], tokenizer: AnyTokenizer, request_metadata: RequestResponseMetadata, ) -> AsyncGenerator[str, None]: @@ -310,7 +309,7 @@ async def chat_completion_stream_generator( should_stream_with_reasoning_parsing = ( self._should_stream_with_reasoning_parsing(request)) - all_previous_token_ids: Optional[List[List[int]]] + all_previous_token_ids: Optional[list[list[int]]] # Only one of these will be used, thus previous_texts and # all_previous_token_ids will not be used twice in the same iteration. @@ -339,7 +338,7 @@ async def chat_completion_stream_generator( # Prepare the tool parser if it's needed try: if tool_choice_auto and self.tool_parser: - tool_parsers: List[Optional[ToolParser]] = [ + tool_parsers: list[Optional[ToolParser]] = [ self.tool_parser(tokenizer) ] * num_choices else: @@ -406,7 +405,7 @@ async def chat_completion_stream_generator( # Send response to echo the input portion of the # last message if request.echo: - last_msg_content: Union[str, List[Dict[str, str]]] = "" + last_msg_content: Union[str, list[dict[str, str]]] = "" if conversation and "content" in conversation[ -1] and conversation[-1].get("role") == role: last_msg_content = conversation[-1]["content"] or "" @@ -674,7 +673,7 @@ async def chat_completion_full_generator( result_generator: AsyncIterator[RequestOutput], request_id: str, model_name: str, - conversation: List[ConversationMessage], + conversation: list[ConversationMessage], tokenizer: AnyTokenizer, request_metadata: RequestResponseMetadata, ) -> Union[ErrorResponse, ChatCompletionResponse]: @@ -693,7 +692,7 @@ async def chat_completion_full_generator( assert final_res is not None - choices: List[ChatCompletionResponseChoice] = [] + choices: list[ChatCompletionResponseChoice] = [] role = self.get_chat_request_role(request) for output in final_res.outputs: @@ -812,7 +811,7 @@ async def chat_completion_full_generator( choices.append(choice_data) if request.echo: - last_msg_content: Union[str, List[Dict[str, str]]] = "" + last_msg_content: Union[str, list[dict[str, str]]] = "" if conversation and "content" in conversation[-1] and conversation[ -1].get("role") == role: last_msg_content = conversation[-1]["content"] or "" @@ -853,8 +852,8 @@ async def chat_completion_full_generator( return response def _get_top_logprobs( - self, logprobs: Dict[int, Logprob], top_logprobs: Optional[int], - tokenizer: AnyTokenizer) -> List[ChatCompletionLogProb]: + self, logprobs: dict[int, Logprob], top_logprobs: Optional[int], + tokenizer: AnyTokenizer) -> list[ChatCompletionLogProb]: return [ ChatCompletionLogProb(token=(token := self._get_decoded_token( p[1], @@ -871,12 +870,12 @@ def _get_top_logprobs( def _create_chat_logprobs( self, token_ids: GenericSequence[int], - top_logprobs: GenericSequence[Optional[Dict[int, Logprob]]], + top_logprobs: GenericSequence[Optional[dict[int, Logprob]]], tokenizer: AnyTokenizer, num_output_top_logprobs: Optional[int] = None, ) -> ChatCompletionLogProbs: """Create OpenAI-style logprobs.""" - logprobs_content: List[ChatCompletionLogProbsContent] = [] + logprobs_content: list[ChatCompletionLogProbsContent] = [] for i, token_id in enumerate(token_ids): step_top_logprobs = top_logprobs[i] diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 840f0f9b8448b..ed09af84f64ba 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -2,9 +2,9 @@ import asyncio import time -from typing import AsyncGenerator, AsyncIterator, Dict, List, Optional -from typing import Sequence as GenericSequence -from typing import Tuple, Union, cast +from collections.abc import AsyncGenerator, AsyncIterator +from collections.abc import Sequence as GenericSequence +from typing import Optional, Union, cast from fastapi import Request @@ -113,7 +113,7 @@ async def create_completion( return self.create_error_response(str(e)) # Schedule the request and get the result generator. - generators: List[AsyncGenerator[RequestOutput, None]] = [] + generators: list[AsyncGenerator[RequestOutput, None]] = [] try: for i, engine_prompt in enumerate(engine_prompts): sampling_params: Union[SamplingParams, BeamSearchParams] @@ -189,7 +189,7 @@ async def create_completion( request_metadata=request_metadata) # Non-streaming response - final_res_batch: List[Optional[RequestOutput]] = [None] * num_prompts + final_res_batch: list[Optional[RequestOutput]] = [None] * num_prompts try: async for i, res in result_generator: final_res_batch[i] = res @@ -203,7 +203,7 @@ async def create_completion( if final_res.prompt is None: final_res.prompt = request_prompts[i]["prompt"] - final_res_batch_checked = cast(List[RequestOutput], + final_res_batch_checked = cast(list[RequestOutput], final_res_batch) response = self.request_output_to_completion_response( @@ -237,7 +237,7 @@ async def fake_stream_generator() -> AsyncGenerator[str, None]: async def completion_stream_generator( self, request: CompletionRequest, - result_generator: AsyncIterator[Tuple[int, RequestOutput]], + result_generator: AsyncIterator[tuple[int, RequestOutput]], request_id: str, created_time: int, model_name: str, @@ -270,7 +270,7 @@ async def completion_stream_generator( num_prompt_tokens[prompt_idx] = len(res.prompt_token_ids) delta_token_ids: GenericSequence[int] - out_logprobs: Optional[GenericSequence[Optional[Dict[ + out_logprobs: Optional[GenericSequence[Optional[dict[ int, Logprob]]]] for output in res.outputs: @@ -381,7 +381,7 @@ async def completion_stream_generator( def request_output_to_completion_response( self, - final_res_batch: List[RequestOutput], + final_res_batch: list[RequestOutput], request: CompletionRequest, request_id: str, created_time: int, @@ -389,7 +389,7 @@ def request_output_to_completion_response( tokenizer: AnyTokenizer, request_metadata: RequestResponseMetadata, ) -> CompletionResponse: - choices: List[CompletionResponseChoice] = [] + choices: list[CompletionResponseChoice] = [] num_prompt_tokens = 0 num_generated_tokens = 0 @@ -406,7 +406,7 @@ def request_output_to_completion_response( prompt_text = final_res.prompt token_ids: GenericSequence[int] - out_logprobs: Optional[GenericSequence[Optional[Dict[int, + out_logprobs: Optional[GenericSequence[Optional[dict[int, Logprob]]]] for output in final_res.outputs: @@ -480,16 +480,16 @@ def request_output_to_completion_response( def _create_completion_logprobs( self, token_ids: GenericSequence[int], - top_logprobs: GenericSequence[Optional[Dict[int, Logprob]]], + top_logprobs: GenericSequence[Optional[dict[int, Logprob]]], num_output_top_logprobs: int, tokenizer: AnyTokenizer, initial_text_offset: int = 0, ) -> CompletionLogProbs: """Create logprobs for OpenAI Completion API.""" - out_text_offset: List[int] = [] - out_token_logprobs: List[Optional[float]] = [] - out_tokens: List[str] = [] - out_top_logprobs: List[Optional[Dict[str, float]]] = [] + out_text_offset: list[int] = [] + out_token_logprobs: list[Optional[float]] = [] + out_tokens: list[str] = [] + out_top_logprobs: list[Optional[dict[str, float]]] = [] last_token_len = 0 diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py index 607dbd96b1945..5f6e06e6f79f0 100644 --- a/vllm/entrypoints/openai/serving_embedding.py +++ b/vllm/entrypoints/openai/serving_embedding.py @@ -3,7 +3,8 @@ import asyncio import base64 import time -from typing import AsyncGenerator, Final, List, Literal, Optional, Union, cast +from collections.abc import AsyncGenerator +from typing import Final, Literal, Optional, Union, cast import numpy as np from fastapi import Request @@ -31,7 +32,7 @@ def _get_embedding( output: EmbeddingOutput, encoding_format: Literal["float", "base64"], -) -> Union[List[float], str]: +) -> Union[list[float], str]: if encoding_format == "float": return output.embedding elif encoding_format == "base64": @@ -143,7 +144,7 @@ async def create_embedding( return self.create_error_response(str(e)) # Schedule the request and get the result generator. - generators: List[AsyncGenerator[PoolingRequestOutput, None]] = [] + generators: list[AsyncGenerator[PoolingRequestOutput, None]] = [] try: pooling_params = request.to_pooling_params() @@ -178,7 +179,7 @@ async def create_embedding( num_prompts = len(engine_prompts) # Non-streaming response - final_res_batch: List[Optional[PoolingRequestOutput]] + final_res_batch: list[Optional[PoolingRequestOutput]] final_res_batch = [None] * num_prompts try: async for i, res in result_generator: @@ -186,7 +187,7 @@ async def create_embedding( assert all(final_res is not None for final_res in final_res_batch) - final_res_batch_checked = cast(List[PoolingRequestOutput], + final_res_batch_checked = cast(list[PoolingRequestOutput], final_res_batch) response = self.request_output_to_embedding_response( @@ -206,13 +207,13 @@ async def create_embedding( def request_output_to_embedding_response( self, - final_res_batch: List[PoolingRequestOutput], + final_res_batch: list[PoolingRequestOutput], request_id: str, created_time: int, model_name: str, encoding_format: Literal["float", "base64"], ) -> EmbeddingResponse: - items: List[EmbeddingResponseData] = [] + items: list[EmbeddingResponseData] = [] num_prompt_tokens = 0 for idx, final_res in enumerate(final_res_batch): diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index d097bfcfc5ab7..59333dbfd24e2 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -1,15 +1,14 @@ # SPDX-License-Identifier: Apache-2.0 import json +from collections.abc import Iterable, Iterator, Mapping, Sequence from concurrent.futures.thread import ThreadPoolExecutor from http import HTTPStatus -from typing import (Any, Callable, Dict, Iterable, Iterator, List, Mapping, - Optional, Sequence, Tuple, TypedDict, Union) +from typing import Annotated, Any, Callable, Optional, TypedDict, Union from fastapi import Request from pydantic import Field from starlette.datastructures import Headers -from typing_extensions import Annotated from vllm.config import ModelConfig from vllm.engine.protocol import EngineClient @@ -64,10 +63,10 @@ class TextTokensPrompt(TypedDict): prompt: str - prompt_token_ids: List[int] + prompt_token_ids: list[int] -RequestPrompt = Union[List[int], str, TextTokensPrompt] +RequestPrompt = Union[list[int], str, TextTokensPrompt] class OpenAIServing: @@ -144,7 +143,7 @@ async def _check_model( def _maybe_get_adapters( self, request: AnyRequest - ) -> Union[Tuple[None, None], Tuple[LoRARequest, None], Tuple[ + ) -> Union[tuple[None, None], tuple[LoRARequest, None], tuple[ None, PromptAdapterRequest]]: if self._is_model_supported(request.model): return None, None @@ -188,7 +187,7 @@ def _normalize_prompt_tokens_to_input( self, request: AnyRequest, tokenizer: AnyTokenizer, - prompt_ids: List[int], + prompt_ids: list[int], truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]], ) -> TextTokensPrompt: if truncate_prompt_tokens is None: @@ -203,7 +202,7 @@ def _normalize_prompt_tokens_to_input( def _validate_input( self, request: AnyRequest, - input_ids: List[int], + input_ids: list[int], input_text: str, ) -> TextTokensPrompt: token_num = len(input_ids) @@ -259,7 +258,7 @@ def _tokenize_prompt_input( self, request: AnyRequest, tokenizer: AnyTokenizer, - prompt_input: Union[str, List[int]], + prompt_input: Union[str, list[int]], truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None, add_special_tokens: bool = True, ) -> TextTokensPrompt: @@ -280,7 +279,7 @@ def _tokenize_prompt_inputs( self, request: AnyRequest, tokenizer: AnyTokenizer, - prompt_inputs: Iterable[Union[str, List[int]]], + prompt_inputs: Iterable[Union[str, list[int]]], truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None, add_special_tokens: bool = True, ) -> Iterator[TextTokensPrompt]: @@ -309,10 +308,10 @@ def _tokenize_prompt_input_or_inputs( self, request: AnyRequest, tokenizer: AnyTokenizer, - input_or_inputs: Union[str, List[str], List[int], List[List[int]]], + input_or_inputs: Union[str, list[str], list[int], list[list[int]]], truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None, add_special_tokens: bool = True, - ) -> List[TextTokensPrompt]: + ) -> list[TextTokensPrompt]: """ Tokenize/detokenize depending on the input format. @@ -344,10 +343,10 @@ async def _preprocess_completion( self, request: CompletionLikeRequest, tokenizer: AnyTokenizer, - input_or_inputs: Union[str, List[str], List[int], List[List[int]]], + input_or_inputs: Union[str, list[str], list[int], list[list[int]]], truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None, add_special_tokens: bool = True, - ) -> Tuple[List[TextTokensPrompt], List[TokensPrompt]]: + ) -> tuple[list[TextTokensPrompt], list[TokensPrompt]]: request_prompts = await self._tokenize_prompt_input_or_inputs_async( request, tokenizer, @@ -367,19 +366,19 @@ async def _preprocess_chat( self, request: ChatLikeRequest, tokenizer: AnyTokenizer, - messages: List[ChatCompletionMessageParam], + messages: list[ChatCompletionMessageParam], chat_template: Optional[str], chat_template_content_format: ChatTemplateContentFormatOption, add_generation_prompt: bool = True, continue_final_message: bool = False, - tool_dicts: Optional[List[Dict[str, Any]]] = None, - documents: Optional[List[Dict[str, str]]] = None, - chat_template_kwargs: Optional[Dict[str, Any]] = None, + tool_dicts: Optional[list[dict[str, Any]]] = None, + documents: Optional[list[dict[str, str]]] = None, + chat_template_kwargs: Optional[dict[str, Any]] = None, tool_parser: Optional[Callable[[AnyTokenizer], ToolParser]] = None, truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None, add_special_tokens: bool = False, - ) -> Tuple[List[ConversationMessage], Sequence[RequestPrompt], - List[TokensPrompt]]: + ) -> tuple[list[ConversationMessage], Sequence[RequestPrompt], + list[TokensPrompt]]: resolved_content_format = resolve_chat_template_content_format( chat_template, chat_template_content_format, @@ -392,7 +391,7 @@ async def _preprocess_chat( content_format=resolved_content_format, ) - _chat_template_kwargs: Dict[str, Any] = dict( + _chat_template_kwargs: dict[str, Any] = dict( chat_template=chat_template, add_generation_prompt=add_generation_prompt, continue_final_message=continue_final_message, @@ -401,7 +400,7 @@ async def _preprocess_chat( ) _chat_template_kwargs.update(chat_template_kwargs or {}) - request_prompt: Union[str, List[int]] + request_prompt: Union[str, list[int]] if isinstance(tokenizer, MistralTokenizer): request_prompt = apply_mistral_chat_template( tokenizer, diff --git a/vllm/entrypoints/openai/serving_models.py b/vllm/entrypoints/openai/serving_models.py index 0f4a174a8c15a..38a66583022a2 100644 --- a/vllm/entrypoints/openai/serving_models.py +++ b/vllm/entrypoints/openai/serving_models.py @@ -4,7 +4,7 @@ import pathlib from dataclasses import dataclass from http import HTTPStatus -from typing import List, Optional, Union +from typing import Optional, Union from vllm.config import ModelConfig from vllm.engine.protocol import EngineClient @@ -53,10 +53,10 @@ def __init__( self, engine_client: EngineClient, model_config: ModelConfig, - base_model_paths: List[BaseModelPath], + base_model_paths: list[BaseModelPath], *, - lora_modules: Optional[List[LoRAModulePath]] = None, - prompt_adapters: Optional[List[PromptAdapterPath]] = None, + lora_modules: Optional[list[LoRAModulePath]] = None, + prompt_adapters: Optional[list[PromptAdapterPath]] = None, ): super().__init__() @@ -65,7 +65,7 @@ def __init__( self.engine_client = engine_client self.static_lora_modules = lora_modules - self.lora_requests: List[LoRARequest] = [] + self.lora_requests: list[LoRARequest] = [] self.lora_id_counter = AtomicCounter(0) self.prompt_adapter_requests = [] diff --git a/vllm/entrypoints/openai/serving_pooling.py b/vllm/entrypoints/openai/serving_pooling.py index bbf5aed1a33c8..0a3ca2aa7c5bf 100644 --- a/vllm/entrypoints/openai/serving_pooling.py +++ b/vllm/entrypoints/openai/serving_pooling.py @@ -3,7 +3,8 @@ import asyncio import base64 import time -from typing import AsyncGenerator, Final, List, Literal, Optional, Union, cast +from collections.abc import AsyncGenerator +from typing import Final, Literal, Optional, Union, cast import numpy as np from fastapi import Request @@ -29,7 +30,7 @@ def _get_data( output: PoolingOutput, encoding_format: Literal["float", "base64"], -) -> Union[List[float], str]: +) -> Union[list[float], str]: if encoding_format == "float": return output.data.tolist() elif encoding_format == "base64": @@ -139,7 +140,7 @@ async def create_pooling( return self.create_error_response(str(e)) # Schedule the request and get the result generator. - generators: List[AsyncGenerator[PoolingRequestOutput, None]] = [] + generators: list[AsyncGenerator[PoolingRequestOutput, None]] = [] try: pooling_params = request.to_pooling_params() @@ -174,7 +175,7 @@ async def create_pooling( num_prompts = len(engine_prompts) # Non-streaming response - final_res_batch: List[Optional[PoolingRequestOutput]] + final_res_batch: list[Optional[PoolingRequestOutput]] final_res_batch = [None] * num_prompts try: async for i, res in result_generator: @@ -182,7 +183,7 @@ async def create_pooling( assert all(final_res is not None for final_res in final_res_batch) - final_res_batch_checked = cast(List[PoolingRequestOutput], + final_res_batch_checked = cast(list[PoolingRequestOutput], final_res_batch) response = self.request_output_to_pooling_response( @@ -202,13 +203,13 @@ async def create_pooling( def request_output_to_pooling_response( self, - final_res_batch: List[PoolingRequestOutput], + final_res_batch: list[PoolingRequestOutput], request_id: str, created_time: int, model_name: str, encoding_format: Literal["float", "base64"], ) -> PoolingResponse: - items: List[PoolingResponseData] = [] + items: list[PoolingResponseData] = [] num_prompt_tokens = 0 for idx, final_res in enumerate(final_res_batch): diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py index a087a8d9ba0f9..73b4288cbb0d8 100644 --- a/vllm/entrypoints/openai/serving_score.py +++ b/vllm/entrypoints/openai/serving_score.py @@ -1,7 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 import asyncio import time -from typing import Any, AsyncGenerator, Dict, List, Mapping, Optional, Union +from collections.abc import AsyncGenerator, Mapping +from typing import Any, Optional, Union from fastapi import Request @@ -48,8 +49,8 @@ def __init__( async def _embedding_score( self, tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], - texts_1: List[str], - texts_2: List[str], + texts_1: list[str], + texts_2: list[str], request: Union[RerankRequest, ScoreRequest], request_id=str, tokenization_kwargs: Optional[dict[str, Any]] = None, @@ -57,11 +58,11 @@ async def _embedding_score( prompt_adapter_request: Optional[Union[PromptAdapterRequest, None]] = None, trace_headers: Optional[Mapping[str, str]] = None, - ) -> List[PoolingRequestOutput]: + ) -> list[PoolingRequestOutput]: input_texts = texts_1 + texts_2 - engine_prompts: List[TokensPrompt] = [] + engine_prompts: list[TokensPrompt] = [] tokenize_async = make_async(tokenizer.__call__, executor=self._tokenizer_executor) @@ -82,7 +83,7 @@ async def _embedding_score( prompt_token_ids=text_token_prompt["prompt_token_ids"])) # Schedule the request and get the result generator. - generators: List[AsyncGenerator[PoolingRequestOutput, None]] = [] + generators: list[AsyncGenerator[PoolingRequestOutput, None]] = [] pooling_params = request.to_pooling_params() for i, engine_prompt in enumerate(engine_prompts): @@ -108,16 +109,16 @@ async def _embedding_score( result_generator = merge_async_iterators(*generators) # Non-streaming response - final_res_batch: List[PoolingRequestOutput] = [] + final_res_batch: list[PoolingRequestOutput] = [] - embeddings: List[Optional[PoolingRequestOutput]] =\ + embeddings: list[Optional[PoolingRequestOutput]] =\ [None] * len(engine_prompts) async for i, res in result_generator: embeddings[i] = res - emb_texts_1: List[PoolingRequestOutput] = [] - emb_texts_2: List[PoolingRequestOutput] = [] + emb_texts_1: list[PoolingRequestOutput] = [] + emb_texts_2: list[PoolingRequestOutput] = [] for i in range(0, len(texts_1)): assert (emb := embeddings[i]) is not None @@ -139,8 +140,8 @@ async def _embedding_score( async def _cross_encoding_score( self, tokenizer: Union[AnyTokenizer], - texts_1: List[str], - texts_2: List[str], + texts_1: list[str], + texts_2: list[str], request: Union[RerankRequest, ScoreRequest], request_id=str, tokenization_kwargs: Optional[dict[str, Any]] = None, @@ -148,10 +149,10 @@ async def _cross_encoding_score( prompt_adapter_request: Optional[Union[PromptAdapterRequest, None]] = None, trace_headers: Optional[Mapping[str, str]] = None, - ) -> List[PoolingRequestOutput]: + ) -> list[PoolingRequestOutput]: - request_prompts: List[str] = [] - engine_prompts: List[TokensPrompt] = [] + request_prompts: list[str] = [] + engine_prompts: list[TokensPrompt] = [] if len(texts_1) == 1: texts_1 = texts_1 * len(texts_2) @@ -185,7 +186,7 @@ async def _cross_encoding_score( engine_prompts.append(engine_prompt) # Schedule the request and get the result generator. - generators: List[AsyncGenerator[PoolingRequestOutput, None]] = [] + generators: list[AsyncGenerator[PoolingRequestOutput, None]] = [] pooling_params = request.to_pooling_params() @@ -212,7 +213,7 @@ async def _cross_encoding_score( result_generator = merge_async_iterators(*generators) # Non-streaming response - final_res_batch: List[ + final_res_batch: list[ Optional[PoolingRequestOutput]] = [None] * len(engine_prompts) async for i, res in result_generator: @@ -228,9 +229,9 @@ async def _run_scoring( request_id: str, raw_request: Optional[Request] = None, truncate_prompt_tokens: Optional[int] = None, - ) -> List[PoolingRequestOutput]: + ) -> list[PoolingRequestOutput]: - tokenization_kwargs: Dict[str, Any] = {} + tokenization_kwargs: dict[str, Any] = {} if truncate_prompt_tokens is not None: tokenization_kwargs["truncation"] = True tokenization_kwargs["max_length"] = truncate_prompt_tokens @@ -372,12 +373,12 @@ async def do_rerank( def request_output_to_score_response( self, - final_res_batch: List[PoolingRequestOutput], + final_res_batch: list[PoolingRequestOutput], request_id: str, created_time: int, model_name: str, ) -> ScoreResponse: - items: List[ScoreResponseData] = [] + items: list[ScoreResponseData] = [] num_prompt_tokens = 0 for idx, final_res in enumerate(final_res_batch): @@ -406,13 +407,13 @@ def request_output_to_score_response( ) def request_output_to_rerank_response( - self, final_res_batch: List[PoolingRequestOutput], request_id: str, - model_name: str, documents: List[str], + self, final_res_batch: list[PoolingRequestOutput], request_id: str, + model_name: str, documents: list[str], top_n: int) -> RerankResponse: """ Convert the output of do_rank to a RerankResponse """ - results: List[RerankResult] = [] + results: list[RerankResult] = [] num_prompt_tokens = 0 for idx, final_res in enumerate(final_res_batch): classify_res = ScoringRequestOutput.from_base(final_res) diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py index 6c79adf90c8ad..4e95ef59e80eb 100644 --- a/vllm/entrypoints/openai/serving_tokenization.py +++ b/vllm/entrypoints/openai/serving_tokenization.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Final, List, Optional, Union +from typing import Final, Optional, Union from fastapi import Request @@ -92,7 +92,7 @@ async def create_tokenize( logger.exception("Error in preprocessing prompt inputs") return self.create_error_response(str(e)) - input_ids: List[int] = [] + input_ids: list[int] = [] for i, engine_prompt in enumerate(engine_prompts): self._log_inputs(request_id, request_prompts[i], diff --git a/vllm/entrypoints/openai/serving_transcription.py b/vllm/entrypoints/openai/serving_transcription.py index 0bedb5718a4b4..77f016a5e0a4a 100644 --- a/vllm/entrypoints/openai/serving_transcription.py +++ b/vllm/entrypoints/openai/serving_transcription.py @@ -1,7 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 import asyncio import io -from typing import AsyncGenerator, Optional, Union, cast +from collections.abc import AsyncGenerator +from typing import Optional, Union, cast from fastapi import Request diff --git a/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py index 7cdd6d4c4f2ba..931d5aab9bd9d 100644 --- a/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py @@ -1,8 +1,9 @@ # SPDX-License-Identifier: Apache-2.0 import os +from collections.abc import Sequence from functools import cached_property -from typing import Callable, Dict, List, Optional, Sequence, Type, Union +from typing import Callable, Optional, Union from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, DeltaMessage, @@ -22,16 +23,16 @@ class ToolParser: """ def __init__(self, tokenizer: AnyTokenizer): - self.prev_tool_call_arr: List[Dict] = [] + self.prev_tool_call_arr: list[dict] = [] # the index of the tool call that is currently being parsed self.current_tool_id: int = -1 self.current_tool_name_sent: bool = False - self.streamed_args_for_tool: List[str] = [] + self.streamed_args_for_tool: list[str] = [] self.model_tokenizer = tokenizer @cached_property - def vocab(self) -> Dict[str, int]: + def vocab(self) -> dict[str, int]: # NOTE: Only PreTrainedTokenizerFast is guaranteed to have .vocab # whereas all tokenizers have .get_vocab() return self.model_tokenizer.get_vocab() @@ -79,10 +80,10 @@ def extract_tool_calls_streaming( class ToolParserManager: - tool_parsers: Dict[str, Type] = {} + tool_parsers: dict[str, type] = {} @classmethod - def get_tool_parser(cls, name) -> Type: + def get_tool_parser(cls, name) -> type: """ Get tool parser by name which is registered by `register_module`. @@ -95,8 +96,8 @@ def get_tool_parser(cls, name) -> Type: @classmethod def _register_module(cls, - module: Type, - module_name: Optional[Union[str, List[str]]] = None, + module: type, + module_name: Optional[Union[str, list[str]]] = None, force: bool = True) -> None: if not issubclass(module, ToolParser): raise TypeError( @@ -116,9 +117,9 @@ def _register_module(cls, @classmethod def register_module( cls, - name: Optional[Union[str, List[str]]] = None, + name: Optional[Union[str, list[str]]] = None, force: bool = True, - module: Union[Type, None] = None) -> Union[type, Callable]: + module: Union[type, None] = None) -> Union[type, Callable]: """ Register module with the given name or name list. it can be used as a decoder(with module as None) or normal function(with module as not diff --git a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py index 002bf17388308..76da63c580082 100644 --- a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py @@ -2,8 +2,9 @@ import json import re +from collections.abc import Sequence from json import JSONDecoder -from typing import Dict, Sequence, Union +from typing import Union import partial_json_parser from partial_json_parser.core.options import Allow @@ -145,7 +146,7 @@ def extract_tool_calls_streaming( return None # select as the current tool call the one we're on the state at - current_tool_call: Dict = tool_call_arr[self.current_tool_id] \ + current_tool_call: dict = tool_call_arr[self.current_tool_id] \ if len(tool_call_arr) > 0 else {} # case -- if no tokens have been streamed for the tool, e.g. diff --git a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py index c948ed78f503b..91afc88ef3dde 100644 --- a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py @@ -1,7 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 import json -from typing import Dict, Sequence, Union +from collections.abc import Sequence +from typing import Union import partial_json_parser from partial_json_parser.core.options import Allow @@ -136,7 +137,7 @@ def extract_tool_calls_streaming( return None # select as the current tool call the one we're on the state at - current_tool_call: Dict = tool_call_arr[self.current_tool_id] + current_tool_call: dict = tool_call_arr[self.current_tool_id] delta = None # case: we are starting a new tool in the array diff --git a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py index 4841b28703ee3..4c39e9b0c61f1 100644 --- a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py @@ -2,7 +2,8 @@ import json import re -from typing import Dict, List, Sequence, Union +from collections.abc import Sequence +from typing import Union import partial_json_parser from partial_json_parser.core.options import Allow @@ -33,9 +34,9 @@ def __init__(self, tokenizer: AnyTokenizer): self.model_tokenizer = self.model_tokenizer.tokenizer self.current_tool_name_sent: bool = False - self.prev_tool_call_arr: List[Dict] = [] + self.prev_tool_call_arr: list[dict] = [] self.current_tool_id: int = -1 - self.streamed_args_for_tool: List[str] = [ + self.streamed_args_for_tool: list[str] = [ ] # map what has been streamed for each tool so far to a list self.tool_call_start_token: str = "" diff --git a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py index b9215e7979bf5..57d7c77c64f74 100644 --- a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py @@ -1,7 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 import json -from typing import Dict, Sequence, Union +from collections.abc import Sequence +from typing import Union import partial_json_parser from partial_json_parser.core.options import Allow @@ -90,7 +91,7 @@ def extract_tool_calls_streaming( # tool calls are generated in an object in inernlm2 # it's not support parallel tool calls try: - tool_call_arr: Dict = partial_json_parser.loads( + tool_call_arr: dict = partial_json_parser.loads( parsable_arr, flags) except partial_json_parser.core.exceptions.MalformedJSON: logger.debug('not enough tokens to parse into JSON yet') diff --git a/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py index 7c4d63e188653..8df106bf27185 100644 --- a/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py @@ -2,7 +2,8 @@ import json import re -from typing import Dict, List, Sequence, Union +from collections.abc import Sequence +from typing import Union import partial_json_parser from partial_json_parser.core.options import Allow @@ -35,9 +36,9 @@ def __init__(self, tokenizer: AnyTokenizer): ) self.current_tool_name_sent: bool = False - self.prev_tool_call_arr: List[Dict] = [] + self.prev_tool_call_arr: list[dict] = [] self.current_tool_id: int = -1 - self.streamed_args_for_tool: List[str] = [ + self.streamed_args_for_tool: list[str] = [ ] # map what has been streamed for each tool so far to a list self.tool_calls_start_token: str = "" @@ -157,7 +158,7 @@ def extract_tool_calls_streaming( # tool calls are generated in an array, so do partial JSON # parsing on the entire array try: - tool_call_arr: List[Dict] = partial_json_parser.loads( + tool_call_arr: list[dict] = partial_json_parser.loads( parsable_arr, flags) except partial_json_parser.core.exceptions.MalformedJSON: logger.debug('not enough tokens to parse into JSON yet') @@ -165,7 +166,7 @@ def extract_tool_calls_streaming( # select as the current tool call the one we're on the state at - current_tool_call: Dict = tool_call_arr[self.current_tool_id] \ + current_tool_call: dict = tool_call_arr[self.current_tool_id] \ if len(tool_call_arr) > 0 else {} # case -- if no tokens have been streamed for the tool, e.g. diff --git a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py index 6a7b113623e65..20c3238fb3dfe 100644 --- a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py @@ -2,8 +2,9 @@ import json import re +from collections.abc import Sequence from json import JSONDecoder -from typing import Dict, List, Sequence, Union +from typing import Union import partial_json_parser from partial_json_parser.core.options import Allow @@ -40,10 +41,10 @@ def __init__(self, tokenizer: PreTrainedTokenizerBase): # initialize properties used for state when parsing tool calls in # streaming mode - self.prev_tool_call_arr: List[Dict] = [] + self.prev_tool_call_arr: list[dict] = [] self.current_tool_id: int = -1 self.current_tool_name_sent: bool = False - self.streamed_args_for_tool: List[str] = [ + self.streamed_args_for_tool: list[str] = [ ] # map what has been streamed for each tool so far to a list self.bot_token = "<|python_tag|>" self.bot_token_id = tokenizer.encode(self.bot_token, @@ -78,7 +79,7 @@ def extract_tool_calls( start_idx += end_idx + len('; ') function_call_arr.append(obj) - tool_calls: List[ToolCall] = [ + tool_calls: list[ToolCall] = [ ToolCall( type="function", function=FunctionCall( @@ -152,7 +153,7 @@ def extract_tool_calls_streaming( return None # select as the current tool call the one we're on the state at - current_tool_call: Dict = tool_call_arr[self.current_tool_id] \ + current_tool_call: dict = tool_call_arr[self.current_tool_id] \ if len(tool_call_arr) > 0 else {} # case -- if no tokens have been streamed for the tool, e.g. diff --git a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py index 4f04808829925..0661445639d74 100644 --- a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py @@ -2,9 +2,10 @@ import json import re +from collections.abc import Sequence from random import choices from string import ascii_letters, digits -from typing import Dict, List, Sequence, Union +from typing import Union import partial_json_parser from partial_json_parser.core.options import Allow @@ -56,10 +57,10 @@ def __init__(self, tokenizer: AnyTokenizer): # initialize properties used for state when parsing tool calls in # streaming mode - self.prev_tool_call_arr: List[Dict] = [] + self.prev_tool_call_arr: list[dict] = [] self.current_tool_id: int = -1 self.current_tool_name_sent: bool = False - self.streamed_args_for_tool: List[str] = [ + self.streamed_args_for_tool: list[str] = [ ] # map what has been streamed for each tool so far to a list self.bot_token = "[TOOL_CALLS]" self.bot_token_id = self.vocab.get(self.bot_token) @@ -104,7 +105,7 @@ def extract_tool_calls( function_call_arr = json.loads(raw_tool_call) # Tool Call - tool_calls: List[MistralToolCall] = [ + tool_calls: list[MistralToolCall] = [ MistralToolCall( type="function", function=FunctionCall( @@ -172,7 +173,7 @@ def extract_tool_calls_streaming( # tool calls are generated in an array, so do partial JSON # parsing on the entire array try: - tool_call_arr: List[Dict] = partial_json_parser.loads( + tool_call_arr: list[dict] = partial_json_parser.loads( parsable_arr, flags) except partial_json_parser.core.exceptions.MalformedJSON: logger.debug('not enough tokens to parse into JSON yet') @@ -180,7 +181,7 @@ def extract_tool_calls_streaming( # select as the current tool call the one we're on the state at - current_tool_call: Dict = tool_call_arr[self.current_tool_id] \ + current_tool_call: dict = tool_call_arr[self.current_tool_id] \ if len(tool_call_arr) > 0 else {} # case -- if no tokens have been streamed for the tool, e.g. diff --git a/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py index 5c282b5c2605a..1b9317f16f345 100644 --- a/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py @@ -3,7 +3,8 @@ import ast import json import re -from typing import Any, Sequence, Tuple, Union +from collections.abc import Sequence +from typing import Any, Union from transformers import PreTrainedTokenizerBase @@ -204,7 +205,7 @@ def _handle_single_tool(call: ast.Call) -> ToolCall: arguments=json.dumps(arguments))) -def _make_valid_python(text: str) -> Union[Tuple[str, str], None]: +def _make_valid_python(text: str) -> Union[tuple[str, str], None]: bracket_stack = [] for index, char in enumerate(text): if char in {"[", "(", "{"}: diff --git a/vllm/entrypoints/openai/tool_parsers/utils.py b/vllm/entrypoints/openai/tool_parsers/utils.py index 945cbd6835028..7997629d461a4 100644 --- a/vllm/entrypoints/openai/tool_parsers/utils.py +++ b/vllm/entrypoints/openai/tool_parsers/utils.py @@ -2,7 +2,7 @@ import json from json import JSONDecodeError, JSONDecoder -from typing import Any, List, Tuple +from typing import Any import partial_json_parser from partial_json_parser.core.options import Allow @@ -82,7 +82,7 @@ def extract_intermediate_diff(curr: str, old: str) -> str: return diff -def find_all_indices(string: str, substring: str) -> List[int]: +def find_all_indices(string: str, substring: str) -> list[int]: """ Find all (starting) indices of a substring in a given string. Useful for tool call extraction @@ -99,7 +99,7 @@ def find_all_indices(string: str, substring: str) -> List[int]: # partial_json_parser doesn't support extra data and # JSONDecorder.raw_decode doesn't support partial JSON -def partial_json_loads(input_str: str, flags: Allow) -> Tuple[Any, int]: +def partial_json_loads(input_str: str, flags: Allow) -> tuple[Any, int]: try: return (partial_json_parser.loads(input_str, flags), len(input_str)) except JSONDecodeError as e: diff --git a/vllm/entrypoints/score_utils.py b/vllm/entrypoints/score_utils.py index 6ec0b5fb024a1..53411a27b41ee 100644 --- a/vllm/entrypoints/score_utils.py +++ b/vllm/entrypoints/score_utils.py @@ -1,5 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List, Union +from typing import Union from torch.nn import CosineSimilarity @@ -10,12 +10,12 @@ def _cosine_similarity( tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], - embed_1: List[PoolingRequestOutput], - embed_2: List[PoolingRequestOutput], -) -> List[PoolingRequestOutput]: + embed_1: list[PoolingRequestOutput], + embed_2: list[PoolingRequestOutput], +) -> list[PoolingRequestOutput]: scorer = CosineSimilarity(0) - scores: Union[List[PoolingRequestOutput]] = [] + scores: Union[list[PoolingRequestOutput]] = [] for emb_1, emb_2 in zip(embed_1, embed_2): pair_score = scorer(emb_1.outputs.data, emb_2.outputs.data) @@ -38,8 +38,8 @@ def _cosine_similarity( def _validate_score_input_lens( - texts_1: Union[List[str], List[dict]], - texts_2: Union[List[str], List[dict]], + texts_1: Union[list[str], list[dict]], + texts_2: Union[list[str], list[dict]], ): if len(texts_1) > 1 and len(texts_1) != len(texts_2): raise ValueError("Input lengths must be either 1:1, 1:N or N:N") diff --git a/vllm/envs.py b/vllm/envs.py index 048d63bfec0f7..bf64cd70674da 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -2,7 +2,7 @@ import os import tempfile -from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional +from typing import TYPE_CHECKING, Any, Callable, Optional if TYPE_CHECKING: VLLM_HOST_IP: str = "" @@ -67,12 +67,12 @@ VLLM_ALLOW_LONG_MAX_MODEL_LEN: bool = False VLLM_TEST_FORCE_FP8_MARLIN: bool = False VLLM_RPC_TIMEOUT: int = 10000 # ms - VLLM_PLUGINS: Optional[List[str]] = None + VLLM_PLUGINS: Optional[list[str]] = None VLLM_TORCH_PROFILER_DIR: Optional[str] = None VLLM_USE_TRITON_AWQ: bool = False VLLM_ALLOW_RUNTIME_LORA_UPDATING: bool = False VLLM_SKIP_P2P_CHECK: bool = False - VLLM_DISABLED_KERNELS: List[str] = [] + VLLM_DISABLED_KERNELS: list[str] = [] VLLM_USE_V1: bool = False VLLM_ROCM_FP8_PADDING: bool = True VLLM_ENABLE_V1_MULTIPROCESSING: bool = True @@ -123,7 +123,7 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]: # begin-env-vars-definition -environment_variables: Dict[str, Callable[[], Any]] = { +environment_variables: dict[str, Callable[[], Any]] = { # ================== Installation Time Env Vars ================== diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py index 6f5adb4f64728..138696370839e 100644 --- a/vllm/executor/executor_base.py +++ b/vllm/executor/executor_base.py @@ -3,8 +3,8 @@ import asyncio import time from abc import ABC, abstractmethod -from typing import (Any, Awaitable, Callable, Dict, List, Optional, Set, Tuple, - Union) +from collections.abc import Awaitable +from typing import Any, Callable, Optional, Union import torch.nn as nn from typing_extensions import TypeVar @@ -60,8 +60,8 @@ def _init_executor(self) -> None: def collective_rpc(self, method: Union[str, Callable[..., _R]], timeout: Optional[float] = None, - args: Tuple = (), - kwargs: Optional[Dict[str, Any]] = None) -> List[_R]: + args: tuple = (), + kwargs: Optional[dict[str, Any]] = None) -> list[_R]: """ Execute an RPC call on all workers. @@ -86,7 +86,7 @@ def collective_rpc(self, """ raise NotImplementedError - def determine_num_available_blocks(self) -> Tuple[int, int]: + def determine_num_available_blocks(self) -> tuple[int, int]: """Determine the number of available blocks for the GPU KV cache and swappable CPU KV cache. @@ -94,7 +94,7 @@ def determine_num_available_blocks(self) -> Tuple[int, int]: ExecutorBase may require modification of the result, e.g. to ensure the selected cache sizes are compatible with all workers. - Returns a Tuple[num_gpu_blocks, num_cpu_blocks], where num_gpu_blocks + Returns a tuple[num_gpu_blocks, num_cpu_blocks], where num_gpu_blocks are blocks that are "active" on the device and can be appended to. num_cpu_blocks refers to "swapped" blocks in CPU memory and cannot be appended to. @@ -135,7 +135,7 @@ def rpc_func(worker: WorkerBase) -> _R: def execute_model( self, execute_model_req: ExecuteModelRequest - ) -> Optional[List[Union[SamplerOutput, PoolerOutput]]]: + ) -> Optional[list[Union[SamplerOutput, PoolerOutput]]]: output = self.collective_rpc("execute_model", args=(execute_model_req, )) return output[0] @@ -156,7 +156,7 @@ def pin_lora(self, lora_id: int) -> bool: assert lora_id > 0, "lora_id must be greater than 0." return all(self.collective_rpc("pin_lora", args=(lora_id, ))) - def list_loras(self) -> Set[int]: + def list_loras(self) -> set[int]: sets = self.collective_rpc("list_loras") for s in sets: assert s == sets[0], "All workers should have the same LORAs." @@ -184,7 +184,7 @@ def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool: self.collective_rpc("pin_prompt_adapter", args=(prompt_adapter_id, ))) - def list_prompt_adapters(self) -> Set[int]: + def list_prompt_adapters(self) -> set[int]: sets = self.collective_rpc("list_prompt_adapters") for s in sets: assert (s == sets[0] @@ -245,7 +245,7 @@ def __del__(self): async def execute_model_async( self, - execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]: + execute_model_req: ExecuteModelRequest) -> list[SamplerOutput]: """Executes one model step on the given sequences.""" output = await make_async(self.execute_model)(execute_model_req) return output @@ -273,7 +273,7 @@ def __init__(self, *args, **kwargs): def execute_model( self, execute_model_req: ExecuteModelRequest, - ) -> List[SamplerOutput]: + ) -> list[SamplerOutput]: # TODO: unify into collective_rpc if self.parallel_worker_tasks is None: self.parallel_worker_tasks = self._run_workers( @@ -299,7 +299,7 @@ def stop_remote_worker_execution_loop(self) -> None: @abstractmethod def _driver_execute_model( self, execute_model_req: Optional[ExecuteModelRequest] - ) -> Optional[List[SamplerOutput]]: + ) -> Optional[list[SamplerOutput]]: """Run execute_model in the driver worker. Passing None will cause the driver to stop the model execution loop @@ -311,8 +311,8 @@ def _driver_execute_model( def collective_rpc(self, method: Union[str, Callable], timeout: Optional[float] = None, - args: Tuple = (), - kwargs: Optional[Dict] = None) -> List[Any]: + args: tuple = (), + kwargs: Optional[dict] = None) -> list[Any]: return self._run_workers(method, *args, **(kwargs or {})) @abstractmethod @@ -344,7 +344,7 @@ def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None: async def execute_model_async( self, - execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]: + execute_model_req: ExecuteModelRequest) -> list[SamplerOutput]: if self.parallel_worker_tasks is None: # Start model execution loop running in the parallel workers self.parallel_worker_tasks = asyncio.create_task( @@ -368,7 +368,7 @@ async def stop_remote_worker_execution_loop_async(self) -> None: async def _driver_execute_model_async( self, execute_model_req: Optional[ExecuteModelRequest] = None, - ) -> List[SamplerOutput]: + ) -> list[SamplerOutput]: """Execute the model asynchronously in the driver worker. Passing None will cause the driver to stop the model execution diff --git a/vllm/executor/mp_distributed_executor.py b/vllm/executor/mp_distributed_executor.py index d1f8c36fbbec7..e175568923c08 100644 --- a/vllm/executor/mp_distributed_executor.py +++ b/vllm/executor/mp_distributed_executor.py @@ -2,7 +2,7 @@ import asyncio import os -from typing import Any, Callable, List, Optional, Union +from typing import Any, Callable, Optional, Union import cloudpickle @@ -72,15 +72,15 @@ def _init_executor(self) -> None: distributed_init_method = get_distributed_init_method( "127.0.0.1", get_open_port()) - self.workers: List[ProcessWorkerWrapper] = [] + self.workers: list[ProcessWorkerWrapper] = [] # This is the list of workers that are rank 0 of each TP group EXCEPT # global rank 0. These are the workers that will broadcast to the # rest of the workers. - self.tp_driver_workers: List[ProcessWorkerWrapper] = [] + self.tp_driver_workers: list[ProcessWorkerWrapper] = [] # This is the list of workers that are not drivers and not the first # worker in a TP group. These are the workers that will be # broadcasted to. - self.non_driver_workers: List[ProcessWorkerWrapper] = [] + self.non_driver_workers: list[ProcessWorkerWrapper] = [] if world_size == 1: self.worker_monitor = None @@ -126,7 +126,7 @@ def _init_executor(self) -> None: max_concurrent_workers=self.parallel_config. max_parallel_loading_workers) self.driver_exec_model = make_async(self.driver_worker.execute_model) - self.pp_locks: Optional[List[asyncio.Lock]] = None + self.pp_locks: Optional[list[asyncio.Lock]] = None def shutdown(self): if (worker_monitor := getattr(self, "worker_monitor", @@ -135,7 +135,7 @@ def shutdown(self): def _driver_execute_model( self, execute_model_req: Optional[ExecuteModelRequest] - ) -> Optional[List[SamplerOutput]]: + ) -> Optional[list[SamplerOutput]]: """Run execute_model in the driver worker. Passing None will cause the driver to stop the model execution @@ -150,7 +150,7 @@ def _run_workers( async_run_tensor_parallel_workers_only: bool = False, max_concurrent_workers: Optional[int] = None, **kwargs, - ) -> List[Any]: + ) -> list[Any]: """Runs the given method on all workers. Args: @@ -204,7 +204,7 @@ def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None: async def _driver_execute_model_async( self, execute_model_req: Optional[ExecuteModelRequest] = None - ) -> List[SamplerOutput]: + ) -> list[SamplerOutput]: if not self.tp_driver_workers: return await self.driver_exec_model(execute_model_req) diff --git a/vllm/executor/msgspec_utils.py b/vllm/executor/msgspec_utils.py index e680d53cbd10e..0a28952a9cdc1 100644 --- a/vllm/executor/msgspec_utils.py +++ b/vllm/executor/msgspec_utils.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 from array import array -from typing import Any, Type +from typing import Any from vllm.sequence import VLLM_TOKEN_ID_ARRAY_TYPE @@ -18,7 +18,7 @@ def encode_hook(obj: Any) -> Any: return obj.tobytes() -def decode_hook(type: Type, obj: Any) -> Any: +def decode_hook(type: type, obj: Any) -> Any: """Custom msgspec dec hook that supports array types. See https://jcristharif.com/msgspec/api.html#msgspec.msgpack.Encoder diff --git a/vllm/executor/multiproc_worker_utils.py b/vllm/executor/multiproc_worker_utils.py index 68a83bb610a49..25e4eb8296525 100644 --- a/vllm/executor/multiproc_worker_utils.py +++ b/vllm/executor/multiproc_worker_utils.py @@ -9,8 +9,7 @@ from multiprocessing import Queue from multiprocessing.connection import wait from multiprocessing.process import BaseProcess -from typing import (Any, Callable, Dict, Generic, List, Optional, TextIO, - TypeVar, Union) +from typing import Any, Callable, Generic, Optional, TextIO, TypeVar, Union import torch @@ -82,7 +81,7 @@ class ResultHandler(threading.Thread): def __init__(self) -> None: super().__init__(daemon=True) self.result_queue = get_mp_context().Queue() - self.tasks: Dict[uuid.UUID, Union[ResultFuture, asyncio.Future]] = {} + self.tasks: dict[uuid.UUID, Union[ResultFuture, asyncio.Future]] = {} def run(self): for result in iter(self.result_queue.get, _TERMINATE): @@ -102,7 +101,7 @@ def close(self): class WorkerMonitor(threading.Thread): """Monitor worker status (in background thread)""" - def __init__(self, workers: List['ProcessWorkerWrapper'], + def __init__(self, workers: list['ProcessWorkerWrapper'], result_handler: ResultHandler): super().__init__(daemon=True) self.workers = workers diff --git a/vllm/executor/ray_distributed_executor.py b/vllm/executor/ray_distributed_executor.py index c3b41d1c11340..d4c95840665f3 100644 --- a/vllm/executor/ray_distributed_executor.py +++ b/vllm/executor/ray_distributed_executor.py @@ -4,7 +4,7 @@ import os from collections import defaultdict from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union +from typing import TYPE_CHECKING, Any, Callable, Optional, Union import cloudpickle import msgspec @@ -91,10 +91,10 @@ def _init_executor(self) -> None: self.input_encoder = msgspec.msgpack.Encoder(enc_hook=encode_hook) self.output_decoder = msgspec.msgpack.Decoder( - Optional[List[SamplerOutput]]) + Optional[list[SamplerOutput]]) self.use_v1 = envs.VLLM_USE_V1 - self.pp_locks: Optional[List[asyncio.Lock]] = None + self.pp_locks: Optional[list[asyncio.Lock]] = None if not self.use_ray_compiled_dag: self.driver_exec_method = make_async( self.driver_worker.execute_method) @@ -112,7 +112,7 @@ def shutdown(self) -> None: self.forward_dag = None def _configure_ray_workers_use_nsight(self, - ray_remote_kwargs) -> Dict[str, Any]: + ray_remote_kwargs) -> dict[str, Any]: # If nsight profiling is enabled, we need to set the profiling # configuration for the ray workers as runtime env. runtime_env = ray_remote_kwargs.setdefault("runtime_env", {}) @@ -138,12 +138,12 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", # It holds the resource for the driver worker. self.driver_dummy_worker: Optional[RayWorkerWrapper] = None # The remaining workers are the actual ray actors. - self.workers: List[RayWorkerWrapper] = [] + self.workers: list[RayWorkerWrapper] = [] # Used in ray compiled DAG: indexed first by PP rank, # and then TP rank. In other words, the inner list is # the TP group of workers for a PP rank. - self.pp_tp_workers: List[List[RayWorkerWrapper]] = [] + self.pp_tp_workers: list[list[RayWorkerWrapper]] = [] if self.parallel_config.ray_workers_use_nsight: ray_remote_kwargs = self._configure_ray_workers_use_nsight( @@ -152,7 +152,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", logger.info("use_ray_spmd_worker: %s", self.use_ray_spmd_worker) # Create the workers. - bundle_indices: List[int] + bundle_indices: list[int] if envs.VLLM_RAY_BUNDLE_INDICES: # Use the bundle indices specified by the user. bundle_indices = list( @@ -172,7 +172,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", bundle_indices.append(bundle_id) bundle_indices = bundle_indices[:self.parallel_config.world_size] - worker_metadata: List[RayWorkerMetaData] = [] + worker_metadata: list[RayWorkerMetaData] = [] driver_ip = get_ip() for rank, bundle_id in enumerate(bundle_indices): scheduling_strategy = PlacementGroupSchedulingStrategy( @@ -233,7 +233,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", "Consider adjusting the Ray placement group or running " "the driver on a GPU node.") - ip_counts: Dict[str, int] = {} + ip_counts: dict[str, int] = {} for ip in worker_ips: ip_counts[ip] = ip_counts.get(ip, 0) + 1 @@ -377,11 +377,11 @@ def sort_by_driver_then_worker_ip(item: RayWorkerMetaData): # This is the list of workers that are rank 0 of each TP group EXCEPT # global rank 0. These are the workers that will broadcast to the # rest of the workers. - self.tp_driver_workers: List[RayWorkerWrapper] = [] + self.tp_driver_workers: list[RayWorkerWrapper] = [] # This is the list of workers that are not drivers and not the first # worker in a TP group. These are the workers that will be # broadcasted to. - self.non_driver_workers: List[RayWorkerWrapper] = [] + self.non_driver_workers: list[RayWorkerWrapper] = [] # Enforce rank order for correct rank to return final output. for index, worker in enumerate(self.workers): @@ -394,7 +394,7 @@ def sort_by_driver_then_worker_ip(item: RayWorkerMetaData): def _driver_execute_model( self, execute_model_req: Optional[ExecuteModelRequest] - ) -> Optional[List[SamplerOutput]]: + ) -> Optional[list[SamplerOutput]]: """Run execute_model in the driver worker. Passing None will cause the driver to stop the model execution @@ -407,7 +407,7 @@ def _driver_execute_model( def execute_model( self, - execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]: + execute_model_req: ExecuteModelRequest) -> list[SamplerOutput]: if not self.use_ray_spmd_worker: return super().execute_model(execute_model_req) @@ -586,7 +586,7 @@ def __del__(self): async def execute_model_async( self, - execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]: + execute_model_req: ExecuteModelRequest) -> list[SamplerOutput]: if not self.use_ray_spmd_worker: return await super().execute_model_async(execute_model_req) @@ -601,7 +601,7 @@ async def execute_model_async( async def _driver_execute_model_async( self, execute_model_req: Optional[ExecuteModelRequest] = None - ) -> List[SamplerOutput]: + ) -> list[SamplerOutput]: assert not self.use_ray_spmd_worker, ( "driver_worker does not exist for VLLM_USE_RAY_SPMD_WORKER=1") if not self.tp_driver_workers: diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py index 6067f9a3c13b8..0fd2efdceb142 100644 --- a/vllm/executor/ray_utils.py +++ b/vllm/executor/ray_utils.py @@ -3,7 +3,7 @@ import os import time from collections import defaultdict -from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Optional, Union import msgspec @@ -52,7 +52,7 @@ def __init__(self, *args, **kwargs) -> None: def get_node_ip(self) -> str: return get_ip() - def get_node_and_gpu_ids(self) -> Tuple[str, List[int]]: + def get_node_and_gpu_ids(self) -> tuple[str, list[int]]: node_id = ray.get_runtime_context().get_node_id() device_key = vllm.platforms.current_platform.ray_device_key if not device_key: @@ -64,7 +64,7 @@ def get_node_and_gpu_ids(self) -> Tuple[str, List[int]]: def execute_model_spmd( self, req_or_tuple: Union[bytes, - Tuple[bytes, + tuple[bytes, Optional[IntermediateTensors]]] ) -> bytes: """Execute model in SPMD fashion: used only when SPMD worker and @@ -115,9 +115,9 @@ def setup_device_if_necessary(self): def execute_model_ray( self, scheduler_output: Union["SchedulerOutput", - Tuple["SchedulerOutput", + tuple["SchedulerOutput", "IntermediateTensors"]], - ) -> Union["ModelRunnerOutput", Tuple["SchedulerOutput", + ) -> Union["ModelRunnerOutput", tuple["SchedulerOutput", "IntermediateTensors"]]: # This method is used by Ray Compiled Graph to execute the model, # and it needs a special logic of self.setup_device_if_necessary() @@ -133,7 +133,7 @@ def execute_model_ray( output = scheduler_output, output return output - def override_env_vars(self, vars: Dict[str, str]): + def override_env_vars(self, vars: dict[str, str]): os.environ.update(vars) ray_import_err = None @@ -171,8 +171,8 @@ def _verify_bundles(placement_group: "PlacementGroup", bundle_to_node_ids = pg_data["bundles_to_node_id"] # bundle_idx -> bundle (e.g., {"GPU": 1}) bundles = pg_data["bundles"] - # node_id -> List of bundle (e.g., {"GPU": 1}) - node_id_to_bundle: Dict[str, List[Dict[str, float]]] = defaultdict(list) + # node_id -> list of bundle (e.g., {"GPU": 1}) + node_id_to_bundle: dict[str, list[dict[str, float]]] = defaultdict(list) for bundle_idx, node_id in bundle_to_node_ids.items(): node_id_to_bundle[node_id].append(bundles[bundle_idx]) @@ -334,7 +334,7 @@ def initialize_ray_cluster( "number of available %ss in the placement group.", device_str, device_str) # Create a new placement group - placement_group_specs: List[Dict[str, float]] = ([{ + placement_group_specs: list[dict[str, float]] = ([{ device_str: 1.0 } for _ in range(parallel_config.world_size)]) diff --git a/vllm/executor/uniproc_executor.py b/vllm/executor/uniproc_executor.py index e041215de6602..87dc3eed2a0e8 100644 --- a/vllm/executor/uniproc_executor.py +++ b/vllm/executor/uniproc_executor.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import os -from typing import Any, Callable, Dict, List, Optional, Tuple, Union +from typing import Any, Callable, Optional, Union import torch import torch.distributed as dist @@ -49,8 +49,8 @@ def _init_executor(self) -> None: def collective_rpc(self, method: Union[str, Callable], timeout: Optional[float] = None, - args: Tuple = (), - kwargs: Optional[Dict] = None) -> List[Any]: + args: tuple = (), + kwargs: Optional[dict] = None) -> list[Any]: if kwargs is None: kwargs = {} answer = run_method(self.driver_worker, method, args, kwargs) @@ -120,7 +120,7 @@ def _init_executor(self) -> None: self.collective_rpc("init_device") self.collective_rpc("load_model") - def determine_num_available_blocks(self) -> Tuple[int, int]: + def determine_num_available_blocks(self) -> tuple[int, int]: """ Determine the number of available KV blocks. Add an additional all_reduce to get the min across all ranks. diff --git a/vllm/forward_context.py b/vllm/forward_context.py index b91816af1b6d5..c3d20cff426c7 100644 --- a/vllm/forward_context.py +++ b/vllm/forward_context.py @@ -4,7 +4,7 @@ from collections import defaultdict from contextlib import contextmanager from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, Dict, List, Optional +from typing import TYPE_CHECKING, Any, Optional import torch import torch.distributed as dist @@ -28,13 +28,13 @@ @dataclass class ForwardContext: # copy from vllm_config.compilation_config.static_forward_context - attn_layers: Dict[str, Any] + attn_layers: dict[str, Any] # TODO: extend to support per-layer dynamic forward context attn_metadata: "AttentionMetadata" # set dynamically for each forward pass # TODO: remove after making all virtual_engines share the same kv cache virtual_engine: int # set dynamically for each forward pass num_tokens_across_dp: Optional[ - List[int]] = None # set dynamically for each forward pass + list[int]] = None # set dynamically for each forward pass _forward_context: Optional[ForwardContext] = None diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py index 2ffebeee392a3..138a8f61107be 100644 --- a/vllm/inputs/data.py +++ b/vllm/inputs/data.py @@ -1,9 +1,9 @@ # SPDX-License-Identifier: Apache-2.0 +from collections.abc import Iterable from dataclasses import dataclass from functools import cached_property -from typing import (TYPE_CHECKING, Any, Dict, Generic, Iterable, List, Literal, - Optional, Tuple, Union, cast) +from typing import TYPE_CHECKING, Any, Generic, Literal, Optional, Union, cast import torch from typing_extensions import NotRequired, TypedDict, TypeVar, assert_never @@ -26,7 +26,7 @@ class TextPrompt(TypedDict): if the model supports it. """ - mm_processor_kwargs: NotRequired[Dict[str, Any]] + mm_processor_kwargs: NotRequired[dict[str, Any]] """ Optional multi-modal processor kwargs to be forwarded to the multimodal input mapper & processor. Note that if multiple modalities @@ -38,10 +38,10 @@ class TextPrompt(TypedDict): class TokensPrompt(TypedDict): """Schema for a tokenized prompt.""" - prompt_token_ids: List[int] + prompt_token_ids: list[int] """A list of token IDs to pass to the model.""" - token_type_ids: NotRequired[List[int]] + token_type_ids: NotRequired[list[int]] """A list of token type IDs to pass to the cross encoder model.""" multi_modal_data: NotRequired["MultiModalDataDict"] @@ -50,7 +50,7 @@ class TokensPrompt(TypedDict): if the model supports it. """ - mm_processor_kwargs: NotRequired[Dict[str, Any]] + mm_processor_kwargs: NotRequired[dict[str, Any]] """ Optional multi-modal processor kwargs to be forwarded to the multimodal input mapper & processor. Note that if multiple modalities @@ -115,7 +115,7 @@ class ExplicitEncoderDecoderPrompt(TypedDict, Generic[_T1_co, _T2_co]): decoder_prompt: Optional[_T2_co] - mm_processor_kwargs: NotRequired[Dict[str, Any]] + mm_processor_kwargs: NotRequired[dict[str, Any]] PromptType = Union[SingletonPrompt, ExplicitEncoderDecoderPrompt] @@ -136,10 +136,10 @@ class TokenInputs(TypedDict): type: Literal["token"] """The type of inputs.""" - prompt_token_ids: List[int] + prompt_token_ids: list[int] """The token IDs of the prompt.""" - token_type_ids: NotRequired[List[int]] + token_type_ids: NotRequired[list[int]] """The token type IDs of the prompt.""" prompt: NotRequired[str] @@ -164,12 +164,12 @@ class TokenInputs(TypedDict): Placeholder ranges for the multi-modal data. """ - multi_modal_hashes: NotRequired[List[str]] + multi_modal_hashes: NotRequired[list[str]] """ The hashes of the multi-modal data. """ - mm_processor_kwargs: NotRequired[Dict[str, Any]] + mm_processor_kwargs: NotRequired[dict[str, Any]] """ Optional multi-modal processor kwargs to be forwarded to the multimodal input mapper & processor. Note that if multiple modalities @@ -179,14 +179,14 @@ class TokenInputs(TypedDict): def token_inputs( - prompt_token_ids: List[int], - token_type_ids: Optional[List[int]] = None, + prompt_token_ids: list[int], + token_type_ids: Optional[list[int]] = None, prompt: Optional[str] = None, multi_modal_data: Optional["MultiModalDataDict"] = None, multi_modal_inputs: Optional["MultiModalKwargs"] = None, - multi_modal_hashes: Optional[List[str]] = None, + multi_modal_hashes: Optional[list[str]] = None, multi_modal_placeholders: Optional["MultiModalPlaceholderDict"] = None, - mm_processor_kwargs: Optional[Dict[str, Any]] = None, + mm_processor_kwargs: Optional[dict[str, Any]] = None, ) -> TokenInputs: """Construct :class:`TokenInputs` from optional values.""" inputs = TokenInputs(type="token", prompt_token_ids=prompt_token_ids) @@ -255,7 +255,7 @@ def prompt(self) -> Optional[str]: assert_never(inputs) # type: ignore[arg-type] @cached_property - def prompt_token_ids(self) -> List[int]: + def prompt_token_ids(self) -> list[int]: inputs = self.inputs if inputs["type"] == "token" or inputs["type"] == "multimodal": @@ -264,7 +264,7 @@ def prompt_token_ids(self) -> List[int]: assert_never(inputs) # type: ignore[arg-type] @cached_property - def token_type_ids(self) -> List[int]: + def token_type_ids(self) -> list[int]: inputs = self.inputs if inputs["type"] == "token" or inputs["type"] == "multimodal": @@ -294,7 +294,7 @@ def multi_modal_data(self) -> "MultiModalDataDict": assert_never(inputs) # type: ignore[arg-type] @cached_property - def multi_modal_inputs(self) -> Union[Dict, "MultiModalKwargs"]: + def multi_modal_inputs(self) -> Union[dict, "MultiModalKwargs"]: inputs = self.inputs if inputs["type"] == "token": @@ -306,7 +306,7 @@ def multi_modal_inputs(self) -> Union[Dict, "MultiModalKwargs"]: assert_never(inputs) # type: ignore[arg-type] @cached_property - def multi_modal_hashes(self) -> List[str]: + def multi_modal_hashes(self) -> list[str]: inputs = self.inputs if inputs["type"] == "token": @@ -331,7 +331,7 @@ def multi_modal_placeholders(self) -> "MultiModalPlaceholderDict": assert_never(inputs) # type: ignore[arg-type] @cached_property - def mm_processor_kwargs(self) -> Dict[str, Any]: + def mm_processor_kwargs(self) -> dict[str, Any]: inputs = self.inputs if inputs["type"] == "token": @@ -355,7 +355,7 @@ def mm_processor_kwargs(self) -> Dict[str, Any]: def build_explicit_enc_dec_prompt( encoder_prompt: _T1, decoder_prompt: Optional[_T2], - mm_processor_kwargs: Optional[Dict[str, Any]] = None, + mm_processor_kwargs: Optional[dict[str, Any]] = None, ) -> ExplicitEncoderDecoderPrompt[_T1, _T2]: if mm_processor_kwargs is None: mm_processor_kwargs = {} @@ -368,9 +368,9 @@ def build_explicit_enc_dec_prompt( def zip_enc_dec_prompts( enc_prompts: Iterable[_T1], dec_prompts: Iterable[Optional[_T2]], - mm_processor_kwargs: Optional[Union[Iterable[Dict[str, Any]], - Dict[str, Any]]] = None, -) -> List[ExplicitEncoderDecoderPrompt[_T1, _T2]]: + mm_processor_kwargs: Optional[Union[Iterable[dict[str, Any]], + dict[str, Any]]] = None, +) -> list[ExplicitEncoderDecoderPrompt[_T1, _T2]]: """ Zip encoder and decoder prompts together into a list of :class:`ExplicitEncoderDecoderPrompt` instances. @@ -380,12 +380,12 @@ def zip_enc_dec_prompts( provided, it will be zipped with the encoder/decoder prompts. """ if mm_processor_kwargs is None: - mm_processor_kwargs = cast(Dict[str, Any], {}) + mm_processor_kwargs = cast(dict[str, Any], {}) if isinstance(mm_processor_kwargs, dict): return [ build_explicit_enc_dec_prompt( encoder_prompt, decoder_prompt, - cast(Dict[str, Any], mm_processor_kwargs)) + cast(dict[str, Any], mm_processor_kwargs)) for (encoder_prompt, decoder_prompt) in zip(enc_prompts, dec_prompts) ] @@ -399,7 +399,7 @@ def zip_enc_dec_prompts( def to_enc_dec_tuple_list( enc_dec_prompts: Iterable[ExplicitEncoderDecoderPrompt[_T1, _T2]], -) -> List[Tuple[_T1, Optional[_T2]]]: +) -> list[tuple[_T1, Optional[_T2]]]: return [(enc_dec_prompt["encoder_prompt"], enc_dec_prompt["decoder_prompt"]) for enc_dec_prompt in enc_dec_prompts] diff --git a/vllm/inputs/parse.py b/vllm/inputs/parse.py index 454d9d8303b77..ed1056948d807 100644 --- a/vllm/inputs/parse.py +++ b/vllm/inputs/parse.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List, Literal, Sequence, TypedDict, Union, cast, overload +from collections.abc import Sequence +from typing import Literal, TypedDict, Union, cast, overload from typing_extensions import TypeIs @@ -17,24 +18,24 @@ class ParsedText(TypedDict): class ParsedTokens(TypedDict): - content: List[int] + content: list[int] is_tokens: Literal[True] @overload def parse_and_batch_prompt( - prompt: Union[str, List[str]]) -> Sequence[ParsedText]: + prompt: Union[str, list[str]]) -> Sequence[ParsedText]: ... @overload def parse_and_batch_prompt( - prompt: Union[List[int], List[List[int]]]) -> Sequence[ParsedTokens]: + prompt: Union[list[int], list[list[int]]]) -> Sequence[ParsedTokens]: ... def parse_and_batch_prompt( - prompt: Union[str, List[str], List[int], List[List[int]]], + prompt: Union[str, list[str], list[int], list[list[int]]], ) -> Union[Sequence[ParsedText], Sequence[ParsedTokens]]: if isinstance(prompt, str): # case 1: a string @@ -46,16 +47,16 @@ def parse_and_batch_prompt( if is_list_of(prompt, str): # case 2: array of strings - prompt = cast(List[str], prompt) + prompt = cast(list[str], prompt) return [ ParsedText(content=elem, is_tokens=False) for elem in prompt ] if is_list_of(prompt, int): # case 3: array of tokens - prompt = cast(List[int], prompt) + prompt = cast(list[int], prompt) return [ParsedTokens(content=prompt, is_tokens=True)] if is_list_of(prompt, list): - prompt = cast(List[List[int]], prompt) + prompt = cast(list[list[int]], prompt) if len(prompt[0]) == 0: raise ValueError("please provide at least one prompt") diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index bc5856990da6f..742733d3644a3 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -1,7 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 import asyncio -from typing import List, Mapping, Optional, Tuple, Union, cast +from collections.abc import Mapping +from typing import Optional, Union, cast from typing_extensions import assert_never @@ -92,7 +93,7 @@ def get_decoder_start_token_id(self) -> Optional[int]: return dec_start_token_id - def _get_default_enc_dec_decoder_prompt(self) -> List[int]: + def _get_default_enc_dec_decoder_prompt(self) -> list[int]: ''' Specifically for encoder/decoder models: generate a default decoder prompt for when @@ -130,8 +131,8 @@ def _get_default_enc_dec_decoder_prompt(self) -> List[int]: def _prepare_decoder_input_ids_for_generation( self, - decoder_input_ids: Optional[List[int]], - ) -> List[int]: + decoder_input_ids: Optional[list[int]], + ) -> list[int]: """ Prepares `decoder_input_ids` for generation with encoder-decoder models. @@ -168,9 +169,9 @@ def _prepare_decoder_input_ids_for_generation( def _apply_prompt_adapter( self, - prompt_token_ids: List[int], + prompt_token_ids: list[int], prompt_adapter_request: Optional[PromptAdapterRequest], - ) -> List[int]: + ) -> list[int]: if prompt_adapter_request: prompt_token_ids = ( [0] * prompt_adapter_request.prompt_adapter_num_virtual_tokens @@ -183,7 +184,7 @@ def _tokenize_prompt( prompt: str, request_id: str, lora_request: Optional[LoRARequest], - ) -> List[int]: + ) -> list[int]: """ Apply the model's tokenizer to a text prompt, returning the corresponding token IDs. @@ -211,7 +212,7 @@ async def _tokenize_prompt_async( prompt: str, request_id: str, lora_request: Optional[LoRARequest], - ) -> List[int]: + ) -> list[int]: """Async version of :meth:`_tokenize_prompt`.""" tokenizer = self.get_tokenizer_group() add_special_tokens = None @@ -246,7 +247,7 @@ def _can_process_multimodal(self) -> bool: def _process_multimodal( self, - prompt: Union[str, List[int]], + prompt: Union[str, list[int]], mm_data: MultiModalDataDict, mm_processor_kwargs: Optional[Mapping[str, object]], lora_request: Optional[LoRARequest], @@ -274,7 +275,7 @@ def _process_multimodal( async def _process_multimodal_async( self, - prompt: Union[str, List[int]], + prompt: Union[str, list[int]], mm_data: MultiModalDataDict, mm_processor_kwargs: Optional[Mapping[str, object]], lora_request: Optional[LoRARequest], @@ -500,7 +501,7 @@ def _separate_enc_dec_inputs_from_mm_processor_outputs( self, inputs: SingletonInputs, decoder_inputs_to_override: Optional[SingletonInputs] = None, - ) -> Tuple[SingletonInputs, SingletonInputs]: + ) -> tuple[SingletonInputs, SingletonInputs]: """ For encoder/decoder models only: Separate Encoder/Decoder inputs from a MultiModalEncDecInputs diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py index 691fcd7dc53f2..4ec9304345399 100644 --- a/vllm/inputs/registry.py +++ b/vllm/inputs/registry.py @@ -2,9 +2,10 @@ import functools from collections import UserDict +from collections.abc import Mapping from dataclasses import dataclass -from typing import (TYPE_CHECKING, Any, Callable, Mapping, NamedTuple, - Optional, Protocol, Union) +from typing import (TYPE_CHECKING, Any, Callable, NamedTuple, Optional, + Protocol, Union) from torch import nn from transformers import BatchFeature, PretrainedConfig, ProcessorMixin diff --git a/vllm/logger.py b/vllm/logger.py index 0ee47de173add..2b0b9da2d6f7f 100644 --- a/vllm/logger.py +++ b/vllm/logger.py @@ -109,7 +109,7 @@ def _configure_vllm_root_logger() -> None: custom_config = json.loads(file.read()) if not isinstance(custom_config, dict): - raise ValueError("Invalid logging config. Expected Dict, got %s.", + raise ValueError("Invalid logging config. Expected dict, got %s.", type(custom_config).__name__) logging_config = custom_config diff --git a/vllm/logits_process.py b/vllm/logits_process.py index a810be7bc7a85..e3faf20029ec9 100644 --- a/vllm/logits_process.py +++ b/vllm/logits_process.py @@ -1,13 +1,13 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Callable, List, Tuple, Union +from typing import Callable, Union import torch from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer -LogitsProcessor = Union[Callable[[List[int], torch.Tensor], torch.Tensor], - Callable[[List[int], List[int], torch.Tensor], +LogitsProcessor = Union[Callable[[list[int], torch.Tensor], torch.Tensor], + Callable[[list[int], list[int], torch.Tensor], torch.Tensor]] """LogitsProcessor is a function that takes a list of previously generated tokens, the logits tensor @@ -17,9 +17,9 @@ def get_bad_words_logits_processors( - bad_words: List[str], - tokenizer: AnyTokenizer) -> List[LogitsProcessor]: - bad_words_ids: List[List[int]] = list() + bad_words: list[str], + tokenizer: AnyTokenizer) -> list[LogitsProcessor]: + bad_words_ids: list[list[int]] = list() for bad_word in bad_words: # To prohibit words both at the beginning @@ -51,13 +51,13 @@ class NoBadWordsLogitsProcessor: _SMALLEST_LOGIT = float("-inf") _NEUTRAL_LOGIT = 0.0 - def __init__(self, bad_words_ids: List[List[int]]): + def __init__(self, bad_words_ids: list[list[int]]): self.bad_words_ids = bad_words_ids self.word_bias: torch.FloatTensor = None def __call__( self, - past_tokens_ids: Union[List[int], Tuple[int]], + past_tokens_ids: Union[list[int], tuple[int]], logits: torch.FloatTensor, ) -> torch.Tensor: if self.word_bias is None: diff --git a/vllm/lora/fully_sharded_layers.py b/vllm/lora/fully_sharded_layers.py index 41e1ec94145db..40a366a876d7e 100644 --- a/vllm/lora/fully_sharded_layers.py +++ b/vllm/lora/fully_sharded_layers.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # pylint: disable=unused-argument -from typing import TYPE_CHECKING, List, Optional, Tuple, Union, cast +from typing import TYPE_CHECKING, Optional, Union, cast import torch import torch.nn as nn @@ -107,7 +107,7 @@ def can_replace_layer( cls, source_layer: nn.Module, lora_config: LoRAConfig, - packed_modules_list: List, + packed_modules_list: list, model_config: Optional[PretrainedConfig], ) -> bool: # specifying kwargs so they can be easily accessed in decorator @@ -130,8 +130,8 @@ class MergedColumnParallelLinearWithShardedLoRA( """ def slice_lora_a( - self, lora_a: List[Union[torch.Tensor, None]] - ) -> List[Union[torch.Tensor, None]]: + self, lora_a: list[Union[torch.Tensor, None]] + ) -> list[Union[torch.Tensor, None]]: #NOTE: lora_a contains 2 subloras, and each sublora could be None. output_shard_size = self.lora_a_stacked[0].shape[2] output_start_idx = self.tp_rank * output_shard_size @@ -154,7 +154,7 @@ def can_replace_layer( cls, source_layer: nn.Module, lora_config: LoRAConfig, - packed_modules_list: List, + packed_modules_list: list, model_config: Optional[PretrainedConfig], ) -> bool: # specifying kwargs so they can be easily accessed in decorator @@ -190,7 +190,7 @@ def apply(self, @classmethod @_fully_sharded_can_replace def can_replace_layer(cls, source_layer: nn.Module, - lora_config: LoRAConfig, packed_modules_list: List, + lora_config: LoRAConfig, packed_modules_list: list, model_config: Optional[PretrainedConfig]) -> bool: # specifying kwargs so they can be easily accessed in decorator return super().can_replace_layer( @@ -211,8 +211,8 @@ class MergedQKVParallelLinearWithShardedLoRA(MergedQKVParallelLinearWithLoRA): """ def slice_lora_a( - self, lora_a: List[Union[torch.Tensor, None]] - ) -> List[Union[torch.Tensor, None]]: + self, lora_a: list[Union[torch.Tensor, None]] + ) -> list[Union[torch.Tensor, None]]: # NOTE: lora_a contains 3 subloras, and each sublora could be None. shard_size = [self.lora_a_stacked[i].shape[2] for i in range(3)] start_idx = [self.tp_rank * shard_size[i] for i in range(3)] @@ -237,7 +237,7 @@ def can_replace_layer( cls, source_layer: nn.Module, lora_config: LoRAConfig, - packed_modules_list: List, + packed_modules_list: list, model_config: Optional[PretrainedConfig], ) -> bool: # specifying kwargs so they can be easily accessed in decorator @@ -270,7 +270,7 @@ def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor: def slice_bias(self, bias: torch.Tensor) -> torch.Tensor: if bias is None: return bias - self.lora_bias_stacked = cast(Tuple[torch.Tensor, ...], + self.lora_bias_stacked = cast(tuple[torch.Tensor, ...], self.lora_bias_stacked) shard_size = self.lora_bias_stacked[0].shape[2] start_idx = self.tp_rank * shard_size @@ -322,7 +322,7 @@ def can_replace_layer( cls, source_layer: nn.Module, lora_config: LoRAConfig, - packed_modules_list: List, + packed_modules_list: list, model_config: Optional[PretrainedConfig], ) -> bool: # specifying kwargs so they can be easily accessed in decorator diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index 6c48173c201b3..d4cbb3e207e2b 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -3,7 +3,7 @@ # pylint: disable=unused-argument import math from dataclasses import dataclass -from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union, cast +from typing import TYPE_CHECKING, Optional, Union, cast import torch import torch.nn as nn @@ -82,14 +82,14 @@ class LoRAMapping(AdapterMapping): class BaseLayerWithLoRA(nn.Module): def slice_lora_a( - self, lora_a: Union[torch.Tensor, List[Union[torch.Tensor, None]]] - ) -> Union[torch.Tensor, List[Union[torch.Tensor, None]]]: + self, lora_a: Union[torch.Tensor, list[Union[torch.Tensor, None]]] + ) -> Union[torch.Tensor, list[Union[torch.Tensor, None]]]: """Slice lora a if splitting for tensor parallelism.""" ... def slice_lora_b( - self, lora_b: Union[torch.Tensor, List[Union[torch.Tensor, None]]] - ) -> Union[torch.Tensor, List[Union[torch.Tensor, None]]]: + self, lora_b: Union[torch.Tensor, list[Union[torch.Tensor, None]]] + ) -> Union[torch.Tensor, list[Union[torch.Tensor, None]]]: """Slice lora b if splitting with tensor parallelism.""" ... @@ -128,7 +128,7 @@ def can_replace_layer( cls, source_layer: nn.Module, lora_config: LoRAConfig, - packed_modules_list: List, + packed_modules_list: list, model_config: Optional[PretrainedConfig], ) -> bool: """Returns True if the layer can be replaced by this LoRA layer.""" @@ -140,7 +140,7 @@ class VocabParallelEmbeddingWithLoRA(BaseLayerWithLoRA): def __init__(self, base_layer: VocabParallelEmbedding) -> None: super().__init__() self.base_layer = base_layer - self.embeddings_slice: Optional[Tuple[int, int]] + self.embeddings_slice: Optional[tuple[int, int]] self.embeddings_weights: Optional[torch.Tensor] def create_lora_weights( @@ -269,7 +269,7 @@ def can_replace_layer( cls, source_layer: nn.Module, lora_config: LoRAConfig, - packed_modules_list: List, + packed_modules_list: list, model_config: Optional[PretrainedConfig], ) -> bool: return type(source_layer) is VocabParallelEmbedding @@ -282,9 +282,9 @@ def __init__(self, base_layer: LinearBase): self.base_layer = base_layer self.input_size = self.base_layer.input_size self.device = _get_lora_device(self.base_layer) - self.lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]] = None + self.lora_bias_stacked: Optional[tuple[torch.Tensor, ...]] = None - self.output_slices: Tuple[int, ...] + self.output_slices: tuple[int, ...] self.tp_size: int self.output_size: int self.n_slices: int @@ -351,7 +351,7 @@ def reset_lora(self, index: int): self.lora_b_stacked[s_index][index] = 0 if self.lora_config.bias_enabled: # Make mypy happy - self.lora_bias_stacked = cast(Tuple[torch.Tensor, ...], + self.lora_bias_stacked = cast(tuple[torch.Tensor, ...], self.lora_bias_stacked) self.lora_bias_stacked[s_index][index] = 0 @@ -385,7 +385,7 @@ def set_lora( lora_b.T, non_blocking=True) if lora_bias is not None: - self.lora_bias_stacked = cast(Tuple[torch.Tensor, ...], + self.lora_bias_stacked = cast(tuple[torch.Tensor, ...], self.lora_bias_stacked) assert len(self.lora_bias_stacked) self.lora_bias_stacked[0][index, 0, :lora_bias.shape[0]].copy_( @@ -413,7 +413,7 @@ def __init__(self, base_layer: ReplicatedLinear) -> None: def forward( self, input_: torch.Tensor - ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]: + ) -> tuple[Optional[torch.Tensor], Optional[torch.Tensor]]: """Forward of ReplicatedLinearWithLoRA Args: @@ -440,7 +440,7 @@ def can_replace_layer( cls, source_layer: nn.Module, lora_config: LoRAConfig, - packed_modules_list: List, + packed_modules_list: list, model_config: Optional[PretrainedConfig], ) -> bool: return type(source_layer) is ReplicatedLinear @@ -506,7 +506,7 @@ def slice_bias(self, bias: torch.Tensor) -> torch.Tensor: def forward( self, input_: torch.Tensor - ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]: + ) -> tuple[Optional[torch.Tensor], Optional[torch.Tensor]]: """Forward of ColumnParallelLinear Args: @@ -536,7 +536,7 @@ def can_replace_layer( cls, source_layer: nn.Module, lora_config: LoRAConfig, - packed_modules_list: List, + packed_modules_list: list, model_config: Optional[PretrainedConfig], ) -> bool: return type(source_layer) is ColumnParallelLinear or ( @@ -613,13 +613,13 @@ def create_lora_weights( ) for output_size in self.output_slices) def slice_lora_a( - self, lora_a: List[Union[torch.Tensor, None]] - ) -> List[Union[torch.Tensor, None]]: + self, lora_a: list[Union[torch.Tensor, None]] + ) -> list[Union[torch.Tensor, None]]: return lora_a def slice_lora_b( - self, lora_b: List[Union[torch.Tensor, None]] - ) -> List[Union[torch.Tensor, None]]: + self, lora_b: list[Union[torch.Tensor, None]] + ) -> list[Union[torch.Tensor, None]]: for i, (shard_id, shard_size) in enumerate( zip(self.output_ids, self.output_slices)): if (lora_b_i := lora_b[i]) is not None: @@ -628,8 +628,8 @@ def slice_lora_b( return lora_b def slice_bias( - self, bias: List[Union[torch.Tensor, - None]]) -> List[Union[torch.Tensor, None]]: + self, bias: list[Union[torch.Tensor, + None]]) -> list[Union[torch.Tensor, None]]: for i, (shard_id, shard_size) in enumerate( zip(self.output_ids, self.output_slices)): if (bias_i := bias[i]) is not None: @@ -664,7 +664,7 @@ def set_lora( lora_b_i.T, non_blocking=True) if lora_bias is not None: - self.lora_bias_stacked = cast(Tuple[torch.Tensor, ...], + self.lora_bias_stacked = cast(tuple[torch.Tensor, ...], self.lora_bias_stacked) for i in range(self.n_slices): if (lora_bias_i := lora_bias[i]) is not None: @@ -679,7 +679,7 @@ def can_replace_layer( cls, source_layer: nn.Module, lora_config: LoRAConfig, - packed_modules_list: List, + packed_modules_list: list, model_config: Optional[PretrainedConfig], ) -> bool: return (type(source_layer) is MergedColumnParallelLinear @@ -748,7 +748,7 @@ def slice_bias(self, bias: torch.Tensor) -> torch.Tensor: @classmethod @_not_fully_sharded_can_replace def can_replace_layer(cls, source_layer: nn.Module, - lora_config: LoRAConfig, packed_modules_list: List, + lora_config: LoRAConfig, packed_modules_list: list, model_config: Optional[PretrainedConfig]) -> bool: return type(source_layer) is QKVParallelLinear and len( packed_modules_list) == 1 @@ -808,7 +808,7 @@ def can_replace_layer( cls, source_layer: nn.Module, lora_config: LoRAConfig, - packed_modules_list: List, + packed_modules_list: list, model_config: Optional[PretrainedConfig], ) -> bool: return (type(source_layer) is QKVParallelLinear @@ -845,7 +845,7 @@ def slice_bias(self, bias: torch.Tensor) -> torch.Tensor: def forward( self, input_: torch.Tensor - ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]: + ) -> tuple[Optional[torch.Tensor], Optional[torch.Tensor]]: """Forward of RowParallelLinear Args: @@ -893,7 +893,7 @@ def can_replace_layer( cls, source_layer: nn.Module, lora_config: LoRAConfig, - packed_modules_list: List, + packed_modules_list: list, model_config: Optional[PretrainedConfig], ) -> bool: return type(source_layer) is RowParallelLinear @@ -916,7 +916,7 @@ class LogitsProcessorWithLoRA(BaseLayerWithLoRA): def __init__(self, base_layer: LogitsProcessor, hidden_size: int, dtype: torch.dtype, device: torch.device, - sharded_to_full_mapping: Optional[List[int]]) -> None: + sharded_to_full_mapping: Optional[list[int]]) -> None: super().__init__() self.base_layer = base_layer self.hidden_size = hidden_size @@ -1113,7 +1113,7 @@ def can_replace_layer( cls, source_layer: nn.Module, lora_config: LoRAConfig, - packed_modules_list: List, + packed_modules_list: list, model_config: Optional[PretrainedConfig], ) -> bool: # Special handling for the LogitsProcessor. @@ -1180,7 +1180,7 @@ def forward( positions: torch.Tensor, query: torch.Tensor, key: torch.Tensor, - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: return self.base_layer( positions, query, @@ -1189,7 +1189,7 @@ def forward( ) @property - def scaling_factor_to_offset(self) -> Dict[float, int]: + def scaling_factor_to_offset(self) -> dict[float, int]: return self.base_layer.scaling_factor_to_offset @classmethod @@ -1197,7 +1197,7 @@ def can_replace_layer( cls, source_layer: nn.Module, lora_config: LoRAConfig, - packed_modules_list: List, + packed_modules_list: list, model_config: Optional[PretrainedConfig], ) -> bool: """Returns True if the layer can be replaced by this LoRA layer.""" diff --git a/vllm/lora/lora.py b/vllm/lora/lora.py index 00299bf6c2a81..294b49e0a8997 100644 --- a/vllm/lora/lora.py +++ b/vllm/lora/lora.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List, Optional -from typing import Sequence as GenericSequence +from collections.abc import Sequence as GenericSequence +from typing import Optional import torch import torch.types @@ -125,11 +125,11 @@ def __init__( self, module_name: str, rank: int, - lora_alphas: List[Optional[int]], - lora_a: List[Optional[torch.Tensor]], - lora_b: List[Optional[torch.Tensor]], - bias: Optional[List[Optional[torch.Tensor]]] = None, - scaling: Optional[List[float]] = None, + lora_alphas: list[Optional[int]], + lora_a: list[Optional[torch.Tensor]], + lora_b: list[Optional[torch.Tensor]], + bias: Optional[list[Optional[torch.Tensor]]] = None, + scaling: Optional[list[float]] = None, ) -> None: super().__init__( module_name=module_name, diff --git a/vllm/lora/models.py b/vllm/lora/models.py index e1294884ac2af..cbd303deb58c3 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -4,9 +4,9 @@ import math import os import re +from collections.abc import Sequence from dataclasses import dataclass, field -from typing import (Any, Callable, Dict, List, Optional, Sequence, Set, Type, - Union) +from typing import Any, Callable, Optional, Union import safetensors.torch import torch @@ -43,12 +43,12 @@ class LongContextLoRAContext: """Context for lora adapters that support long context.""" # The scaling factors to support long context lora fine tuned models. - scaling_factors: List[float] + scaling_factors: list[float] # dimension to apply rotary embedding. rot_dim: int # offsets to the sin_cos_cache for each lora_id loaded. # This value is dynamically modified. - offsets_by_lora_id: Dict[int, int] = field(default_factory=dict) + offsets_by_lora_id: dict[int, int] = field(default_factory=dict) def get_lora_id(): @@ -64,7 +64,7 @@ def __init__( self, lora_model_id: int, rank: int, - loras: Dict[str, LoRALayerWeights], + loras: dict[str, LoRALayerWeights], scaling_factor: Optional[float] = None, ) -> None: """ @@ -83,7 +83,7 @@ def __init__( lora_model_id > 0), f"a valid lora id should be greater than 0, got {self.id}" self.rank = rank - self.loras: Dict[str, LoRALayerWeights] = loras + self.loras: dict[str, LoRALayerWeights] = loras def clone(self, lora_model_id: int) -> "LoRAModel": """Return a copy of the object with different ids. @@ -109,19 +109,19 @@ def get_lora(self, module_name: str) -> Optional[LoRALayerWeights]: def from_lora_tensors( cls, lora_model_id: int, - tensors: Dict[str, torch.Tensor], + tensors: dict[str, torch.Tensor], peft_helper: PEFTHelper, device: str = "cuda", dtype: Optional[torch.dtype] = None, - embeddings: Optional[Dict[str, torch.Tensor]] = None, + embeddings: Optional[dict[str, torch.Tensor]] = None, target_embedding_padding: Optional[int] = None, - embedding_modules: Optional[Dict[str, str]] = None, - embedding_padding_modules: Optional[List[str]] = None, + embedding_modules: Optional[dict[str, str]] = None, + embedding_padding_modules: Optional[list[str]] = None, weights_mapper: Optional[WeightsMapper] = None, ) -> "LoRAModel": """Create a LoRAModel from a dictionary of tensors.""" pin_memory = str(device) == "cpu" and is_pin_memory_available() - loras: Dict[str, LoRALayerWeights] = {} + loras: dict[str, LoRALayerWeights] = {} for tensor_name, tensor in tensors.items(): module_name, is_lora_a, is_bias = parse_fine_tuned_lora_name( tensor_name, weights_mapper) @@ -183,15 +183,15 @@ def from_lora_tensors( def from_local_checkpoint( cls, lora_dir: str, - expected_lora_modules: List[str], + expected_lora_modules: list[str], peft_helper: PEFTHelper, *, lora_model_id: Optional[int] = None, device: str = "cuda", dtype: Optional[torch.dtype] = None, target_embedding_padding: Optional[int] = None, - embedding_modules: Optional[Dict[str, str]] = None, - embedding_padding_modules: Optional[List[str]] = None, + embedding_modules: Optional[dict[str, str]] = None, + embedding_padding_modules: Optional[list[str]] = None, weights_mapper: Optional[WeightsMapper] = None, ) -> "LoRAModel": """Create a LoRAModel from a local checkpoint. @@ -216,9 +216,9 @@ def from_local_checkpoint( new_embeddings_bin_file_path = os.path.join(lora_dir, "new_embeddings.bin") - unexpected_modules: List[Union[list[str], str]] + unexpected_modules: list[Union[list[str], str]] if os.path.isfile(lora_tensor_path): - tensors: Dict[str, torch.Tensor] = {} + tensors: dict[str, torch.Tensor] = {} # Find unexpected modules. # Use safetensor key as a source of truth to find expected modules. # in peft if you have target_modules A, B, C and C does not exist @@ -323,7 +323,7 @@ def __init__( self.max_num_seqs = max_num_seqs assert self.capacity >= self.lora_slots self.max_num_batched_tokens = math.ceil(max_num_batched_tokens / 8) * 8 - self.lora_index_to_id: List[Optional[int]] = [None] * self.lora_slots + self.lora_index_to_id: list[Optional[int]] = [None] * self.lora_slots self.vocab_size = vocab_size self.long_lora_context: Optional[LongContextLoRAContext] = None self.punica_wrapper = get_punica_wrapper(max_num_batched_tokens, @@ -331,7 +331,7 @@ def __init__( device=self.device) # Scaling factor -> offset to the sin_cos_cache to it. # Used for long context lora. - self.scaling_factor_to_offset: Dict[float, int] = {} + self.scaling_factor_to_offset: dict[float, int] = {} super().__init__(model) self.supported_lora_modules = get_supported_lora_modules(self.model) assert self.supported_lora_modules, "No supported LoRA modules found in" @@ -348,9 +348,9 @@ def __init__( # In case the model only supports LoRA for # text modules (e.g. ChatGLM) and hasattr(self.model, "get_mm_mapping")) - self.packed_modules: Dict[str, List[str]] = {} - self.modules: Dict[str, BaseLayerWithLoRA] = {} - # Dict instead of a Set for compatibility with LRUCache. + self.packed_modules: dict[str, list[str]] = {} + self.modules: dict[str, BaseLayerWithLoRA] = {} + # dict instead of a Set for compatibility with LRUCache. self._last_mapping: Optional[LoRAMapping] = None self._create_lora_modules() self.model.lora_manager = self @@ -520,7 +520,7 @@ def create_dummy_lora( lora_id: int, rank: int, scaling_factor: Optional[float], - embedding_modules: Optional[Dict[str, str]] = None) -> LoRAModel: + embedding_modules: Optional[dict[str, str]] = None) -> LoRAModel: """Create zero-initialized LoRAModel for warmup.""" model = LoRAModel(lora_id, rank, {}, scaling_factor) for module_name, module in self.model.named_modules(): @@ -568,7 +568,7 @@ def create_dummy_lora( else: parts = module_name.split(".") replacements = self.packed_modules_mapping[parts[-1]] - subloras: List[Optional[LoRALayerWeights]] = [] + subloras: list[Optional[LoRALayerWeights]] = [] for i, r in enumerate(replacements): lora = LoRALayerWeights.create_dummy_lora_weights( module_name + "." + r, @@ -620,8 +620,8 @@ def _register_packed_modules(self, module_full_name: str) -> None: def _create_merged_loras_inplace(self, lora_model: LoRAModel) -> None: for module_name, new_module_names in self.packed_modules.items(): - replacement_loras: List[Optional[LoRALayerWeights]] = [] - replaced_module: Set[str] = set() + replacement_loras: list[Optional[LoRALayerWeights]] = [] + replaced_module: set[str] = set() has_replacement = False for r in new_module_names: lora = lora_model.get_lora(r) @@ -662,7 +662,7 @@ def remove_adapter(self, adapter_id: int) -> bool: return remove_adapter(adapter_id, self._registered_adapters, self.deactivate_adapter) - def list_adapters(self) -> Dict[int, Any]: + def list_adapters(self) -> dict[int, Any]: return list_adapters(self._registered_adapters) def get_adapter(self, adapter_id: int) -> Optional[Any]: @@ -689,7 +689,7 @@ def __init__(self, model: nn.Module, max_num_seqs: int, self._active_adapters: LoRALRUCache = LoRALRUCache( self.lora_slots, self._deactivate_adapter) - def list_adapters(self) -> Dict[int, LoRAModel]: + def list_adapters(self) -> dict[int, LoRAModel]: """List all registered LoRAModels.""" return dict(self._registered_adapters.cache) @@ -754,7 +754,7 @@ def create_lora_manager( vocab_size: int, lora_config: LoRAConfig, device: torch.device, - lora_manager_cls: Type[LoRAModelManager] = LoRAModelManager, + lora_manager_cls: type[LoRAModelManager] = LoRAModelManager, **kwargs) -> LoRAModelManager: """Create a LoRA adapter for a given model.""" if not hasattr(model, "packed_modules_mapping"): diff --git a/vllm/lora/ops/triton_ops/sgmv_expand.py b/vllm/lora/ops/triton_ops/sgmv_expand.py index 6aa3eafaba4c0..ae4afa759bcea 100644 --- a/vllm/lora/ops/triton_ops/sgmv_expand.py +++ b/vllm/lora/ops/triton_ops/sgmv_expand.py @@ -6,8 +6,6 @@ https://arxiv.org/abs/2310.18547 """ -from typing import List - import torch import triton import triton.language as tl @@ -119,7 +117,7 @@ def _sgmv_expand_kernel( @torch.inference_mode() def _sgmv_expand( inputs: torch.Tensor, - lora_b_weights: List[torch.Tensor], + lora_b_weights: list[torch.Tensor], output_tensor: torch.Tensor, b_seq_start_loc: torch.Tensor, seq_len_tensor: torch.Tensor, @@ -133,7 +131,7 @@ def _sgmv_expand( """ Args: inputs (torch.Tensor): input tensor - lora_b_weights (List[torch.Tensor]): lora'b weight + lora_b_weights (list[torch.Tensor]): lora'b weight output_tensor (torch.Tensor): output tensor b_seq_start_loc (torch.Tensor): (batch_size,). The cumulative sequence lengths of the sequences in the batch, used to index @@ -222,7 +220,7 @@ def _sgmv_expand( def _sgmv_expand_fake( inputs: torch.Tensor, - lora_b_weights: List[torch.Tensor], + lora_b_weights: list[torch.Tensor], output_tensor: torch.Tensor, b_seq_start_loc: torch.Tensor, seq_len_tensor: torch.Tensor, diff --git a/vllm/lora/ops/triton_ops/sgmv_shrink.py b/vllm/lora/ops/triton_ops/sgmv_shrink.py index b8ed0b020f9ac..04f5beffc7810 100644 --- a/vllm/lora/ops/triton_ops/sgmv_shrink.py +++ b/vllm/lora/ops/triton_ops/sgmv_shrink.py @@ -6,8 +6,6 @@ https://arxiv.org/abs/2310.18547 """ -from typing import List - import torch import triton import triton.language as tl @@ -112,7 +110,7 @@ def _sgmv_shrink_kernel( @torch.inference_mode() def _sgmv_shrink( inputs: torch.Tensor, - lora_a_weights: List[torch.Tensor], + lora_a_weights: list[torch.Tensor], output_tensor: torch.Tensor, b_seq_start_loc: torch.Tensor, seq_len_tensor: torch.Tensor, @@ -125,7 +123,7 @@ def _sgmv_shrink( """ Args: inputs (torch.Tensor): input tensor - lora_a_weights (List[torch.Tensor]): lora'a weight + lora_a_weights (list[torch.Tensor]): lora'a weight output_tensor (torch.Tensor): output tensor b_seq_start_loc (torch.Tensor): (batch_size,). The cumulative sequence lengths of the sequences in the batch, used to index @@ -198,7 +196,7 @@ def _sgmv_shrink( def sgmv_shrink_fake( inputs: torch.Tensor, - lora_a_weights: List[torch.Tensor], + lora_a_weights: list[torch.Tensor], output_tensor: torch.Tensor, b_seq_start_loc: torch.Tensor, seq_len_tensor: torch.Tensor, diff --git a/vllm/lora/ops/triton_ops/utils.py b/vllm/lora/ops/triton_ops/utils.py index 78409b91a14e8..1f52f1bc8dd5f 100644 --- a/vllm/lora/ops/triton_ops/utils.py +++ b/vllm/lora/ops/triton_ops/utils.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 import functools -from typing import Dict, List, Tuple import torch @@ -36,7 +35,7 @@ def _get_default_config(op_type: str, batch: int, hidden_size: int): def get_lora_op_configs(op_type: str, batch: int, - hidden_size: int) -> Dict[str, int]: + hidden_size: int) -> dict[str, int]: """Inspired by `fused_moe_kernel` The return value will be a dictionary mapping an irregular grid of batch sizes and hidden_size to configurations of the bgmv-related kernel. @@ -50,11 +49,11 @@ def get_lora_op_configs(op_type: str, batch: int, return config -_LORA_A_PTR_DICT: Dict[Tuple[int, ...], Tuple[torch.tensor, ...]] = {} -_LORA_B_PTR_DICT: Dict[Tuple[int, ...], Tuple[torch.tensor, ...]] = {} +_LORA_A_PTR_DICT: dict[tuple[int, ...], tuple[torch.tensor, ...]] = {} +_LORA_B_PTR_DICT: dict[tuple[int, ...], tuple[torch.tensor, ...]] = {} -def _get_lora_a_ptr(lora_a_weights: List[torch.Tensor], device: str): +def _get_lora_a_ptr(lora_a_weights: list[torch.Tensor], device: str): """ `_LORA_A_PTR_DICT` collects the required information during `profile_run`, After this, it remains constant and subsequent usage is through LUT. @@ -99,7 +98,7 @@ def _get_lora_a_ptr(lora_a_weights: List[torch.Tensor], device: str): return _LORA_A_PTR_DICT.get(key) -def _get_lora_b_ptr(lora_weights: List[torch.Tensor], offset_start: int, +def _get_lora_b_ptr(lora_weights: list[torch.Tensor], offset_start: int, device: str): """ `_LORA_B_PTR_DICT` collects the required information during `profile_run`, diff --git a/vllm/lora/peft_helper.py b/vllm/lora/peft_helper.py index f6944368b36ee..d5de63f5baade 100644 --- a/vllm/lora/peft_helper.py +++ b/vllm/lora/peft_helper.py @@ -6,7 +6,7 @@ import math import os from dataclasses import MISSING, dataclass, field, fields -from typing import List, Literal, Optional, Union +from typing import Literal, Optional, Union from vllm.config import LoRAConfig from vllm.logger import init_logger @@ -40,7 +40,7 @@ class PEFTHelper: vllm_max_position_embeddings: Optional[int] = field(default=False) vllm_long_context_scaling_factor: Optional[float] = field(default=None) - def _validate_features(self) -> List[str]: + def _validate_features(self) -> list[str]: """ Check if there are any unsupported LoRA features. """ diff --git a/vllm/lora/punica_wrapper/punica_base.py b/vllm/lora/punica_wrapper/punica_base.py index 94fa3f27ab604..38d1ce6584d2b 100644 --- a/vllm/lora/punica_wrapper/punica_base.py +++ b/vllm/lora/punica_wrapper/punica_base.py @@ -7,7 +7,7 @@ """ from abc import ABC, abstractmethod -from typing import TYPE_CHECKING, List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Optional, Union import torch @@ -28,7 +28,7 @@ class PunicaWrapperABC(ABC): def update_metadata( self, mapping: "LoRAMapping", - lora_index_to_id: List[Optional[int]], + lora_index_to_id: list[Optional[int]], max_loras: int, vocab_size: int, extra_vocab_size: int, @@ -43,9 +43,9 @@ def update_metadata( @abstractmethod def add_shrink( self, - y: Union[Tuple[torch.Tensor, ...], torch.Tensor], + y: Union[tuple[torch.Tensor, ...], torch.Tensor], x: torch.Tensor, - lora_a_stacked: Tuple[torch.Tensor, ...], + lora_a_stacked: tuple[torch.Tensor, ...], scale: float, **kwargs, ) -> None: @@ -59,10 +59,10 @@ def add_shrink( def add_expand( self, y: torch.Tensor, - x: Union[Tuple[torch.Tensor, ...], torch.Tensor], - lora_b_stacked: Tuple[torch.Tensor, ...], - lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]], - output_slices: Tuple[int, ...], + x: Union[tuple[torch.Tensor, ...], torch.Tensor], + lora_b_stacked: tuple[torch.Tensor, ...], + lora_bias_stacked: Optional[tuple[torch.Tensor, ...]], + output_slices: tuple[int, ...], offset_start: int = 0, add_inputs=True, **kwargs, @@ -91,13 +91,13 @@ def add_lora_embedding( def add_lora_linear(self, y: torch.Tensor, x: torch.Tensor, - lora_a_stacked: Tuple[torch.Tensor, ...], - lora_b_stacked: Tuple[torch.Tensor, ...], - lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]], + lora_a_stacked: tuple[torch.Tensor, ...], + lora_b_stacked: tuple[torch.Tensor, ...], + lora_bias_stacked: Optional[tuple[torch.Tensor, ...]], scale: float, - output_slices: Tuple[int, ...], + output_slices: tuple[int, ...], *, - buffer: Optional[Tuple[torch.Tensor, ...]] = None, + buffer: Optional[tuple[torch.Tensor, ...]] = None, **kwargs) -> None: """ Applicable to linear-related lora. @@ -150,7 +150,7 @@ def __init__(self, max_num_batched_tokens: int, max_batches: int, # 5 is the number of indices tensors. # base_indices, sampler_indices, sampler_indices_padded, # embeddings_indices,long_lora_indices - self.indices_len: List[Optional[int]] = [None] * 5 + self.indices_len: list[Optional[int]] = [None] * 5 # these attributes are the information required for sgmv kernel self._seq_start_locs = torch.empty(max_batches, dtype=torch.long, @@ -171,7 +171,7 @@ def __init__(self, max_num_batched_tokens: int, max_batches: int, def _update_base_metadata( self, mapping: "LoRAMapping", - lora_index_to_id: List[Optional[int]], + lora_index_to_id: list[Optional[int]], max_loras: int, vocab_size: int, extra_vocab_size: int, @@ -227,8 +227,8 @@ def _apply_bias( self, indices: torch.Tensor, output: torch.Tensor, - output_slices: Tuple[int, ...], - lora_bias_stacked: Tuple[Optional[torch.Tensor], ...], + output_slices: tuple[int, ...], + lora_bias_stacked: tuple[Optional[torch.Tensor], ...], ): """Applies bias to output @@ -258,7 +258,7 @@ def _apply_bias( @property def prefill_metadata( self - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, int, int, int]: + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, int, int, int]: """ This property provides a convenient way to access the necessary metadata for prefill-related kernel computations. @@ -322,7 +322,7 @@ def long_lora_indices(self) -> torch.Tensor: def update_metadata( self, mapping: "LoRAMapping", - lora_index_to_id: List[Optional[int]], + lora_index_to_id: list[Optional[int]], max_loras: int, vocab_size: int, extra_vocab_size: int, @@ -340,8 +340,8 @@ def update_metadata( self.is_prefill = False @abstractmethod - def add_shrink(self, y: Union[Tuple[torch.Tensor, ...], torch.Tensor], - x: torch.Tensor, lora_a_stacked: Tuple[torch.Tensor, ...], + def add_shrink(self, y: Union[tuple[torch.Tensor, ...], torch.Tensor], + x: torch.Tensor, lora_a_stacked: tuple[torch.Tensor, ...], scale: float, **kwargs) -> None: """ Performs GEMM for multiple slices of lora_a. @@ -351,9 +351,9 @@ def add_shrink(self, y: Union[Tuple[torch.Tensor, ...], torch.Tensor], y[i] += (x @ lora_a_stacked[i]) * scale Args: - y (Union[Tuple[torch.Tensor, ...], torch.Tensor]): Output tensors + y (Union[tuple[torch.Tensor, ...], torch.Tensor]): Output tensors x (torch.Tensor): Input tensor - lora_a_stacked (Tuple[torch.Tensor, ...]): lora_a's weights + lora_a_stacked (tuple[torch.Tensor, ...]): lora_a's weights scale (float): Scaling factor for the operation """ @@ -363,10 +363,10 @@ def add_shrink(self, y: Union[Tuple[torch.Tensor, ...], torch.Tensor], @abstractmethod def add_expand(self, y: torch.Tensor, - x: Union[Tuple[torch.Tensor, ...], torch.Tensor], - lora_b_stacked: Tuple[torch.Tensor, ...], - lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]], - output_slices: Tuple[int, ...], + x: Union[tuple[torch.Tensor, ...], torch.Tensor], + lora_b_stacked: tuple[torch.Tensor, ...], + lora_bias_stacked: Optional[tuple[torch.Tensor, ...]], + output_slices: tuple[int, ...], offset_start: int = 0, add_inputs=True, **kwargs) -> None: @@ -383,11 +383,11 @@ def add_expand(self, Args: y (torch.Tensor): Output tensor. - x (Union[Tuple[torch.Tensor, ...], torch.Tensor]): Input tensors - lora_b_stacked (Tuple[torch.Tensor, ...]): lora_b's weight - lora_bias_stacked (Optional[Tuple[torch.Tensor, ...]]): + x (Union[tuple[torch.Tensor, ...], torch.Tensor]): Input tensors + lora_b_stacked (tuple[torch.Tensor, ...]): lora_b's weight + lora_bias_stacked (Optional[tuple[torch.Tensor, ...]]): bias's weight - output_slices (Tuple[int, ...]): Every slice's size + output_slices (tuple[int, ...]): Every slice's size offset_start (int): The starting position of y, defaults to 0 add_inputs (bool): Defaults to True. @@ -421,13 +421,13 @@ def add_lora_embedding(self, def add_lora_linear(self, y: torch.Tensor, x: torch.Tensor, - lora_a_stacked: Tuple[torch.Tensor, ...], - lora_b_stacked: Tuple[torch.Tensor, ...], - lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]], + lora_a_stacked: tuple[torch.Tensor, ...], + lora_b_stacked: tuple[torch.Tensor, ...], + lora_bias_stacked: Optional[tuple[torch.Tensor, ...]], scale: float, - output_slices: Tuple[int, ...], + output_slices: tuple[int, ...], *, - buffer: Optional[Tuple[torch.Tensor, ...]] = None, + buffer: Optional[tuple[torch.Tensor, ...]] = None, **kwargs) -> None: """ Applicable to linear-related lora. @@ -444,12 +444,12 @@ def add_lora_linear(self, Args: y (torch.Tensor): Output tensor. Will be changed in-place. x (torch.Tensor): Input tensor - lora_a_stacked (Tuple[torch.Tensor, ...]): lora_a's weight. - lora_b_stacked (Tuple[torch.Tensor, ...]): lora_b's weight. - lora_bias_stacked (Optional[Tuple[torch.Tensor, ...]]): lora's bias. + lora_a_stacked (tuple[torch.Tensor, ...]): lora_a's weight. + lora_b_stacked (tuple[torch.Tensor, ...]): lora_b's weight. + lora_bias_stacked (Optional[tuple[torch.Tensor, ...]]): lora's bias. scale (float): Scaling factor. - output_slices (Tuple[int, ...]): Every slice's size. - buffer (Optional[Tuple[torch.Tensor, ...]]): Defaults to None. + output_slices (tuple[int, ...]): Every slice's size. + buffer (Optional[tuple[torch.Tensor, ...]]): Defaults to None. """ # TODO: implement it based on torch ops raise NotImplementedError diff --git a/vllm/lora/punica_wrapper/punica_cpu.py b/vllm/lora/punica_wrapper/punica_cpu.py index 29428f4cfff31..8118a72d696a2 100644 --- a/vllm/lora/punica_wrapper/punica_cpu.py +++ b/vllm/lora/punica_wrapper/punica_cpu.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Callable, Optional, Tuple, Union +from typing import Callable, Optional, Union import torch @@ -150,8 +150,8 @@ def _apply_shrink(self, y: torch.Tensor, x: torch.Tensor, shrink_fun(y, x, w_t_all, scale) y = y.view_as(y_org) - def add_shrink(self, y: Union[Tuple[torch.Tensor, ...], torch.Tensor], - x: torch.Tensor, lora_a_stacked: Tuple[torch.Tensor, ...], + def add_shrink(self, y: Union[tuple[torch.Tensor, ...], torch.Tensor], + x: torch.Tensor, lora_a_stacked: tuple[torch.Tensor, ...], scale: float, **kwargs): """ Performs GEMM for multiple slices of lora_a. @@ -165,9 +165,9 @@ def add_shrink(self, y: Union[Tuple[torch.Tensor, ...], torch.Tensor], y[i] += (x @ lora_a_stacked[i]) * scale Args: - y (Union[Tuple[torch.Tensor, ...], torch.Tensor]): Output tensors + y (Union[tuple[torch.Tensor, ...], torch.Tensor]): Output tensors x (torch.Tensor): Input tensor - lora_a_stacked (Tuple[torch.Tensor, ...]): lora_a's weights + lora_a_stacked (tuple[torch.Tensor, ...]): lora_a's weights scale (float): Scaling factor for the operation """ @@ -179,10 +179,10 @@ def add_shrink(self, y: Union[Tuple[torch.Tensor, ...], torch.Tensor], def add_expand(self, y: torch.Tensor, - x: Union[Tuple[torch.Tensor, ...], torch.Tensor], - lora_b_stacked: Tuple[torch.Tensor, ...], - lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]], - output_slices: Tuple[int, ...], + x: Union[tuple[torch.Tensor, ...], torch.Tensor], + lora_b_stacked: tuple[torch.Tensor, ...], + lora_bias_stacked: Optional[tuple[torch.Tensor, ...]], + output_slices: tuple[int, ...], offset_start: int = 0, add_inputs=True, **kwargs) -> None: @@ -198,11 +198,11 @@ def add_expand(self, Args: y (torch.Tensor): Output tensor. - x (Union[Tuple[torch.Tensor, ...], torch.Tensor]): Input tensors - lora_b_stacked (Tuple[torch.Tensor, ...]): lora_b's weight - lora_bias_stacked (Optional[Tuple[torch.Tensor, ...]]): + x (Union[tuple[torch.Tensor, ...], torch.Tensor]): Input tensors + lora_b_stacked (tuple[torch.Tensor, ...]): lora_b's weight + lora_bias_stacked (Optional[tuple[torch.Tensor, ...]]): bias's weight - output_slices (Tuple[int, ...]): Every slice's size + output_slices (tuple[int, ...]): Every slice's size add_inputs (bool): Defaults to True. """ y_org = y @@ -250,13 +250,13 @@ def add_lora_embedding(self, def add_lora_linear(self, y: torch.Tensor, x: torch.Tensor, - lora_a_stacked: Tuple[torch.Tensor, ...], - lora_b_stacked: Tuple[torch.Tensor, ...], - lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]], + lora_a_stacked: tuple[torch.Tensor, ...], + lora_b_stacked: tuple[torch.Tensor, ...], + lora_bias_stacked: Optional[tuple[torch.Tensor, ...]], scale: float, - output_slices: Tuple[int, ...], + output_slices: tuple[int, ...], *, - buffer: Optional[Tuple[torch.Tensor, ...]] = None, + buffer: Optional[tuple[torch.Tensor, ...]] = None, **kwargs) -> None: """ Applicable to linear-related lora. @@ -273,12 +273,12 @@ def add_lora_linear(self, Args: y (torch.Tensor): Output tensor. Will be changed in-place. x (torch.Tensor): Input tensor - lora_a_stacked (Tuple[torch.Tensor, ...]): lora_a's weight. - lora_b_stacked (Tuple[torch.Tensor, ...]): lora_b's weight. - lora_bias_stacked (Optional[Tuple[torch.Tensor, ...]]): lora's bias. + lora_a_stacked (tuple[torch.Tensor, ...]): lora_a's weight. + lora_b_stacked (tuple[torch.Tensor, ...]): lora_b's weight. + lora_bias_stacked (Optional[tuple[torch.Tensor, ...]]): lora's bias. scale (float): Scaling factor. - output_slices (Tuple[int, ...]): Every slice's size. - buffer (Optional[Tuple[torch.Tensor, ...]]): Defaults to None. + output_slices (tuple[int, ...]): Every slice's size. + buffer (Optional[tuple[torch.Tensor, ...]]): Defaults to None. """ assert len(lora_a_stacked) == len(lora_b_stacked) == len(output_slices) diff --git a/vllm/lora/punica_wrapper/punica_gpu.py b/vllm/lora/punica_wrapper/punica_gpu.py index 9ccd9c36a073e..9a54243a070ea 100644 --- a/vllm/lora/punica_wrapper/punica_gpu.py +++ b/vllm/lora/punica_wrapper/punica_gpu.py @@ -6,7 +6,7 @@ https://arxiv.org/abs/2310.18547 """ -from typing import Optional, Tuple, Union, final +from typing import Optional, Union, final import torch @@ -39,7 +39,7 @@ def _apply_shrink_prefill( self, y: torch.Tensor, x: torch.Tensor, - w_t_all: Tuple[torch.Tensor, ...], + w_t_all: tuple[torch.Tensor, ...], scale: float, ): #No LoRA request, so return directly @@ -95,8 +95,8 @@ def _apply_expand_decode( bgmv_expand_slice(x, w_t_all, y, self.token_lora_indices, y_offset, y_slice_size, add_inputs) - def add_shrink(self, y: Union[Tuple[torch.Tensor, ...], torch.Tensor], - x: torch.Tensor, lora_a_stacked: Tuple[torch.Tensor, ...], + def add_shrink(self, y: Union[tuple[torch.Tensor, ...], torch.Tensor], + x: torch.Tensor, lora_a_stacked: tuple[torch.Tensor, ...], scale: float, **kwargs): """ Performs GEMM for multiple slices of lora_a. @@ -110,9 +110,9 @@ def add_shrink(self, y: Union[Tuple[torch.Tensor, ...], torch.Tensor], y[i] += (x @ lora_a_stacked[i]) * scale Args: - y (Union[Tuple[torch.Tensor, ...], torch.Tensor]): Output tensors + y (Union[tuple[torch.Tensor, ...], torch.Tensor]): Output tensors x (torch.Tensor): Input tensor - lora_a_stacked (Tuple[torch.Tensor, ...]): lora_a's weights + lora_a_stacked (tuple[torch.Tensor, ...]): lora_a's weights scale (float): Scaling factor for the operation """ @@ -129,10 +129,10 @@ def add_shrink(self, y: Union[Tuple[torch.Tensor, ...], torch.Tensor], def add_expand(self, y: torch.Tensor, - x: Union[Tuple[torch.Tensor, ...], torch.Tensor], - lora_b_stacked: Tuple[torch.Tensor, ...], - lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]], - output_slices: Tuple[int, ...], + x: Union[tuple[torch.Tensor, ...], torch.Tensor], + lora_b_stacked: tuple[torch.Tensor, ...], + lora_bias_stacked: Optional[tuple[torch.Tensor, ...]], + output_slices: tuple[int, ...], offset_start: int = 0, add_inputs=True, **kwargs) -> None: @@ -148,11 +148,11 @@ def add_expand(self, Args: y (torch.Tensor): Output tensor. - x (Union[Tuple[torch.Tensor, ...], torch.Tensor]): Input tensors - lora_b_stacked (Tuple[torch.Tensor, ...]): lora_b's weight - lora_bias_stacked (Optional[Tuple[torch.Tensor, ...]]): + x (Union[tuple[torch.Tensor, ...], torch.Tensor]): Input tensors + lora_b_stacked (tuple[torch.Tensor, ...]): lora_b's weight + lora_bias_stacked (Optional[tuple[torch.Tensor, ...]]): bias's weight - output_slices (Tuple[int, ...]): Every slice's size + output_slices (tuple[int, ...]): Every slice's size add_inputs (bool): Defaults to True. """ y_org = y @@ -216,13 +216,13 @@ def add_lora_embedding(self, def add_lora_linear(self, y: torch.Tensor, x: torch.Tensor, - lora_a_stacked: Tuple[torch.Tensor, ...], - lora_b_stacked: Tuple[torch.Tensor, ...], - lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]], + lora_a_stacked: tuple[torch.Tensor, ...], + lora_b_stacked: tuple[torch.Tensor, ...], + lora_bias_stacked: Optional[tuple[torch.Tensor, ...]], scale: float, - output_slices: Tuple[int, ...], + output_slices: tuple[int, ...], *, - buffer: Optional[Tuple[torch.Tensor, ...]] = None, + buffer: Optional[tuple[torch.Tensor, ...]] = None, **kwargs) -> None: """ Applicable to linear-related lora. @@ -239,12 +239,12 @@ def add_lora_linear(self, Args: y (torch.Tensor): Output tensor. Will be changed in-place. x (torch.Tensor): Input tensor - lora_a_stacked (Tuple[torch.Tensor, ...]): lora_a's weight. - lora_b_stacked (Tuple[torch.Tensor, ...]): lora_b's weight. - lora_bias_stacked (Optional[Tuple[torch.Tensor, ...]]): lora's bias. + lora_a_stacked (tuple[torch.Tensor, ...]): lora_a's weight. + lora_b_stacked (tuple[torch.Tensor, ...]): lora_b's weight. + lora_bias_stacked (Optional[tuple[torch.Tensor, ...]]): lora's bias. scale (float): Scaling factor. - output_slices (Tuple[int, ...]): Every slice's size. - buffer (Optional[Tuple[torch.Tensor, ...]]): Defaults to None. + output_slices (tuple[int, ...]): Every slice's size. + buffer (Optional[tuple[torch.Tensor, ...]]): Defaults to None. """ assert len(lora_a_stacked) == len(lora_b_stacked) == len(output_slices) diff --git a/vllm/lora/punica_wrapper/punica_hpu.py b/vllm/lora/punica_wrapper/punica_hpu.py index 3661a7214648a..416c23e73bf85 100644 --- a/vllm/lora/punica_wrapper/punica_hpu.py +++ b/vllm/lora/punica_wrapper/punica_hpu.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import TYPE_CHECKING, List, Optional, Tuple, Union, final +from typing import TYPE_CHECKING, Optional, Union, final import torch from vllm_hpu_extension.ops import (dispatch_bgmv_embedding, @@ -28,7 +28,7 @@ def __init__(self, max_num_batched_tokens: int, max_batches: int, def _update_base_metadata( self, mapping: "LoRAMapping", - lora_index_to_id: List[Optional[int]], + lora_index_to_id: list[Optional[int]], max_loras: int, vocab_size: int, extra_vocab_size: int, @@ -48,9 +48,9 @@ def _update_base_metadata( # graph accumulation. Hence HPU appends `lora_offset` to a list and # converts it to a tensor only after it is ready. if long_lora_context: - index_mapping_indices: List[int] = list( + index_mapping_indices: list[int] = list( mapping.index_mapping).copy() - long_lora_offsets: List[int] = [] + long_lora_offsets: list[int] = [] for i in range(len(index_mapping_indices)): lora_offset: int = long_lora_context.offsets_by_lora_id.get( index_mapping_indices[i], 0) @@ -85,13 +85,13 @@ def add_lora_embedding(self, def add_lora_linear(self, y: torch.Tensor, x: torch.Tensor, - lora_a_stacked: Tuple[torch.Tensor, ...], - lora_b_stacked: Tuple[torch.Tensor, ...], - lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]], + lora_a_stacked: tuple[torch.Tensor, ...], + lora_b_stacked: tuple[torch.Tensor, ...], + lora_bias_stacked: Optional[tuple[torch.Tensor, ...]], scale: float, - output_slices: Tuple[int, ...], + output_slices: tuple[int, ...], *, - buffer: Optional[Tuple[torch.Tensor, ...]] = None, + buffer: Optional[tuple[torch.Tensor, ...]] = None, **kwargs) -> None: y_org = y x = x.view(-1, x.shape[-1]) @@ -122,9 +122,9 @@ def add_lora_logits(self, def add_shrink( self, - y: Union[Tuple[torch.Tensor, ...], torch.Tensor], + y: Union[tuple[torch.Tensor, ...], torch.Tensor], x: torch.Tensor, - lora_a_stacked: Tuple[torch.Tensor, ...], + lora_a_stacked: tuple[torch.Tensor, ...], scale: float, **kwargs, ) -> None: @@ -133,10 +133,10 @@ def add_shrink( def add_expand( self, y: torch.Tensor, - x: Union[Tuple[torch.Tensor, ...], torch.Tensor], - lora_b_stacked: Tuple[torch.Tensor, ...], - lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]], - output_slices: Tuple[int, ...], + x: Union[tuple[torch.Tensor, ...], torch.Tensor], + lora_b_stacked: tuple[torch.Tensor, ...], + lora_bias_stacked: Optional[tuple[torch.Tensor, ...]], + output_slices: tuple[int, ...], offset_start: int = 0, add_inputs=True, **kwargs, diff --git a/vllm/lora/punica_wrapper/utils.py b/vllm/lora/punica_wrapper/utils.py index dbc2d27c597f2..c37d2b2bddcb2 100644 --- a/vllm/lora/punica_wrapper/utils.py +++ b/vllm/lora/punica_wrapper/utils.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import TYPE_CHECKING, List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Optional, Union import torch @@ -12,7 +12,7 @@ def compute_meta( token_lora_tensor: torch.Tensor -) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, int, int, int, bool]: +) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, int, int, int, bool]: """ Get the information required for the sgmv kernel. With the features: 1. If consecutive requests in the batch use the same LoRA, this function @@ -43,19 +43,19 @@ def compute_meta( # TODO see if this can be vectorized def convert_mapping( mapping: "LoRAMapping", - lora_index_to_id: List[Optional[int]], + lora_index_to_id: list[Optional[int]], max_loras: int, vocab_size: int, extra_vocab_size: int, device: torch.device, long_lora_context: Optional["LongContextLoRAContext"] = None, -) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, - Optional[torch.Tensor], List[int]]: +) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, + Optional[torch.Tensor], list[int]]: """Converts LoRAMapping to index tensors. Args: mapping: LoRAMapping mapping rows in a batch to LoRA ids. - lora_index_to_id: List mapping LoRA ids to LoRA indices. + lora_index_to_id: list mapping LoRA ids to LoRA indices. max_loras: Maximum number of LoRAs. vocab_size: Model vocab size. extra_vocab_size: Extra vocab size each LoRA can have. @@ -80,11 +80,11 @@ def convert_mapping( long_lora_indices: Tensor of shape [batch_size] mapping requests to RoPE offsets and rot dims for long LoRAs. None if long context lora doesn't exist. - indices_len: List of lengths of the above tensors. It contains + indices_len: list of lengths of the above tensors. It contains (base_indices, sampler_indices, sampler_indices_padded, embeddings_indices, long_lora_indices). """ - index_mapping_indices: List[int] = list(mapping.index_mapping).copy() + index_mapping_indices: list[int] = list(mapping.index_mapping).copy() embedding_indices = index_mapping_indices.copy() lora_indices = index_mapping_indices.copy() long_lora_offsets: Optional[torch.Tensor] = None @@ -92,7 +92,7 @@ def convert_mapping( long_lora_offsets = torch.zeros(len(index_mapping_indices), device=device, dtype=torch.long) - prompt_mapping: List[int] = [ + prompt_mapping: list[int] = [ lora_index_to_id.index(x) if x > 0 else -1 for x in mapping.prompt_mapping ] @@ -109,7 +109,7 @@ def convert_mapping( index_mapping_indices[i], 0) long_lora_offsets[i] = lora_offset - indices_list: List[Union[List[int], torch.Tensor]] = [ + indices_list: list[Union[list[int], torch.Tensor]] = [ index_mapping_indices, lora_indices, embedding_indices, diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py index 63b465fdf7432..dee1c94f3527a 100644 --- a/vllm/lora/utils.py +++ b/vllm/lora/utils.py @@ -2,7 +2,7 @@ import os import re -from typing import List, Optional, Set, Tuple, Type, Union +from typing import Optional, Union import huggingface_hub from huggingface_hub.utils import (EntryNotFoundError, HfHubHTTPError, @@ -37,7 +37,7 @@ logger = init_logger(__name__) -_all_lora_classes: Set[Type[BaseLayerWithLoRA]] = { +_all_lora_classes: set[type[BaseLayerWithLoRA]] = { VocabParallelEmbeddingWithLoRA, ColumnParallelLinearWithLoRA, MergedColumnParallelLinearWithLoRA, @@ -58,7 +58,7 @@ def from_layer(layer: nn.Module, max_loras: int, lora_config: LoRAConfig, - packed_modules_list: List, + packed_modules_list: list, model_config: Optional[PretrainedConfig] = None) -> nn.Module: for lora_cls in _all_lora_classes: # specifying kwargs so they can be easily accessed in decorator @@ -106,7 +106,7 @@ def replace_submodule(model: nn.Module, module_name: str, def parse_fine_tuned_lora_name( name: str, weights_mapper: Optional[WeightsMapper] = None -) -> Tuple[str, bool, bool]: +) -> tuple[str, bool, bool]: """Parse the name of lora weights. args: @@ -115,7 +115,7 @@ def parse_fine_tuned_lora_name( weights_mapper: maps the name of weight, e.g. `model.` -> `language_model.model.`, return: - Tuple(module_name, is_lora_a): + tuple(module_name, is_lora_a): module_name: the name of the module, e.g. model.dense1, is_lora_a whether the tensor is lora_a or lora_b. is_bias whether the tensor is lora bias. @@ -147,8 +147,8 @@ def parse_fine_tuned_lora_name( raise ValueError(f"{name} is unsupported LoRA weight") -def is_regex_target_modules(load_modules: Union[str, List[str]], - expected_lora_modules: List[str]) -> bool: +def is_regex_target_modules(load_modules: Union[str, list[str]], + expected_lora_modules: list[str]) -> bool: """ PEFT supports passing `target_modules` in the form of regular expressions, such as `model.*(q_proj|k_proj|v_proj)$`. This function is mainly used to @@ -179,11 +179,11 @@ def is_subset(sub_list, full_list): return False -def get_supported_lora_modules(model: nn.Module) -> List[str]: +def get_supported_lora_modules(model: nn.Module) -> list[str]: """ In vLLM, all linear layers support LoRA. """ - supported_lora_modules: Set[str] = set() + supported_lora_modules: set[str] = set() # step1: traverse the model to get all the linear subfixes. for name, module in model.named_modules(): if isinstance(module, (LinearBase, )): diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py index 108beb34b244a..8e5bc61066593 100644 --- a/vllm/lora/worker_manager.py +++ b/vllm/lora/worker_manager.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 from contextlib import contextmanager -from typing import Any, Dict, List, Literal, Optional, Set, Type, Union +from typing import Any, Literal, Optional, Union import torch @@ -27,7 +27,7 @@ class WorkerLoRAManager(AbstractWorkerManager): Every request, the requested LoRAs will be loaded (unless they are already loaded), and every other LoRA will be unloaded.""" - _manager_cls: Type[LoRAModelManager] = LoRAModelManager + _manager_cls: type[LoRAModelManager] = LoRAModelManager def __init__( self, @@ -36,9 +36,9 @@ def __init__( vocab_size: int, lora_config: LoRAConfig, device: torch.device, - embedding_modules: Dict[str, str], - embedding_padding_modules: List[str], - lora_model_cls: Type[LoRAModel] = LoRAModel, + embedding_modules: dict[str, str], + embedding_padding_modules: list[str], + lora_model_cls: type[LoRAModel] = LoRAModel, max_position_embeddings: Optional[int] = None, ): self._lora_model_cls = lora_model_cls @@ -88,7 +88,7 @@ def _load_adapter(self, lora_request: LoRARequest) -> LoRAModel: self._adapter_manager.supported_lora_modules) packed_modules_mapping = ( self._adapter_manager.packed_modules_mapping) - expected_lora_modules: List[str] = [] + expected_lora_modules: list[str] = [] for module in supported_lora_modules: if module in packed_modules_mapping: expected_lora_modules.extend( @@ -162,12 +162,12 @@ def add_dummy_lora(self, lora_request: LoRARequest, rank: int) -> bool: def pin_adapter(self, adapter_id: int) -> bool: return self._adapter_manager.pin_adapter(adapter_id) - def set_active_adapters(self, requests: Set[Any], + def set_active_adapters(self, requests: set[Any], mapping: Optional[Any]) -> None: set_active_adapters_worker(requests, mapping, self._apply_adapters, self._adapter_manager.set_adapter_mapping) - def _apply_adapters(self, adapter_requests: Set[Any]) -> None: + def _apply_adapters(self, adapter_requests: set[Any]) -> None: apply_adapters_worker(adapter_requests, self.list_adapters, self._adapter_manager.adapter_slots, self.remove_adapter, self.add_adapter) @@ -184,7 +184,7 @@ def remove_adapter(self, adapter_id: int) -> bool: def remove_all_adapters(self): self._adapter_manager.remove_all_adapters() - def list_adapters(self) -> Set[int]: + def list_adapters(self) -> set[int]: return list_adapters_worker(self._adapter_manager.list_adapters) @@ -195,7 +195,7 @@ class LRUCacheWorkerLoRAManager(WorkerLoRAManager): (unless they are already loaded) and least recently used LoRAs will be unloaded if the cache is above capacity.""" - _manager_cls: Type[LRUCacheLoRAModelManager] = LRUCacheLoRAModelManager + _manager_cls: type[LRUCacheLoRAModelManager] = LRUCacheLoRAModelManager def create_lora_manager( self, @@ -213,7 +213,7 @@ def create_lora_manager( self._adapter_manager = lora_manager return lora_manager.model - def _apply_adapters(self, lora_requests: Set[LoRARequest]) -> None: + def _apply_adapters(self, lora_requests: set[LoRARequest]) -> None: loras_map = { lora_request.lora_int_id: lora_request for lora_request in lora_requests if lora_request diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py index dfd052f625211..d0c3c0280428b 100644 --- a/vllm/model_executor/custom_op.py +++ b/vllm/model_executor/custom_op.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Dict, Type - import torch.nn as nn from vllm.config import get_current_vllm_config @@ -138,7 +136,7 @@ def default_on() -> bool: # Examples: # - MyOp.enabled() # - op_registry["my_op"].enabled() - op_registry: Dict[str, Type['CustomOp']] = {} + op_registry: dict[str, type['CustomOp']] = {} # Decorator to register custom ops. @classmethod diff --git a/vllm/model_executor/guided_decoding/guided_fields.py b/vllm/model_executor/guided_decoding/guided_fields.py index db4ce26806c1f..2b84cad0d1f5c 100644 --- a/vllm/model_executor/guided_decoding/guided_fields.py +++ b/vllm/model_executor/guided_decoding/guided_fields.py @@ -1,16 +1,16 @@ # SPDX-License-Identifier: Apache-2.0 from dataclasses import dataclass -from typing import Dict, List, Optional, TypedDict, Union +from typing import Optional, TypedDict, Union from pydantic import BaseModel # These classes are deprecated, see SamplingParams class LLMGuidedOptions(TypedDict, total=False): - guided_json: Union[Dict, BaseModel, str] + guided_json: Union[dict, BaseModel, str] guided_regex: str - guided_choice: List[str] + guided_choice: list[str] guided_grammar: str guided_decoding_backend: str guided_whitespace_pattern: str @@ -20,9 +20,9 @@ class LLMGuidedOptions(TypedDict, total=False): @dataclass class GuidedDecodingRequest: """One of the fields will be used to retrieve the logit processor.""" - guided_json: Optional[Union[Dict, BaseModel, str]] = None + guided_json: Optional[Union[dict, BaseModel, str]] = None guided_regex: Optional[str] = None - guided_choice: Optional[List[str]] = None + guided_choice: Optional[list[str]] = None guided_grammar: Optional[str] = None guided_decoding_backend: Optional[str] = None guided_whitespace_pattern: Optional[str] = None diff --git a/vllm/model_executor/guided_decoding/outlines_decoding.py b/vllm/model_executor/guided_decoding/outlines_decoding.py index ba9c982903682..a480fabf7d803 100644 --- a/vllm/model_executor/guided_decoding/outlines_decoding.py +++ b/vllm/model_executor/guided_decoding/outlines_decoding.py @@ -6,7 +6,7 @@ from enum import Enum from json import dumps as json_dumps from re import escape as regex_escape -from typing import Tuple, Union +from typing import Union from transformers import PreTrainedTokenizerBase @@ -105,7 +105,7 @@ def get_local_outlines_guided_decoding_logits_processor( def _get_guide_and_mode( guided_params: GuidedDecodingParams -) -> Union[Tuple[str, GuidedDecodingMode], Tuple[None, None]]: +) -> Union[tuple[str, GuidedDecodingMode], tuple[None, None]]: if guided_params.json: if isinstance(guided_params.json, dict): # turn dict into hashable string diff --git a/vllm/model_executor/guided_decoding/outlines_logits_processors.py b/vllm/model_executor/guided_decoding/outlines_logits_processors.py index a05267d921d1a..e5d926e82fc67 100644 --- a/vllm/model_executor/guided_decoding/outlines_logits_processors.py +++ b/vllm/model_executor/guided_decoding/outlines_logits_processors.py @@ -19,7 +19,7 @@ import json from collections import defaultdict from functools import lru_cache -from typing import Callable, DefaultDict, Dict, List, Union +from typing import Callable, Union import numpy as np import torch @@ -40,10 +40,10 @@ class BaseLogitsProcessor: def __init__(self, guide: Guide): self._guide: Guide = guide # CFGState is used for the FSM state for CFGGuide - self._fsm_state: DefaultDict[int, Union[int, + self._fsm_state: defaultdict[int, Union[int, CFGState]] = defaultdict(int) - def __call__(self, input_ids: List[int], + def __call__(self, input_ids: list[int], scores: torch.Tensor) -> torch.Tensor: """Use the FSM to bias the logits before sampling the next token.""" seq_id = hash(tuple(input_ids)) @@ -130,7 +130,7 @@ def __init__(self, regex_string: str, tokenizer: PreTrainedTokenizerBase): class JSONLogitsProcessor(RegexLogitsProcessor): - def __init__(self, schema: Union[str, Dict, BaseModel], + def __init__(self, schema: Union[str, dict, BaseModel], tokenizer: PreTrainedTokenizerBase, whitespace_pattern: Union[str, None]): """Compile the FSM that drives the JSON-guided generation. @@ -150,7 +150,7 @@ def __init__(self, schema: Union[str, Dict, BaseModel], """ if isinstance(schema, type(BaseModel)): schema_str = json.dumps(schema.model_json_schema()) - elif isinstance(schema, Dict): + elif isinstance(schema, dict): schema_str = json.dumps(schema) elif isinstance(schema, str): schema_str = schema @@ -219,11 +219,11 @@ def convert_token_to_string(token: str) -> str: return string def change_decoder( - decoder: Callable[[List[int]], - str]) -> Callable[[List[int]], List[str]]: + decoder: Callable[[list[int]], + str]) -> Callable[[list[int]], list[str]]: """Sync vLLM's decoder with the outlines by returning list.""" - def new_decoder(inp_tokens: List[int]) -> List[str]: + def new_decoder(inp_tokens: list[int]) -> list[str]: if (isinstance(inp_tokens, list) and len(inp_tokens) == 1 and isinstance(inp_tokens[0], list)): inp_tokens = inp_tokens[0] diff --git a/vllm/model_executor/guided_decoding/xgrammar_decoding.py b/vllm/model_executor/guided_decoding/xgrammar_decoding.py index eb9d83acb2867..883d4e728abb0 100644 --- a/vllm/model_executor/guided_decoding/xgrammar_decoding.py +++ b/vllm/model_executor/guided_decoding/xgrammar_decoding.py @@ -6,7 +6,7 @@ import json import re from dataclasses import dataclass, field -from typing import TYPE_CHECKING, Any, List +from typing import TYPE_CHECKING, Any import torch from transformers import PreTrainedTokenizerFast @@ -281,7 +281,7 @@ def escape_ebnf_string(s: str) -> str: return re.sub(r'(["\\])', r'\\\1', s) @staticmethod - def choice_as_grammar(choice: List[str] | None) -> str: + def choice_as_grammar(choice: list[str] | None) -> str: if choice is None: raise ValueError("Choice is not set") escaped_choices = (GrammarConfig.escape_ebnf_string(c) for c in choice) diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py index 6f933c3fa3c9f..59adfe3d5c32f 100644 --- a/vllm/model_executor/layers/fused_moe/__init__.py +++ b/vllm/model_executor/layers/fused_moe/__init__.py @@ -1,13 +1,13 @@ # SPDX-License-Identifier: Apache-2.0 from contextlib import contextmanager -from typing import Any, Dict, Optional +from typing import Any, Optional from vllm.model_executor.layers.fused_moe.layer import ( FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported) from vllm.triton_utils import HAS_TRITON -_config: Optional[Dict[str, Any]] = None +_config: Optional[dict[str, Any]] = None @contextmanager @@ -19,7 +19,7 @@ def override_config(config): _config = old_config -def get_config() -> Optional[Dict[str, Any]]: +def get_config() -> Optional[dict[str, Any]]: return _config diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 00260313e72eb..20fd415d2dd80 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -3,7 +3,7 @@ import functools import json import os -from typing import Any, Callable, Dict, List, Optional, Tuple +from typing import Any, Callable, Optional import torch import triton @@ -578,7 +578,7 @@ def moe_align_block_size( block_size: int, num_experts: int, expert_map: torch.Tensor = None -) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: +) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: """ Aligns the token distribution across experts to be compatible with block size for matrix multiplication. @@ -676,12 +676,12 @@ def invoke_fused_moe_kernel(A: torch.Tensor, num_tokens_post_padded: torch.Tensor, mul_routed_weight: bool, top_k: int, - config: Dict[str, Any], + config: dict[str, Any], compute_type: tl.dtype, use_fp8_w8a8: bool, use_int8_w8a16: bool, use_int4_w4a16: bool, - block_shape: Optional[List[int]] = None) -> None: + block_shape: Optional[list[int]] = None) -> None: assert topk_weights.stride(1) == 1 assert sorted_token_ids.stride(0) == 1 @@ -804,7 +804,7 @@ def invoke_fused_moe_kernel(A: torch.Tensor, def get_config_file_name(E: int, N: int, dtype: Optional[str], - block_shape: Optional[List[int]] = None) -> str: + block_shape: Optional[list[int]] = None) -> str: device_name = current_platform.get_device_name().replace(" ", "_") dtype_selector = "" if not dtype else f",dtype={dtype}" block_shape_selector = ("" if not block_shape or not all(block_shape) else @@ -820,7 +820,7 @@ def get_moe_configs( dtype: Optional[str], block_n: Optional[int] = None, block_k: Optional[int] = None, -) -> Optional[Dict[int, Any]]: +) -> Optional[dict[int, Any]]: """ Return optimized configurations for the fused MoE kernel. @@ -860,8 +860,8 @@ def get_default_config( topk: int, dtype: Optional[str], is_marlin: bool, - block_shape: Optional[List[int]] = None, -) -> Dict[str, int]: + block_shape: Optional[list[int]] = None, +) -> dict[str, int]: if dtype == "fp8_w8a8" and block_shape is not None: # Block-wise quant: BLOCK_SIZE_N must be divisible by block_shape[0] # BLOCK_SIZE_K must be divisible by block_shape[1] @@ -892,13 +892,13 @@ def get_default_config( def try_get_optimal_moe_config( - w1_shape: Tuple[int, ...], - w2_shape: Tuple[int, ...], + w1_shape: tuple[int, ...], + w2_shape: tuple[int, ...], top_k: int, dtype: Optional[str], M: int, is_marlin: bool = False, - block_shape: Optional[List[int]] = None, + block_shape: Optional[list[int]] = None, ): from vllm.model_executor.layers.fused_moe import get_config override_config = get_config() @@ -1052,7 +1052,7 @@ def inplace_fused_experts(hidden_states: torch.Tensor, w2_zp: Optional[torch.Tensor] = None, a1_scale: Optional[torch.Tensor] = None, a2_scale: Optional[torch.Tensor] = None, - block_shape: Optional[List[int]] = None) -> None: + block_shape: Optional[list[int]] = None) -> None: fused_experts_impl(hidden_states, w1, w2, topk_weights, topk_ids, True, activation, use_fp8_w8a8, use_int8_w8a16, use_int4_w4a16, global_num_experts, expert_map, @@ -1078,7 +1078,7 @@ def inplace_fused_experts_fake( w2_zp: Optional[torch.Tensor] = None, a1_scale: Optional[torch.Tensor] = None, a2_scale: Optional[torch.Tensor] = None, - block_shape: Optional[List[int]] = None) -> None: + block_shape: Optional[list[int]] = None) -> None: pass @@ -1108,7 +1108,7 @@ def outplace_fused_experts( w2_zp: Optional[torch.Tensor] = None, a1_scale: Optional[torch.Tensor] = None, a2_scale: Optional[torch.Tensor] = None, - block_shape: Optional[List[int]] = None) -> torch.Tensor: + block_shape: Optional[list[int]] = None) -> torch.Tensor: return fused_experts_impl(hidden_states, w1, w2, topk_weights, topk_ids, False, activation, use_fp8_w8a8, use_int8_w8a16, use_int4_w4a16, global_num_experts, expert_map, @@ -1134,7 +1134,7 @@ def outplace_fused_experts_fake( w2_zp: Optional[torch.Tensor] = None, a1_scale: Optional[torch.Tensor] = None, a2_scale: Optional[torch.Tensor] = None, - block_shape: Optional[List[int]] = None) -> torch.Tensor: + block_shape: Optional[list[int]] = None) -> torch.Tensor: return torch.empty_like(hidden_states) @@ -1164,7 +1164,7 @@ def fused_experts(hidden_states: torch.Tensor, w2_zp: Optional[torch.Tensor] = None, a1_scale: Optional[torch.Tensor] = None, a2_scale: Optional[torch.Tensor] = None, - block_shape: Optional[List[int]] = None) -> torch.Tensor: + block_shape: Optional[list[int]] = None) -> torch.Tensor: if inplace: torch.ops.vllm.inplace_fused_experts( @@ -1199,7 +1199,7 @@ def fused_experts_impl(hidden_states: torch.Tensor, w2_zp: Optional[torch.Tensor] = None, a1_scale: Optional[torch.Tensor] = None, a2_scale: Optional[torch.Tensor] = None, - block_shape: Optional[List[int]] = None): + block_shape: Optional[list[int]] = None): # Check constraints. if use_int4_w4a16: assert hidden_states.shape[1] // 2 == w1.shape[ @@ -1370,7 +1370,7 @@ def fused_moe( w2_zp: Optional[torch.Tensor] = None, a1_scale: Optional[torch.Tensor] = None, a2_scale: Optional[torch.Tensor] = None, - block_shape: Optional[List[int]] = None, + block_shape: Optional[list[int]] = None, ) -> torch.Tensor: """ This function computes a Mixture of Experts (MoE) layer using two sets of @@ -1413,7 +1413,7 @@ def fused_moe( a1. - a2_scale (Optional[torch.Tensor]): Optional scale to be used for a2. - - block_shape: (Optional[List[int]]): Optional block size for block-wise + - block_shape: (Optional[list[int]]): Optional block size for block-wise quantization. Returns: diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 28a88571dab4b..99ce694327441 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -2,7 +2,7 @@ from abc import abstractmethod from enum import Enum -from typing import Callable, List, Optional, Tuple +from typing import Callable, Optional import torch from torch.nn.parameter import UninitializedParameter @@ -698,7 +698,7 @@ def forward(self, hidden_states: torch.Tensor, def make_expert_params_mapping( cls, ckpt_gate_proj_name: str, ckpt_down_proj_name: str, ckpt_up_proj_name: str, - num_experts: int) -> List[Tuple[str, str, int, str]]: + num_experts: int) -> list[tuple[str, str, int, str]]: return [ # (param_name, weight_name, expert_id, shard_id) diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py index b476fb0dbc7eb..9d17b1e3044eb 100644 --- a/vllm/model_executor/layers/layernorm.py +++ b/vllm/model_executor/layers/layernorm.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 """Custom normalization layers.""" -from typing import Optional, Tuple, Union +from typing import Optional, Union import torch import torch.nn as nn @@ -39,7 +39,7 @@ def forward_native( self, x: torch.Tensor, residual: Optional[torch.Tensor] = None, - ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: + ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]: """PyTorch-native implementation equivalent to forward().""" orig_dtype = x.dtype x = x.to(torch.float32) @@ -77,7 +77,7 @@ def forward_cuda( self, x: torch.Tensor, residual: Optional[torch.Tensor] = None, - ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: + ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]: if self.variance_size_override is not None: return self.forward_native(x, residual) @@ -104,7 +104,7 @@ def forward_hpu( self, x: torch.Tensor, residual: Optional[torch.Tensor] = None, - ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: + ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]: from vllm_hpu_extension.ops import HPUFusedRMSNorm if HPUFusedRMSNorm is None: return self.forward_native(x, residual) @@ -123,7 +123,7 @@ def forward_xpu( self, x: torch.Tensor, residual: Optional[torch.Tensor] = None, - ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: + ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]: if self.variance_size_override is not None: return self.forward_native(x, residual) @@ -173,7 +173,7 @@ def forward_static( variance_epsilon: float, x: torch.Tensor, residual: Optional[torch.Tensor], - ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: + ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]: """PyTorch-native implementation equivalent to forward().""" orig_dtype = x.dtype if residual is not None: @@ -193,7 +193,7 @@ def forward_native( self, x: torch.Tensor, residual: Optional[torch.Tensor] = None, - ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: + ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]: """PyTorch-native implementation equivalent to forward().""" return self.forward_static(self.weight.data, self.variance_epsilon, x, residual) @@ -202,7 +202,7 @@ def forward_cuda( self, x: torch.Tensor, residual: Optional[torch.Tensor] = None, - ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: + ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]: if torch.compiler.is_compiling(): return self.forward_native(x, residual) diff --git a/vllm/model_executor/layers/mamba/mamba_mixer2.py b/vllm/model_executor/layers/mamba/mamba_mixer2.py index b53a540ed6624..c1c582681bb96 100644 --- a/vllm/model_executor/layers/mamba/mamba_mixer2.py +++ b/vllm/model_executor/layers/mamba/mamba_mixer2.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List, Optional, Tuple, Union +from typing import Optional, Union import torch from torch import nn @@ -107,7 +107,7 @@ def forward_cuda( self, x: torch.Tensor, gate: torch.Tensor, - ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: + ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]: if self.tp_size > 1 or self.n_groups != 1: return self.forward_native(x, gate) @@ -139,7 +139,7 @@ def extra_groups_for_head_shards(ngroups: int, tp_size: int): def mamba_v2_sharded_weight_loader( - shard_spec: List[Tuple[int, int, float]], + shard_spec: list[tuple[int, int, float]], tp_size: int, tp_rank: int, ) -> LoaderFunction: diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py index 0012636ef9ffc..8785a5b4cb82d 100644 --- a/vllm/model_executor/layers/pooler.py +++ b/vllm/model_executor/layers/pooler.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 from enum import IntEnum -from typing import List, Optional, Union +from typing import Optional, Union import torch import torch.nn as nn @@ -46,7 +46,7 @@ def from_pooling_type( normalize: bool, softmax: bool, step_tag_id: Optional[int] = None, - returned_token_ids: Optional[List[int]] = None, + returned_token_ids: Optional[list[int]] = None, ) -> "SimplePooler": if pooling_type == PoolingType.LAST: assert step_tag_id is None and returned_token_ids is None @@ -174,7 +174,7 @@ def __init__( normalize: bool, softmax: bool, step_tag_id: Optional[int] = None, - returned_token_ids: Optional[List[int]] = None, + returned_token_ids: Optional[list[int]] = None, ): super().__init__(normalize=normalize, softmax=softmax) @@ -245,7 +245,7 @@ def from_config_with_defaults( normalize: bool, softmax: bool, step_tag_id: Optional[int] = None, - returned_token_ids: Optional[List[int]] = None, + returned_token_ids: Optional[list[int]] = None, ) -> SimplePooler: return SimplePooler.from_pooling_type( pooling_type=PoolingType[pooler_config.pooling_type] diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py index 6cd508d057a44..93fb964eeea9b 100644 --- a/vllm/model_executor/layers/quantization/__init__.py +++ b/vllm/model_executor/layers/quantization/__init__.py @@ -1,11 +1,9 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Dict, List, Type - from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) -QUANTIZATION_METHODS: List[str] = [ +QUANTIZATION_METHODS: list[str] = [ "aqlm", "awq", "deepspeedfp", @@ -73,7 +71,7 @@ def _wrapper(quant_config_cls): return _wrapper -def get_quantization_config(quantization: str) -> Type[QuantizationConfig]: +def get_quantization_config(quantization: str) -> type[QuantizationConfig]: if quantization not in QUANTIZATION_METHODS: raise ValueError(f"Invalid quantization method: {quantization}") @@ -104,7 +102,7 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]: from .qqq import QQQConfig from .tpu_int8 import Int8TpuConfig - method_to_config: Dict[str, Type[QuantizationConfig]] = { + method_to_config: dict[str, type[QuantizationConfig]] = { "aqlm": AQLMConfig, "awq": AWQConfig, "deepspeedfp": DeepSpeedFPConfig, diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py index 10f5241f9a717..a2c61e7d0862b 100644 --- a/vllm/model_executor/layers/quantization/aqlm.py +++ b/vllm/model_executor/layers/quantization/aqlm.py @@ -4,7 +4,7 @@ # and https://arxiv.org/pdf/2401.06118.pdf import math -from typing import Any, Dict, List, Optional +from typing import Any, Optional import torch import torch.nn.functional as F @@ -97,7 +97,7 @@ def generic_dequantize_gemm( codebooks: torch. Tensor, # [num_codebooks, codebook_size, out_group_size, in_group_size] scales: torch.Tensor, # [num_out_groups, 1, 1, 1] - output_partition_sizes: List[int], + output_partition_sizes: list[int], bias: Optional[torch.Tensor], ) -> torch.Tensor: output_shape = input.shape[:-1] + (scales.shape[0], ) @@ -135,7 +135,7 @@ def optimized_dequantize_gemm( codebooks: torch. Tensor, # [num_codebooks, codebook_size, out_group_size, in_group_size] scales: torch.Tensor, # [num_out_groups, 1, 1, 1] - output_partition_sizes: List[int], + output_partition_sizes: list[int], bias: Optional[torch.Tensor], ) -> torch.Tensor: weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes) @@ -190,7 +190,7 @@ def get_name(cls) -> str: return "aqlm" @classmethod - def get_supported_act_dtypes(cls) -> List[torch.dtype]: + def get_supported_act_dtypes(cls) -> list[torch.dtype]: return [torch.half] @classmethod @@ -198,11 +198,11 @@ def get_min_capability(cls) -> int: return 60 @classmethod - def get_config_filenames(cls) -> List[str]: + def get_config_filenames(cls) -> list[str]: return [] # no extra configs. @classmethod - def from_config(cls, config: Dict[str, Any]) -> "AQLMConfig": + def from_config(cls, config: dict[str, Any]) -> "AQLMConfig": in_group_size = cls.get_from_keys(config, ["in_group_size"]) nbits_per_codebook = cls.get_from_keys(config, ["nbits_per_codebook"]) num_code_books = cls.get_from_keys(config, ["num_codebooks"]) @@ -229,7 +229,7 @@ def __init__(self, quant_config: AQLMConfig): def create_weights(self, layer: torch.nn.Module, input_size_per_partition: int, - output_partition_sizes: List[int], input_size: int, + output_partition_sizes: list[int], input_size: int, output_size: int, params_dtype: torch.dtype, **extra_weight_attrs): del output_size # Unused. diff --git a/vllm/model_executor/layers/quantization/awq.py b/vllm/model_executor/layers/quantization/awq.py index 227be1497d0ec..1f0f6f7074d43 100644 --- a/vllm/model_executor/layers/quantization/awq.py +++ b/vllm/model_executor/layers/quantization/awq.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Any, Dict, List, Optional +from typing import Any, Optional import torch @@ -24,7 +24,7 @@ def __init__( weight_bits: int, group_size: int, zero_point: bool, - modules_to_not_convert: Optional[List[str]] = None, + modules_to_not_convert: Optional[list[str]] = None, ) -> None: super().__init__() self.weight_bits = weight_bits @@ -47,7 +47,7 @@ def __repr__(self) -> str: def get_name(self) -> str: return "awq" - def get_supported_act_dtypes(self) -> List[torch.dtype]: + def get_supported_act_dtypes(self) -> list[torch.dtype]: return [torch.half] @classmethod @@ -56,7 +56,7 @@ def get_min_capability(cls) -> int: return 75 @staticmethod - def get_config_filenames() -> List[str]: + def get_config_filenames() -> list[str]: return [ "quant_config.json", # E.g., casperhansen/vicuna-7b-v1.5-awq # E.g., abhinavkulkarni/mosaicml-mpt-7b-instruct-w4-g128-awq @@ -64,7 +64,7 @@ def get_config_filenames() -> List[str]: ] @classmethod - def from_config(cls, config: Dict[str, Any]) -> "AWQConfig": + def from_config(cls, config: dict[str, Any]) -> "AWQConfig": weight_bits = cls.get_from_keys(config, ["w_bit", "bits"]) group_size = cls.get_from_keys(config, ["q_group_size", "group_size"]) zero_point = cls.get_from_keys(config, ["zero_point"]) @@ -81,7 +81,7 @@ def get_quant_method(self, layer: torch.nn.Module, return None -def is_layer_skipped_awq(prefix: str, modules_to_not_convert: List[str]): +def is_layer_skipped_awq(prefix: str, modules_to_not_convert: list[str]): return any(module_name in prefix for module_name in modules_to_not_convert) @@ -97,7 +97,7 @@ def __init__(self, quant_config: AWQConfig): def create_weights(self, layer: torch.nn.Module, input_size_per_partition: int, - output_partition_sizes: List[int], input_size: int, + output_partition_sizes: list[int], input_size: int, output_size: int, params_dtype: torch.dtype, **extra_weight_attrs): if input_size_per_partition % self.quant_config.group_size != 0: diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py index 473816fcc3ecd..2f543cd9a765d 100644 --- a/vllm/model_executor/layers/quantization/awq_marlin.py +++ b/vllm/model_executor/layers/quantization/awq_marlin.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Any, Callable, Dict, List, Optional +from typing import Any, Callable, Optional import torch from torch.nn import Parameter @@ -45,8 +45,8 @@ class AWQMarlinConfig(QuantizationConfig): def __init__(self, weight_bits: int, group_size: int, zero_point: bool, lm_head_quantized: bool, - modules_to_not_convert: Optional[List[str]], - full_config: Dict[str, Any]) -> None: + modules_to_not_convert: Optional[list[str]], + full_config: dict[str, Any]) -> None: super().__init__() self.pack_factor = 32 // weight_bits # packed into int32 self.group_size = group_size @@ -78,7 +78,7 @@ def get_name(cls) -> str: return "awq_marlin" @classmethod - def get_supported_act_dtypes(cls) -> List[torch.dtype]: + def get_supported_act_dtypes(cls) -> list[torch.dtype]: return [torch.half, torch.bfloat16] @classmethod @@ -86,11 +86,11 @@ def get_min_capability(cls) -> int: return 80 @classmethod - def get_config_filenames(cls) -> List[str]: + def get_config_filenames(cls) -> list[str]: return ["quantize_config.json"] @classmethod - def from_config(cls, config: Dict[str, Any]) -> "AWQMarlinConfig": + def from_config(cls, config: dict[str, Any]) -> "AWQMarlinConfig": weight_bits = cls.get_from_keys(config, ["bits"]) group_size = cls.get_from_keys(config, ["group_size"]) zero_point = cls.get_from_keys(config, ["zero_point"]) @@ -145,7 +145,7 @@ def get_quant_method(self, layer: torch.nn.Module, return None @classmethod - def is_awq_marlin_compatible(cls, quant_config: Dict[str, Any]): + def is_awq_marlin_compatible(cls, quant_config: dict[str, Any]): # Extract data from quant config. quant_method = quant_config.get("quant_method", "").lower() num_bits = quant_config.get("bits") @@ -184,7 +184,7 @@ def create_weights( self, layer: torch.nn.Module, input_size_per_partition: int, - output_partition_sizes: List[int], + output_partition_sizes: list[int], input_size: int, output_size: int, params_dtype: torch.dtype, diff --git a/vllm/model_executor/layers/quantization/base_config.py b/vllm/model_executor/layers/quantization/base_config.py index 5ef11546fd41b..5343e6ca0e773 100644 --- a/vllm/model_executor/layers/quantization/base_config.py +++ b/vllm/model_executor/layers/quantization/base_config.py @@ -2,7 +2,7 @@ import inspect from abc import ABC, abstractmethod -from typing import Any, Dict, List, Optional, Type +from typing import Any, Optional import torch from torch import nn @@ -43,7 +43,7 @@ def process_weights_after_loading(self, layer: nn.Module) -> None: def method_has_implemented_embedding( - method_class: Type[QuantizeMethodBase]) -> bool: + method_class: type[QuantizeMethodBase]) -> bool: """ Not all quant methods have embedding implemented, so we need to check that it exists for our given method. We check this by making sure the function @@ -63,7 +63,7 @@ class QuantizationConfig(ABC): def __init__(self): super().__init__() # mapping is updated by models as they initialize - self.packed_modules_mapping: Dict[str, List[str]] = dict() + self.packed_modules_mapping: dict[str, list[str]] = dict() @abstractmethod def get_name(self) -> str: @@ -71,7 +71,7 @@ def get_name(self) -> str: raise NotImplementedError @abstractmethod - def get_supported_act_dtypes(self) -> List[torch.dtype]: + def get_supported_act_dtypes(self) -> list[torch.dtype]: """List of supported activation dtypes.""" raise NotImplementedError @@ -88,13 +88,13 @@ def get_min_capability(cls) -> int: @staticmethod @abstractmethod - def get_config_filenames() -> List[str]: + def get_config_filenames() -> list[str]: """List of filenames to search for in the model directory.""" raise NotImplementedError @classmethod @abstractmethod - def from_config(cls, config: Dict[str, Any]) -> "QuantizationConfig": + def from_config(cls, config: dict[str, Any]) -> "QuantizationConfig": """Create a config class from the model's quantization config.""" raise NotImplementedError @@ -110,7 +110,7 @@ def override_quantization_method(cls, hf_quant_cfg, return None @staticmethod - def get_from_keys(config: Dict[str, Any], keys: List[str]) -> Any: + def get_from_keys(config: dict[str, Any], keys: list[str]) -> Any: """Get a value from the model's quantization config.""" for key in keys: if key in config: @@ -119,7 +119,7 @@ def get_from_keys(config: Dict[str, Any], keys: List[str]) -> Any: "quantization config.") @staticmethod - def get_from_keys_or(config: Dict[str, Any], keys: List[str], + def get_from_keys_or(config: dict[str, Any], keys: list[str], default: Any) -> Any: """Get a optional value from the model's quantization config.""" try: diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py index 33c2ca93ffa17..b7d0f0564e660 100644 --- a/vllm/model_executor/layers/quantization/bitsandbytes.py +++ b/vllm/model_executor/layers/quantization/bitsandbytes.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Any, Dict, List, Optional +from typing import Any, Optional import torch @@ -27,7 +27,7 @@ def __init__( bnb_4bit_use_double_quant: bool = False, llm_int8_enable_fp32_cpu_offload: bool = False, llm_int8_has_fp16_weight: bool = False, - llm_int8_skip_modules: Optional[List[str]] = None, + llm_int8_skip_modules: Optional[list[str]] = None, llm_int8_threshold: float = 6.0, ) -> None: super().__init__() @@ -59,7 +59,7 @@ def get_name(self) -> str: return "bitsandbytes" @classmethod - def get_supported_act_dtypes(self) -> List[torch.dtype]: + def get_supported_act_dtypes(self) -> list[torch.dtype]: return [torch.float32, torch.float16, torch.bfloat16] @classmethod @@ -67,13 +67,13 @@ def get_min_capability(cls) -> int: return 70 @staticmethod - def get_config_filenames() -> List[str]: + def get_config_filenames() -> list[str]: return [ "adapter_config.json", ] @classmethod - def from_config(cls, config: Dict[str, Any]) -> "BitsAndBytesConfig": + def from_config(cls, config: dict[str, Any]) -> "BitsAndBytesConfig": def get_safe_value(config, keys, default_value=None): try: @@ -128,7 +128,7 @@ def get_quant_method(self, layer: torch.nn.Module, return None -def is_layer_skipped_bnb(prefix: str, llm_int8_skip_modules: List[str]): +def is_layer_skipped_bnb(prefix: str, llm_int8_skip_modules: list[str]): # Split the prefix into its dot-separated components components = prefix.split('.') @@ -167,7 +167,7 @@ def __init__(self, quant_config: BitsAndBytesConfig): def create_weights(self, layer: torch.nn.Module, input_size_per_partition: int, - output_partition_sizes: List[int], input_size: int, + output_partition_sizes: list[int], input_size: int, output_size: int, params_dtype: torch.dtype, **extra_weight_attrs): from bitsandbytes.nn import Int8Params diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index ce6c706fe3d27..d9f54839112da 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 from contextlib import suppress -from typing import Any, Dict, List, Literal, Optional, Tuple, cast +from typing import Any, Literal, Optional, cast import torch from compressed_tensors.config import (CompressionFormat, @@ -36,20 +36,20 @@ __all__ = ["CompressedTensorsLinearMethod"] SPARSITY_CONFIG_NAME: Literal["sparsity_config"] = "sparsity_config" -QUANTIZATION_SCHEME_MAP_TYPE = Dict[str, Optional[Dict[str, QuantizationArgs]]] +QUANTIZATION_SCHEME_MAP_TYPE = dict[str, Optional[dict[str, QuantizationArgs]]] class CompressedTensorsConfig(QuantizationConfig): def __init__( self, - target_scheme_map: Dict[str, Any], - ignore: List[str], + target_scheme_map: dict[str, Any], + ignore: list[str], quant_format: str, - sparsity_scheme_map: Dict[str, SparsityCompressionConfig], - sparsity_ignore_list: List[str], - kv_cache_scheme: Optional[Dict[str, Any]] = None, - config: Optional[Dict[str, Any]] = None, + sparsity_scheme_map: dict[str, SparsityCompressionConfig], + sparsity_ignore_list: list[str], + kv_cache_scheme: Optional[dict[str, Any]] = None, + config: Optional[dict[str, Any]] = None, ): super().__init__() self.ignore = ignore @@ -64,7 +64,7 @@ def __init__( def get_linear_method(self) -> "CompressedTensorsLinearMethod": return CompressedTensorsLinearMethod(self) - def get_supported_act_dtypes(cls) -> List[torch.dtype]: + def get_supported_act_dtypes(cls) -> list[torch.dtype]: return [torch.float16, torch.bfloat16] @classmethod @@ -100,8 +100,8 @@ def get_quant_method( return None @classmethod - def from_config(cls, config: Dict[str, Any]) -> "CompressedTensorsConfig": - ignore: List[str] = cast(List[str], config.get("ignore", [])) + def from_config(cls, config: dict[str, Any]) -> "CompressedTensorsConfig": + ignore: list[str] = cast(list[str], config.get("ignore", [])) quant_format = cast(str, config.get("format")) target_scheme_map = cls._quantization_scheme_map_from_config( config=config) @@ -119,8 +119,8 @@ def from_config(cls, config: Dict[str, Any]) -> "CompressedTensorsConfig": @classmethod def _parse_sparsity_config( - cls, config: Dict[str, Any] - ) -> Tuple[Dict[str, SparsityCompressionConfig], List[str]]: + cls, config: dict[str, Any] + ) -> tuple[dict[str, SparsityCompressionConfig], list[str]]: """ :param config: The `quantization_config` dictionary from config.json :return: A tuple with two elements @@ -133,7 +133,7 @@ def _parse_sparsity_config( sparsity_config = SparsityCompressionConfig.model_validate( sparsity_config) - sparse_scheme_map: Dict[str, SparsityCompressionConfig] = { + sparse_scheme_map: dict[str, SparsityCompressionConfig] = { target: sparsity_config for target in sparsity_config.targets or list() } @@ -142,13 +142,13 @@ def _parse_sparsity_config( @classmethod def _quantization_scheme_map_from_config( - cls, config: Dict[str, Any]) -> QUANTIZATION_SCHEME_MAP_TYPE: + cls, config: dict[str, Any]) -> QUANTIZATION_SCHEME_MAP_TYPE: """ :param config: The `quantization_config` dictionary from config.json :return: A dictionary mapping target layer names to their corresponding quantization_args for weights and input activations """ - target_scheme_map: Dict[str, Any] = dict() + target_scheme_map: dict[str, Any] = dict() quant_format = cast(str, config.get("format")) # The quant_config has multiple config_groups, each containing @@ -186,7 +186,7 @@ def _quantization_scheme_map_from_config( return target_scheme_map @classmethod - def get_config_filenames(cls) -> List[str]: + def get_config_filenames(cls) -> list[str]: return [] def _check_scheme_supported(self, @@ -531,7 +531,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: def create_weights(self, layer: torch.nn.Module, input_size_per_partition: int, - output_partition_sizes: List[int], input_size: int, + output_partition_sizes: list[int], input_size: int, output_size: int, params_dtype: torch.dtype, **extra_weight_attrs): """ @@ -577,7 +577,7 @@ def __init__(self, quant_config: CompressedTensorsConfig): super().__init__(quant_config) @staticmethod - def validate_kv_cache_scheme(kv_cache_scheme: Optional[Dict[str, Any]]): + def validate_kv_cache_scheme(kv_cache_scheme: Optional[dict[str, Any]]): """ Validator for the kv cache scheme. Useful for controlling the kv cache quantization schemes, that are being supported in vLLM diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index c9aa0ec285baf..c9982b7a88e80 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -2,7 +2,7 @@ import enum from enum import Enum -from typing import Callable, List, Optional +from typing import Callable, Optional import torch from compressed_tensors import CompressionFormat @@ -417,10 +417,10 @@ def replace_tensor(name, new_t): del new_t def get_scale_perms(num_bits: int): - scale_perm: List[int] = [] + scale_perm: list[int] = [] for i in range(8): scale_perm.extend([i + 8 * j for j in range(8)]) - scale_perm_single: List[int] = [] + scale_perm_single: list[int] = [] for i in range(4): scale_perm_single.extend( [2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]]) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py index ec805c934e4ae..f010bc03418c3 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Any, Callable, Dict, List, Optional, Tuple +from typing import Any, Callable, Optional import torch from compressed_tensors import CompressionFormat, ModelCompressor @@ -31,7 +31,7 @@ def __init__( quantized: bool = False, weight_quant: Optional[QuantizationArgs] = None, input_quant: Optional[QuantizationArgs] = None, - model_compression_config: Optional[Dict[str, Any]] = None, + model_compression_config: Optional[dict[str, Any]] = None, ): self.quantized = quantized self.weight_quant = weight_quant @@ -53,7 +53,7 @@ def create_weights( self, layer: torch.nn.Module, input_size: int, - output_partition_sizes: List[int], + output_partition_sizes: list[int], input_size_per_partition: int, params_dtype: torch.dtype, weight_loader: Callable, @@ -327,9 +327,9 @@ def _process_split( ) return sparsity_compressor.decompress_weight(weight_data) - split_weights: List[torch.Tensor] = [] - split_bitmask: List[torch.Tensor] = [] - split_shape: List[Tuple[int, int]] = [] + split_weights: list[torch.Tensor] = [] + split_bitmask: list[torch.Tensor] = [] + split_shape: list[tuple[int, int]] = [] if isinstance(layer, (QKVParallelLinear, MergedColumnParallelLinear)): split_weights = torch.split(compressed, layer.logical_widths) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py index 535ea6b32cfbf..6ea31e50caa72 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Callable, List, Optional +from typing import Callable, Optional import torch from torch.nn import Parameter @@ -58,7 +58,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: layer.meta = Parameter(layer.meta.data, requires_grad=False) def create_weights(self, layer: torch.nn.Module, input_size: int, - output_partition_sizes: List[int], + output_partition_sizes: list[int], input_size_per_partition: int, params_dtype: torch.dtype, weight_loader: Callable, **kwargs): diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py index 5c8261908735f..d5ff04ee3811b 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Callable, List, Optional +from typing import Callable, Optional import torch from compressed_tensors.quantization import QuantizationStrategy @@ -58,7 +58,7 @@ def process_weights_after_loading(self, layer) -> None: prepare_fp8_layer_for_marlin(layer, strategy="channel") def create_weights(self, layer: torch.nn.Module, input_size: int, - output_partition_sizes: List[int], + output_partition_sizes: list[int], input_size_per_partition: int, params_dtype: torch.dtype, weight_loader: Callable, **kwargs): diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py index 32072e9fa570f..37cb2a4e99e02 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Callable, List, Optional +from typing import Callable, Optional import torch from compressed_tensors.quantization import QuantizationStrategy @@ -89,7 +89,7 @@ def process_weights_after_loading(self, layer) -> None: layer.input_scale = None def create_weights(self, layer: torch.nn.Module, - output_partition_sizes: List[int], + output_partition_sizes: list[int], input_size_per_partition: int, params_dtype: torch.dtype, weight_loader: Callable, **kwargs): diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py index 08d86a4e5ddd2..7792ce86553c6 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Callable, List, Optional, Set +from typing import Callable, Optional import torch from compressed_tensors.quantization import QuantizationStrategy @@ -19,7 +19,7 @@ class CompressedTensorsW8A8Int8(CompressedTensorsScheme): - _kernel_backends_being_used: Set[str] = set() + _kernel_backends_being_used: set[str] = set() def __init__(self, strategy: str, is_static_input_scheme: bool, input_symmetric: bool): @@ -33,7 +33,7 @@ def get_min_capability(cls) -> int: return 75 def create_weights(self, layer: torch.nn.Module, - output_partition_sizes: List[int], + output_partition_sizes: list[int], input_size_per_partition: int, params_dtype: torch.dtype, weight_loader: Callable, **kwargs): diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py index 38df09ff39373..e19ea17361201 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Callable, List, Optional, Set +from typing import Callable, Optional import torch from compressed_tensors.quantization import ActivationOrdering @@ -30,7 +30,7 @@ class CompressedTensorsWNA16(CompressedTensorsScheme): - _kernel_backends_being_used: Set[str] = set() + _kernel_backends_being_used: set[str] = set() def __init__(self, strategy: str, @@ -61,7 +61,7 @@ def get_min_capability(cls) -> int: return 80 def create_weights(self, layer: torch.nn.Module, output_size: int, - input_size: int, output_partition_sizes: List[int], + input_size: int, output_partition_sizes: list[int], input_size_per_partition: int, params_dtype: torch.dtype, weight_loader: Callable, **kwargs): diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py b/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py index b69c5e7a02a72..664697a037009 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Optional, Type +from typing import Optional import torch import triton @@ -126,7 +126,7 @@ def triton_scaled_mm(input: torch.Tensor, weight: torch.Tensor, scale_a: torch.Tensor, scale_b: torch.Tensor, - out_dtype: Type[torch.dtype], + out_dtype: type[torch.dtype], bias: Optional[torch.Tensor] = None, block_size_m: int = 32, block_size_n: int = 32, diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py index 85ae1d5cb7878..ccd54281ceb7e 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py @@ -1,8 +1,9 @@ # SPDX-License-Identifier: Apache-2.0 import re +from collections.abc import Iterable, Mapping from types import MappingProxyType -from typing import Iterable, List, Mapping, Optional +from typing import Optional from compressed_tensors import CompressionFormat from torch.nn import Module @@ -20,7 +21,7 @@ def is_activation_quantization_format(format: str) -> bool: def should_ignore_layer( layer_name: Optional[str], ignore: Iterable[str] = tuple(), - fused_mapping: Mapping[str, List[str]] = MappingProxyType({}) + fused_mapping: Mapping[str, list[str]] = MappingProxyType({}) ) -> bool: if layer_name is None: return False @@ -84,7 +85,7 @@ def find_matched_target( layer_name: Optional[str], module: Module, targets: Iterable[str], - fused_mapping: Mapping[str, List[str]] = MappingProxyType({}) + fused_mapping: Mapping[str, list[str]] = MappingProxyType({}) ) -> str: """ Helper function to look up which "target" in the compressed-tensors @@ -171,7 +172,7 @@ def _is_equal_or_regex_match(value: str, def _match_fused_layer( layer_name: str, target_layers: Iterable[str], - fused_mapping: Mapping[str, List[str]]) -> Optional[str]: + fused_mapping: Mapping[str, list[str]]) -> Optional[str]: """ Match a fused layer name to its corresponding individual layer in target_layers. Returns first value in fused_mapping which matches targets @@ -201,7 +202,7 @@ def _match_fused_layer( ] # for each unfused component, find a match in targets - unfused_matches: List[Optional[str]] = [] + unfused_matches: list[Optional[str]] = [] for unfused in unfused_paths: for target in target_layers: if _is_equal_or_regex_match(unfused, target): diff --git a/vllm/model_executor/layers/quantization/deepspeedfp.py b/vllm/model_executor/layers/quantization/deepspeedfp.py index 67934d37284e1..e44b25a91b0e4 100644 --- a/vllm/model_executor/layers/quantization/deepspeedfp.py +++ b/vllm/model_executor/layers/quantization/deepspeedfp.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Any, Dict, List, Optional +from typing import Any, Optional import torch import torch.nn as nn @@ -45,7 +45,7 @@ def get_name(cls) -> str: return "DeepSpeedFP" @classmethod - def from_config(cls, config: Dict[str, Any]) -> "DeepSpeedFPConfig": + def from_config(cls, config: dict[str, Any]) -> "DeepSpeedFPConfig": weight_bits = cls.get_from_keys(config, ["bits"]) group_size = cls.get_from_keys(config, ["group_size"]) return cls(weight_bits=weight_bits, group_size=group_size) @@ -54,7 +54,7 @@ def get_linear_method(self) -> "DeepSpeedFPLinearMethod": return DeepSpeedFPLinearMethod(self) @classmethod - def get_supported_act_dtypes(cls) -> List[torch.dtype]: + def get_supported_act_dtypes(cls) -> list[torch.dtype]: return [torch.half, torch.bfloat16] @classmethod @@ -63,7 +63,7 @@ def get_min_capability(cls) -> int: return 60 @staticmethod - def get_config_filenames() -> List[str]: + def get_config_filenames() -> list[str]: return [ "quant_config.json", "quantize_config.json", @@ -90,7 +90,7 @@ def __init__(self, quant_config: DeepSpeedFPConfig): def create_weights(self, layer: torch.nn.Module, input_size_per_partition: int, - output_partition_sizes: List[int], + output_partition_sizes: list[int], input_size: int, output_size: int, params_dtype: torch.dtype, diff --git a/vllm/model_executor/layers/quantization/experts_int8.py b/vllm/model_executor/layers/quantization/experts_int8.py index d18ca55afebdb..cfa8ae7c00644 100644 --- a/vllm/model_executor/layers/quantization/experts_int8.py +++ b/vllm/model_executor/layers/quantization/experts_int8.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Any, Callable, Dict, List, Optional +from typing import Any, Callable, Optional import torch @@ -24,7 +24,7 @@ def get_name(cls) -> str: return "experts_int8" @classmethod - def get_supported_act_dtypes(cls) -> List[torch.dtype]: + def get_supported_act_dtypes(cls) -> list[torch.dtype]: return [torch.bfloat16, torch.half] @classmethod @@ -32,11 +32,11 @@ def get_min_capability(cls) -> int: return 80 @classmethod - def get_config_filenames(cls) -> List[str]: + def get_config_filenames(cls) -> list[str]: return [] @classmethod - def from_config(cls, config: Dict[str, Any]) -> "ExpertsInt8Config": + def from_config(cls, config: dict[str, Any]) -> "ExpertsInt8Config": return cls() def get_quant_method(self, layer: torch.nn.Module, diff --git a/vllm/model_executor/layers/quantization/fbgemm_fp8.py b/vllm/model_executor/layers/quantization/fbgemm_fp8.py index 20f2c3da600d7..3fc3f6c677be9 100644 --- a/vllm/model_executor/layers/quantization/fbgemm_fp8.py +++ b/vllm/model_executor/layers/quantization/fbgemm_fp8.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Any, Dict, List, Optional +from typing import Any, Optional import torch from torch.nn import Module @@ -29,7 +29,7 @@ class FBGEMMFp8Config(QuantizationConfig): """Config class for FBGEMM Fp8.""" - def __init__(self, ignore_list: List[str], input_scale_ub: float): + def __init__(self, ignore_list: list[str], input_scale_ub: float): super().__init__() self.ignore_list = ignore_list if ignore_list else [] self.input_scale_ub = input_scale_ub @@ -43,7 +43,7 @@ def get_name(cls) -> str: return "fbgemm_fp8" @classmethod - def get_supported_act_dtypes(cls) -> List[torch.dtype]: + def get_supported_act_dtypes(cls) -> list[torch.dtype]: return [torch.bfloat16, torch.float16] @classmethod @@ -51,11 +51,11 @@ def get_min_capability(cls) -> int: return 80 @classmethod - def get_config_filenames(cls) -> List[str]: + def get_config_filenames(cls) -> list[str]: return [] @classmethod - def from_config(cls, config: Dict[str, Any]) -> "FBGEMMFp8Config": + def from_config(cls, config: dict[str, Any]) -> "FBGEMMFp8Config": ignore_list = cls.get_from_keys(config, ["modules_to_not_convert"]) input_scale_ub = cls.get_from_keys(config, ["activation_scale_ub"]) return cls(ignore_list=ignore_list, input_scale_ub=input_scale_ub) @@ -79,7 +79,7 @@ def create_weights( self, layer: torch.nn.Module, input_size_per_partition: int, - output_partition_sizes: List[int], + output_partition_sizes: list[int], input_size: int, output_size: int, params_dtype: torch.dtype, diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index a705f63be4acb..e3fca559c397e 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Any, Callable, Dict, List, Optional +from typing import Any, Callable, Optional import torch import torch.nn.functional as F @@ -45,8 +45,8 @@ def __init__( self, is_checkpoint_fp8_serialized: bool = False, activation_scheme: str = "dynamic", - ignored_layers: Optional[List[str]] = None, - weight_block_size: Optional[List[int]] = None, + ignored_layers: Optional[list[str]] = None, + weight_block_size: Optional[list[int]] = None, ) -> None: super().__init__() self.is_checkpoint_fp8_serialized = is_checkpoint_fp8_serialized @@ -78,7 +78,7 @@ def get_name(cls) -> str: return "fp8" @classmethod - def get_supported_act_dtypes(cls) -> List[torch.dtype]: + def get_supported_act_dtypes(cls) -> list[torch.dtype]: return [torch.bfloat16, torch.half] @classmethod @@ -86,11 +86,11 @@ def get_min_capability(cls) -> int: return 80 @classmethod - def get_config_filenames(cls) -> List[str]: + def get_config_filenames(cls) -> list[str]: return [] @classmethod - def from_config(cls, config: Dict[str, Any]) -> "Fp8Config": + def from_config(cls, config: dict[str, Any]) -> "Fp8Config": quant_method = cls.get_from_keys(config, ["quant_method"]) is_checkpoint_fp8_serialized = ("fp8" in quant_method) activation_scheme = cls.get_from_keys(config, ["activation_scheme"]) @@ -157,7 +157,7 @@ def create_weights( self, layer: torch.nn.Module, input_size_per_partition: int, - output_partition_sizes: List[int], + output_partition_sizes: list[int], input_size: int, output_size: int, params_dtype: torch.dtype, diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py index ba176e4a567cc..19f0fc9717a29 100644 --- a/vllm/model_executor/layers/quantization/gguf.py +++ b/vllm/model_executor/layers/quantization/gguf.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Any, Callable, Dict, List, Optional +from typing import Any, Callable, Optional import gguf import torch @@ -31,7 +31,7 @@ def __repr__(self) -> str: def get_name(self) -> str: return "gguf" - def get_supported_act_dtypes(self) -> List[torch.dtype]: + def get_supported_act_dtypes(self) -> list[torch.dtype]: return [torch.half] @classmethod @@ -39,11 +39,11 @@ def get_min_capability(cls) -> int: return 60 @classmethod - def get_config_filenames(cls) -> List[str]: + def get_config_filenames(cls) -> list[str]: return [] # no extra configs. @classmethod - def from_config(cls, config: Dict[str, Any]) -> "GGUFConfig": + def from_config(cls, config: dict[str, Any]) -> "GGUFConfig": return cls() def get_quant_method(self, layer: torch.nn.Module, @@ -131,7 +131,7 @@ def __init__(self, quant_config: GGUFConfig): def create_weights(self, layer: torch.nn.Module, input_size_per_partition: int, - output_partition_sizes: List[int], input_size: int, + output_partition_sizes: list[int], input_size: int, output_size: int, params_dtype: torch.dtype, **extra_weight_attrs): output_size_per_partition = sum(output_partition_sizes) @@ -332,7 +332,7 @@ def embedding(self, layer: torch.nn.Module, class GGUFUninitializedParameter(UninitializedParameter): cls_to_become = Parameter - data_container: List[torch.Tensor] + data_container: list[torch.Tensor] def materialize_nested(self) -> Parameter: dtype = {data.dtype for data in self.data_container} diff --git a/vllm/model_executor/layers/quantization/gptq.py b/vllm/model_executor/layers/quantization/gptq.py index 1c8d6cb1ea79a..c057a4bdabe7d 100644 --- a/vllm/model_executor/layers/quantization/gptq.py +++ b/vllm/model_executor/layers/quantization/gptq.py @@ -3,7 +3,7 @@ import enum from enum import Enum from fractions import Fraction -from typing import Any, Dict, List, Optional, Union +from typing import Any, Optional, Union import torch from torch.nn.parameter import Parameter @@ -33,11 +33,11 @@ def __init__( group_size: int, desc_act: bool, lm_head_quantized: bool, - dynamic: Dict[str, Dict[str, Union[int, bool]]], + dynamic: dict[str, dict[str, Union[int, bool]]], ) -> None: # GPTQModel use `dynamic` config property to allow per module # quantization config so each module can be individually optimized. - # Format is Dict[str, Dict] where key is a regex string that can + # Format is dict[str, dict] where key is a regex string that can # perform both positive ("+:" prefixed) or negative ("-:" prefixed) # matching of a module. # Default to positive match, override base quant config mode, if no @@ -83,7 +83,7 @@ def get_name(cls) -> str: return "gptq" @classmethod - def get_supported_act_dtypes(cls) -> List[torch.dtype]: + def get_supported_act_dtypes(cls) -> list[torch.dtype]: return [torch.half] @classmethod @@ -92,11 +92,11 @@ def get_min_capability(cls) -> int: return 60 @classmethod - def get_config_filenames(cls) -> List[str]: + def get_config_filenames(cls) -> list[str]: return ["quantize_config.json"] @classmethod - def from_config(cls, config: Dict[str, Any]) -> "GPTQConfig": + def from_config(cls, config: dict[str, Any]) -> "GPTQConfig": dynamic = cls.get_from_keys_or(config, ["dynamic"], default={}) dynamic = {} if dynamic is None else dynamic @@ -134,7 +134,7 @@ def create_weights( self, layer: torch.nn.Module, input_size_per_partition: int, - output_partition_sizes: List[int], + output_partition_sizes: list[int], input_size: int, output_size: int, params_dtype: torch.dtype, diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index 21db8ccba059c..a826a7b5be4b3 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Any, Callable, Dict, List, Optional, Set, Union +from typing import Any, Callable, Optional, Union import torch @@ -44,8 +44,8 @@ class GPTQMarlinConfig(QuantizationConfig): def __init__(self, weight_bits: int, group_size: int, desc_act: bool, is_sym: bool, lm_head_quantized: bool, - dynamic: Dict[str, Dict[str, Union[int, bool]]], - full_config: Dict[str, Any]) -> None: + dynamic: dict[str, dict[str, Union[int, bool]]], + full_config: dict[str, Any]) -> None: super().__init__() if desc_act and group_size == -1: # In this case, act_order == True is the same as act_order == False @@ -54,7 +54,7 @@ def __init__(self, weight_bits: int, group_size: int, desc_act: bool, # GPTQModel use `dynamic` config property to allow per module # quantization config so each module can be individually optimized. - # Format is Dict[str, Dict] where key is a regex string that can + # Format is dict[str, dict] where key is a regex string that can # perform both positive ("+:" prefixed) or negative ("-:" prefixed) # matching of a module. # Default to positive match, override base quant config mode, if no @@ -104,7 +104,7 @@ def get_name(cls) -> str: return "gptq_marlin" @classmethod - def get_supported_act_dtypes(cls) -> List[torch.dtype]: + def get_supported_act_dtypes(cls) -> list[torch.dtype]: return [torch.half, torch.bfloat16] @classmethod @@ -112,11 +112,11 @@ def get_min_capability(cls) -> int: return 80 @classmethod - def get_config_filenames(cls) -> List[str]: + def get_config_filenames(cls) -> list[str]: return ["quantize_config.json"] @classmethod - def from_config(cls, config: Dict[str, Any]) -> "GPTQMarlinConfig": + def from_config(cls, config: dict[str, Any]) -> "GPTQMarlinConfig": dynamic = cls.get_from_keys_or(config, ["dynamic"], default={}) dynamic = {} if dynamic is None else dynamic @@ -163,7 +163,7 @@ def get_quant_method(self, layer: torch.nn.Module, GPTQMarlinLinearMethod) @classmethod - def is_gptq_marlin_compatible(cls, quant_config: Dict[str, Any]): + def is_gptq_marlin_compatible(cls, quant_config: dict[str, Any]): quant_method = quant_config.get("quant_method", "").lower() num_bits = quant_config.get("bits") group_size = quant_config.get("group_size") @@ -195,7 +195,7 @@ class GPTQMarlinLinearMethod(LinearMethodBase): quant_config: The GPTQ Marlin quantization config. """ - _kernel_backends_being_used: Set[str] = set() + _kernel_backends_being_used: set[str] = set() def __init__(self, quant_config: GPTQMarlinConfig) -> None: self.quant_config = quant_config @@ -208,7 +208,7 @@ def create_weights( self, layer: torch.nn.Module, input_size_per_partition: int, - output_partition_sizes: List[int], + output_partition_sizes: list[int], input_size: int, output_size: int, params_dtype: torch.dtype, diff --git a/vllm/model_executor/layers/quantization/gptq_marlin_24.py b/vllm/model_executor/layers/quantization/gptq_marlin_24.py index dd747e182e289..be97fb2f176fc 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin_24.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin_24.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Any, Dict, List, Optional +from typing import Any, Optional import torch from torch.nn.parameter import Parameter @@ -89,7 +89,7 @@ def get_name(cls) -> str: return "gptq_marlin_24" @classmethod - def get_supported_act_dtypes(cls) -> List[torch.dtype]: + def get_supported_act_dtypes(cls) -> list[torch.dtype]: return [torch.half] @classmethod @@ -98,11 +98,11 @@ def get_min_capability(cls) -> int: return 80 @classmethod - def get_config_filenames(cls) -> List[str]: + def get_config_filenames(cls) -> list[str]: return ["quantize_config.json"] @classmethod - def from_config(cls, config: Dict[str, Any]) -> "GPTQMarlin24Config": + def from_config(cls, config: dict[str, Any]) -> "GPTQMarlin24Config": weight_bits = cls.get_from_keys(config, ["bits"]) group_size = cls.get_from_keys(config, ["group_size"]) return cls(weight_bits, group_size) @@ -145,7 +145,7 @@ def create_weights( self, layer: torch.nn.Module, input_size_per_partition: int, - output_partition_sizes: List[int], + output_partition_sizes: list[int], input_size: int, output_size: int, params_dtype: torch.dtype, diff --git a/vllm/model_executor/layers/quantization/hqq_marlin.py b/vllm/model_executor/layers/quantization/hqq_marlin.py index 4edc9aa848a19..f42212b90387f 100644 --- a/vllm/model_executor/layers/quantization/hqq_marlin.py +++ b/vllm/model_executor/layers/quantization/hqq_marlin.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Any, Dict, List, Optional +from typing import Any, Optional import torch @@ -31,7 +31,7 @@ def __init__( self, weight_bits: int, group_size: int, - skip_modules: Optional[List[str]] = None, + skip_modules: Optional[list[str]] = None, ) -> None: super().__init__() assert group_size == 64, ("The only supported HQQ group size is " @@ -54,7 +54,7 @@ def get_name(cls) -> str: return "hqq" @classmethod - def get_supported_act_dtypes(cls) -> List[torch.dtype]: + def get_supported_act_dtypes(cls) -> list[torch.dtype]: return [torch.half, torch.bfloat16] @classmethod @@ -62,11 +62,11 @@ def get_min_capability(cls) -> int: return 80 @classmethod - def get_config_filenames(cls) -> List[str]: + def get_config_filenames(cls) -> list[str]: return ["quantize_config.json"] @classmethod - def from_config(cls, config: Dict[str, Any]) -> "HQQMarlinConfig": + def from_config(cls, config: dict[str, Any]) -> "HQQMarlinConfig": wq_params = (config["quant_config"]["weight_quant_params"]) weight_bits = cls.get_from_keys(wq_params, ["nbits"]) group_size = cls.get_from_keys(wq_params, ["group_size"]) @@ -191,7 +191,7 @@ def create_weights( self, layer: torch.nn.Module, input_size_per_partition: int, - output_partition_sizes: List[int], + output_partition_sizes: list[int], input_size: int, output_size: int, params_dtype: torch.dtype, diff --git a/vllm/model_executor/layers/quantization/ipex_quant.py b/vllm/model_executor/layers/quantization/ipex_quant.py index c09cc13cb276b..8a39c6edb0e8a 100644 --- a/vllm/model_executor/layers/quantization/ipex_quant.py +++ b/vllm/model_executor/layers/quantization/ipex_quant.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Any, Dict, List, Optional +from typing import Any, Optional import torch @@ -31,7 +31,7 @@ def __init__( method: str, weight_bits: int, group_size: int, - modules_to_not_convert: Optional[List[str]] = None, + modules_to_not_convert: Optional[list[str]] = None, desc_act: Optional[bool] = None, lm_head_quantized: Optional[bool] = None, ) -> None: @@ -62,7 +62,7 @@ def get_name(cls) -> str: return "ipex" @classmethod - def get_supported_act_dtypes(cls) -> List[torch.dtype]: + def get_supported_act_dtypes(cls) -> list[torch.dtype]: return [torch.bfloat16, torch.float16] @classmethod @@ -70,14 +70,14 @@ def get_min_capability(cls) -> int: return -1 @staticmethod - def get_config_filenames() -> List[str]: + def get_config_filenames() -> list[str]: return [ "quant_config.json", "quantize_config.json", ] @classmethod - def from_config(cls, config: Dict[str, Any]) -> "IPEXConfig": + def from_config(cls, config: dict[str, Any]) -> "IPEXConfig": method = cls.get_from_keys(config, ["quant_method"]).lower() if method == "awq": weight_bits = cls.get_from_keys(config, ["w_bit", "bits"]) diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py index c06befaf3b5ad..55ad00b1cf461 100644 --- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py +++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py @@ -2,7 +2,7 @@ from abc import ABC, abstractmethod from dataclasses import dataclass -from typing import Callable, Optional, Tuple +from typing import Callable, Optional import torch @@ -12,8 +12,8 @@ @dataclass class MPLinearLayerConfig: - full_weight_shape: Tuple[int, int] # [in, out] - partition_weight_shape: Tuple[int, int] + full_weight_shape: tuple[int, int] # [in, out] + partition_weight_shape: tuple[int, int] weight_type: ScalarType act_type: torch.dtype group_size: int @@ -31,7 +31,7 @@ def get_min_capability(cls) -> int: @classmethod @abstractmethod def can_implement(cls, - c: MPLinearLayerConfig) -> Tuple[bool, Optional[str]]: + c: MPLinearLayerConfig) -> tuple[bool, Optional[str]]: raise NotImplementedError def __init__(self, @@ -75,7 +75,7 @@ def _transform_param(self, layer: torch.nn.Module, name: Optional[str], torch.nn.Parameter(new_param.data, requires_grad=False)) def _get_weight_params( - self, layer: torch.nn.Module) -> Tuple[ + self, layer: torch.nn.Module) -> tuple[ torch.Tensor, # w_q torch.Tensor, # w_s Optional[torch.Tensor], # w_zp, diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py index bcfdb16777166..537553e7d3d70 100644 --- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py +++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List, Optional, Type +from typing import Optional import vllm.envs as envs from vllm.model_executor.layers.quantization.kernels.mixed_precision.exllama import ( # noqa: E501 @@ -14,7 +14,7 @@ from vllm.platforms import current_platform # in priority/performance order (when available) -_POSSIBLE_KERNELS: List[Type[MPLinearKernel]] = [ +_POSSIBLE_KERNELS: list[type[MPLinearKernel]] = [ MacheteLinearKernel, MarlinLinearKernel, ExllamaLinearKernel, @@ -23,7 +23,7 @@ def choose_mp_linear_kernel( config: MPLinearLayerConfig, - compute_capability: Optional[int] = None) -> Type[MPLinearKernel]: + compute_capability: Optional[int] = None) -> type[MPLinearKernel]: """ Choose an MPLinearKernel that can implement the given config for the given compute capability. Attempts to choose the best kernel in terms of @@ -40,7 +40,7 @@ def choose_mp_linear_kernel( ValueError: If no kernel can implement the given config. Returns: - Type[MPLinearKernel]: Chosen kernel. + type[MPLinearKernel]: Chosen kernel. """ if compute_capability is None: if current_platform is None: diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py index 2706fbb539ab4..50d293cf415bf 100644 --- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py +++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Optional, Tuple +from typing import Optional import torch @@ -25,7 +25,7 @@ def get_min_capability(cls) -> int: @classmethod def can_implement(cls, - c: MPLinearLayerConfig) -> Tuple[bool, Optional[str]]: + c: MPLinearLayerConfig) -> tuple[bool, Optional[str]]: if c.has_g_idx and\ c.partition_weight_shape[0] != c.full_weight_shape[0]: return False, "Act reordering currently not supported by Exllama, "\ diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py index 3f0586f6e30d6..2dec4ace92f09 100644 --- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py +++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 from functools import partial -from typing import Optional, Tuple +from typing import Optional import torch @@ -25,7 +25,7 @@ def get_min_capability(cls) -> int: @classmethod def can_implement(cls, - c: MPLinearLayerConfig) -> Tuple[bool, Optional[str]]: + c: MPLinearLayerConfig) -> tuple[bool, Optional[str]]: if c.has_g_idx and\ c.partition_weight_shape[0] != c.full_weight_shape[0]: return False, "Act reordering currently not supported by Machete, "\ diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py index e21801cf6a785..ed8a31b318393 100644 --- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py +++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Optional, Tuple +from typing import Optional import torch @@ -24,7 +24,7 @@ def get_min_capability(cls) -> int: @classmethod def can_implement(cls, - c: MPLinearLayerConfig) -> Tuple[bool, Optional[str]]: + c: MPLinearLayerConfig) -> tuple[bool, Optional[str]]: if c.zero_points: return False, "Zero points currently not supported by "\ " MarlinLinearKernel. Will be added when AWQMarlin "\ diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py index 91e7654053f9d..2d92af74bbf9a 100644 --- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py +++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py @@ -2,7 +2,7 @@ from abc import ABC, abstractmethod from dataclasses import dataclass -from typing import Optional, Tuple +from typing import Optional import torch @@ -24,7 +24,7 @@ def get_min_capability(cls) -> int: @classmethod @abstractmethod def can_implement( - cls, c: ScaledMMLinearLayerConfig) -> Tuple[bool, Optional[str]]: + cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, Optional[str]]: raise NotImplementedError def __init__(self, c: ScaledMMLinearLayerConfig, w_q_param_name: str, @@ -50,7 +50,7 @@ def apply_weights(self, raise NotImplementedError def _get_weight_params( - self, layer: torch.nn.Module) -> Tuple[ + self, layer: torch.nn.Module) -> tuple[ torch.Tensor, # weight torch.Tensor, # weight_scale Optional[torch.Tensor], # input_scale, diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py index a5967995ac88d..3ac2fd4ed9b43 100644 --- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py +++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import os -from typing import Dict, List, Optional, Type +from typing import Optional from vllm.model_executor.layers.quantization.kernels.scaled_mm.cutlass import ( CutlassScaledMMLinearKernel) @@ -14,7 +14,7 @@ from vllm.platforms import PlatformEnum, current_platform # in priority/performance order (when available) -_POSSIBLE_KERNELS: Dict[PlatformEnum, List[Type[ScaledMMLinearKernel]]] = { +_POSSIBLE_KERNELS: dict[PlatformEnum, list[type[ScaledMMLinearKernel]]] = { PlatformEnum.CPU: [CutlassScaledMMLinearKernel], PlatformEnum.CUDA: [CutlassScaledMMLinearKernel], PlatformEnum.ROCM: [TritonScaledMMLinearKernel], @@ -25,7 +25,7 @@ def choose_scaled_mm_linear_kernel( config: ScaledMMLinearLayerConfig, compute_capability: Optional[int] = None -) -> Type[ScaledMMLinearKernel]: +) -> type[ScaledMMLinearKernel]: """ Choose an ScalledMMLinearKernel that can implement the given config for the given compute capability. Attempts to choose the best kernel in terms of @@ -42,7 +42,7 @@ def choose_scaled_mm_linear_kernel( ValueError: If no kernel can implement the given config. Returns: - Type[ScaledMMLinearKernel]: Chosen kernel. + type[ScaledMMLinearKernel]: Chosen kernel. """ if compute_capability is None: diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py index 2bf21a05c46d9..245f6635cf85a 100644 --- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py +++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Optional, Tuple +from typing import Optional import torch @@ -22,7 +22,7 @@ def get_min_capability(cls) -> int: @classmethod def can_implement( - cls, c: ScaledMMLinearLayerConfig) -> Tuple[bool, Optional[str]]: + cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, Optional[str]]: if (not current_platform.is_cuda() and not current_platform.is_cpu()): return False, "CutlassScaledMM requires running on CUDA or CPU." diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py index 5da5df8efaeb0..c09ca83d01cbb 100644 --- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py +++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Optional, Tuple +from typing import Optional import torch @@ -18,7 +18,7 @@ def get_min_capability(cls) -> int: @classmethod def can_implement( - cls, c: ScaledMMLinearLayerConfig) -> Tuple[bool, Optional[str]]: + cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, Optional[str]]: if current_platform.is_cpu(): return ( False, diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py index 0bf090d7fab3c..ab27f49115c26 100644 --- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py +++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import warnings -from typing import Optional, Tuple +from typing import Optional import torch from functorch.experimental.control_flow import cond # noqa: F401 @@ -25,7 +25,7 @@ def get_min_capability(cls) -> int: @classmethod def can_implement( - cls, c: ScaledMMLinearLayerConfig) -> Tuple[bool, Optional[str]]: + cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, Optional[str]]: if not current_platform.is_tpu(): return False, "ScaledMMXLA requires running on TPU." diff --git a/vllm/model_executor/layers/quantization/marlin.py b/vllm/model_executor/layers/quantization/marlin.py index 4cf0c677c0794..284abeea912e6 100644 --- a/vllm/model_executor/layers/quantization/marlin.py +++ b/vllm/model_executor/layers/quantization/marlin.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Any, Dict, List, Optional +from typing import Any, Optional import torch from torch.nn.parameter import Parameter @@ -67,7 +67,7 @@ def get_name(cls) -> str: return "marlin" @classmethod - def get_supported_act_dtypes(cls) -> List[torch.dtype]: + def get_supported_act_dtypes(cls) -> list[torch.dtype]: return [torch.half] @classmethod @@ -76,11 +76,11 @@ def get_min_capability(cls) -> int: return 80 @classmethod - def get_config_filenames(cls) -> List[str]: + def get_config_filenames(cls) -> list[str]: return ["quantize_config.json"] @classmethod - def from_config(cls, config: Dict[str, Any]) -> "MarlinConfig": + def from_config(cls, config: dict[str, Any]) -> "MarlinConfig": group_size = cls.get_from_keys(config, ["group_size"]) lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"], default=False) @@ -127,7 +127,7 @@ def create_weights( self, layer: torch.nn.Module, input_size_per_partition: int, - output_partition_sizes: List[int], + output_partition_sizes: list[int], input_size: int, output_size: int, params_dtype: torch.dtype, diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index 36711a7a5098b..4223bf3cb6378 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Any, Dict, List, Optional +from typing import Any, Optional import torch from torch.nn import Module @@ -39,7 +39,7 @@ def get_name(cls) -> str: return "modelopt" @classmethod - def get_supported_act_dtypes(cls) -> List[torch.dtype]: + def get_supported_act_dtypes(cls) -> list[torch.dtype]: return [torch.bfloat16, torch.half] @classmethod @@ -47,11 +47,11 @@ def get_min_capability(cls) -> int: return 89 @classmethod - def get_config_filenames(cls) -> List[str]: + def get_config_filenames(cls) -> list[str]: return ["hf_quant_config.json"] @classmethod - def from_config(cls, config: Dict[str, Any]) -> "ModelOptFp8Config": + def from_config(cls, config: dict[str, Any]) -> "ModelOptFp8Config": quant_config = cls.get_from_keys(config, ["quantization"]) quant_method = quant_config["quant_algo"] is_checkpoint_fp8_serialized = ("FP8" in quant_method) @@ -101,7 +101,7 @@ def create_weights( self, layer: torch.nn.Module, input_size_per_partition: int, - output_partition_sizes: List[int], + output_partition_sizes: list[int], input_size: int, output_size: int, params_dtype: torch.dtype, diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py index a3adac1bb129b..99c46007ca6e7 100644 --- a/vllm/model_executor/layers/quantization/moe_wna16.py +++ b/vllm/model_executor/layers/quantization/moe_wna16.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Any, Callable, Dict, List, Optional +from typing import Any, Callable, Optional import torch @@ -22,8 +22,8 @@ class MoeWNA16Config(QuantizationConfig): def __init__(self, linear_quant_method: str, weight_bits: int, group_size: int, has_zp: bool, lm_head_quantized: bool, - modules_to_not_convert: Optional[List[str]], - full_config: Dict[str, Any]) -> None: + modules_to_not_convert: Optional[list[str]], + full_config: dict[str, Any]) -> None: super().__init__() self.weight_bits = weight_bits self.group_size = group_size @@ -68,7 +68,7 @@ def get_name(cls) -> str: return "moe_wna16" @classmethod - def get_supported_act_dtypes(cls) -> List[torch.dtype]: + def get_supported_act_dtypes(cls) -> list[torch.dtype]: return [torch.bfloat16, torch.half] @classmethod @@ -76,11 +76,11 @@ def get_min_capability(cls) -> int: return 70 @classmethod - def get_config_filenames(cls) -> List[str]: + def get_config_filenames(cls) -> list[str]: return ["quantize_config.json"] @classmethod - def from_config(cls, config: Dict[str, Any]) -> "MoeWNA16Config": + def from_config(cls, config: dict[str, Any]) -> "MoeWNA16Config": linear_quant_method = cls.get_from_keys(config, ["quant_method"]) weight_bits = cls.get_from_keys(config, ["bits"]) group_size = cls.get_from_keys(config, ["group_size"]) @@ -108,7 +108,7 @@ def override_quantization_method(cls, hf_quant_cfg, return None @classmethod - def is_moe_wna16_compatible(cls, quant_config: Dict[str, Any]): + def is_moe_wna16_compatible(cls, quant_config: dict[str, Any]): # Extract data from quant config. quant_method = quant_config.get("quant_method", "").lower() num_bits = quant_config.get("bits") @@ -162,7 +162,7 @@ def get_quant_method(self, layer: torch.nn.Module, return None -def is_layer_skipped_quant(prefix: str, modules_to_not_convert: List[str]): +def is_layer_skipped_quant(prefix: str, modules_to_not_convert: list[str]): return any(module_name in prefix for module_name in modules_to_not_convert) diff --git a/vllm/model_executor/layers/quantization/neuron_quant.py b/vllm/model_executor/layers/quantization/neuron_quant.py index f6f66803f8169..325ea71871f99 100644 --- a/vllm/model_executor/layers/quantization/neuron_quant.py +++ b/vllm/model_executor/layers/quantization/neuron_quant.py @@ -2,7 +2,7 @@ import os from importlib.util import find_spec -from typing import Any, Dict, List, Optional +from typing import Any, Optional from torch.nn import Module @@ -33,7 +33,7 @@ def __init__( def get_name(self) -> str: return "neuron_quant" - def get_supported_act_dtypes(self) -> List[str]: + def get_supported_act_dtypes(self) -> list[str]: return SUPPORTED_QUANT_DTYPE_LIST @classmethod @@ -42,11 +42,11 @@ def get_min_capability(cls) -> int: "This function should not be called with Neuron Backend") @staticmethod - def get_config_filenames() -> List[str]: + def get_config_filenames() -> list[str]: return [] @classmethod - def from_config(cls, config: Dict[str, Any]) -> "NeuronQuantConfig": + def from_config(cls, config: dict[str, Any]) -> "NeuronQuantConfig": quantize_method = cls.get_from_keys(config, ["quantize_method"]) dequant_dtype = cls.get_from_keys(config, ["dequant_dtype"]) return cls(dequant_dtype=dequant_dtype, diff --git a/vllm/model_executor/layers/quantization/ptpc_fp8.py b/vllm/model_executor/layers/quantization/ptpc_fp8.py index 1ded5389e5f45..95cfbca8d05c8 100644 --- a/vllm/model_executor/layers/quantization/ptpc_fp8.py +++ b/vllm/model_executor/layers/quantization/ptpc_fp8.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Any, Dict, List, Optional +from typing import Any, Optional import torch from torch.nn.parameter import Parameter @@ -31,7 +31,7 @@ class PTPCFp8Config(Fp8Config): def __init__( self, activation_scheme: str = "dynamic", - ignored_layers: Optional[List[str]] = None, + ignored_layers: Optional[list[str]] = None, ) -> None: if not current_platform.is_rocm(): raise ValueError( @@ -54,7 +54,7 @@ def get_name(cls) -> str: return "ptpc_fp8" @classmethod - def from_config(cls, config: Dict[str, Any]) -> "PTPCFp8Config": + def from_config(cls, config: dict[str, Any]) -> "PTPCFp8Config": activation_scheme = cls.get_from_keys(config, ["activation_scheme"]) ignored_layers = cls.get_from_keys_or(config, ["ignored_layers"], None) return cls(activation_scheme=activation_scheme, diff --git a/vllm/model_executor/layers/quantization/qqq.py b/vllm/model_executor/layers/quantization/qqq.py index 1e05917a5187b..58d2adbed35c9 100644 --- a/vllm/model_executor/layers/quantization/qqq.py +++ b/vllm/model_executor/layers/quantization/qqq.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Any, Dict, List, Optional +from typing import Any, Optional import torch from torch.nn.parameter import Parameter @@ -88,7 +88,7 @@ def get_name(cls) -> str: return "qqq" @classmethod - def get_supported_act_dtypes(cls) -> List[torch.dtype]: + def get_supported_act_dtypes(cls) -> list[torch.dtype]: return [torch.half] @classmethod @@ -96,7 +96,7 @@ def get_min_capability(cls) -> int: return 80 @classmethod - def get_config_filenames(cls) -> List[str]: + def get_config_filenames(cls) -> list[str]: """List of filenames to search for in the model directory.""" return [ "quant_config.json", @@ -104,7 +104,7 @@ def get_config_filenames(cls) -> List[str]: ] @classmethod - def from_config(cls, config: Dict[str, Any]) -> "QQQConfig": + def from_config(cls, config: dict[str, Any]) -> "QQQConfig": weight_bits = cls.get_from_keys(config, ["wbits"]) group_size = cls.get_from_keys(config, ["group_size"]) return cls(weight_bits, group_size) @@ -130,7 +130,7 @@ def create_weights( self, layer: torch.nn.Module, input_size_per_partition: int, - output_partition_sizes: List[int], + output_partition_sizes: list[int], input_size: int, output_size: int, params_dtype: torch.dtype, diff --git a/vllm/model_executor/layers/quantization/quark/quark.py b/vllm/model_executor/layers/quantization/quark/quark.py index ca71da8b736a5..c512393774c59 100644 --- a/vllm/model_executor/layers/quantization/quark/quark.py +++ b/vllm/model_executor/layers/quantization/quark/quark.py @@ -2,7 +2,7 @@ import fnmatch import re -from typing import Any, Dict, List, Optional, cast +from typing import Any, Optional, cast import torch @@ -26,9 +26,9 @@ class QuarkConfig(QuantizationConfig): def __init__(self, - quant_config: Dict[str, Any], - kv_cache_group: Optional[List[str]] = None, - kv_cache_config: Optional[Dict[str, Any]] = None, + quant_config: dict[str, Any], + kv_cache_group: Optional[list[str]] = None, + kv_cache_config: Optional[dict[str, Any]] = None, pack_method: str = "reorder"): super().__init__() if kv_cache_group is None: @@ -41,7 +41,7 @@ def __init__(self, def get_linear_method(self) -> "QuarkLinearMethod": return QuarkLinearMethod(self) - def get_supported_act_dtypes(cls) -> List[torch.dtype]: + def get_supported_act_dtypes(cls) -> list[torch.dtype]: return [torch.float16, torch.bfloat16] @classmethod @@ -56,7 +56,7 @@ def get_quant_method(self, layer: torch.nn.Module, from vllm.attention.layer import Attention # Avoid circular import # Check if the layer is skipped for quantization. - exclude_layers = cast(List[str], self.quant_config.get("exclude")) + exclude_layers = cast(list[str], self.quant_config.get("exclude")) if should_ignore_layer(prefix, ignore=exclude_layers, fused_mapping=self.packed_modules_mapping): @@ -74,12 +74,12 @@ def get_quant_method(self, layer: torch.nn.Module, return None @classmethod - def from_config(cls, config: Dict[str, Any]) -> "QuarkConfig": + def from_config(cls, config: dict[str, Any]) -> "QuarkConfig": export_config = config.get("export") if export_config is None: raise ValueError("The export key should be included in " "the configurations of Quark quantized model") - kv_cache_group = cast(List[str], export_config.get("kv_cache_group")) + kv_cache_group = cast(list[str], export_config.get("kv_cache_group")) pack_method = cast(str, export_config.get("pack_method")) # In the export model of quark, the quantization configuration @@ -91,7 +91,7 @@ def from_config(cls, config: Dict[str, Any]) -> "QuarkConfig": kv_cache_config = None else: kv_cache_set = set(kv_cache_group) - layer_quant_config = cast(Dict[str, Any], + layer_quant_config = cast(dict[str, Any], config.get("layer_quant_config")) layer_quant_names = list(layer_quant_config.keys()) layer_quant_set = set(layer_quant_names) @@ -104,7 +104,7 @@ def from_config(cls, config: Dict[str, Any]) -> "QuarkConfig": "configuration.") q_configs = [ - cast(Dict[str, Any], layer_quant_config.get(name)) + cast(dict[str, Any], layer_quant_config.get(name)) for name in kv_cache_group ] if not all( @@ -131,7 +131,7 @@ def from_config(cls, config: Dict[str, Any]) -> "QuarkConfig": pack_method=pack_method) @classmethod - def get_config_filenames(cls) -> List[str]: + def get_config_filenames(cls) -> list[str]: return [] def _check_scheme_supported(self, @@ -151,8 +151,8 @@ def _check_scheme_supported(self, else: return False - def _is_fp8_w8a8(self, weight_quant: Optional[Dict[str, Any]], - input_quant: Optional[Dict[str, Any]]) -> bool: + def _is_fp8_w8a8(self, weight_quant: Optional[dict[str, Any]], + input_quant: Optional[dict[str, Any]]) -> bool: # Confirm weights and input quantized. if weight_quant is None or input_quant is None: return False @@ -176,8 +176,8 @@ def _is_fp8_w8a8(self, weight_quant: Optional[Dict[str, Any]], is_per_tensor_activation = (input_quant.get("qscheme") == "per_tensor") return is_per_tensor_activation - def _is_static_tensor_w8a8(self, weight_quant: Optional[Dict[str, Any]], - input_quant: Optional[Dict[str, Any]]) -> bool: + def _is_static_tensor_w8a8(self, weight_quant: Optional[dict[str, Any]], + input_quant: Optional[dict[str, Any]]) -> bool: # Confirm weights and input quantized. if weight_quant is None or input_quant is None: return False @@ -199,7 +199,7 @@ def _is_static_tensor_w8a8(self, weight_quant: Optional[Dict[str, Any]], return is_int8_dtype and is_tensor and is_weight_symmetric and is_static def _find_matched_config(self, layer_name: str, - module: torch.nn.Module) -> Dict[str, Any]: + module: torch.nn.Module) -> dict[str, Any]: proj_name = layer_name.split(".")[-1] if proj_name in self.packed_modules_mapping: @@ -224,29 +224,29 @@ def _find_matched_config(self, layer_name: str, return shard_configs[0] else: layer_quant_config = cast( - Dict[str, Any], self.quant_config.get("layer_quant_config")) + dict[str, Any], self.quant_config.get("layer_quant_config")) for name_pattern in layer_quant_config: if fnmatch.fnmatch(layer_name, name_pattern): return layer_quant_config[name_pattern] layer_type = cast(str, type(module)) layer_type_quant_config = cast( - Dict[str, Any], + dict[str, Any], self.quant_config.get("layer_type_quant_config")) if layer_type in layer_type_quant_config: return layer_type_quant_config[layer_type] global_quant_config = cast( - Dict[str, Any], self.quant_config.get("global_quant_config")) + dict[str, Any], self.quant_config.get("global_quant_config")) return global_quant_config - def _get_scheme_from_config(self, config: Dict[str, Any]) -> "QuarkScheme": + def _get_scheme_from_config(self, config: dict[str, Any]) -> "QuarkScheme": if config.get("output_tensors") or config.get("bias"): raise NotImplementedError( "Currently, Quark models with output_tensors " "and bias quantized are not supported") - weight_config = cast(Dict[str, Any], config.get("weight")) - input_config = cast(Dict[str, Any], config.get("input_tensors")) + weight_config = cast(dict[str, Any], config.get("weight")) + input_config = cast(dict[str, Any], config.get("input_tensors")) if self._is_fp8_w8a8(weight_config, input_config): is_fp8_w8a8_supported = self._check_scheme_supported( @@ -323,7 +323,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: def create_weights(self, layer: torch.nn.Module, input_size_per_partition: int, - output_partition_sizes: List[int], input_size: int, + output_partition_sizes: list[int], input_size: int, output_size: int, params_dtype: torch.dtype, **extra_weight_attrs): """ @@ -367,7 +367,7 @@ def __init__(self, quant_config: QuarkConfig): super().__init__(quant_config) @staticmethod - def validate_kv_cache_config(kv_cache_config: Optional[Dict[str, Any]]): + def validate_kv_cache_config(kv_cache_config: Optional[dict[str, Any]]): """ Validator for the kv cache configuration. Useful for controlling the kv cache quantization schemes, that are being supported in vLLM diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py index 32dce5aaf5e07..1ae3fc937a28d 100644 --- a/vllm/model_executor/layers/quantization/quark/quark_moe.py +++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Any, Callable, Dict, Optional +from typing import Any, Callable, Optional import torch @@ -45,7 +45,7 @@ def get_moe_method( class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod): - def __init__(self, weight_config: Dict[str, Any], input_config: Dict[str, + def __init__(self, weight_config: dict[str, Any], input_config: dict[str, Any]): self.weight_quant = weight_config self.input_quant = input_config diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py index c885e98a4d66e..221d3c93b5fb8 100644 --- a/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py +++ b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Callable, List, Optional +from typing import Callable, Optional import torch from torch.nn import Parameter @@ -83,7 +83,7 @@ def process_weights_after_loading(self, layer) -> None: layer.input_scale = None def create_weights(self, layer: torch.nn.Module, - output_partition_sizes: List[int], + output_partition_sizes: list[int], input_size_per_partition: int, params_dtype: torch.dtype, weight_loader: Callable, **kwargs): diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py index 1bf34b098938c..f3dc4ab705764 100644 --- a/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py +++ b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Callable, List, Optional, Set +from typing import Callable, Optional import torch @@ -17,7 +17,7 @@ class QuarkW8A8Int8(QuarkScheme): - _kernel_backends_being_used: Set[str] = set() + _kernel_backends_being_used: set[str] = set() def __init__(self, qscheme: str, is_static_input_scheme: Optional[bool], input_symmetric: Optional[bool]): @@ -31,7 +31,7 @@ def get_min_capability(cls) -> int: return 75 def create_weights(self, layer: torch.nn.Module, - output_partition_sizes: List[int], + output_partition_sizes: list[int], input_size_per_partition: int, params_dtype: torch.dtype, weight_loader: Callable, **kwargs): diff --git a/vllm/model_executor/layers/quantization/quark/utils.py b/vllm/model_executor/layers/quantization/quark/utils.py index 17e0df021085a..d1d293b017914 100644 --- a/vllm/model_executor/layers/quantization/quark/utils.py +++ b/vllm/model_executor/layers/quantization/quark/utils.py @@ -1,8 +1,9 @@ # SPDX-License-Identifier: Apache-2.0 import re +from collections.abc import Iterable, Mapping from types import MappingProxyType -from typing import Any, Iterable, List, Mapping, Optional +from typing import Any, Optional def deep_compare(dict1: Any, dict2: Any) -> bool: @@ -21,7 +22,7 @@ def deep_compare(dict1: Any, dict2: Any) -> bool: def should_ignore_layer( layer_name: Optional[str], ignore: Iterable[str], - fused_mapping: Mapping[str, List[str]] = MappingProxyType({}) + fused_mapping: Mapping[str, list[str]] = MappingProxyType({}) ) -> bool: if layer_name is None: return False diff --git a/vllm/model_executor/layers/quantization/schema.py b/vllm/model_executor/layers/quantization/schema.py index 026881f2dbaac..c0be40c16affc 100644 --- a/vllm/model_executor/layers/quantization/schema.py +++ b/vllm/model_executor/layers/quantization/schema.py @@ -12,7 +12,7 @@ scaling factors. """ -from typing import Dict, Optional +from typing import Optional from pydantic import BaseModel, ConfigDict, ValidationInfo, model_validator @@ -23,7 +23,7 @@ class KVCacheQuantSchema(BaseModel): # layer indices to their per-tensor KV cache scaling factor. # TODO: Consider pulling this and its validation methods out into its # own schema class (tricky as its members are variable) - scaling_factor: Dict[int, Dict[int, float]] + scaling_factor: dict[int, dict[int, float]] @model_validator(mode="after") def check_is_fp8(self) -> "KVCacheQuantSchema": diff --git a/vllm/model_executor/layers/quantization/tpu_int8.py b/vllm/model_executor/layers/quantization/tpu_int8.py index 14e5bcf6e5bbe..a7c2b623ddea1 100644 --- a/vllm/model_executor/layers/quantization/tpu_int8.py +++ b/vllm/model_executor/layers/quantization/tpu_int8.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Any, Dict, List, Optional, Tuple +from typing import Any, Optional import torch from torch.nn import Module @@ -30,7 +30,7 @@ def __init__( def get_name(self) -> str: return "tpu_int8" - def get_supported_act_dtypes(self) -> List[torch.dtype]: + def get_supported_act_dtypes(self) -> list[torch.dtype]: return [torch.float16, torch.bfloat16] @classmethod @@ -39,11 +39,11 @@ def get_min_capability(cls) -> int: "This function should not be called with TPU Backend") @staticmethod - def get_config_filenames() -> List[str]: + def get_config_filenames() -> list[str]: return [] @classmethod - def from_config(cls, config: Dict[str, Any]) -> "Int8TpuConfig": + def from_config(cls, config: dict[str, Any]) -> "Int8TpuConfig": activation_scheme = cls.get_from_keys(config, ["activation_scheme"]) return cls(activation_scheme=activation_scheme) @@ -61,7 +61,7 @@ def __init__(self, quant_config: Int8TpuConfig): self.quant_config = quant_config def create_weights(self, layer: Module, input_size_per_partition: int, - output_partition_sizes: List[int], input_size: int, + output_partition_sizes: list[int], input_size: int, output_size: int, params_dtype: torch.dtype, **extra_weight_attrs): @@ -76,7 +76,7 @@ def create_weights(self, layer: Module, input_size_per_partition: int, layer.register_parameter("weight", weight) def _quantize_weight( - self, weight: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + self, weight: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: weight_dtype = weight.dtype weight = weight.cpu().to(torch.float32) n_bit = 8 diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py index 7d91d2cf1c6e8..2aaee820988a0 100644 --- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py @@ -4,7 +4,7 @@ import functools import json import os -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import Any, Optional, Union import torch import triton @@ -35,7 +35,7 @@ def is_fp8(x: Union[torch.dtype, torch.Tensor]) -> bool: def apply_w8a8_block_fp8_linear( input: torch.Tensor, weight: torch.Tensor, - block_size: List[int], + block_size: list[int], weight_scale: torch.Tensor, input_scale: Optional[torch.Tensor] = None, bias: Optional[torch.Tensor] = None, @@ -85,7 +85,7 @@ def apply_w8a8_block_fp8_linear( def apply_w8a8_block_fp8_linear_fake( input: torch.Tensor, weight: torch.Tensor, - block_size: List[int], + block_size: list[int], weight_scale: torch.Tensor, input_scale: Optional[torch.Tensor] = None, ) -> torch.Tensor: @@ -108,8 +108,8 @@ def apply_fp8_linear_generic( input: torch.Tensor, weight: torch.Tensor, weight_scale: torch.Tensor, - input_group_shape: Tuple[int, int], - weight_group_shape: Tuple[int, int], + input_group_shape: tuple[int, int], + weight_group_shape: tuple[int, int], input_scale: Optional[torch.Tensor] = None, # static scale if one cutlass_fp8_supported: bool = CUTLASS_FP8_SUPPORTED, cutlass_block_fp8_supported: bool = CUTLASS_BLOCK_FP8_SUPPORTED, @@ -146,7 +146,7 @@ def is_dim_blocked(dim, shape, group_shape): def input_to_float8( x: torch.Tensor, dtype: Optional[torch.dtype] = None -) -> Tuple[torch.Tensor, torch.Tensor]: +) -> tuple[torch.Tensor, torch.Tensor]: """This function quantizes input values to float8 values " "with tensor-wise quantization.""" if dtype is None: @@ -163,7 +163,7 @@ def input_to_float8( def block_quant_to_tensor_quant( x_q_block: torch.Tensor, x_s: torch.Tensor, -) -> Tuple[torch.Tensor, torch.Tensor]: +) -> tuple[torch.Tensor, torch.Tensor]: """This function converts block-wise quantization to tensor-wise quantization. The inputs are block-wise quantization tensor `x_q_block`, block-wise quantization scale and the block size. @@ -281,7 +281,7 @@ def per_token_group_quant_fp8( eps: float = 1e-10, dtype: Optional[torch.dtype] = None, column_major_scales: bool = False, -) -> Tuple[torch.Tensor, torch.Tensor]: +) -> tuple[torch.Tensor, torch.Tensor]: """Function to perform per-token-group quantization on an input tensor `x`. It converts the tensor values into signed float8 values and returns the quantized tensor along with the scaling factor used for quantization. @@ -292,7 +292,7 @@ def per_token_group_quant_fp8( dtype: The dype of output tensor. Note that only `torch.float8_e4m3fn` is supported for now. Returns: - Tuple[torch.Tensor, torch.Tensor]: The quantized tensor and the + tuple[torch.Tensor, torch.Tensor]: The quantized tensor and the scaling factor for quantization. """ if dtype is None: @@ -448,7 +448,7 @@ def _w8a8_block_fp8_matmul( @functools.lru_cache def get_w8a8_block_fp8_configs(N: int, K: int, block_n: int, - block_k: int) -> Optional[Dict[int, Any]]: + block_k: int) -> Optional[dict[int, Any]]: """ Return optimized configurations for the w8a8 block fp8 kernel. The return value will be a dictionary that maps an irregular grid of @@ -488,7 +488,7 @@ def w8a8_block_fp8_matmul( B: torch.Tensor, As: torch.Tensor, Bs: torch.Tensor, - block_size: List[int], + block_size: list[int], output_dtype: torch.dtype = torch.float16, ) -> torch.Tensor: """This function performs matrix multiplication with block-wise diff --git a/vllm/model_executor/layers/quantization/utils/gptq_utils.py b/vllm/model_executor/layers/quantization/utils/gptq_utils.py index 5b0e6299f4739..ff7a8169e6fbc 100644 --- a/vllm/model_executor/layers/quantization/utils/gptq_utils.py +++ b/vllm/model_executor/layers/quantization/utils/gptq_utils.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import re from copy import deepcopy -from typing import Dict, Optional, Union +from typing import Optional, Union import torch @@ -52,7 +52,7 @@ def get_dynamic_override( layer_name: str, key: Optional[str] = None, default_value: Union[int, bool, - None] = None) -> Union[Dict, int, bool, None]: + None] = None) -> Union[dict, int, bool, None]: for pattern, pattern_dict in config.dynamic.items(): # Negative match: matched modules are excluded from quantized init if pattern.startswith("-:"): diff --git a/vllm/model_executor/layers/quantization/utils/machete_utils.py b/vllm/model_executor/layers/quantization/utils/machete_utils.py index cb7d49ed6f1ca..6d840b5686123 100644 --- a/vllm/model_executor/layers/quantization/utils/machete_utils.py +++ b/vllm/model_executor/layers/quantization/utils/machete_utils.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List, Optional, Tuple +from typing import Optional import torch @@ -10,19 +10,19 @@ MACHETE_PREPACKED_BLOCK_SHAPE = [64, 128] -def query_machete_supported_quant_types(zero_points: bool) -> List[ScalarType]: +def query_machete_supported_quant_types(zero_points: bool) -> list[ScalarType]: if zero_points: return [scalar_types.uint4, scalar_types.uint8] else: return [scalar_types.uint4b8, scalar_types.uint8b128] -def query_machete_supported_act_types(zero_points: bool) -> List[ScalarType]: +def query_machete_supported_act_types(zero_points: bool) -> list[ScalarType]: return [torch.float16, torch.bfloat16] def check_machete_supports_shape(in_features: int, out_featrues: int) \ - -> Tuple[bool, Optional[str]]: + -> tuple[bool, Optional[str]]: if in_features % MACHETE_PREPACKED_BLOCK_SHAPE[0] != 0: return False, "Input features size must be divisible by "\ f"{MACHETE_PREPACKED_BLOCK_SHAPE[0]}" diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py index 80416c1bc6ebc..57f6137bf4763 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List, Optional, Tuple +from typing import Optional import numpy import torch @@ -53,7 +53,7 @@ def _check_marlin_supported( quant_type: ScalarType, group_size: Optional[int], has_zp: bool, - device_capability: Optional[int] = None) -> Tuple[bool, Optional[str]]: + device_capability: Optional[int] = None) -> tuple[bool, Optional[str]]: if device_capability is None: capability_tuple = current_platform.get_device_capability() @@ -126,7 +126,7 @@ def verify_marlin_supports_shape(output_size_per_partition: int, def check_marlin_supports_shape(output_size_per_partition: int, input_size_per_partition: int, input_size: int, group_size: int) \ - -> Tuple[bool, Optional[str]]: + -> tuple[bool, Optional[str]]: try: verify_marlin_supports_shape(output_size_per_partition, input_size_per_partition, input_size, @@ -184,16 +184,16 @@ def marlin_make_empty_zp(device: torch.device) -> torch.Tensor: def marlin_sort_g_idx( - g_idx: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + g_idx: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: g_idx_sort_indices = torch.argsort(g_idx).to(torch.int) return g_idx[g_idx_sort_indices], g_idx_sort_indices def get_scale_perms(): - scale_perm: List[int] = [] + scale_perm: list[int] = [] for i in range(8): scale_perm.extend([i + 8 * j for j in range(8)]) - scale_perm_single: List[int] = [] + scale_perm_single: list[int] = [] for i in range(4): scale_perm_single.extend( [2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]]) diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py index fb557a31393ca..81112b27f53a8 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 """Utility functions used for tests and benchmarks""" -from typing import List, Optional +from typing import Optional import numpy as np import torch @@ -64,9 +64,9 @@ def marlin_weights(q_w, size_k, size_n, num_bits, perm): def get_weight_perm(num_bits: int): - perm_list: List[int] = [] + perm_list: list[int] = [] for i in range(32): - perm1: List[int] = [] + perm1: list[int] = [] col = i // 4 for block in [0, 1]: for row in [ diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py index 3654268e27af3..73feb4264a8bb 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py @@ -2,7 +2,6 @@ """Utility functions used for tests and benchmarks""" import random -from typing import List import numpy import torch @@ -373,19 +372,19 @@ def compress_quantized_24_weight(q_24, size_k, size_n, wtype: ScalarType): def get_scale_perms_24(): - scale_perm: List[int] = [] + scale_perm: list[int] = [] for i in range(8): scale_perm.extend([i * 8 + j for j in [0, 4, 1, 5, 2, 6, 3, 7]]) - scale_perm_single: List[int] = [] + scale_perm_single: list[int] = [] for i in range(8): scale_perm_single.extend([8 * i + j for j in [0, 1, 2, 3, 4, 5, 6, 7]]) return scale_perm, scale_perm_single def get_weight_perm_24(num_bits: int): - perm_list: List[int] = [] + perm_list: list[int] = [] for i in range(32): - perm1: List[int] = [] + perm1: list[int] = [] col = i // 4 col_o = col // 2 for block in [0, 1]: diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py index 176b2947ab09e..0123540fc5ddd 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List - import numpy import torch @@ -34,10 +32,10 @@ def marlin_qqq_weights(q_w, size_k, size_n, num_bits, perm, group_size): def get_qqq_scale_perms(): - scale_perm: List[int] = [] + scale_perm: list[int] = [] for i in range(8): scale_perm.extend([i + 8 * j for j in range(8)]) - scale_perm_single: List[int] = [] + scale_perm_single: list[int] = [] for i in range(4): scale_perm_single.extend( [2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]]) @@ -46,9 +44,9 @@ def get_qqq_scale_perms(): # NOTE(HandH1998): QQQ employs different perms for per-group and per-channel weight quantization. # noqa: E501 def get_qqq_weight_perm(num_bits: int, quant_type: str): - perm_list: List[int] = [] + perm_list: list[int] = [] for i in range(32): - perm1: List[int] = [] + perm1: list[int] = [] col = i // 4 for block in [0, 1]: for row in [ diff --git a/vllm/model_executor/layers/quantization/utils/quant_utils.py b/vllm/model_executor/layers/quantization/utils/quant_utils.py index c7ce3a42c81f9..6ba327f3db7a4 100644 --- a/vllm/model_executor/layers/quantization/utils/quant_utils.py +++ b/vllm/model_executor/layers/quantization/utils/quant_utils.py @@ -1,7 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 """This file is used for /tests and /benchmarks""" +from collections.abc import Mapping from types import MappingProxyType -from typing import List, Mapping, Optional, Tuple +from typing import Optional import numpy import torch @@ -15,7 +16,7 @@ # Normalize the group_shape to the full extent for any dims that are -1 -def _normalize_quant_group_shape(x: torch.Tensor, group_shape: Tuple[int, +def _normalize_quant_group_shape(x: torch.Tensor, group_shape: tuple[int, int]): # -1 means full extent return (group_shape[0] if group_shape[0] > 0 else x.shape[-2], @@ -56,9 +57,9 @@ def group_broadcast(t, shape): # (i.e. per-token-per-group) def scaled_quantize( x: torch.Tensor, - group_shape: Tuple[int, int], + group_shape: tuple[int, int], quant_dtype: torch.dtype, -) -> Tuple[torch.Tensor, torch.Tensor]: +) -> tuple[torch.Tensor, torch.Tensor]: group_shape = _normalize_quant_group_shape(x, group_shape) assert quant_dtype.is_floating_point, \ "currently `scaled_quantize` only supports floating point dtypes " \ @@ -97,9 +98,9 @@ def scaled_quantize( def scaled_dequantize( x_q: torch.Tensor, x_s: torch.Tensor, - group_shape: Optional[Tuple[int, int]] = None, + group_shape: Optional[tuple[int, int]] = None, out_dtype: torch.dtype = torch.float32, -) -> Tuple[torch.Tensor, torch.Tensor]: +) -> tuple[torch.Tensor, torch.Tensor]: if group_shape is not None: group_shape = _normalize_quant_group_shape(x_q, group_shape) @@ -173,8 +174,8 @@ def unpack_quantized_values_into_int32(w_q: torch.Tensor, def is_layer_skipped( prefix: str, - ignored_layers: List[str], - fused_mapping: Mapping[str, List[str]] = MappingProxyType({}) + ignored_layers: list[str], + fused_mapping: Mapping[str, list[str]] = MappingProxyType({}) ) -> bool: # prefix: model.layers.0.self_attn.q_proj # proj_name: q_proj diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py index 0f93b7f6c45ba..4cbf64a5a06bf 100644 --- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List, Optional, Tuple, Union +from typing import Optional, Union import torch @@ -68,7 +68,7 @@ def all_close_1d(x: torch.Tensor) -> bool: def convert_to_channelwise( weight_scale: torch.Tensor, - logical_widths: List[int]) -> Tuple[torch.Tensor, torch.Tensor]: + logical_widths: list[int]) -> tuple[torch.Tensor, torch.Tensor]: # Create channelwise buffer weight_scale_channel = torch.empty((sum(logical_widths), 1), dtype=torch.float32, @@ -86,7 +86,7 @@ def convert_to_channelwise( def requantize_with_max_scale( weight: torch.Tensor, weight_scale: torch.Tensor, - logical_widths: List[int]) -> Tuple[torch.Tensor, torch.Tensor]: + logical_widths: list[int]) -> tuple[torch.Tensor, torch.Tensor]: # Max scale to be used for requanitzation. max_w_scale = weight_scale.max() @@ -250,7 +250,7 @@ def normalize_e4m3fn_to_e4m3fnuz( weight: torch.Tensor, weight_scale: torch.Tensor, input_scale: Optional[torch.Tensor] = None -) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: +) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: assert weight.dtype == torch.float8_e4m3fn # The bits pattern 10000000(-128) represents zero in e4m3fn # but NaN in e4m3fnuz. So here we set it to 0. diff --git a/vllm/model_executor/layers/rejection_sampler.py b/vllm/model_executor/layers/rejection_sampler.py index 62e27b714866a..78aa82285af2b 100644 --- a/vllm/model_executor/layers/rejection_sampler.py +++ b/vllm/model_executor/layers/rejection_sampler.py @@ -2,7 +2,7 @@ from functools import cached_property from importlib.util import find_spec -from typing import Dict, Optional, Tuple +from typing import Optional import torch import torch.jit @@ -65,7 +65,7 @@ def forward( bonus_token_ids: torch.Tensor, draft_probs: torch.Tensor, draft_token_ids: torch.Tensor, - seeded_seqs: Optional[Dict[int, torch.Generator]] = None, + seeded_seqs: Optional[dict[int, torch.Generator]] = None, ) -> torch.Tensor: """Sample token ids using rejection sampling. This accepts or rejects tokens proposed by the draft model using the probability of each token @@ -95,7 +95,7 @@ def forward( probabilities. shape = [batch_size, num_speculative_tokens] - seeded_seqs: Dict of batch row index to torch generator, for + seeded_seqs: dict of batch row index to torch generator, for sequences using seeded generation. Returns: @@ -161,8 +161,8 @@ def _batch_modified_rejection_sampling( target_probs: torch.Tensor, # [batch_size, k, vocab_size] draft_probs: torch.Tensor, # [batch_size, k, vocab_size] draft_token_ids: torch.Tensor, # [batch_size, k] - seeded_seqs: Optional[Dict[int, torch.Generator]], - ) -> Tuple[torch.Tensor, torch.Tensor]: + seeded_seqs: Optional[dict[int, torch.Generator]], + ) -> tuple[torch.Tensor, torch.Tensor]: """Perform modified rejection sampling on each sequence. Returns: @@ -194,7 +194,7 @@ def _batch_modified_rejection_sampling( return accepted, recovered_token_ids def _create_uniform_samples(self, - seeded_seqs: Optional[Dict[int, + seeded_seqs: Optional[dict[int, torch.Generator]], batch_size: int, k: int, device: torch.device) -> torch.Tensor: @@ -210,7 +210,7 @@ def _create_uniform_samples(self, a seed. Args: - seeded_seqs : Optional[Dict[int, torch.Generator]] + seeded_seqs : Optional[dict[int, torch.Generator]] A dictionary mapping indices in the batch to `torch.Generator` objects. If `None`, all samples are generated without a seed. @@ -255,7 +255,7 @@ def _get_accepted( target_probs: torch.Tensor, # [batch_size, k, vocab_size] draft_probs: torch.Tensor, # [batch_size, k, vocab_size] draft_token_ids: torch.Tensor, # [batch_size, k] - seeded_seqs: Optional[Dict[int, torch.Generator]], + seeded_seqs: Optional[dict[int, torch.Generator]], ) -> torch.Tensor: r"""Create bool matrix over the proposed draft tokens. If True, then a token can be accepted, else it should be @@ -376,7 +376,7 @@ def _multinomial( probs: torch.Tensor, num_samples: int, k: int, - seeded_seqs: Dict[int, torch.Generator], + seeded_seqs: dict[int, torch.Generator], ) -> torch.Tensor: if num_samples > 1: diff --git a/vllm/model_executor/layers/resampler.py b/vllm/model_executor/layers/resampler.py index 4c9860006c328..839688e313aae 100644 --- a/vllm/model_executor/layers/resampler.py +++ b/vllm/model_executor/layers/resampler.py @@ -33,7 +33,7 @@ """ import math from functools import partial -from typing import Callable, Optional, Tuple, Union +from typing import Callable, Optional, Union import numpy as np import torch @@ -69,7 +69,7 @@ def get_abs_pos(abs_pos: torch.Tensor, tgt_size: Union[torch.Tensor, # https://github.com/facebookresearch/mae/blob/efb2a8062c206524e35e47d04501ed4f544c0ae8/util/pos_embed.py#L20 def get_1d_sincos_pos_embed_from_grid( embed_dim: int, pos: np.ndarray, - version: Tuple[int, int] = (2, 0)) -> torch.Tensor: + version: tuple[int, int] = (2, 0)) -> torch.Tensor: """ embed_dim: output dimension for each position pos: a list of positions to be encoded: size (M,) / (H, W) @@ -96,7 +96,7 @@ def get_1d_sincos_pos_embed_from_grid( def get_2d_sincos_pos_embed_from_grid( embed_dim: int, grid: np.ndarray, - version: Tuple[int, int] = (2, 0)) -> torch.Tensor: + version: tuple[int, int] = (2, 0)) -> torch.Tensor: assert embed_dim % 2 == 0 # use half of dimensions to encode grid_h @@ -114,9 +114,9 @@ def get_2d_sincos_pos_embed_from_grid( def get_2d_sincos_pos_embed( embed_dim: int, - grid_size: Union[int, Tuple[int, int]], + grid_size: Union[int, tuple[int, int]], cls_token: bool = False, - version: Tuple[int, int] = (2, 0), + version: tuple[int, int] = (2, 0), ) -> torch.Tensor: """ grid_size: int of the grid height and width diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py index 64c2dac524f2b..384d09f55e321 100644 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -23,7 +23,7 @@ # limitations under the License. """Rotary Positional Embeddings.""" import math -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import Any, Optional, Union import torch import torch.nn as nn @@ -128,7 +128,7 @@ def forward_native( query: torch.Tensor, key: torch.Tensor, offsets: Optional[torch.Tensor] = None, - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: """A PyTorch-native implementation of forward().""" if offsets is not None: positions = positions + offsets @@ -158,7 +158,7 @@ def forward_cuda( query: torch.Tensor, key: torch.Tensor, offsets: Optional[torch.Tensor] = None, - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: from vllm import _custom_ops as ops self.cos_sin_cache = self.cos_sin_cache.to(query.device, @@ -181,7 +181,7 @@ def forward_xpu( query: torch.Tensor, key: torch.Tensor, offsets: Optional[torch.Tensor] = None, - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: from vllm._ipex_ops import ipex_ops as ops self.cos_sin_cache = self.cos_sin_cache.to(positions.device, @@ -204,7 +204,7 @@ def forward_hpu( query: torch.Tensor, key: torch.Tensor, offsets: Optional[torch.Tensor] = None, - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: from habana_frameworks.torch.hpex.kernels import ( RotaryPosEmbeddingMode, apply_rotary_pos_emb) if offsets is not None: @@ -260,7 +260,7 @@ def forward_neuron( query: torch.Tensor, key: torch.Tensor, offsets: Optional[torch.Tensor] = None, - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: def _apply_rotary_emb_neuron( x: torch.Tensor, @@ -373,23 +373,23 @@ def __init__( max_position_embeddings: int, base: int, is_neox_style: bool, - scaling_factors: Union[List[float], float], + scaling_factors: Union[list[float], float], dtype: torch.dtype, ) -> None: if isinstance(scaling_factors, float): scaling_factors = [scaling_factors] - self.scaling_factors: List[float] = scaling_factors # noqa + self.scaling_factors: list[float] = scaling_factors # noqa super().__init__(head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype) # Lazy initialized. - self._scaling_factor_to_offset: Dict[float, int] + self._scaling_factor_to_offset: dict[float, int] def _compute_cos_sin_cache(self) -> torch.Tensor: inv_freq = self._compute_inv_freq(self.base) - cache_list: List[torch.Tensor] = [] + cache_list: list[torch.Tensor] = [] # offsets to the next cache in a tensor. # Each offset corresponds to the same index in scaling_factors. - offsets: List[int] = [] + offsets: list[int] = [] for scaling_factor in self.scaling_factors: # NOTE(woosuk): self.max_position_embeddings is the original # maximum length before applying the rope scaling. @@ -419,7 +419,7 @@ def _compute_cos_sin_cache(self) -> torch.Tensor: return torch.cat(cache_list, dim=0) @property - def scaling_factor_to_offset(self) -> Dict[float, int]: + def scaling_factor_to_offset(self) -> dict[float, int]: return self._scaling_factor_to_offset @@ -479,7 +479,7 @@ def _yarn_find_correction_range( high_rot: int, dim: int, base: float = 10000, - max_position_embeddings: int = 2048) -> Tuple[int, int]: + max_position_embeddings: int = 2048) -> tuple[int, int]: low = math.floor( _yarn_find_correction_dim(low_rot, dim, base, max_position_embeddings)) high = math.ceil( @@ -580,8 +580,8 @@ def __init__( base: int, is_neox_style: bool, dtype: torch.dtype, - short_factor: List[float], - long_factor: List[float], + short_factor: list[float], + long_factor: list[float], short_mscale: Optional[float] = None, long_mscale: Optional[float] = None, ): @@ -629,7 +629,7 @@ def __init__( long_short_cache, persistent=False) - def _compute_inv_freq(self, rescale_factors: List[float]) -> torch.Tensor: + def _compute_inv_freq(self, rescale_factors: list[float]) -> torch.Tensor: rescale_factors = torch.tensor(rescale_factors, dtype=torch.float32) inv_freq = 1.0 / (rescale_factors * (self.base**(torch.arange( 0, self.rotary_dim, 2, dtype=torch.float) / self.rotary_dim))) @@ -638,7 +638,7 @@ def _compute_inv_freq(self, rescale_factors: List[float]) -> torch.Tensor: def _compute_cos_sin_cache( self, max_position_embeddings: int, - rescale_factors: List[float], + rescale_factors: list[float], mscale: float, ) -> torch.Tensor: inv_freq = self._compute_inv_freq(rescale_factors) @@ -655,7 +655,7 @@ def forward( query: torch.Tensor, key: torch.Tensor, offsets: Optional[torch.Tensor] = None, - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: query = query.view(*query.shape[:-1], -1, self.head_size) key = key.view(*key.shape[:-1], -1, self.head_size) @@ -765,7 +765,7 @@ def forward( query: torch.Tensor, key: torch.Tensor, offsets: Optional[torch.Tensor] = None, - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: """PyTorch-native implementation equivalent to forward().""" query_rot = query[..., :self.rotary_dim] key_rot = key[..., :self.rotary_dim] @@ -857,7 +857,7 @@ def __init__( base: int, is_neox_style: bool, dtype: torch.dtype, - mrope_section: Optional[List[int]] = None, + mrope_section: Optional[list[int]] = None, ) -> None: # In Qwen2.5-VL, the maximum index value is related to the duration of # the input video. We enlarge max_position_embeddings to 4 times to get @@ -875,7 +875,7 @@ def forward( positions: torch.Tensor, query: torch.Tensor, key: torch.Tensor, - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: """PyTorch-native implementation equivalent to forward(). Args: @@ -921,14 +921,14 @@ def forward( @staticmethod def get_input_positions( - input_tokens: List[int], + input_tokens: list[int], hf_config: PretrainedConfig, - image_grid_thw: Union[List[List[int]], torch.Tensor], - video_grid_thw: Union[List[List[int]], torch.Tensor], - second_per_grid_ts: Optional[List[float]] = None, + image_grid_thw: Union[list[list[int]], torch.Tensor], + video_grid_thw: Union[list[list[int]], torch.Tensor], + second_per_grid_ts: Optional[list[float]] = None, context_len: int = 0, seq_len: Optional[int] = None, - ) -> Tuple[List[List[int]], int]: + ) -> tuple[list[list[int]], int]: """Get mrope input positions and delta value.""" llm_positions, mrope_position_delta = \ @@ -946,14 +946,14 @@ def get_input_positions( @staticmethod def get_input_positions_tensor( - input_tokens: List[int], + input_tokens: list[int], hf_config: PretrainedConfig, - image_grid_thw: Union[List[List[int]], torch.Tensor], - video_grid_thw: Union[List[List[int]], torch.Tensor], - second_per_grid_ts: Optional[List[float]] = None, + image_grid_thw: Union[list[list[int]], torch.Tensor], + video_grid_thw: Union[list[list[int]], torch.Tensor], + second_per_grid_ts: Optional[list[float]] = None, context_len: int = 0, seq_len: Optional[int] = None, - ) -> Tuple[torch.Tensor, int]: + ) -> tuple[torch.Tensor, int]: """Get mrope input positions and delta value.""" image_token_id = hf_config.image_token_id @@ -1052,7 +1052,7 @@ def get_next_input_positions( mrope_position_delta: int, context_len: int, seq_len: int, - ) -> List[List[int]]: + ) -> list[list[int]]: return [ list( range(context_len + mrope_position_delta, @@ -1071,7 +1071,7 @@ def get_next_input_positions_tensor( ).expand(3, -1) -_ROPE_DICT: Dict[Tuple, RotaryEmbedding] = {} +_ROPE_DICT: dict[tuple, RotaryEmbedding] = {} def get_rope( @@ -1080,7 +1080,7 @@ def get_rope( max_position: int, base: int, is_neox_style: bool = True, - rope_scaling: Optional[Dict[str, Any]] = None, + rope_scaling: Optional[dict[str, Any]] = None, dtype: Optional[torch.dtype] = None, partial_rotary_factor: float = 1.0, ) -> RotaryEmbedding: diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index 07ee75593f7b7..b0669c117d416 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -2,10 +2,11 @@ """A layer that samples the next tokens from the model's outputs.""" import itertools import warnings +from collections.abc import Iterator from dataclasses import dataclass from importlib.util import find_spec from math import inf -from typing import Dict, Iterator, List, Optional, Tuple, Union +from typing import Optional, Union import msgspec import torch @@ -42,14 +43,14 @@ def get_sampler() -> torch.nn.Module: # (num_token_ids, num_parent_ids) per sequence group. -SampleResultType = List[Tuple[List[int], List[int]]] +SampleResultType = list[tuple[list[int], list[int]]] # Types of temporary data structures used for # computing sample_result -SampleMetadataType = Dict[SamplingType, Tuple[List[int], - List[SequenceGroupToSample]]] -MultinomialSamplesType = Dict[SamplingType, torch.Tensor] -SampleResultsDictType = Dict[int, Tuple[List[int], List[int]]] +SampleMetadataType = dict[SamplingType, tuple[list[int], + list[SequenceGroupToSample]]] +MultinomialSamplesType = dict[SamplingType, torch.Tensor] +SampleResultsDictType = dict[int, tuple[list[int], list[int]]] # Encapsulates temporary data structures for computing @@ -76,7 +77,7 @@ class SampleResultArgsType: MaybeDeferredSampleResultType = Union[SampleResultType, SampleResultArgsType] # Abbreviation of the _sample() return type -SampleReturnType = Tuple[MaybeDeferredSampleResultType, Optional[torch.Tensor]] +SampleReturnType = tuple[MaybeDeferredSampleResultType, Optional[torch.Tensor]] class SamplerOutput( @@ -90,7 +91,7 @@ class SamplerOutput( also has optional fields for device tensors. """ - outputs: List[CompletionSequenceGroupOutput] + outputs: list[CompletionSequenceGroupOutput] # On-device tensor containing probabilities of each token. sampled_token_probs: Optional[torch.Tensor] = None @@ -344,8 +345,8 @@ def _apply_min_tokens_penalty( """Apply min_tokens penalty which sets stop tokens to -inf if min_tokens have not been generated yet """ - # list of indices in logits that will be set to -inf - logits_to_penalize: List[Tuple[int, int]] = [] + # List of indices in logits that will be set to -inf + logits_to_penalize: list[tuple[int, int]] = [] logits_applied = 0 for seq_group in sampling_metadata.seq_groups: seq_ids = seq_group.seq_ids @@ -361,7 +362,7 @@ def _apply_min_tokens_penalty( min_tokens = sampling_params.min_tokens token_ids_to_penalize = sampling_params.all_stop_token_ids if min_tokens > 0 and token_ids_to_penalize: - seqs_to_penalize: List[int] = [] + seqs_to_penalize: list[int] = [] for j, seq_id in enumerate(seq_ids): seq_data = seq_group.seq_data[seq_id] if len(seq_data.output_token_ids_array) < min_tokens: @@ -431,7 +432,7 @@ def _apply_min_p( def _greedy_sample( - selected_seq_groups: List[SequenceGroupToSample], + selected_seq_groups: list[SequenceGroupToSample], samples: torch.Tensor, ) -> SampleResultType: """Run greedy sampling on a given samples. @@ -442,7 +443,7 @@ def _greedy_sample( samples could be smaller than selected_seq_groups if seq_group.do_sample is False. Returns: - Tuple of (next_token_ids, parent_ids). The length of returned list is + tuple of (next_token_ids, parent_ids). The length of returned list is same as the length of selected_seq_groups. If the corresponding seq_group has do_sample=False, tuple contains ([], []) """ @@ -466,7 +467,7 @@ def _greedy_sample( def _random_sample( - selected_seq_groups: List[SequenceGroupToSample], + selected_seq_groups: list[SequenceGroupToSample], random_samples: torch.Tensor, ) -> SampleResultType: """Run random sampling on a given samples. @@ -477,7 +478,7 @@ def _random_sample( length of samples could be smaller than selected_seq_groups if seq_group.do_sample is False. Returns: - Tuple of (next_token_ids, parent_ids). The length of returned list is + tuple of (next_token_ids, parent_ids). The length of returned list is same as the length of selected_seq_groups. If the corresponding seq_group has do_sample=False, tuple contains ([], []) """ @@ -517,7 +518,7 @@ def _random_sample( def _multinomial( probs: torch.Tensor, num_samples: int, - seq_groups: Optional[List[SequenceGroupToSample]] = None, + seq_groups: Optional[list[SequenceGroupToSample]] = None, ) -> torch.Tensor: if num_samples > 1: probs = probs.repeat_interleave(num_samples, dim=0) @@ -538,7 +539,7 @@ def _multinomial( def _top_k_top_p_multinomial_with_flashinfer( probs: torch.Tensor, top_ks: torch.Tensor, top_ps: torch.Tensor, - num_samples: int, seq_groups: Optional[List[SequenceGroupToSample]]): + num_samples: int, seq_groups: Optional[list[SequenceGroupToSample]]): max_top_k_round = 32 if num_samples > 1: probs = probs.repeat_interleave(num_samples, dim=0) @@ -643,7 +644,7 @@ def _sample_with_torch( tensors required for Pythonization ''' - categorized_seq_group_ids: Dict[SamplingType, List[int]] = { + categorized_seq_group_ids: dict[SamplingType, list[int]] = { t: [] for t in SamplingType } @@ -807,7 +808,7 @@ def get_logprobs( logprobs: torch.Tensor, sampling_metadata: SamplingMetadata, sample_results: SampleResultType, -) -> Tuple[List[Optional[PromptLogprobs]], List[SampleLogprobs]]: +) -> tuple[list[Optional[PromptLogprobs]], list[SampleLogprobs]]: """Return sample logprobs and prompt logprobs. The logic consists of 3 parts. @@ -836,9 +837,9 @@ def get_logprobs( """ # The index of query token to calculate logprobs. It includes both # prompt and sample logprob indices. - query_indices: List[int] = [] + query_indices: list[int] = [] # The next token ids to get the logprob value from. - next_token_ids: List[int] = [] + next_token_ids: list[int] = [] # The largest requested number of logprobs. We find logprobs as many as the # largest num logprobs in this API. If every logprobs is None, it will be # set to -1. @@ -920,8 +921,8 @@ def get_logprobs( ranks = ranks.to('cpu') # Find prompt/sample logprobs. - prompt_logprobs_per_seq_group: List[Optional[PromptLogprobs]] = [] - sample_logprobs_per_seq_group: List[SampleLogprobs] = [] + prompt_logprobs_per_seq_group: list[Optional[PromptLogprobs]] = [] + sample_logprobs_per_seq_group: list[SampleLogprobs] = [] top_logprob_idx = 0 selected_logprobs_idx = 0 @@ -972,7 +973,7 @@ def _get_prompt_logprob_if_needed( for idx, token_id in enumerate(next_prompt_tokens): # Calculate the prompt logprob of the real prompt tokens. # {token_id: (logprob, rank_from_vocab)} - prompt_logprobs_dict: Dict[int, Tuple[float, int]] = { + prompt_logprobs_dict: dict[int, tuple[float, int]] = { token_id: (selected_logprob_items[idx], rank_items[idx]) } @@ -1004,7 +1005,7 @@ def _get_prompt_logprob_if_needed( def _get_sampled_logprob_if_needed( seq_group: SequenceGroupToSample, - sample_result: Tuple[List[int], List[int]], + sample_result: tuple[list[int], list[int]], selected_logprobs: torch.Tensor, ranks: torch.Tensor, top_token_ids: torch.Tensor, @@ -1125,21 +1126,21 @@ def _modify_greedy_probs_inplace(logprobs: torch.Tensor, probs: torch.Tensor, def _build_sampler_output( maybe_deferred_sample_results: MaybeDeferredSampleResultType, sampling_metadata: SamplingMetadata, - prompt_logprobs: Optional[List[Optional[PromptLogprobs]]], - sample_logprobs: Optional[List[SampleLogprobs]], - on_device_tensors: Optional[Tuple[torch.Tensor, torch.Tensor, + prompt_logprobs: Optional[list[Optional[PromptLogprobs]]], + sample_logprobs: Optional[list[SampleLogprobs]], + on_device_tensors: Optional[tuple[torch.Tensor, torch.Tensor, torch.Tensor]], skip_sampler_cpu_output: bool = False, ) -> SamplerOutput: """Construct Python objects with the output of sampling. Args: - on_device_tensors: Tuple containing on-device tensors with the + on_device_tensors: tuple containing on-device tensors with the probabilities used in sampling and the sampled token ids. This allows post-processing without copies to CPU/serialization, e.g. in speculative decoding rejection sampling. """ - sampler_output: List[CompletionSequenceGroupOutput] = [] + sampler_output: list[CompletionSequenceGroupOutput] = [] if skip_sampler_cpu_output: assert isinstance(maybe_deferred_sample_results, SampleResultArgsType) @@ -1161,7 +1162,7 @@ def _build_sampler_output( prompt_logprobs, sample_logprobs): seq_ids = seq_group.seq_ids next_token_ids, parent_ids = sample_result - seq_outputs: List[SequenceOutput] = [] + seq_outputs: list[SequenceOutput] = [] for parent_id, next_token_id, logprobs in zip( parent_ids, next_token_ids, group_sample_logprobs): seq_outputs.append( @@ -1187,7 +1188,7 @@ def _build_sampler_output( deferred_sample_results_args=deferred_sample_results_args) -def _get_next_prompt_tokens(seq_group: SequenceGroupToSample) -> List[int]: +def _get_next_prompt_tokens(seq_group: SequenceGroupToSample) -> list[int]: """Get a list of next prompt tokens to compute logprob from a given sequence group. diff --git a/vllm/model_executor/layers/spec_decode_base_sampler.py b/vllm/model_executor/layers/spec_decode_base_sampler.py index 54fd43fc6592c..969cd59b57ccc 100644 --- a/vllm/model_executor/layers/spec_decode_base_sampler.py +++ b/vllm/model_executor/layers/spec_decode_base_sampler.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 from abc import abstractmethod -from typing import Dict, Optional, Union +from typing import Optional, Union import torch import torch.jit @@ -253,6 +253,6 @@ def forward( bonus_token_ids: torch.Tensor, draft_probs: torch.Tensor, draft_token_ids: torch.Tensor, - seeded_seqs: Optional[Dict[int, torch.Generator]] = None, + seeded_seqs: Optional[dict[int, torch.Generator]] = None, ) -> torch.Tensor: raise NotImplementedError diff --git a/vllm/model_executor/layers/utils.py b/vllm/model_executor/layers/utils.py index a9ef973917e19..917fb1d7a0f63 100644 --- a/vllm/model_executor/layers/utils.py +++ b/vllm/model_executor/layers/utils.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 """Utility methods for model layers.""" -from typing import Tuple import torch @@ -9,7 +8,7 @@ def get_token_bin_counts_and_mask( tokens: torch.Tensor, vocab_size: int, num_seqs: int, -) -> Tuple[torch.Tensor, torch.Tensor]: +) -> tuple[torch.Tensor, torch.Tensor]: # Compute the bin counts for the tokens. # vocab_size + 1 for padding. bin_counts = torch.zeros((num_seqs, vocab_size + 1), diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py index f65dfc3cb3294..d270f2c9d82dd 100644 --- a/vllm/model_executor/layers/vocab_parallel_embedding.py +++ b/vllm/model_executor/layers/vocab_parallel_embedding.py @@ -1,7 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 +from collections.abc import Sequence from dataclasses import dataclass -from typing import List, Optional, Sequence, Tuple +from typing import Optional import torch import torch.nn.functional as F @@ -24,7 +25,7 @@ class UnquantizedEmbeddingMethod(QuantizeMethodBase): def create_weights(self, layer: torch.nn.Module, input_size_per_partition: int, - output_partition_sizes: List[int], input_size: int, + output_partition_sizes: list[int], input_size: int, output_size: int, params_dtype: torch.dtype, **extra_weight_attrs): """Create weights for embedding layer.""" @@ -140,7 +141,7 @@ def get_masked_input_and_mask( input_: torch.Tensor, org_vocab_start_index: int, org_vocab_end_index: int, num_org_vocab_padding: int, added_vocab_start_index: int, - added_vocab_end_index: int) -> Tuple[torch.Tensor, torch.Tensor]: + added_vocab_end_index: int) -> tuple[torch.Tensor, torch.Tensor]: # torch.compile will fuse all of the pointwise ops below # into a single kernel, making it very fast org_vocab_mask = (input_ >= org_vocab_start_index) & ( @@ -297,7 +298,7 @@ def _get_indices(cls, vocab_size_padded: int, org_vocab_size_padded: int, org_vocab_start_index, org_vocab_end_index, added_vocab_start_index, added_vocab_end_index) - def get_sharded_to_full_mapping(self) -> Optional[List[int]]: + def get_sharded_to_full_mapping(self) -> Optional[list[int]]: """Get a mapping that can be used to reindex the gathered logits for sampling. @@ -311,9 +312,9 @@ def get_sharded_to_full_mapping(self) -> Optional[List[int]]: if self.tp_size < 2: return None - base_embeddings: List[int] = [] - added_embeddings: List[int] = [] - padding: List[int] = [] + base_embeddings: list[int] = [] + added_embeddings: list[int] = [] + padding: list[int] = [] for tp_rank in range(self.tp_size): shard_indices = self._get_indices(self.num_embeddings_padded, self.org_vocab_size_padded, diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index 46247eaf2a60c..a63e893ae31d8 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -12,9 +12,9 @@ import os import warnings from abc import ABC, abstractmethod +from collections.abc import Generator, Iterable from contextlib import contextmanager -from typing import (Any, Callable, Dict, Generator, Iterable, List, Optional, - Tuple, cast) +from typing import Any, Callable, Optional, cast import gguf import huggingface_hub @@ -67,7 +67,7 @@ def device_loading_context(module: torch.nn.Module, yield module return - original_device_states: Dict[str, torch.device] = {} + original_device_states: dict[str, torch.device] = {} # Store original device states and move parameters to GPU if they're on CPU for name, p in module.named_parameters(): @@ -253,7 +253,7 @@ def _prepare_weights( revision: Optional[str], fall_back_to_pt: bool, allow_patterns_overrides: Optional[list[str]], - ) -> Tuple[str, List[str], bool]: + ) -> tuple[str, list[str], bool]: """Prepare weights for the model. If the model is not local, it will be downloaded.""" @@ -298,7 +298,7 @@ def _prepare_weights( else: hf_folder = model_name_or_path - hf_weights_files: List[str] = [] + hf_weights_files: list[str] = [] for pattern in allow_patterns: hf_weights_files += glob.glob(os.path.join(hf_folder, pattern)) if len(hf_weights_files) > 0: @@ -333,7 +333,7 @@ def _prepare_weights( def _get_weights_iterator( self, source: "Source" - ) -> Generator[Tuple[str, torch.Tensor], None, None]: + ) -> Generator[tuple[str, torch.Tensor], None, None]: """Get an iterator for the model weights based on the load format.""" hf_folder, hf_weights_files, use_safetensors = self._prepare_weights( source.model_or_path, source.revision, source.fall_back_to_pt, @@ -372,7 +372,7 @@ def _get_all_weights( self, model_config: ModelConfig, model: nn.Module, - ) -> Generator[Tuple[str, torch.Tensor], None, None]: + ) -> Generator[tuple[str, torch.Tensor], None, None]: primary_weights = DefaultModelLoader.Source( model_config.model, model_config.revision, @@ -466,7 +466,7 @@ def _verify_config(self, model_config: ModelConfig, self.tensorizer_config.verify_with_parallel_config(parallel_config) def _get_weights_iterator( - self, ) -> Generator[Tuple[str, torch.Tensor], None, None]: + self, ) -> Generator[tuple[str, torch.Tensor], None, None]: tensorizer_args = self.tensorizer_config._construct_tensorizer_args() return tensorizer_weights_iterator(tensorizer_args) @@ -572,12 +572,12 @@ def __init__(self, load_config: LoadConfig): @staticmethod def _filter_subtensors( - tensors: Dict[str, torch.Tensor], ) -> Dict[str, torch.Tensor]: + tensors: dict[str, torch.Tensor], ) -> dict[str, torch.Tensor]: """ Filter out all tensors that share the same memory or a subset of the memory of another tensor. """ - same_storage_groups: Dict[Any, List[Tuple[str, torch.Tensor]]] = ( + same_storage_groups: dict[Any, list[tuple[str, torch.Tensor]]] = ( collections.defaultdict(list)) for key, tensor in tensors.items(): if tensor.numel(): @@ -587,7 +587,7 @@ def _filter_subtensors( def get_end_ptr(tensor: torch.Tensor) -> int: return tensor.view(-1)[-1].data_ptr() + tensor.element_size() - result: Dict[str, torch.Tensor] = {} + result: dict[str, torch.Tensor] = {} for group in same_storage_groups.values(): for k, t in group: a, b = t.data_ptr(), get_end_ptr(t) @@ -695,7 +695,7 @@ def save_model( part_idx = 0 total_size = 0 state_dict = ShardedStateLoader._filter_subtensors(model.state_dict()) - state_dict_part: Dict[str, torch.Tensor] = {} + state_dict_part: dict[str, torch.Tensor] = {} for key, tensor in state_dict.items(): param_size = tensor.nelement() * tensor.element_size() if max_size is not None and total_size + param_size > max_size: @@ -726,21 +726,21 @@ def __init__(self, load_config: LoadConfig): super().__init__(load_config) # Save the module names without sharding. - self.unsharded_weights_modules: List[str] = [] + self.unsharded_weights_modules: list[str] = [] # Save the module names that are sharded by column. - self.column_sharded_weights_modules: List[str] = [] + self.column_sharded_weights_modules: list[str] = [] # Store all module names (from transformers) that support # BNB quantization. - self.target_modules: List[str] = [] + self.target_modules: list[str] = [] # mapping weight names from transformers to vllm. self.weight_mapper: Callable = lambda name: name def _get_weight_files( self, model_name_or_path: str, - allowed_patterns: List[str], + allowed_patterns: list[str], revision: Optional[str] = None, - ) -> Tuple[List[str], str]: + ) -> tuple[list[str], str]: """Retrieve weight files. Download the files if necessary. Return the weight files and the file pattern.""" @@ -771,7 +771,7 @@ def _get_weight_files( f"No model weights found in: `{model_name_or_path}`") def _prepare_weights(self, model_name_or_path: str, - revision: Optional[str]) -> Tuple[List[str], bool]: + revision: Optional[str]) -> tuple[list[str], bool]: """Prepare weight files for the model.""" allowed_patterns = ["*.safetensors", "*.bin", "*.pt"] @@ -806,7 +806,7 @@ def _get_quantized_weights_iterator( revision: Optional[str], pre_quant: bool, load_8bit: bool, - ) -> Tuple[Generator[Tuple[str, torch.Tensor], None, None], Dict[str, + ) -> tuple[Generator[tuple[str, torch.Tensor], None, None], dict[str, Any]]: """Get an iterator to the model weights with bitsandbytes quantization, as well as the quantization state dictionary.""" @@ -826,7 +826,7 @@ def _get_quantized_weights_iterator( hf_weights_files, use_safetensors = self._prepare_weights( model_name_or_path, revision) - quant_state_dict: Dict[str, Any] = {} + quant_state_dict: dict[str, Any] = {} if pre_quant: if load_8bit: @@ -908,7 +908,7 @@ def _quantized_4bit_generator(self, hf_weights_files, use_safetensors, # Closure to parse quant_state for each prequant weight def _parse_quant_state(param_name: str, - temp_state_dict: Dict) -> QuantState: + temp_state_dict: dict) -> QuantState: quant_state = {} for k in temp_state_dict: if param_name + "." in k: @@ -1066,7 +1066,7 @@ def _load_weights(self, model_config: ModelConfig, # Modules whose weights might have fused on disk # we need their output_sizes to make shard in flight correctly with TP - self.maybe_fused_weights_modules: Dict[str, List[int]] = {} + self.maybe_fused_weights_modules: dict[str, list[int]] = {} self._get_bnb_target_modules(model) for name, module in model.named_modules(): # Some modules like `ReplicatedLinear` should not have their weights @@ -1131,7 +1131,7 @@ def _load_weights(self, model_config: ModelConfig, torch.cuda.empty_cache() param_dict = dict(model.named_parameters()) - stacked_quant_state_dict: Dict[str, Dict[int, Any]] = {} + stacked_quant_state_dict: dict[str, dict[int, Any]] = {} # TODO: Change this lazy import to normal import # after the checks are updated to run on a new version from vllm.model_executor.models.utils import is_pp_missing_parameter @@ -1284,8 +1284,8 @@ def _get_gguf_weights_map(self, model_config: ModelConfig): return gguf_to_hf_name_map def _get_weights_iterator( - self, model_name_or_path: str, gguf_to_hf_name_map: Dict[str, str] - ) -> Generator[Tuple[str, torch.Tensor], None, None]: + self, model_name_or_path: str, gguf_to_hf_name_map: dict[str, str] + ) -> Generator[tuple[str, torch.Tensor], None, None]: return gguf_quant_weights_iterator(model_name_or_path, gguf_to_hf_name_map) @@ -1339,7 +1339,7 @@ def __init__(self, load_config: LoadConfig): os.environ["RUNAI_STREAMER_S3_ENDPOINT"] = aws_endpoint_url def _prepare_weights(self, model_name_or_path: str, - revision: Optional[str]) -> List[str]: + revision: Optional[str]) -> list[str]: """Prepare weights for the model. If the model is not local, it will be downloaded.""" @@ -1378,7 +1378,7 @@ def _prepare_weights(self, model_name_or_path: str, def _get_weights_iterator( self, model_or_path: str, - revision: str) -> Generator[Tuple[str, torch.Tensor], None, None]: + revision: str) -> Generator[tuple[str, torch.Tensor], None, None]: """Get an iterator for the model weights based on the load format.""" hf_weights_files = self._prepare_weights(model_or_path, revision) return runai_safetensors_weights_iterator(hf_weights_files) diff --git a/vllm/model_executor/model_loader/neuron.py b/vllm/model_executor/model_loader/neuron.py index d900fb3a7d397..3d1109f3dfba4 100644 --- a/vllm/model_executor/model_loader/neuron.py +++ b/vllm/model_executor/model_loader/neuron.py @@ -3,7 +3,7 @@ import copy import importlib import os -from typing import Dict, List, Optional, Tuple +from typing import Optional import torch import torch.nn as nn @@ -30,7 +30,7 @@ } # Models supported by Neuron. -_NEURON_SUPPORTED_MODELS: Dict[str, Tuple[str, str, str]] = { +_NEURON_SUPPORTED_MODELS: dict[str, tuple[str, str, str]] = { "LlamaForCausalLM": ("transformers_neuronx.llama.model", "LlamaForSampling", "LlamaForCausalLM"), "MistralForCausalLM": ("transformers_neuronx.mistral.model", @@ -124,7 +124,7 @@ def _get_model_architecture(config: PretrainedConfig) -> str: f"{list(_NEURON_SUPPORTED_MODELS.keys())}") -def _get_buckets(env: str, default_value: List[int]) -> List[int]: +def _get_buckets(env: str, default_value: list[int]) -> list[int]: env_value = os.getenv(env) if env_value is None: return default_value diff --git a/vllm/model_executor/model_loader/openvino.py b/vllm/model_executor/model_loader/openvino.py index 805f0cfc585e3..c54090c16a51b 100644 --- a/vllm/model_executor/model_loader/openvino.py +++ b/vllm/model_executor/model_loader/openvino.py @@ -2,7 +2,7 @@ # ruff: noqa: SIM117 from pathlib import Path -from typing import List, Optional, Tuple +from typing import Optional import openvino as ov import torch @@ -147,7 +147,7 @@ def forward( self, input_ids: torch.Tensor, positions: torch.Tensor, - kv_caches: List[Tuple[ov.Tensor, ov.Tensor]], + kv_caches: list[tuple[ov.Tensor, ov.Tensor]], attn_metadata: OpenVINOAttentionMetadata, ) -> torch.Tensor: flatten_kv_cache = _flattenize_inputs(kv_caches) diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py index 117251ccf05f1..0ff35b3a6dca1 100644 --- a/vllm/model_executor/model_loader/tensorizer.py +++ b/vllm/model_executor/model_loader/tensorizer.py @@ -6,9 +6,10 @@ import os import re import time +from collections.abc import Generator from dataclasses import dataclass from functools import partial -from typing import BinaryIO, Generator, Optional, Tuple, Type, Union +from typing import BinaryIO, Optional, Union import torch from torch import nn @@ -67,7 +68,7 @@ class TensorizerConfig: s3_access_key_id: Optional[str] = None s3_secret_access_key: Optional[str] = None s3_endpoint: Optional[str] = None - model_class: Optional[Type[torch.nn.Module]] = None + model_class: Optional[type[torch.nn.Module]] = None hf_config: Optional[PretrainedConfig] = None dtype: Optional[Union[str, torch.dtype]] = None _is_sharded: bool = False @@ -365,7 +366,7 @@ def deserialize(self): def tensorizer_weights_iterator( tensorizer_args: "TensorizerArgs" -) -> Generator[Tuple[str, torch.Tensor], None, None]: +) -> Generator[tuple[str, torch.Tensor], None, None]: logger.warning("Deserializing HuggingFace models is not optimized for " "loading on vLLM, as tensorizer is forced to load to CPU. " "Consider deserializing a vLLM model instead for faster " diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py index 9686231fb4bd1..d76304106eb48 100644 --- a/vllm/model_executor/model_loader/utils.py +++ b/vllm/model_executor/model_loader/utils.py @@ -2,7 +2,7 @@ """Utilities for selecting and loading models.""" import contextlib from dataclasses import dataclass, field -from typing import Dict, List, Optional, Tuple, Type +from typing import Optional import torch import transformers @@ -84,7 +84,7 @@ def resolve_transformers_fallback(model_config: ModelConfig, def get_model_architecture( - model_config: ModelConfig) -> Tuple[Type[nn.Module], str]: + model_config: ModelConfig) -> tuple[type[nn.Module], str]: architectures = getattr(model_config.hf_config, "architectures", []) # Special handling for quantized Mixtral. @@ -128,8 +128,8 @@ class ParamMapping: It creates a bidirectional mapping between packed parameters and their constituent parts. """ - packed_mapping: Dict[str, List[str]] - inverse_packed_mapping: Dict[str, Tuple[str, + packed_mapping: dict[str, list[str]] + inverse_packed_mapping: dict[str, tuple[str, int]] = field(default_factory=dict) def __post_init__(self): @@ -144,7 +144,7 @@ def __post_init__(self): ) def get_sub_modules(self, - module_name: str) -> Optional[Tuple[str, List[str]]]: + module_name: str) -> Optional[tuple[str, list[str]]]: for key, value in self.packed_mapping.items(): if module_name.endswith(key): return key, value @@ -152,7 +152,7 @@ def get_sub_modules(self, def configure_quant_config(quant_config: QuantizationConfig, - model_class: Type[nn.Module]): + model_class: type[nn.Module]): """ Pass packed_modules_mapping by reference to quant_config so that quant_config can properly match fused modules diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index 245c199f75b18..77df3884f27d5 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -8,7 +8,8 @@ import tempfile import time from collections import defaultdict -from typing import Any, Callable, Dict, Generator, List, Optional, Tuple, Union +from collections.abc import Generator +from typing import Any, Callable, Optional, Union import filelock import gguf @@ -217,9 +218,9 @@ def get_quant_config(model_config: ModelConfig, def download_weights_from_hf( model_name_or_path: str, cache_dir: Optional[str], - allow_patterns: List[str], + allow_patterns: list[str], revision: Optional[str] = None, - ignore_patterns: Optional[Union[str, List[str]]] = None, + ignore_patterns: Optional[Union[str, list[str]]] = None, ) -> str: """Download model weights from Hugging Face Hub. @@ -227,11 +228,11 @@ def download_weights_from_hf( model_name_or_path (str): The model name or path. cache_dir (Optional[str]): The cache directory to store the model weights. If None, will use HF defaults. - allow_patterns (List[str]): The allowed patterns for the + allow_patterns (list[str]): The allowed patterns for the weight files. Files matched by any of the patterns will be downloaded. revision (Optional[str]): The revision of the model. - ignore_patterns (Optional[Union[str, List[str]]]): The patterns to + ignore_patterns (Optional[Union[str, list[str]]]): The patterns to filter out the weight files. Files matched by any of the patterns will be ignored. @@ -311,9 +312,9 @@ def download_safetensors_index_file_from_hf( # Passing both of these to the weight loader functionality breaks. # So, we use the index_file to # look up which safetensors files should be used. -def filter_duplicate_safetensors_files(hf_weights_files: List[str], +def filter_duplicate_safetensors_files(hf_weights_files: list[str], hf_folder: str, - index_file: str) -> List[str]: + index_file: str) -> list[str]: # model.safetensors.index.json is a mapping from keys in the # torch state_dict to safetensors file holding that weight. index_file_name = os.path.join(hf_folder, index_file) @@ -336,7 +337,7 @@ def filter_duplicate_safetensors_files(hf_weights_files: List[str], def filter_files_not_needed_for_inference( - hf_weights_files: List[str]) -> List[str]: + hf_weights_files: list[str]) -> list[str]: """ Exclude files that are not needed for inference. @@ -365,8 +366,8 @@ def filter_files_not_needed_for_inference( def np_cache_weights_iterator( model_name_or_path: str, cache_dir: Optional[str], hf_folder: str, - hf_weights_files: List[str] -) -> Generator[Tuple[str, torch.Tensor], None, None]: + hf_weights_files: list[str] +) -> Generator[tuple[str, torch.Tensor], None, None]: """Iterate over the weights in the model np files. Will dump the model weights to numpy files if they are not already dumped. @@ -382,7 +383,7 @@ def np_cache_weights_iterator( # dumping the same model weights to numpy at the same time. with get_lock(model_name_or_path, cache_dir): if not os.path.exists(weight_names_file): - weight_names: List[str] = [] + weight_names: list[str] = [] for bin_file in tqdm( hf_weights_files, desc="Loading np_cache checkpoint shards", @@ -411,8 +412,8 @@ def np_cache_weights_iterator( def safetensors_weights_iterator( - hf_weights_files: List[str] -) -> Generator[Tuple[str, torch.Tensor], None, None]: + hf_weights_files: list[str] +) -> Generator[tuple[str, torch.Tensor], None, None]: """Iterate over the weights in the model safetensor files.""" enable_tqdm = not torch.distributed.is_initialized( ) or torch.distributed.get_rank() == 0 @@ -429,8 +430,8 @@ def safetensors_weights_iterator( def runai_safetensors_weights_iterator( - hf_weights_files: List[str] -) -> Generator[Tuple[str, torch.Tensor], None, None]: + hf_weights_files: list[str] +) -> Generator[tuple[str, torch.Tensor], None, None]: """Iterate over the weights in the model safetensor files.""" enable_tqdm = not torch.distributed.is_initialized( ) or torch.distributed.get_rank() == 0 @@ -446,8 +447,8 @@ def runai_safetensors_weights_iterator( def pt_weights_iterator( - hf_weights_files: List[str] -) -> Generator[Tuple[str, torch.Tensor], None, None]: + hf_weights_files: list[str] +) -> Generator[tuple[str, torch.Tensor], None, None]: """Iterate over the weights in the model bin/pt files.""" enable_tqdm = not torch.distributed.is_initialized( ) or torch.distributed.get_rank() == 0 @@ -463,7 +464,7 @@ def pt_weights_iterator( def get_gguf_extra_tensor_names( - gguf_file: str, gguf_to_hf_name_map: Dict[str, str]) -> List[str]: + gguf_file: str, gguf_to_hf_name_map: dict[str, str]) -> list[str]: reader = gguf.GGUFReader(gguf_file) expected_gguf_keys = set(gguf_to_hf_name_map.keys()) exact_gguf_keys = set([tensor.name for tensor in reader.tensors]) @@ -472,8 +473,8 @@ def get_gguf_extra_tensor_names( def gguf_quant_weights_iterator( - gguf_file: str, gguf_to_hf_name_map: Dict[str, str] -) -> Generator[Tuple[str, torch.Tensor], None, None]: + gguf_file: str, gguf_to_hf_name_map: dict[str, str] +) -> Generator[tuple[str, torch.Tensor], None, None]: """ Iterate over the quant weights in the model gguf files and convert them to torch tensors diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py index e2d4a8de605b9..ed244541aefb4 100644 --- a/vllm/model_executor/models/arctic.py +++ b/vllm/model_executor/models/arctic.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 """Inference-only Snowflake Arctic model.""" -from typing import Iterable, List, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -471,8 +472,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -480,8 +481,8 @@ def load_weights(self, weights: Iterable[Tuple[str, ("qkv_proj", "v_proj", "v"), ] - mlp_params_mapping: List[Tuple[str, str, int]] = [] - expert_params_mapping: List[Tuple[str, str, int]] = [] + mlp_params_mapping: list[tuple[str, str, int]] = [] + expert_params_mapping: list[tuple[str, str, int]] = [] num_layers = self.config.num_hidden_layers for layer in range(num_layers): @@ -510,7 +511,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ("ws", f"experts.{expert_id}.w3.weight", expert_id)) params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() logger.info( "It will take ~10 minutes loading from the 16-bit weights. " diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py index 656e9b037d969..ffa931e054723 100644 --- a/vllm/model_executor/models/aria.py +++ b/vllm/model_executor/models/aria.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import (Iterable, List, Mapping, Optional, Set, Tuple, TypedDict, - Union) +from collections.abc import Iterable, Mapping +from typing import Optional, TypedDict, Union import torch import torch.nn as nn @@ -67,8 +67,8 @@ def __init__( # Identity layer self.post_layernorm = nn.Identity() - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -76,7 +76,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ("qkv_proj", "v_proj", "v"), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: # NOTE: post_layernorm is not used in Aria @@ -323,8 +323,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): # Adapted from LlamaModel.load_weights with the modification of adding # the expert weights mapping to `stacked_params_mapping` - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) (".qkv_proj", ".q_proj", "q"), @@ -336,7 +336,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ("experts.w2_weight", "experts.fc2.weight", 'w2'), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue @@ -533,7 +533,7 @@ def __init__( self.sampler = get_sampler() def _validate_image_sizes( - self, images: List[torch.Tensor]) -> List[torch.Tensor]: + self, images: list[torch.Tensor]) -> list[torch.Tensor]: if not all(img.shape == images[0].shape for img in images): raise ValueError("All images must be the same size") return images @@ -583,7 +583,7 @@ def _create_patch_attention_mask( def _process_image_input( self, image_input: AriaImagePixelInputs - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: assert self.vision_tower is not None pixel_values = image_input['pixel_values'] @@ -660,6 +660,6 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): loader = AutoWeightsLoader(self) loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py index 4fb68e7b48da9..8ec42c5c62da1 100644 --- a/vllm/model_executor/models/baichuan.py +++ b/vllm/model_executor/models/baichuan.py @@ -20,7 +20,8 @@ # limitations under the License. """Inference-only BaiChuan model compatible with HuggingFace weights.""" import math -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -231,7 +232,7 @@ def forward( positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: # Self Attention if residual is None: residual = hidden_states @@ -392,15 +393,15 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("gate_up_proj", "gate_proj", 0), ("gate_up_proj", "up_proj", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue diff --git a/vllm/model_executor/models/bamba.py b/vllm/model_executor/models/bamba.py index 69da05884ded8..bf192823b8a20 100644 --- a/vllm/model_executor/models/bamba.py +++ b/vllm/model_executor/models/bamba.py @@ -1,7 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 """Inference-only Bamba model.""" # Added by the IBM Team, 2024 -from typing import Iterable, Optional, Set, Tuple +from collections.abc import Iterable +from typing import Optional import torch from torch import nn @@ -37,7 +38,7 @@ make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) -KVCache = Tuple[torch.Tensor, torch.Tensor] +KVCache = tuple[torch.Tensor, torch.Tensor] class BambaMLP(nn.Module): @@ -453,7 +454,7 @@ def get_seqlen_agnostic_capture_inputs(self, batch_size: int): return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size) def _get_mamba_cache_shape( - self) -> Tuple[Tuple[int, int], Tuple[int, int]]: + self) -> tuple[tuple[int, int], tuple[int, int]]: world_size = get_tensor_model_parallel_world_size() hidden_size = self.config.hidden_size @@ -501,8 +502,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -513,7 +514,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue diff --git a/vllm/model_executor/models/bart.py b/vllm/model_executor/models/bart.py index 93452696dca55..780eb18726613 100644 --- a/vllm/model_executor/models/bart.py +++ b/vllm/model_executor/models/bart.py @@ -19,7 +19,8 @@ # limitations under the License. """PyTorch BART model.""" import math -from typing import Iterable, Optional, Tuple +from collections.abc import Iterable +from typing import Optional import torch from torch import nn @@ -881,14 +882,14 @@ def _rename_key(self, key: str): def _rename_stacked_param( self, name: str, - ) -> Tuple[str, Optional[str]]: + ) -> tuple[str, Optional[str]]: for key, mapping in self.stacked_params_mapping.items(): if key in name: name = name.replace(key, mapping["param_name"]) return name, mapping["shard_id"] return name, None - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): model_params_dict = dict(self.model.named_parameters()) top_params_dict = dict(self.named_parameters()) diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py index 4ff69527653d8..ced2ad9e966ff 100644 --- a/vllm/model_executor/models/bert.py +++ b/vllm/model_executor/models/bert.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Iterable, Optional, Set, Tuple +from collections.abc import Iterable +from typing import Optional import torch from torch import nn @@ -348,8 +349,8 @@ def forward( token_type_ids=token_type_ids) return self.encoder(hidden_states) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "query", "q"), @@ -358,7 +359,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if self.pooler is None and "pooler" in name: continue @@ -423,7 +424,7 @@ def pooler( ) -> Optional[PoolerOutput]: return self._pooler(hidden_states, pooling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): weights = self.hf_to_vllm_mapper.apply(weights) weights = ((name, data) for name, data in weights if not name.startswith("lm_head.")) @@ -470,7 +471,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self._pooler = CrossEncodingPooler(config, self.classifier, self.bert.pooler) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): self_weights = [] diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py index bedbdceb7721d..ec836732379d3 100644 --- a/vllm/model_executor/models/blip.py +++ b/vllm/model_executor/models/blip.py @@ -1,7 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 """Minimal implementation of BlipVisionModel intended to be only used within a vision language model.""" -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch import torch.nn as nn @@ -293,8 +294,8 @@ def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: return self.post_layernorm(hidden_states) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -302,7 +303,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ("qkv_proj", "v_proj", "v"), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() layer_count = len(self.encoder.layers) for name, loaded_weight in weights: diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index 23bb3cd07f1d4..f755bcd59c43d 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -1,8 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 +from collections.abc import Iterable, Mapping from functools import cached_property -from typing import (Iterable, Literal, Mapping, Optional, Set, Tuple, - TypedDict, Union) +from typing import Literal, Optional, TypedDict, Union import torch import torch.nn as nn @@ -184,7 +184,7 @@ def forward( self, hidden_states: torch.Tensor, encoder_hidden_states: Optional[torch.FloatTensor] = None, - ) -> Tuple[torch.Tensor]: + ) -> tuple[torch.Tensor]: self_output = self.attention( hidden_states, encoder_hidden_states=encoder_hidden_states, @@ -725,7 +725,7 @@ def sample( ) -> Optional[SamplerOutput]: return self.language_model.sample(logits, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py index 84b79613abc47..9d5bb7f77d1aa 100644 --- a/vllm/model_executor/models/bloom.py +++ b/vllm/model_executor/models/bloom.py @@ -18,7 +18,8 @@ # limitations under the License. """Inference-only BLOOM model compatible with HuggingFace weights.""" import math -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -332,10 +333,10 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if name == "lm_head.weight": continue diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index e91399b2674df..ac0a1d102a106 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -1,8 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 +from collections.abc import Iterable, Mapping from functools import cached_property -from typing import (Any, Dict, Iterable, Literal, Mapping, Optional, Set, - Tuple, TypedDict, Union) +from typing import Any, Literal, Optional, TypedDict, Union import torch import torch.nn as nn @@ -233,7 +233,7 @@ def __init__( num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, + rope_scaling: Optional[dict[str, Any]] = None, max_position_embeddings: int = 4096, quant_config: Optional[QuantizationConfig] = None, bias: bool = False, @@ -296,7 +296,7 @@ def __init__( prefix=f"{prefix}.attn") def _apply_qk_norm(self, q: torch.Tensor, - k: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + k: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: # reshape for layernorm q = q.reshape(-1, self.num_heads, self.head_dim) k = k.reshape(-1, self.num_kv_heads, self.head_dim) @@ -371,7 +371,7 @@ def forward( positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: if residual is None: residual = hidden_states @@ -442,7 +442,7 @@ def forward( positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: residual = hidden_states hidden_states = self.self_attn( @@ -777,7 +777,7 @@ def __init__(self, config: ChameleonVQVAEConfig): def encode( self, pixel_values: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: hidden_states = self.encoder(pixel_values) hidden_states = self.quant_conv(hidden_states) quant, emb_loss, indices = self.quantize(hidden_states) @@ -790,7 +790,7 @@ class ChameleonImageVocabularyMapping: A class for mapping discrete image tokens from VQGAN to BPE tokens. """ - def __init__(self, vocab_map: Dict[str, int]): + def __init__(self, vocab_map: dict[str, int]): self.vocab_map = vocab_map self.image_token_id = vocab_map.get("") @@ -1059,8 +1059,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) (".qkv_proj", ".q_proj", "q"), @@ -1070,7 +1070,7 @@ def load_weights(self, weights: Iterable[Tuple[str, (".gate_up_proj", ".up_proj", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index 6eca25212ee66..e135dcda91ccc 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -2,7 +2,8 @@ # Adapted from # https://github.com/THUDM/ChatGLM2-6B """Inference-only ChatGLM model compatible with THUDM weights.""" -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -352,15 +353,15 @@ def forward( return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("linear_proj.merged_proj", "linear_proj.gate_proj", 0), ("linear_proj.merged_proj", "linear_proj.dense_h_to_4h", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: for (param_name, weight_name, shard_id) in stacked_params_mapping: @@ -444,7 +445,7 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): loader = AutoWeightsLoader(self) return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py index dc3aa9cbe86b7..c1226d12ec2a9 100644 --- a/vllm/model_executor/models/clip.py +++ b/vllm/model_executor/models/clip.py @@ -1,7 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 """Minimal implementation of CLIPVisionModel intended to be only used within a vision language model.""" -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch import torch.nn as nn @@ -371,8 +372,8 @@ def device(self): # (TODO) Add prefix argument for filtering out weights to be loaded # ref: https://github.com/vllm-project/vllm/pull/7186#discussion_r1734163986 - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -380,7 +381,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ("qkv_proj", "v_proj", "v"), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() layer_count = len(self.vision_model.encoder.layers) for name, loaded_weight in weights: diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py index b0cb4a62333a4..3eb16762bc8fd 100644 --- a/vllm/model_executor/models/commandr.py +++ b/vllm/model_executor/models/commandr.py @@ -21,7 +21,8 @@ # This file is based on the LLama model definition file in transformers """PyTorch Cohere model.""" -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch import torch.utils.checkpoint @@ -254,7 +255,7 @@ def forward( positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: # Self Attention residual = hidden_states hidden_states, residual = self.input_layernorm(hidden_states, residual) @@ -408,8 +409,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -419,7 +420,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ("gate_up_proj", "up_proj", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if (self.quant_config is not None and diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py index 7830dd4ce2ec3..492768b2bac3c 100644 --- a/vllm/model_executor/models/dbrx.py +++ b/vllm/model_executor/models/dbrx.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch import torch.nn as nn @@ -421,14 +422,14 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: expert_params_mapping = [( "w13" if weight_name in ["w1", "v1"] else "w2", f"mlp.{weight_name}", ) for weight_name in ["w1", "v1", "w2"]] params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if (self.quant_config is not None and diff --git a/vllm/model_executor/models/decilm.py b/vllm/model_executor/models/decilm.py index b239b642f752b..b008905cd5c91 100644 --- a/vllm/model_executor/models/decilm.py +++ b/vllm/model_executor/models/decilm.py @@ -24,7 +24,7 @@ # limitations under the License. """Inference-only DeciLM model compatible with HuggingFace weights.""" -from typing import Iterable, Set, Tuple +from collections.abc import Iterable import torch @@ -59,8 +59,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): delattr(config, "num_key_value_heads_per_layer") super().__init__(vllm_config=vllm_config) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -70,7 +70,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ("gate_up_proj", "up_proj", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue diff --git a/vllm/model_executor/models/deepseek.py b/vllm/model_executor/models/deepseek.py index c04e7a02bae23..2eaa8e39916fc 100644 --- a/vllm/model_executor/models/deepseek.py +++ b/vllm/model_executor/models/deepseek.py @@ -22,7 +22,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only Deepseek model.""" -from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Any, Optional, Union import torch from torch import nn @@ -184,7 +185,7 @@ def __init__( num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, + rope_scaling: Optional[dict[str, Any]] = None, max_position_embeddings: int = 8192, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, @@ -438,8 +439,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -450,7 +451,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue diff --git a/vllm/model_executor/models/deepseek_mtp.py b/vllm/model_executor/models/deepseek_mtp.py index cac1b2b3b11cc..74cedd0031973 100644 --- a/vllm/model_executor/models/deepseek_mtp.py +++ b/vllm/model_executor/models/deepseek_mtp.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Iterable, Optional, Set, Tuple +from collections.abc import Iterable +from typing import Optional import torch import torch.nn as nn @@ -183,8 +184,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ ("gate_up_proj", "gate_proj", 0), ("gate_up_proj", "up_proj", 1), @@ -197,7 +198,7 @@ def load_weights(self, weights: Iterable[Tuple[str, num_experts=self.config.n_routed_experts) params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index 6ff3ef129a74b..3c23a5f9f84ab 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -22,7 +22,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only DeepseekV2/DeepseekV3 model.""" -from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Any, Optional, Union import torch from torch import nn @@ -187,7 +188,7 @@ def __init__( q_lora_rank: int, kv_lora_rank: int, rope_theta: float = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, + rope_scaling: Optional[dict[str, Any]] = None, max_position_embeddings: int = 8192, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, @@ -339,7 +340,7 @@ def __init__( q_lora_rank: Optional[int], kv_lora_rank: int, rope_theta: float = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, + rope_scaling: Optional[dict[str, Any]] = None, max_position_embeddings: int = 8192, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, @@ -692,8 +693,8 @@ def make_empty_intermediate_tensors( device=device), }) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("gate_up_proj", "gate_proj", 0), @@ -709,7 +710,7 @@ def load_weights(self, weights: Iterable[Tuple[str, num_experts=self.config.n_routed_experts) params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py index ea217e2444040..8550ad82640bb 100644 --- a/vllm/model_executor/models/deepseek_vl2.py +++ b/vllm/model_executor/models/deepseek_vl2.py @@ -3,9 +3,9 @@ # adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/faf18023f24b962b32d9f0a2d89e402a8d383a78/deepseek_vl2/models/modeling_deepseek_vl_v2.py """Inference-only Deepseek-VL2 model compatible with HuggingFace weights.""" import math +from collections.abc import Iterable, Mapping from functools import cached_property -from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple, - TypedDict, Union) +from typing import Literal, Optional, TypedDict, Union import torch import torch.nn as nn @@ -50,7 +50,7 @@ class DeepseekVL2ImagePixelInputs(TypedDict): type: Literal["pixel_values"] - data: Union[torch.Tensor, List[torch.Tensor]] + data: Union[torch.Tensor, list[torch.Tensor]] """ Shape: `(batch_size * num_images, num_channels, height, width)` """ @@ -62,7 +62,7 @@ class DeepseekVL2ImagePixelInputs(TypedDict): class DeepseekVL2VImageEmbeddingInputs(TypedDict): type: Literal["image_embeds"] - data: Union[torch.Tensor, List[torch.Tensor]] + data: Union[torch.Tensor, list[torch.Tensor]] """Shape: `(batch_size * num_images, image_feature_size, hidden_size)` `hidden_size` must match the hidden size of language model backbone. @@ -416,8 +416,8 @@ def sampler(self): return get_sampler() def _validate_pixel_values( - self, data: Union[torch.Tensor, List[torch.Tensor]] - ) -> Union[torch.Tensor, List[torch.Tensor]]: + self, data: Union[torch.Tensor, list[torch.Tensor]] + ) -> Union[torch.Tensor, list[torch.Tensor]]: h = w = self.vision_config.image_size expected_dims = (3, h, w) @@ -437,8 +437,8 @@ def _validate_shape(d: torch.Tensor): return data def _validate_images_spatial_crop( - self, data: Union[torch.Tensor, List[torch.Tensor]] - ) -> Union[torch.Tensor, List[torch.Tensor]]: + self, data: Union[torch.Tensor, list[torch.Tensor]] + ) -> Union[torch.Tensor, list[torch.Tensor]]: expected_dims = 2 def _validate_shape(d: torch.Tensor): @@ -665,8 +665,8 @@ def sample( ) -> Optional[SamplerOutput]: return self.language_model.sample(logits, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) autoloaded_weights = loader.load_weights(weights, diff --git a/vllm/model_executor/models/eagle.py b/vllm/model_executor/models/eagle.py index f2a2935e6c694..31821ba36d403 100644 --- a/vllm/model_executor/models/eagle.py +++ b/vllm/model_executor/models/eagle.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Iterable, Optional, Tuple +from collections.abc import Iterable +from typing import Optional import torch import torch.nn as nn @@ -165,7 +166,7 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): # This implementation is incompitable with https://huggingface.co/yuhuili/EAGLE-LLaMA3-Instruct-8B # due to missing lm_head weights and its config being that of a # Llama model. Here's a compatible version with the same weights: diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py index 79939f6f40e4e..41b6208116e2c 100644 --- a/vllm/model_executor/models/exaone.py +++ b/vllm/model_executor/models/exaone.py @@ -24,7 +24,8 @@ # limitations under the License. """Inference-only Exaone model compatible with HuggingFace weights.""" -from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Any, Optional, Union import torch from torch import nn @@ -103,7 +104,7 @@ def __init__( num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, + rope_scaling: Optional[dict[str, Any]] = None, max_position_embeddings: int = 8192, quant_config: Optional[QuantizationConfig] = None, bias: bool = False, @@ -197,7 +198,7 @@ def __init__( num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, + rope_scaling: Optional[dict[str, Any]] = None, max_position_embeddings: int = 8192, quant_config: Optional[QuantizationConfig] = None, bias: bool = False, @@ -283,7 +284,7 @@ def forward( positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: # Self Attention if residual is None: residual = hidden_states @@ -480,8 +481,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) (".qkv_proj", ".q_proj", "q"), @@ -491,7 +492,7 @@ def load_weights(self, weights: Iterable[Tuple[str, (".gate_up_proj", ".c_fc_1", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue diff --git a/vllm/model_executor/models/fairseq2_llama.py b/vllm/model_executor/models/fairseq2_llama.py index 310aca999bc2d..00dbbebb120e8 100644 --- a/vllm/model_executor/models/fairseq2_llama.py +++ b/vllm/model_executor/models/fairseq2_llama.py @@ -16,7 +16,7 @@ # limitations under the License. """Llama model for fairseq2 weights.""" -from typing import Iterable, Set, Tuple +from collections.abc import Iterable import torch from torch.nn import Parameter @@ -44,8 +44,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): f"model.{self.tp_rank}.pt", ] - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: # fairseq2's serialization adds a wrapper to usual .pt state_dict's: # { "model_key": my_model_name, "my_model_name": state_dict } # which we first need to unpack @@ -102,7 +102,7 @@ def reshape_fairseq2_weights( name: str, loaded_weight: torch.Tensor, params: dict[str, Parameter], - ) -> Tuple[str, torch.Tensor]: + ) -> tuple[str, torch.Tensor]: """Reshape fairseq2's weights.""" def permute(w: torch.Tensor, n_heads: int) -> torch.Tensor: diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py index 7154ac2e6a5af..21a1038adc83c 100644 --- a/vllm/model_executor/models/falcon.py +++ b/vllm/model_executor/models/falcon.py @@ -20,7 +20,8 @@ """PyTorch Falcon model.""" import math -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -460,8 +461,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: total_num_heads = self.config.num_attention_heads if self.config.new_decoder_architecture: total_num_kv_heads = self.config.num_kv_heads @@ -471,7 +472,7 @@ def load_weights(self, weights: Iterable[Tuple[str, total_num_kv_heads = total_num_heads num_query_heads_per_kv_head = total_num_heads // total_num_kv_heads params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if name == "lm_head.weight" and self.tie_word_embeddings: # Falcon uses tied embeddings except Falcon-11b. diff --git a/vllm/model_executor/models/florence2.py b/vllm/model_executor/models/florence2.py index b71d0de8d707d..92fb39c74ffe4 100644 --- a/vllm/model_executor/models/florence2.py +++ b/vllm/model_executor/models/florence2.py @@ -1,9 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 import math +from collections import OrderedDict +from collections.abc import Iterable, Mapping from functools import cached_property -from typing import (Iterable, List, Literal, Mapping, Optional, OrderedDict, - Set, Tuple, TypedDict, Union) +from typing import Literal, Optional, TypedDict, Union import torch import torch.nn as nn @@ -719,8 +720,8 @@ def sample(self, logits: torch.Tensor, next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -729,7 +730,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: for (param_name, weight_name, shard_id) in stacked_params_mapping: if weight_name not in name: @@ -934,8 +935,8 @@ def sampler(self): return get_sampler() def _validate_pixel_values( - self, data: Union[torch.Tensor, List[torch.Tensor]] - ) -> Union[torch.Tensor, List[torch.Tensor]]: + self, data: Union[torch.Tensor, list[torch.Tensor]] + ) -> Union[torch.Tensor, list[torch.Tensor]]: size = self.processor_config["size"] h, w = size["height"], size["width"] @@ -956,12 +957,12 @@ def _validate_shape(d: torch.Tensor): return data def _parse_and_validate_image_input(self, **kwargs: object): - pixel_values: Optional[Union[List[List[torch.Tensor]], - List[torch.Tensor], + pixel_values: Optional[Union[list[list[torch.Tensor]], + list[torch.Tensor], torch.Tensor]] = kwargs.pop( "pixel_values", None) - image_embeds: Optional[Union[List[List[torch.Tensor]], - List[torch.Tensor], + image_embeds: Optional[Union[list[list[torch.Tensor]], + list[torch.Tensor], torch.Tensor]] = kwargs.pop( "image_embeds", None) @@ -1111,7 +1112,7 @@ def sample( ) -> SamplerOutput: return self.language_model.sample(logits, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index 7e4cc6bac5e61..f68c73b624579 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -17,8 +17,8 @@ # limitations under the License. """ PyTorch Fuyu model.""" import math -from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple, - TypedDict) +from collections.abc import Iterable, Mapping +from typing import Literal, Optional, TypedDict import torch import torch.nn as nn @@ -58,7 +58,7 @@ class FuyuImagePatchInputs(TypedDict): `(batch_size * num_patches, patch_size_x * patch_size_y * num_channels)` """ - patches_per_image: List[int] + patches_per_image: list[int] """ List of number of total patches for each image in the batch. This is used to restore the first two dimensions of `flat_data`. @@ -390,7 +390,7 @@ def sample( next_tokens = self.language_model.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py index da17646c540fd..4af0a7e4c13f7 100644 --- a/vllm/model_executor/models/gemma.py +++ b/vllm/model_executor/models/gemma.py @@ -15,8 +15,9 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only Gemma model compatible with HuggingFace weights.""" +from collections.abc import Iterable from functools import cache -from typing import Iterable, Optional, Set, Tuple, Union +from typing import Optional, Union import torch from torch import nn @@ -232,7 +233,7 @@ def forward( positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: # Self Attention if residual is None: residual = hidden_states @@ -383,8 +384,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -394,7 +395,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ("gate_up_proj", "up_proj", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: for (param_name, shard_name, shard_id) in stacked_params_mapping: if shard_name not in name: diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py index cf744fc2b9d12..4c359dddfee80 100644 --- a/vllm/model_executor/models/gemma2.py +++ b/vllm/model_executor/models/gemma2.py @@ -15,7 +15,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -219,7 +220,7 @@ def forward( positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: if residual is None: residual = hidden_states hidden_states = self.input_layernorm(hidden_states) @@ -306,8 +307,8 @@ def forward( hidden_states, _ = self.norm(hidden_states, residual) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -317,7 +318,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ("gate_up_proj", "up_proj", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if (self.quant_config is not None and (scale_name := self.quant_config.get_cache_scale(name))): @@ -428,8 +429,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, skip_prefixes=(["lm_head."] diff --git a/vllm/model_executor/models/glm4v.py b/vllm/model_executor/models/glm4v.py index 48543c5642ea4..4b95b044f12b5 100644 --- a/vllm/model_executor/models/glm4v.py +++ b/vllm/model_executor/models/glm4v.py @@ -4,7 +4,8 @@ # https://github.com/THUDM/CogAgent """Inference-only CogAgent model compatible with THUDM weights.""" from argparse import Namespace -from typing import Literal, Mapping, Optional, TypedDict, Union +from collections.abc import Mapping +from typing import Literal, Optional, TypedDict, Union import torch from torch import nn diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py index 776c03f652bdc..9c88675031b73 100644 --- a/vllm/model_executor/models/gpt2.py +++ b/vllm/model_executor/models/gpt2.py @@ -18,7 +18,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only GPT-2 model compatible with HuggingFace weights.""" -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -290,10 +291,10 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if ".attn.bias" in name or ".attn.masked_bias" in name: # Skip attention mask. diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py index 43f3d4f6dc9cc..7bb67a6fc94b3 100644 --- a/vllm/model_executor/models/gpt_bigcode.py +++ b/vllm/model_executor/models/gpt_bigcode.py @@ -19,7 +19,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only GPTBigCode model compatible with HuggingFace weights.""" -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -313,10 +314,10 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "lm_head.weight" in name: continue diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py index 752aec0b223dd..1fb91b17fce46 100644 --- a/vllm/model_executor/models/gpt_j.py +++ b/vllm/model_executor/models/gpt_j.py @@ -17,7 +17,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only GPT-J model compatible with HuggingFace weights.""" -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -283,8 +284,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -294,7 +295,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ("gate_up_proj", "up_proj", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "attn.bias" in name or "attn.masked_bias" in name: continue diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py index 4b30c7bb30359..79780a8a02aa9 100644 --- a/vllm/model_executor/models/gpt_neox.py +++ b/vllm/model_executor/models/gpt_neox.py @@ -17,7 +17,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only GPT-NeoX model compatible with HuggingFace weights.""" -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -295,10 +296,10 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if ("attention.bias" in name or "attention.masked_bias" in name or "rotary_emb.inv_freq" in name): diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py index 201e15d3a30f8..6f892fc5c4eee 100644 --- a/vllm/model_executor/models/granite.py +++ b/vllm/model_executor/models/granite.py @@ -22,7 +22,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only IBM Granite model compatible with HuggingFace weights.""" -from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Any, Optional, Union import torch from torch import nn @@ -98,7 +99,7 @@ def __init__( num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, + rope_scaling: Optional[dict[str, Any]] = None, max_position_embeddings: int = 8192, quant_config: Optional[QuantizationConfig] = None, bias: bool = False, @@ -231,7 +232,7 @@ def forward( self, positions: torch.Tensor, hidden_states: torch.Tensor, - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: # Self Attention residual = hidden_states hidden_states = self.input_layernorm(hidden_states) @@ -427,8 +428,8 @@ def make_empty_intermediate_tensors( device=device), }) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) (".qkv_proj", ".q_proj", "q"), @@ -438,7 +439,7 @@ def load_weights(self, weights: Iterable[Tuple[str, (".gate_up_proj", ".up_proj", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py index 9b56874a8add8..a978bdc142549 100644 --- a/vllm/model_executor/models/granitemoe.py +++ b/vllm/model_executor/models/granitemoe.py @@ -22,7 +22,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only GraniteMoe model.""" -from typing import Iterable, Optional, Set, Tuple +from collections.abc import Iterable +from typing import Optional import torch from torch import nn @@ -402,8 +403,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: new_weights = {} for n, p in weights: if n.endswith('.block_sparse_moe.input_linear.weight'): diff --git a/vllm/model_executor/models/grok1.py b/vllm/model_executor/models/grok1.py index f2e82017f6530..846569ac0a284 100644 --- a/vllm/model_executor/models/grok1.py +++ b/vllm/model_executor/models/grok1.py @@ -21,7 +21,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only Grok1 model.""" -from typing import Iterable, List, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch import torch.nn.functional as F @@ -264,7 +265,7 @@ def forward( kv_cache: torch.Tensor, attn_metadata: AttentionMetadata, residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: # Self Attention if residual is None: residual = hidden_states @@ -339,7 +340,7 @@ def forward( self, input_ids: torch.Tensor, positions: torch.Tensor, - kv_caches: List[torch.Tensor], + kv_caches: list[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors], inputs_embeds: Optional[torch.Tensor] = None, @@ -431,7 +432,7 @@ def forward( self, input_ids: torch.Tensor, positions: torch.Tensor, - kv_caches: List[torch.Tensor], + kv_caches: list[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, inputs_embeds: Optional[torch.Tensor] = None, @@ -458,8 +459,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -477,7 +478,7 @@ def load_weights(self, weights: Iterable[Tuple[str, num_experts=num_experts) params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: diff --git a/vllm/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py index bab9c256b9aa0..7fb0c6cc130ec 100644 --- a/vllm/model_executor/models/h2ovl.py +++ b/vllm/model_executor/models/h2ovl.py @@ -7,7 +7,8 @@ # Copyright (c) 2024 H2O.AI # Licensed under Apache 2.0 License [see LICENSE for details] # -------------------------------------------------------- -from typing import Mapping, Optional +from collections.abc import Mapping +from typing import Optional import torch from PIL import Image diff --git a/vllm/model_executor/models/idefics2_vision_model.py b/vllm/model_executor/models/idefics2_vision_model.py index f9c2175b29881..e26c7d2b816a5 100644 --- a/vllm/model_executor/models/idefics2_vision_model.py +++ b/vllm/model_executor/models/idefics2_vision_model.py @@ -17,7 +17,8 @@ # limitations under the License. """PyTorch Idefics2 model.""" -from typing import Iterable, Optional, Set, Tuple +from collections.abc import Iterable +from typing import Optional import torch from torch import nn @@ -318,8 +319,8 @@ def forward( last_hidden_state = self.post_layernorm(encoder_outputs) return last_hidden_state - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -327,7 +328,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ("qkv_proj", "v_proj", "v"), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: for param_name, weight_name, shard_id in stacked_params_mapping: if weight_name not in name: diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py index 0a8763cf910ca..a22211c40abbb 100644 --- a/vllm/model_executor/models/idefics3.py +++ b/vllm/model_executor/models/idefics3.py @@ -16,8 +16,8 @@ """Inference-only Idefics3 model compatible with HuggingFace weights.""" import math -from typing import (Dict, Iterable, List, Literal, Mapping, Optional, Set, - Tuple, TypedDict, Union) +from collections.abc import Iterable, Mapping +from typing import Literal, Optional, TypedDict, Union import torch import torch.utils.checkpoint @@ -84,7 +84,7 @@ class Idefics3ProcessingInfo(BaseProcessingInfo): def get_hf_processor( self, *, - size: Optional[Dict[str, int]] = None, + size: Optional[dict[str, int]] = None, **kwargs: object, ) -> Idefics3Processor: if size is not None: @@ -426,8 +426,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.image_token_id = self.config.image_token_id def _validate_pixel_values( - self, data: Union[torch.Tensor, List[torch.Tensor]] - ) -> Union[torch.Tensor, List[torch.Tensor]]: + self, data: Union[torch.Tensor, list[torch.Tensor]] + ) -> Union[torch.Tensor, list[torch.Tensor]]: h = w = self.config.vision_config.image_size expected_dims = (3, h, w) @@ -676,8 +676,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index 47bd05f140c81..b7fee00694659 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import (TYPE_CHECKING, ClassVar, Dict, List, Literal, Optional, - Protocol, Type, Union, overload, runtime_checkable) +from typing import (TYPE_CHECKING, ClassVar, Literal, Optional, Protocol, + Union, overload, runtime_checkable) import torch from typing_extensions import TypeIs, TypeVar @@ -88,7 +88,7 @@ class _SupportsMultiModalType(Protocol): @overload def supports_multimodal( - model: Type[object]) -> TypeIs[Type[SupportsMultiModal]]: + model: type[object]) -> TypeIs[type[SupportsMultiModal]]: ... @@ -98,8 +98,8 @@ def supports_multimodal(model: object) -> TypeIs[SupportsMultiModal]: def supports_multimodal( - model: Union[Type[object], object], -) -> Union[TypeIs[Type[SupportsMultiModal]], TypeIs[SupportsMultiModal]]: + model: Union[type[object], object], +) -> Union[TypeIs[type[SupportsMultiModal]], TypeIs[SupportsMultiModal]]: if isinstance(model, type): return isinstance(model, _SupportsMultiModalType) @@ -120,9 +120,9 @@ class SupportsLoRA(Protocol): """ # The `embedding_module` and `embedding_padding_modules` # are empty by default. - embedding_modules: ClassVar[Dict[str, str]] = {} - embedding_padding_modules: ClassVar[List[str]] = [] - packed_modules_mapping: ClassVar[Dict[str, List[str]]] = {} + embedding_modules: ClassVar[dict[str, str]] = {} + embedding_padding_modules: ClassVar[list[str]] = [] + packed_modules_mapping: ClassVar[dict[str, list[str]]] = {} # We can't use runtime_checkable with ClassVar for issubclass checks @@ -131,13 +131,13 @@ class SupportsLoRA(Protocol): class _SupportsLoRAType(Protocol): supports_lora: Literal[True] - packed_modules_mapping: Dict[str, List[str]] - embedding_modules: Dict[str, str] - embedding_padding_modules: List[str] + packed_modules_mapping: dict[str, list[str]] + embedding_modules: dict[str, str] + embedding_padding_modules: list[str] @overload -def supports_lora(model: Type[object]) -> TypeIs[Type[SupportsLoRA]]: +def supports_lora(model: type[object]) -> TypeIs[type[SupportsLoRA]]: ... @@ -147,8 +147,8 @@ def supports_lora(model: object) -> TypeIs[SupportsLoRA]: def supports_lora( - model: Union[Type[object], object], -) -> Union[TypeIs[Type[SupportsLoRA]], TypeIs[SupportsLoRA]]: + model: Union[type[object], object], +) -> Union[TypeIs[type[SupportsLoRA]], TypeIs[SupportsLoRA]]: result = _supports_lora(model) if not result: @@ -177,7 +177,7 @@ def supports_lora( return result -def _supports_lora(model: Union[Type[object], object]) -> bool: +def _supports_lora(model: Union[type[object], object]) -> bool: if isinstance(model, type): return isinstance(model, _SupportsLoRAType) @@ -242,7 +242,7 @@ def forward( @overload -def supports_pp(model: Type[object]) -> TypeIs[Type[SupportsPP]]: +def supports_pp(model: type[object]) -> TypeIs[type[SupportsPP]]: ... @@ -252,8 +252,8 @@ def supports_pp(model: object) -> TypeIs[SupportsPP]: def supports_pp( - model: Union[Type[object], object], -) -> Union[bool, TypeIs[Type[SupportsPP]], TypeIs[SupportsPP]]: + model: Union[type[object], object], +) -> Union[bool, TypeIs[type[SupportsPP]], TypeIs[SupportsPP]]: supports_attributes = _supports_pp_attributes(model) supports_inspect = _supports_pp_inspect(model) @@ -284,14 +284,14 @@ def supports_pp( return supports_attributes and supports_inspect -def _supports_pp_attributes(model: Union[Type[object], object]) -> bool: +def _supports_pp_attributes(model: Union[type[object], object]) -> bool: if isinstance(model, type): return isinstance(model, _SupportsPPType) return isinstance(model, SupportsPP) -def _supports_pp_inspect(model: Union[Type[object], object]) -> bool: +def _supports_pp_inspect(model: Union[type[object], object]) -> bool: model_forward = getattr(model, "forward", None) if not callable(model_forward): return False @@ -322,13 +322,13 @@ def has_inner_state(model: object) -> TypeIs[HasInnerState]: @overload -def has_inner_state(model: Type[object]) -> TypeIs[Type[HasInnerState]]: +def has_inner_state(model: type[object]) -> TypeIs[type[HasInnerState]]: ... def has_inner_state( - model: Union[Type[object], object] -) -> Union[TypeIs[Type[HasInnerState]], TypeIs[HasInnerState]]: + model: Union[type[object], object] +) -> Union[TypeIs[type[HasInnerState]], TypeIs[HasInnerState]]: if isinstance(model, type): return isinstance(model, _HasInnerStateType) @@ -359,13 +359,13 @@ def is_attention_free(model: object) -> TypeIs[IsAttentionFree]: @overload -def is_attention_free(model: Type[object]) -> TypeIs[Type[IsAttentionFree]]: +def is_attention_free(model: type[object]) -> TypeIs[type[IsAttentionFree]]: ... def is_attention_free( - model: Union[Type[object], object] -) -> Union[TypeIs[Type[IsAttentionFree]], TypeIs[IsAttentionFree]]: + model: Union[type[object], object] +) -> Union[TypeIs[type[IsAttentionFree]], TypeIs[IsAttentionFree]]: if isinstance(model, type): return isinstance(model, _IsAttentionFreeType) @@ -396,13 +396,13 @@ def is_hybrid(model: object) -> TypeIs[IsHybrid]: @overload -def is_hybrid(model: Type[object]) -> TypeIs[Type[IsHybrid]]: +def is_hybrid(model: type[object]) -> TypeIs[type[IsHybrid]]: ... def is_hybrid( - model: Union[Type[object], object] -) -> Union[TypeIs[Type[IsHybrid]], TypeIs[IsHybrid]]: + model: Union[type[object], object] +) -> Union[TypeIs[type[IsHybrid]], TypeIs[IsHybrid]]: if isinstance(model, type): return isinstance(model, _IsHybridType) @@ -418,7 +418,7 @@ class SupportsCrossEncoding(Protocol): @overload def supports_cross_encoding( - model: Type[object]) -> TypeIs[Type[SupportsCrossEncoding]]: + model: type[object]) -> TypeIs[type[SupportsCrossEncoding]]: ... @@ -428,8 +428,8 @@ def supports_cross_encoding(model: object) -> TypeIs[SupportsCrossEncoding]: def _supports_cross_encoding( - model: Union[Type[object], object], -) -> Union[TypeIs[Type[SupportsCrossEncoding]], TypeIs[SupportsCrossEncoding]]: + model: Union[type[object], object], +) -> Union[TypeIs[type[SupportsCrossEncoding]], TypeIs[SupportsCrossEncoding]]: if isinstance(model, type): return isinstance(model, SupportsCrossEncoding) @@ -438,15 +438,15 @@ def _supports_cross_encoding( def supports_cross_encoding( - model: Union[Type[object], object], -) -> Union[TypeIs[Type[SupportsCrossEncoding]], TypeIs[SupportsCrossEncoding]]: + model: Union[type[object], object], +) -> Union[TypeIs[type[SupportsCrossEncoding]], TypeIs[SupportsCrossEncoding]]: return is_pooling_model(model) and _supports_cross_encoding(model) class SupportsQuant: """The interface required for all models that support quantization.""" - packed_modules_mapping: ClassVar[Dict[str, List[str]]] = {} + packed_modules_mapping: ClassVar[dict[str, list[str]]] = {} quant_config: Optional[QuantizationConfig] = None def __new__(cls, *args, **kwargs) -> "SupportsQuant": @@ -482,7 +482,7 @@ class SupportsTranscription(Protocol): @overload def supports_transcription( - model: Type[object]) -> TypeIs[Type[SupportsTranscription]]: + model: type[object]) -> TypeIs[type[SupportsTranscription]]: ... @@ -492,8 +492,8 @@ def supports_transcription(model: object) -> TypeIs[SupportsTranscription]: def supports_transcription( - model: Union[Type[object], object], -) -> Union[TypeIs[Type[SupportsTranscription]], TypeIs[SupportsTranscription]]: + model: Union[type[object], object], +) -> Union[TypeIs[type[SupportsTranscription]], TypeIs[SupportsTranscription]]: if isinstance(model, type): return isinstance(model, SupportsTranscription) diff --git a/vllm/model_executor/models/interfaces_base.py b/vllm/model_executor/models/interfaces_base.py index 22c9287509ed7..55e31803903bc 100644 --- a/vllm/model_executor/models/interfaces_base.py +++ b/vllm/model_executor/models/interfaces_base.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import (TYPE_CHECKING, Optional, Protocol, Type, Union, overload, +from typing import (TYPE_CHECKING, Optional, Protocol, Union, overload, runtime_checkable) import torch @@ -21,7 +21,7 @@ # The type of hidden states # Currently, T = torch.Tensor for all models except for Medusa -# which has T = List[torch.Tensor] +# which has T = list[torch.Tensor] T = TypeVar("T", default=torch.Tensor) T_co = TypeVar("T_co", default=torch.Tensor, covariant=True) @@ -49,12 +49,12 @@ def forward( ... -def _check_vllm_model_init(model: Union[Type[object], object]) -> bool: +def _check_vllm_model_init(model: Union[type[object], object]) -> bool: model_init = model.__init__ return supports_kw(model_init, "vllm_config") -def _check_vllm_model_forward(model: Union[Type[object], object]) -> bool: +def _check_vllm_model_forward(model: Union[type[object], object]) -> bool: model_forward = getattr(model, "forward", None) if not callable(model_forward): return False @@ -76,7 +76,7 @@ def _check_vllm_model_forward(model: Union[Type[object], object]) -> bool: @overload -def is_vllm_model(model: Type[object]) -> TypeIs[Type[VllmModel]]: +def is_vllm_model(model: type[object]) -> TypeIs[type[VllmModel]]: ... @@ -86,8 +86,8 @@ def is_vllm_model(model: object) -> TypeIs[VllmModel]: def is_vllm_model( - model: Union[Type[object], object], -) -> Union[TypeIs[Type[VllmModel]], TypeIs[VllmModel]]: + model: Union[type[object], object], +) -> Union[TypeIs[type[VllmModel]], TypeIs[VllmModel]]: return _check_vllm_model_init(model) and _check_vllm_model_forward(model) @@ -114,7 +114,7 @@ def sample( @overload def is_text_generation_model( - model: Type[object]) -> TypeIs[Type[VllmModelForTextGeneration]]: + model: type[object]) -> TypeIs[type[VllmModelForTextGeneration]]: ... @@ -125,8 +125,8 @@ def is_text_generation_model( def is_text_generation_model( - model: Union[Type[object], object], -) -> Union[TypeIs[Type[VllmModelForTextGeneration]], + model: Union[type[object], object], +) -> Union[TypeIs[type[VllmModelForTextGeneration]], TypeIs[VllmModelForTextGeneration]]: if not is_vllm_model(model): return False @@ -151,7 +151,7 @@ def pooler( @overload -def is_pooling_model(model: Type[object]) -> TypeIs[Type[VllmModelForPooling]]: +def is_pooling_model(model: type[object]) -> TypeIs[type[VllmModelForPooling]]: ... @@ -161,8 +161,8 @@ def is_pooling_model(model: object) -> TypeIs[VllmModelForPooling]: def is_pooling_model( - model: Union[Type[object], object], -) -> Union[TypeIs[Type[VllmModelForPooling]], TypeIs[VllmModelForPooling]]: + model: Union[type[object], object], +) -> Union[TypeIs[type[VllmModelForPooling]], TypeIs[VllmModelForPooling]]: if not is_vllm_model(model): return False diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py index 0499f339b2465..bb467f40118ef 100644 --- a/vllm/model_executor/models/intern_vit.py +++ b/vllm/model_executor/models/intern_vit.py @@ -6,8 +6,9 @@ # Copyright (c) 2023 OpenGVLab # Licensed under The MIT License [see LICENSE for details] # -------------------------------------------------------- +from collections.abc import Iterable from functools import partial -from typing import Iterable, Optional, Set, Tuple +from typing import Optional import torch import torch.nn as nn @@ -463,10 +464,10 @@ def forward( return encoder_outputs - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: param = params_dict[name] weight_loader = getattr(param, "weight_loader", diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py index 41ca399b9efbc..f31cc2d9ec842 100644 --- a/vllm/model_executor/models/internlm2.py +++ b/vllm/model_executor/models/internlm2.py @@ -1,7 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 +from collections.abc import Iterable from functools import partial -from typing import Any, Dict, Iterable, Optional, Set, Tuple, Type, Union +from typing import Any, Optional, Union import torch from torch import nn @@ -82,7 +83,7 @@ def __init__( num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, + rope_scaling: Optional[dict[str, Any]] = None, max_position_embeddings: int = 8192, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, @@ -226,7 +227,7 @@ def forward( positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: # Self Attention if residual is None: residual = hidden_states @@ -253,7 +254,7 @@ def __init__( *, vllm_config: VllmConfig, prefix: str = "", - layer_type: Type[InternLMDecoderLayer] = InternLMDecoderLayer): + layer_type: type[InternLMDecoderLayer] = InternLMDecoderLayer): super().__init__() config = vllm_config.model_config.hf_config @@ -318,7 +319,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "", - model_type: Type[InternLM2Model] = InternLM2Model): + model_type: type[InternLM2Model] = InternLM2Model): super().__init__() config = vllm_config.model_config.hf_config quant_config = vllm_config.quant_config @@ -372,15 +373,15 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("gate_up_proj", "w1", 0), ("gate_up_proj", "w3", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue @@ -418,7 +419,7 @@ def __init__( *, vllm_config: VllmConfig, prefix: str = "", - model_type: Type[InternLM2Model] = InternLM2Model, + model_type: type[InternLM2Model] = InternLM2Model, ): super().__init__(vllm_config=vllm_config, prefix=prefix, diff --git a/vllm/model_executor/models/internlm2_ve.py b/vllm/model_executor/models/internlm2_ve.py index 69b0caab8f8ec..6893d0239121d 100644 --- a/vllm/model_executor/models/internlm2_ve.py +++ b/vllm/model_executor/models/internlm2_ve.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Optional, Tuple, Union +from typing import Optional, Union import torch from torch import nn @@ -66,7 +66,7 @@ def forward( hidden_states: torch.Tensor, residual: Optional[torch.Tensor], visual_token_mask: Optional[torch.Tensor] = None, - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: # Self Attention if residual is None: residual = hidden_states diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index 52ddb279cca39..a47265afe71f2 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -7,9 +7,9 @@ # Licensed under The MIT License [see LICENSE for details] # -------------------------------------------------------- from abc import ABC, abstractmethod +from collections.abc import Iterable, Mapping from functools import cached_property -from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple, - TypedDict, TypeVar, Union) +from typing import Literal, Optional, TypedDict, TypeVar, Union import torch import torch.nn as nn @@ -55,7 +55,7 @@ class InternVLImagePixelInputs(TypedDict): Shape: `(batch_size * num_images * (1 + num_patches), num_channels, height, width)` """ - patches_per_image: List[int] + patches_per_image: list[int] """ List of number of total patches for each image in the batch. """ @@ -976,7 +976,7 @@ def sample( ) -> Optional[SamplerOutput]: return self.language_model.sample(logits, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/jais.py b/vllm/model_executor/models/jais.py index 78fe6588eddce..4ccc5c9605eba 100644 --- a/vllm/model_executor/models/jais.py +++ b/vllm/model_executor/models/jais.py @@ -21,7 +21,8 @@ """Inference-only Jais model compatible with HuggingFace weights.""" import math -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -343,10 +344,10 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "lm_head.weight" in name: # GPT-2 ties the weights of the embedding layer and the final diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py index 14e56df6cadf8..707e8a3571e0c 100644 --- a/vllm/model_executor/models/jamba.py +++ b/vllm/model_executor/models/jamba.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 """Inference-only Jamba model.""" -from typing import Iterable, Optional, Set, Tuple +from collections.abc import Iterable +from typing import Optional import torch from torch import nn @@ -35,7 +36,7 @@ make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) -KVCache = Tuple[torch.Tensor, torch.Tensor] +KVCache = tuple[torch.Tensor, torch.Tensor] class JambaMoE(nn.Module): @@ -437,7 +438,7 @@ def get_seqlen_agnostic_capture_inputs(self, batch_size: int): return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size) def _get_mamba_cache_shape( - self) -> Tuple[Tuple[int, int], Tuple[int, int]]: + self) -> tuple[tuple[int, int], tuple[int, int]]: world_size = get_tensor_model_parallel_world_size() hidden_size = self.config.hidden_size conv_state_shape = ( @@ -467,8 +468,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -485,7 +486,7 @@ def load_weights(self, weights: Iterable[Tuple[str, num_experts=self.config.num_experts) params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue @@ -586,7 +587,7 @@ def pooler( logits = self.score(hidden_states) return self._pooler(logits, pooling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): # TODO: The reward weights themselves have float32 accuracy data, we # would like to load them in fp32 to get that extra precision. super().load_weights(weights) diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index a0aff9e609d9e..9fe844ec36b12 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -22,7 +22,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only LLaMA model compatible with HuggingFace weights.""" -from typing import Any, Dict, Iterable, Optional, Set, Tuple, Type, Union +from collections.abc import Iterable +from typing import Any, Optional, Union import torch from torch import nn @@ -101,7 +102,7 @@ def __init__(self, num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, + rope_scaling: Optional[dict[str, Any]] = None, max_position_embeddings: int = 8192, quant_config: Optional[QuantizationConfig] = None, bias: bool = False, @@ -267,7 +268,7 @@ def forward( positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: # Self Attention if residual is None: residual = hidden_states @@ -292,7 +293,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "", - layer_type: Type[LlamaDecoderLayer] = LlamaDecoderLayer): + layer_type: type[LlamaDecoderLayer] = LlamaDecoderLayer): super().__init__() config = vllm_config.model_config.hf_config @@ -367,8 +368,8 @@ def forward( hidden_states, _ = self.norm(hidden_states, residual) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) (".qkv_proj", ".q_proj", "q"), @@ -378,7 +379,7 @@ def load_weights(self, weights: Iterable[Tuple[str, (".gate_up_proj", ".up_proj", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue @@ -545,8 +546,8 @@ def sample(self, logits: torch.Tensor, next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, skip_prefixes=(["lm_head."] @@ -562,7 +563,7 @@ def maybe_remap_mistral( self, name: str, loaded_weight: torch.Tensor, - ) -> Tuple[str, torch.Tensor]: + ) -> tuple[str, torch.Tensor]: def permute(w: torch.Tensor, n_heads: int): attn_in = self.config.head_dim * n_heads diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 72b1591306f26..00cad32caed6b 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -1,9 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 from abc import abstractmethod +from collections.abc import Iterable, Mapping from functools import cached_property -from typing import (Final, Iterable, List, Literal, Mapping, Optional, - Protocol, Set, Tuple, TypedDict, TypeVar, Union) +from typing import (Final, Literal, Optional, Protocol, TypedDict, TypeVar, + Union) import torch import torch.nn as nn @@ -47,7 +48,7 @@ class LlavaImagePixelInputs(TypedDict): type: Literal["pixel_values"] - data: Union[torch.Tensor, List[torch.Tensor]] + data: Union[torch.Tensor, list[torch.Tensor]] """ Shape: `(batch_size * num_images, num_channels, height, width)` @@ -729,8 +730,8 @@ def sample( ) -> Optional[SamplerOutput]: return self.language_model.sample(logits, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index 6a050d7798a20..052f07afd83ba 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -1,9 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 from abc import abstractmethod +from collections.abc import Iterable, Mapping from functools import cached_property -from typing import (Final, Iterable, List, Literal, Mapping, Optional, - Protocol, Set, Tuple, TypedDict, TypeVar, Union) +from typing import (Final, Literal, Optional, Protocol, TypedDict, TypeVar, + Union) import torch import torch.nn as nn @@ -32,7 +33,7 @@ class LlavaNextImagePixelInputs(TypedDict): type: Literal["pixel_values"] - data: Union[torch.Tensor, List[torch.Tensor]] + data: Union[torch.Tensor, list[torch.Tensor]] """ Shape: `(batch_size * num_images, 1 + num_patches, num_channels, height, width)` @@ -275,8 +276,8 @@ def _validate_shape(d: torch.Tensor): return data def _validate_pixel_values( - self, data: Union[torch.Tensor, List[torch.Tensor]] - ) -> Union[torch.Tensor, List[torch.Tensor]]: + self, data: Union[torch.Tensor, list[torch.Tensor]] + ) -> Union[torch.Tensor, list[torch.Tensor]]: h = w = self.config.vision_config.image_size expected_dims = (3, h, w) @@ -458,7 +459,7 @@ def _process_image_pixels( def _process_image_input( self, image_input: LlavaNextImageInputs, - ) -> Union[torch.Tensor, List[torch.Tensor]]: + ) -> Union[torch.Tensor, list[torch.Tensor]]: if image_input["type"] == "image_embeds": return [image_input["data"]] @@ -587,7 +588,7 @@ def sample( ) -> Optional[SamplerOutput]: return self.language_model.sample(logits, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py index 807d6977ed409..731c9ba09c883 100644 --- a/vllm/model_executor/models/llava_next_video.py +++ b/vllm/model_executor/models/llava_next_video.py @@ -1,9 +1,9 @@ # SPDX-License-Identifier: Apache-2.0 import math +from collections.abc import Iterable, Mapping from functools import cached_property -from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple, - TypedDict, Union) +from typing import Literal, Optional, TypedDict, Union import torch import torch.nn as nn @@ -36,7 +36,7 @@ class LlavaNextVideoPixelInputs(TypedDict): type: Literal["pixel_values_videos"] - data: Union[torch.Tensor, List[torch.Tensor]] + data: Union[torch.Tensor, list[torch.Tensor]] """ Shape: `(batch_size, num_frames, num_channels, height, width)` @@ -320,8 +320,8 @@ def sampler(self): return get_sampler() def _validate_video_pixel_values( - self, data: Union[torch.Tensor, List[torch.Tensor]] - ) -> Union[torch.Tensor, List[torch.Tensor]]: + self, data: Union[torch.Tensor, list[torch.Tensor]] + ) -> Union[torch.Tensor, list[torch.Tensor]]: h = w = self.config.vision_config.image_size expected_dims = (3, h, w) @@ -346,7 +346,7 @@ def _parse_and_validate_video_input( A legal video input should have the following dimensions: { "pixel_values_videos" : - List[b, Tensor(nb_frames, nb_channels, height, width)] + list[b, Tensor(nb_frames, nb_channels, height, width)] } """ pixel_values = kwargs.pop("pixel_values_videos", None) @@ -485,8 +485,8 @@ def sample( ) -> Optional[SamplerOutput]: return self.language_model.sample(logits, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, # This model doesn't support images for now diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py index e57eea4286e94..faeeae7d42a6c 100644 --- a/vllm/model_executor/models/llava_onevision.py +++ b/vllm/model_executor/models/llava_onevision.py @@ -1,9 +1,9 @@ # SPDX-License-Identifier: Apache-2.0 import math +from collections.abc import Iterable, Mapping from functools import cached_property -from typing import (Final, Iterable, List, Literal, Mapping, Optional, - Protocol, Set, Tuple, TypedDict, Union) +from typing import Final, Literal, Optional, Protocol, TypedDict, Union import torch import torch.nn as nn @@ -42,7 +42,7 @@ class LlavaOnevisionVideoPixelInputs(TypedDict): type: Literal["pixel_values_videos"] - data: Union[torch.Tensor, List[torch.Tensor]] + data: Union[torch.Tensor, list[torch.Tensor]] """ Shape: `(batch_size, num_videos, num_frames, num_channels, height, width)` @@ -54,7 +54,7 @@ class LlavaOnevisionVideoPixelInputs(TypedDict): class LlavaOnevisionImagePixelInputs(TypedDict): type: Literal["pixel_values"] - data: Union[torch.Tensor, List[torch.Tensor]] + data: Union[torch.Tensor, list[torch.Tensor]] """ Shape: `(batch_size * num_images, 1 + num_patches, num_channels, height, width)` @@ -480,8 +480,8 @@ def _validate_shape(d: torch.Tensor): return data def _validate_image_pixel_values( - self, data: Union[torch.Tensor, List[torch.Tensor]] - ) -> Union[torch.Tensor, List[torch.Tensor]]: + self, data: Union[torch.Tensor, list[torch.Tensor]] + ) -> Union[torch.Tensor, list[torch.Tensor]]: h = w = self.config.vision_config.image_size expected_dims = (3, h, w) @@ -539,8 +539,8 @@ def _parse_and_validate_image_input( raise AssertionError("This line should be unreachable.") def _validate_video_pixel_values( - self, data: Union[torch.Tensor, List[torch.Tensor]] - ) -> Union[torch.Tensor, List[torch.Tensor]]: + self, data: Union[torch.Tensor, list[torch.Tensor]] + ) -> Union[torch.Tensor, list[torch.Tensor]]: h = w = self.config.vision_config.image_size expected_dims = (3, h, w) @@ -566,7 +566,7 @@ def _parse_and_validate_video_input( A legal video input should have the following dimensions: { "pixel_values_videos" : - List[b, Tensor(nb_frames, nb_channels, height, width)] + list[b, Tensor(nb_frames, nb_channels, height, width)] } """ pixel_values = kwargs.pop("pixel_values_videos", None) @@ -719,7 +719,7 @@ def _merge_image_patch_embeddings(self, def _process_image_pixels( self, inputs: LlavaOnevisionImagePixelInputs, - ) -> Union[torch.Tensor, List[torch.Tensor]]: + ) -> Union[torch.Tensor, list[torch.Tensor]]: assert self.vision_tower is not None pixel_values = inputs["data"] @@ -748,7 +748,7 @@ def _process_image_pixels( def _process_image_input( self, image_input: LlavaOnevisionImageInputs, - ) -> Union[torch.Tensor, List[torch.Tensor]]: + ) -> Union[torch.Tensor, list[torch.Tensor]]: if image_input["type"] == "image_embeds": return [image_input["data"]] @@ -972,7 +972,7 @@ def sample( ) -> Optional[SamplerOutput]: return self.language_model.sample(logits, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py index 9f1cd8c29a5a0..28a770abec6ae 100644 --- a/vllm/model_executor/models/mamba.py +++ b/vllm/model_executor/models/mamba.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 """PyTorch MAMBA model.""" -from typing import Iterable, Optional, Set, Tuple +from collections.abc import Iterable +from typing import Optional import torch from torch import nn @@ -30,7 +31,7 @@ make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) -KVCache = Tuple[torch.Tensor, torch.Tensor] +KVCache = tuple[torch.Tensor, torch.Tensor] class MambaDecoderLayer(nn.Module): @@ -228,7 +229,7 @@ def get_seqlen_agnostic_capture_inputs(self, batch_size: int): return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size) def _get_mamba_cache_shape( - self) -> Tuple[Tuple[int, int], Tuple[int, int]]: + self) -> tuple[tuple[int, int], tuple[int, int]]: world_size = get_tensor_model_parallel_world_size() conv_state_shape = ( self.config.intermediate_size // world_size, @@ -254,10 +255,10 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "A_log" in name: name = name.replace("A_log", "A") diff --git a/vllm/model_executor/models/mamba2.py b/vllm/model_executor/models/mamba2.py index 266cdc243ac44..fd37c45f6b872 100644 --- a/vllm/model_executor/models/mamba2.py +++ b/vllm/model_executor/models/mamba2.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 """PyTorch MAMBA2 model.""" -from typing import Iterable, Optional, Set, Tuple +from collections.abc import Iterable +from typing import Optional import torch from torch import nn @@ -33,7 +34,7 @@ make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) -KVCache = Tuple[torch.Tensor, torch.Tensor] +KVCache = tuple[torch.Tensor, torch.Tensor] class Mamba2DecoderLayer(nn.Module): @@ -248,7 +249,7 @@ def get_seqlen_agnostic_capture_inputs(self, batch_size: int): return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size) def _get_mamba_cache_shape( - self) -> Tuple[Tuple[int, int], Tuple[int, int]]: + self) -> tuple[tuple[int, int], tuple[int, int]]: world_size = get_tensor_model_parallel_world_size() conv_state_shape, temporal_state_shape = None, None @@ -294,10 +295,10 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "A_log" in name: name = name.replace("A_log", "A") diff --git a/vllm/model_executor/models/mamba_cache.py b/vllm/model_executor/models/mamba_cache.py index d529833093cea..ae1fc937b989c 100644 --- a/vllm/model_executor/models/mamba_cache.py +++ b/vllm/model_executor/models/mamba_cache.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 from dataclasses import dataclass -from typing import Dict, List, Tuple import torch @@ -24,8 +23,8 @@ def at_layer_idx(self, layer_idx): class MambaCacheManager: def __init__(self, vllm_config: VllmConfig, dtype: torch.dtype, - num_mamba_layers: int, conv_state_shape: Tuple[int, int], - temporal_state_shape: Tuple[int, int]): + num_mamba_layers: int, conv_state_shape: tuple[int, int], + temporal_state_shape: tuple[int, int]): # Determine max batch size to set size of MambaCache max_batch_size = vllm_config.scheduler_config.max_num_seqs @@ -45,7 +44,7 @@ def __init__(self, vllm_config: VllmConfig, dtype: torch.dtype, # Maps between the request id and a dict that maps between the seq_id # and its index inside the self.mamba_cache - self.mamba_cache_indices_mapping: Dict[str, Dict[int, int]] = {} + self.mamba_cache_indices_mapping: dict[str, dict[int, int]] = {} self.free_cache_indices = list(range(max_batch_size)) def current_run_tensors(self, **kwargs) -> MambaCacheParams: @@ -147,8 +146,8 @@ def _assign_seq_id_to_cache_index(self, cur_rid: str, seq_id: int, return self.mamba_cache_indices_mapping[cur_rid][seq_id] def _prepare_current_run_mamba_cache( - self, request_ids_to_seq_ids: Dict[str, list[int]], - finished_requests_ids: List[str]) -> List[int]: + self, request_ids_to_seq_ids: dict[str, list[int]], + finished_requests_ids: list[str]) -> list[int]: return [ self._assign_seq_id_to_cache_index(req_id, seq_id, finished_requests_ids) @@ -157,7 +156,7 @@ def _prepare_current_run_mamba_cache( ] def _release_finished_requests(self, - finished_seq_groups_req_ids: List[str]): + finished_seq_groups_req_ids: list[str]): for req_id in finished_seq_groups_req_ids: if req_id in self.mamba_cache_indices_mapping: for seq_id in self.mamba_cache_indices_mapping[req_id]: diff --git a/vllm/model_executor/models/medusa.py b/vllm/model_executor/models/medusa.py index a19d7da5654b6..ac0b281f359c3 100644 --- a/vllm/model_executor/models/medusa.py +++ b/vllm/model_executor/models/medusa.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Iterable, List, Optional, Set, Tuple +from collections.abc import Iterable +from typing import Optional import torch import torch.nn as nn @@ -96,13 +97,13 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: # checkpoint file has token_map tensor. self.token_map = None - def forward(self, hidden_states: torch.Tensor) -> List[torch.Tensor]: + def forward(self, hidden_states: torch.Tensor) -> list[torch.Tensor]: return [block(hidden_states) for block in self.blocks] def compute_logits( - self, hidden_states: List[torch.Tensor], - sampling_metadata: SamplingMetadata) -> List[torch.Tensor]: - logits_lst: List[torch.Tensor] = [] + self, hidden_states: list[torch.Tensor], + sampling_metadata: SamplingMetadata) -> list[torch.Tensor]: + logits_lst: list[torch.Tensor] = [] for hs, lm_head in zip(hidden_states, self.lm_heads): _logits = self.logits_processor(lm_head, hs, sampling_metadata) @@ -127,9 +128,9 @@ def compute_logits( def sample( self, - logits: List[torch.Tensor], + logits: list[torch.Tensor], sampling_metadata: SamplingMetadata, - ) -> List[SamplerOutput]: + ) -> list[SamplerOutput]: logits = torch.stack(logits, dim=0).float() logprobs = torch.log_softmax(logits, dim=-1) token_ids = logits.argmax(-1) # support only top-1 for now @@ -144,7 +145,7 @@ def sample( token_prob_list.append(probs[:, seq_group.sample_indices]) token_logprob_list.append(logprobs[:, seq_group.sample_indices]) - outputs: List[Optional[SamplerOutput]] = [] + outputs: list[Optional[SamplerOutput]] = [] for idx in range(len(sampling_metadata.seq_groups)): outputs.append( SamplerOutput( @@ -160,7 +161,7 @@ def generate_proposals( self, previous_hidden_states: torch.Tensor, sampling_metadata: SamplingMetadata, - ) -> List[SamplerOutput]: + ) -> list[SamplerOutput]: return self.sample( logits=self.compute_logits( hidden_states=self.forward(previous_hidden_states), @@ -169,10 +170,10 @@ def generate_proposals( sampling_metadata=sampling_metadata, ) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() weights_map = {} diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py index 34e1f3927a9af..938b1e40899d8 100644 --- a/vllm/model_executor/models/minicpm.py +++ b/vllm/model_executor/models/minicpm.py @@ -23,7 +23,8 @@ # limitations under the License. """Inference-only MiniCPM model compatible with HuggingFace weights.""" import math -from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Any, Optional, Union import torch from torch import nn @@ -191,7 +192,7 @@ def __init__( num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, + rope_scaling: Optional[dict[str, Any]] = None, max_position_embeddings: int = 8192, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, @@ -330,7 +331,7 @@ def forward( positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: # Self Attention residual = hidden_states hidden_states = self.input_layernorm(hidden_states) @@ -430,8 +431,8 @@ def forward( hidden_states = self.norm(hidden_states) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -448,7 +449,7 @@ def load_weights(self, weights: Iterable[Tuple[str, for weight_name in ["w1", "w2", "w3"] ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue @@ -593,8 +594,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, skip_prefixes=(["lm_head."] diff --git a/vllm/model_executor/models/minicpm3.py b/vllm/model_executor/models/minicpm3.py index 1b24c38cef1b0..2a6867d12d993 100644 --- a/vllm/model_executor/models/minicpm3.py +++ b/vllm/model_executor/models/minicpm3.py @@ -23,7 +23,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only MiniCPM3 model compatible with HuggingFace weights.""" -from typing import Any, Dict, Optional +from typing import Any, Optional import torch from torch import nn @@ -58,7 +58,7 @@ def __init__( q_lora_rank: int, kv_lora_rank: int, rope_theta: float = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, + rope_scaling: Optional[dict[str, Any]] = None, max_position_embeddings: int = 8192, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, diff --git a/vllm/model_executor/models/minicpmo.py b/vllm/model_executor/models/minicpmo.py index e354e5323327f..981ed7ab95c0e 100644 --- a/vllm/model_executor/models/minicpmo.py +++ b/vllm/model_executor/models/minicpmo.py @@ -22,9 +22,9 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only MiniCPM-O model compatible with HuggingFace weights.""" +from collections.abc import Iterable, Mapping from functools import partial -from typing import (Any, Callable, Dict, Iterable, List, Literal, Mapping, - Optional, Set, Tuple, TypedDict, Union) +from typing import Any, Callable, Literal, Optional, TypedDict, Union import torch from torch import nn @@ -80,7 +80,7 @@ class MiniCPMOAudioFeatureInputs(TypedDict): class MiniCPMOAudioEmbeddingInputs(TypedDict): type: Literal["audio_embeds"] - data: List[torch.Tensor] + data: list[torch.Tensor] """ Shape: `(batch_size * num_images * num_slices, hidden_size)` @@ -152,7 +152,7 @@ def _parse_audio_data( class MiniCPMOProcessingInfo(MiniCPMVProcessingInfo): audio_pattern = "()" - def get_supported_mm_modalities(self) -> List[str]: + def get_supported_mm_modalities(self) -> list[str]: return ["image", "video", "audio"] def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: @@ -261,7 +261,7 @@ def get_audio_prompt_texts(self, return self.info.get_hf_processor().get_audio_placeholder( audio_lens, chunk_input, chunk_length) - def get_special_tokens(self) -> Dict[str, torch.Tensor]: + def get_special_tokens(self) -> dict[str, torch.Tensor]: tokenizer = self.info.get_tokenizer() special_tokens = super().get_special_tokens() if hasattr(tokenizer, "audio_start_id"): @@ -272,7 +272,7 @@ def get_special_tokens(self) -> Dict[str, torch.Tensor]: return special_tokens def process_audios(self, mm_data: Mapping[str, object], - mm_kwargs: Mapping[str, object]) -> Dict[str, object]: + mm_kwargs: Mapping[str, object]) -> dict[str, object]: audios = mm_data.pop("audios", []) audio_embeds = mm_data.pop("audio_embeds", []) if isinstance(audios, (list, torch.Tensor)) and len(audios) > 0: @@ -343,13 +343,13 @@ def get_modality_num_counter(self, modality: str) -> str: return "audio_lens" return super().get_modality_num_counter(modality) - def get_num_slices_by_modality(self, inputs: Dict[str, object], + def get_num_slices_by_modality(self, inputs: dict[str, object], modality: str, index: int) -> int: if modality == "audio": return inputs["audio"]["audio_num_segments"][index] return super().get_num_slices_by_modality(inputs, modality, index) - def get_prompt_texts_by_modality(self, inputs: Dict[str, object], + def get_prompt_texts_by_modality(self, inputs: dict[str, object], modality: str, index: int) -> str: if modality == "audio": return self.get_audio_prompt_texts( @@ -359,7 +359,7 @@ def get_prompt_texts_by_modality(self, inputs: Dict[str, object], def _get_prompt_replacements( self, mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, Any], - out_mm_kwargs: MultiModalKwargs) -> List[PromptReplacement]: + out_mm_kwargs: MultiModalKwargs) -> list[PromptReplacement]: placeholder = { "image": self.info.image_pattern, "video": self.info.video_pattern, @@ -579,8 +579,8 @@ def init_audio_module(self, *, vllm_config: VllmConfig, prefix: str = ""): self.audio_encoder_layer = -1 return model - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self, skip_prefixes=["tts"]) return loader.load_weights(weights) @@ -742,7 +742,7 @@ def _get_audio_bounds(self, input_ids: torch.Tensor, def _parse_and_validate_audio_inputs( self, input_ids: torch.Tensor, - **kwargs: object) -> Tuple[MiniCPMOAudioInputs]: + **kwargs: object) -> tuple[MiniCPMOAudioInputs]: audio_features = kwargs.pop("audio_features", []) audio_feature_lens = kwargs.pop("audio_feature_lens", []) audio_embeds = kwargs.pop("audio_embeds", None) diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index 2699958331f3d..154da20d33e98 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -25,9 +25,9 @@ import math import re from collections import Counter +from collections.abc import Iterable, Mapping from functools import cached_property, partial -from typing import (Any, Callable, Dict, Iterable, List, Literal, Mapping, - Optional, Set, Tuple, TypedDict, Union) +from typing import Any, Callable, Literal, Optional, TypedDict, Union import numpy as np import torch @@ -72,7 +72,7 @@ class MiniCPMVImagePixelInputs(TypedDict): type: Literal["pixel_values"] - data: List[torch.Tensor] + data: list[torch.Tensor] """ Shape: `(batch_size * num_images * num_slices, num_channels, height, width)` @@ -128,7 +128,7 @@ def __init__(self, num_heads: int, kv_dim: Optional[int] = None, norm_layer: Callable[[int], nn.LayerNorm] = DEFAULT_LN, - max_size: Tuple[int, int] = (70, 70), + max_size: tuple[int, int] = (70, 70), quant_config: Optional[QuantizationConfig] = None, prefix: str = "") -> None: super().__init__(num_queries, @@ -143,7 +143,7 @@ def __init__(self, self._set_2d_pos_cache(self.max_size) def _set_2d_pos_cache(self, - max_size: Tuple[int, int], + max_size: tuple[int, int], device: torch.types.Device = "cpu") -> None: pos_embed_arr = get_2d_sincos_pos_embed(self.embed_dim, max_size, @@ -213,7 +213,7 @@ def forward(self, x: torch.Tensor, return x -def get_version_by_config(config: PretrainedConfig) -> Tuple[int, ...]: +def get_version_by_config(config: PretrainedConfig) -> tuple[int, ...]: version_float = getattr(config, "version", None) # The old configs do not include version number @@ -352,7 +352,7 @@ def get_image_processor(self): def get_model_version(self): return get_version_by_config(self.get_hf_config()) - def get_supported_mm_modalities(self) -> List[str]: + def get_supported_mm_modalities(self) -> list[str]: if self.get_model_version() == (2, 6): return ["image", "video"] else: @@ -394,7 +394,7 @@ def get_max_slice_num(self) -> int: return max_slice_num def get_sliced_grid(self, image_size: ImageSize, - max_slice_num: int) -> Tuple[int, int]: + max_slice_num: int) -> tuple[int, int]: if self.get_model_version() == (2, 6): slice_grid = self.get_image_processor().get_sliced_grid( image_size, max_slice_num) @@ -536,7 +536,7 @@ def get_video_prompt_texts(self, image_size: ImageSize, use_image_id=False) for image_idx in range(num_frames)) return prompt_texts - def get_special_tokens(self) -> Dict[str, torch.Tensor]: + def get_special_tokens(self) -> dict[str, torch.Tensor]: tokenizer = self.info.get_tokenizer() special_tokens = { "im_start_id": torch.tensor(tokenizer.im_start_id), @@ -556,7 +556,7 @@ def repack_processor_outputs(outputs: Any) -> BatchFeature: return outputs def process_images(self, mm_data: Mapping[str, object], - mm_kwargs: Mapping[str, object]) -> Dict[str, object]: + mm_kwargs: Mapping[str, object]) -> dict[str, object]: images = mm_data.pop("images", []) image_embeds = mm_data.pop("image_embeds", []) if isinstance(images, Image.Image): @@ -579,7 +579,7 @@ def process_images(self, mm_data: Mapping[str, object], return image_outputs def process_videos(self, mm_data: Mapping[str, object], - mm_kwargs: Mapping[str, object]) -> Dict[str, object]: + mm_kwargs: Mapping[str, object]) -> dict[str, object]: videos = mm_data.pop("videos", []) video_embeds = mm_data.pop("video_embeds", []) if len(videos) > 0 and isinstance(videos[0], Image.Image): @@ -639,7 +639,7 @@ def process_mm_inputs(self, mm_data, mm_kwargs) -> object: "video": self.process_videos(mm_data, mm_kwargs) } - def get_input_modalities(self, mm_data) -> List[str]: + def get_input_modalities(self, mm_data) -> list[str]: supported_mm_modalities = self.info.get_supported_mm_modalities() input_modalities = [] for modality in supported_mm_modalities: @@ -653,7 +653,7 @@ def get_modality_num_counter(self, modality: str) -> str: elif modality == "video": return "video_image_sizes" - def get_num_slices_by_modality(self, inputs: Dict[str, object], + def get_num_slices_by_modality(self, inputs: dict[str, object], modality: str, index: int) -> int: if modality == "image": return self.info.get_image_slice_nums( @@ -667,8 +667,8 @@ def get_num_slices_by_modality(self, inputs: Dict[str, object], else: raise ValueError(f"Unexpected modality: {modality}") - def check_mm_inputs(self, inputs: Dict[str, object], - matches: List[str]) -> None: + def check_mm_inputs(self, inputs: dict[str, object], + matches: list[str]) -> None: counts = Counter(matches) for modality, count in counts.items(): if modality not in inputs or not inputs[modality]: @@ -680,7 +680,7 @@ def check_mm_inputs(self, inputs: Dict[str, object], f"{modality} inputs while you pass " f"{len(inputs[modality][counter_key])}") - def get_prompt_texts_by_modality(self, inputs: Dict[str, object], + def get_prompt_texts_by_modality(self, inputs: dict[str, object], modality: str, index: int) -> str: if modality == "image": return self.get_image_prompt_texts( @@ -743,7 +743,7 @@ def _hf_processor_applies_repl( def _get_prompt_replacements( self, mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, Any], - out_mm_kwargs: MultiModalKwargs) -> List[PromptReplacement]: + out_mm_kwargs: MultiModalKwargs) -> list[PromptReplacement]: placeholder = { "image": self.info.image_pattern, "video": self.info.video_pattern, @@ -775,7 +775,7 @@ def _get_mm_fields_config( def apply( self, - prompt: Union[str, List[int]], + prompt: Union[str, list[int]], mm_data: MultiModalDataDict, hf_processor_mm_kwargs: Mapping[str, object], ) -> MultiModalInputs: @@ -851,7 +851,7 @@ def get_embedding_with_vision( self, input_ids: torch.Tensor, image_inputs: Optional[MiniCPMVImageInputs], - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: vlm_embedding: torch.Tensor = self.llm.get_input_embeddings(input_ids) if image_inputs is None: # No image @@ -977,8 +977,8 @@ def _parse_and_validate_image_inputs( f"{len(modality_mm_data['pixel_values'])} vs. " f"{len(modality_mm_data['tgt_sizes'])}") - pixel_values_flat: List[torch.Tensor] = [] - tgt_sizes_flat: List[torch.Tensor] = [] + pixel_values_flat: list[torch.Tensor] = [] + tgt_sizes_flat: list[torch.Tensor] = [] for b in range(batch_size): mm_counts = {"image": 0, "video": 0} if self.version == (2, 6) \ else {"image": 0} @@ -1068,8 +1068,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) @@ -1105,7 +1105,7 @@ def init_resampler(self, def get_vision_embedding( self, - pixel_values: List[torch.Tensor], + pixel_values: list[torch.Tensor], patch_attn_mask: Optional[torch.Tensor] = None, tgt_sizes: Optional[torch.Tensor] = None, ) -> torch.Tensor: @@ -1185,7 +1185,7 @@ def init_resampler(self, def get_vision_embedding( self, - pixel_values: List[torch.Tensor], + pixel_values: list[torch.Tensor], patch_attn_mask: Optional[torch.Tensor] = None, tgt_sizes: Optional[torch.Tensor] = None, ) -> torch.Tensor: @@ -1268,7 +1268,7 @@ def init_resampler(self, def get_vision_embedding( self, - pixel_values: List[torch.Tensor], + pixel_values: list[torch.Tensor], patch_attn_mask: Optional[torch.Tensor] = None, tgt_sizes: Optional[torch.Tensor] = None, ) -> torch.Tensor: @@ -1363,7 +1363,7 @@ def init_resampler(self, def get_vision_embedding( self, - pixel_values: List[torch.Tensor], + pixel_values: list[torch.Tensor], patch_attn_mask: Optional[torch.Tensor] = None, tgt_sizes: Optional[torch.Tensor] = None, ) -> torch.Tensor: diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index c8dea557e5715..ef8c3e60357ad 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -22,7 +22,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only Mixtral model.""" -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -390,8 +391,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -408,7 +409,7 @@ def load_weights(self, weights: Iterable[Tuple[str, num_experts=self.config.num_local_experts) params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py index 21b52d9f54c76..ac441cc79bac8 100644 --- a/vllm/model_executor/models/mixtral_quant.py +++ b/vllm/model_executor/models/mixtral_quant.py @@ -22,7 +22,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only Mixtral model.""" -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import numpy as np import torch @@ -403,8 +404,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -413,7 +414,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py index 459928fe3fb0e..491abcfb1cbb5 100644 --- a/vllm/model_executor/models/mllama.py +++ b/vllm/model_executor/models/mllama.py @@ -15,8 +15,8 @@ # limitations under the License. """PyTorch Mllama model.""" import math -from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple, - TypedDict, Union) +from collections.abc import Iterable, Mapping +from typing import Literal, Optional, TypedDict, Union import numpy as np import torch @@ -318,8 +318,8 @@ def __init__( self, in_channels: int, out_channels: int, - kernel_size: Union[int, Tuple[int, int]], - stride: Union[int, Tuple[int, int]], + kernel_size: Union[int, tuple[int, int]], + stride: Union[int, tuple[int, int]], bias: bool = False, ) -> None: super().__init__() @@ -551,7 +551,7 @@ def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, - ) -> Union[Tuple, BaseModelOutput]: + ) -> Union[tuple, BaseModelOutput]: encoder_states = () for i, encoder_layer in enumerate(self.layers): @@ -824,7 +824,7 @@ def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.Tensor], - kv_range_for_decode: Optional[List[Tuple[int, int]]], + kv_range_for_decode: Optional[list[tuple[int, int]]], cross_attention_states: Optional[torch.Tensor], ) -> torch.Tensor: qkv_dec, _ = self.qkv_proj(hidden_states) @@ -860,7 +860,7 @@ def _attention_with_mask( k: torch.Tensor, v: torch.Tensor, attention_mask: torch.Tensor, - kv_range_for_decode: List[Tuple[int, int]], + kv_range_for_decode: list[tuple[int, int]], ) -> torch.Tensor: kv_cache = self.attn.kv_cache[self.pipeline_parallel_rank] attn_metadata: AttentionMetadata = get_forward_context().attn_metadata @@ -973,7 +973,7 @@ def forward( hidden_states: torch.Tensor, cross_attention_states: torch.Tensor, cross_attention_mask: torch.Tensor, - kv_range_for_decode: Optional[List[Tuple[int, int]]], + kv_range_for_decode: Optional[list[tuple[int, int]]], full_text_row_masked_out_mask: torch.Tensor, ) -> torch.Tensor: residual = hidden_states @@ -1044,8 +1044,8 @@ def forward( positions: Optional[torch.LongTensor], cross_attention_states: Optional[torch.LongTensor], cross_attention_mask: Optional[torch.LongTensor], - kv_range_for_decode: Optional[List[Tuple[int, int]]], - full_text_row_masked_out_mask: Optional[Tuple[torch.Tensor, + kv_range_for_decode: Optional[list[tuple[int, int]]], + full_text_row_masked_out_mask: Optional[tuple[torch.Tensor, torch.Tensor]], skip_cross_attention: bool, ) -> torch.Tensor: @@ -1108,8 +1108,8 @@ def forward( positions: Optional[torch.LongTensor], cross_attention_states: Optional[torch.LongTensor], cross_attention_mask: Optional[torch.LongTensor], - kv_range_for_decode: Optional[List[Tuple[int, int]]], - full_text_row_masked_out_mask: Optional[Tuple[torch.Tensor, + kv_range_for_decode: Optional[list[tuple[int, int]]], + full_text_row_masked_out_mask: Optional[tuple[torch.Tensor, torch.Tensor]], skip_cross_attention: bool, ) -> torch.Tensor: @@ -1188,26 +1188,26 @@ def sample( def _parse_and_validate_image_input(self, **kwargs: object): # tensor with the same shape will be batched together by # MultiModalKwargs.batch, so pixel_values here can be: - # - List[List[torch.Tensor]]: + # - list[list[torch.Tensor]]: # with shape (num_tiles, 3, image_res, image_res) - # - List[torch.Tensor]: + # - list[torch.Tensor]: # with shape (num_image, num_tiles, 3, image_res, image_res) # - torch.Tensor: # with shape (bs, num_image, num_tiles, 3, image_res, image_res) - pixel_values: Optional[Union[List[List[torch.Tensor]], - List[torch.Tensor], + pixel_values: Optional[Union[list[list[torch.Tensor]], + list[torch.Tensor], torch.Tensor]] = kwargs.pop( "pixel_values", None) - image_embeds: Optional[Union[List[List[torch.Tensor]], - List[torch.Tensor], + image_embeds: Optional[Union[list[list[torch.Tensor]], + list[torch.Tensor], torch.Tensor]] = kwargs.pop( "image_embeds", None) - aspect_ratio_ids: Optional[Union[List[List[torch.Tensor]], - List[torch.Tensor], + aspect_ratio_ids: Optional[Union[list[list[torch.Tensor]], + list[torch.Tensor], torch.Tensor]] = kwargs.pop( "aspect_ratio_ids", None) - aspect_ratio_mask: Optional[Union[List[List[torch.Tensor]], - List[torch.Tensor], + aspect_ratio_mask: Optional[Union[list[list[torch.Tensor]], + list[torch.Tensor], torch.Tensor]] = kwargs.pop( "aspect_ratio_mask", None) @@ -1236,7 +1236,7 @@ def _parse_and_validate_image_input(self, **kwargs: object): def flat_encoder_result(self, cross_attention_states: torch.Tensor, attn_metadata: AttentionMetadata, - actual_encoder_seq_lens: List[int]): + actual_encoder_seq_lens: list[int]): cross_attention_states_flat = torch.zeros( sum(actual_encoder_seq_lens), @@ -1257,8 +1257,8 @@ def get_cross_attention_states( self, image_inputs: MllamaImagePixelInputs, attn_metadata: AttentionMetadata, - actual_encoder_seq_lens: List[int], - ) -> Tuple[torch.Tensor]: + actual_encoder_seq_lens: list[int], + ) -> tuple[torch.Tensor]: # NOTE: llama's reference implementation runs vision model on CPU pixel_values = image_inputs['data'] aspect_ratio_ids = image_inputs['aspect_ratio_ids'] @@ -1282,10 +1282,10 @@ def get_cross_attention_mask( self, input_ids: torch.Tensor, attn_metadata: AttentionMetadata, - num_tiles: List[List[int]], + num_tiles: list[list[int]], num_tokens_per_tile: int, dtype: torch.dtype, - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: token_ids = input_ids.tolist() start = 0 batch_token_ids = [] @@ -1337,7 +1337,7 @@ def forward( input_ids: torch.Tensor, positions: torch.Tensor, **kwargs: object, - ) -> Union[Tuple, CausalLMOutputWithPast]: + ) -> Union[tuple, CausalLMOutputWithPast]: attn_metadata = get_forward_context().attn_metadata if attn_metadata.num_prefill_tokens > 0 and \ attn_metadata.num_decode_tokens > 0: @@ -1397,8 +1397,8 @@ def forward( return outputs - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) (".qkv_proj", ".q_proj", "q"), @@ -1408,7 +1408,7 @@ def load_weights(self, weights: Iterable[Tuple[str, (".gate_up_proj", ".up_proj", 1), ] params_dict = dict(self.named_parameters()) - updated_params: Set[str] = set() + updated_params: set[str] = set() for name, loaded_weight in weights: if 'patch_embedding.weight' in name: name = name.replace('patch_embedding.weight', @@ -1450,7 +1450,7 @@ def load_weights(self, weights: Iterable[Tuple[str, return updated_params -def skip_attention_mask(sparse_mask: List[List[int]]) -> bool: +def skip_attention_mask(sparse_mask: list[list[int]]) -> bool: for mask in sparse_mask: # Skip text-only samples. if len(mask) == 0: @@ -1468,10 +1468,10 @@ def skip_attention_mask(sparse_mask: List[List[int]]) -> bool: def convert_sparse_cross_attention_mask_to_dense( - sparse_mask: List[List[List[int]]], - num_tiles: List[List[int]], - lengths: List[int], -) -> Tuple[np.ndarray, List[Tuple[int, int]]]: + sparse_mask: list[list[list[int]]], + num_tiles: list[list[int]], + lengths: list[int], +) -> tuple[np.ndarray, list[tuple[int, int]]]: total_length = sum(lengths) total_tiles = sum([sum(tiles) for tiles in num_tiles]) dense_mask = np.zeros(shape=(total_length, total_tiles), dtype=np.int64) diff --git a/vllm/model_executor/models/mlp_speculator.py b/vllm/model_executor/models/mlp_speculator.py index 2920427f94f7b..a7d7aa7d44ef2 100644 --- a/vllm/model_executor/models/mlp_speculator.py +++ b/vllm/model_executor/models/mlp_speculator.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import math -from typing import Iterable, List, Set, Tuple +from collections.abc import Iterable import torch import torch.nn as nn @@ -148,7 +148,7 @@ def generate_proposals( previous_hidden_states: torch.Tensor, num_predict_tokens: int, sampling_metadata: SamplingMetadata, - ) -> List[SamplerOutput]: + ) -> list[SamplerOutput]: if num_predict_tokens > self.max_speculative_tokens: raise ValueError(f"Max speculative tokens for model is " f"{self.max_speculative_tokens}, but " @@ -190,10 +190,10 @@ def generate_proposals( return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: name = name.replace("speculator.", "") param = params_dict.get(name) diff --git a/vllm/model_executor/models/module_mapping.py b/vllm/model_executor/models/module_mapping.py index 23814e6322d2e..25e6f594069ef 100644 --- a/vllm/model_executor/models/module_mapping.py +++ b/vllm/model_executor/models/module_mapping.py @@ -4,7 +4,7 @@ # https://github.com/modelscope/ms-swift/blob/v2.4.2/swift/utils/module_mapping.py from dataclasses import dataclass, field -from typing import List, Union +from typing import Union @dataclass @@ -46,17 +46,17 @@ class ModelKeys: @dataclass class MultiModelKeys(ModelKeys): - language_model: List[str] = field(default_factory=list) - connector: List[str] = field(default_factory=list) + language_model: list[str] = field(default_factory=list) + connector: list[str] = field(default_factory=list) # vision tower and audio tower - tower_model: List[str] = field(default_factory=list) - generator: List[str] = field(default_factory=list) + tower_model: list[str] = field(default_factory=list) + generator: list[str] = field(default_factory=list) @staticmethod - def from_string_field(language_model: Union[str, List[str]] = None, - connector: Union[str, List[str]] = None, - tower_model: Union[str, List[str]] = None, - generator: Union[str, List[str]] = None, + def from_string_field(language_model: Union[str, list[str]] = None, + connector: Union[str, list[str]] = None, + tower_model: Union[str, list[str]] = None, + generator: Union[str, list[str]] = None, **kwargs) -> 'MultiModelKeys': def to_list(value): diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index cc4d38d8740b2..ec0a239abd662 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -1,10 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 import math +from collections.abc import Iterable, Mapping from dataclasses import dataclass from functools import cached_property, partial -from typing import (Iterable, List, Mapping, Optional, Set, Tuple, TypedDict, - Union, cast) +from typing import Optional, TypedDict, Union, cast import numpy as np import torch @@ -71,13 +71,13 @@ class MolmoImageInputs(TypedDict): - images: Union[torch.Tensor, List[torch.Tensor]] + images: Union[torch.Tensor, list[torch.Tensor]] """Shape: `(batch_size, num_crops, num_patch, patch_dim)`""" - image_masks: Optional[Union[torch.Tensor, List[torch.Tensor]]] + image_masks: Optional[Union[torch.Tensor, list[torch.Tensor]]] """Shape: `(batch_size, num_crops, num_patch)`""" - feat_is_patch: Union[torch.Tensor, List[torch.Tensor]] + feat_is_patch: Union[torch.Tensor, list[torch.Tensor]] """ A boolean mask indicating which image features correspond to patch tokens. @@ -85,7 +85,7 @@ class MolmoImageInputs(TypedDict): Shape: `(batch_size, num_crops, num_patch)` """ - embed_is_patch: Union[torch.Tensor, List[torch.Tensor]] + embed_is_patch: Union[torch.Tensor, list[torch.Tensor]] """ A boolean mask indicating which image embeddings correspond to patch tokens. @@ -99,7 +99,7 @@ class MolmoImageInputs(TypedDict): @dataclass class VisionBackboneConfig: - image_default_input_size: Tuple[int, int] = (336, 336) + image_default_input_size: tuple[int, int] = (336, 336) image_patch_size: int = 14 image_pos_patch_size: int = 14 image_emb_dim: int = 1024 @@ -276,7 +276,7 @@ def __init__( for _ in range(config.image_num_layers) ]) - def forward(self, x: torch.Tensor) -> List[torch.Tensor]: + def forward(self, x: torch.Tensor) -> list[torch.Tensor]: hidden_states = [] for r in self.resblocks: x = r(x) @@ -343,7 +343,7 @@ def add_pos_emb(self, x: torch.Tensor, patch_num: int) -> torch.Tensor: def forward(self, x: torch.Tensor, - patch_num: Optional[int] = None) -> List[torch.Tensor]: + patch_num: Optional[int] = None) -> list[torch.Tensor]: """ : param x: (batch_size, num_patch, n_pixels) """ @@ -443,7 +443,7 @@ def __init__( ) def _apply_qk_norm(self, q: torch.Tensor, - k: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + k: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: if self.tp_size > 1: q = tensor_model_parallel_all_gather(q.contiguous()) k = tensor_model_parallel_all_gather(k.contiguous()) @@ -579,7 +579,7 @@ def forward( positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]: + ) -> tuple[torch.Tensor, Optional[tuple[torch.Tensor, torch.Tensor]]]: # Self Attention if residual is None: residual = hidden_states @@ -605,7 +605,7 @@ def forward( positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]: + ) -> tuple[torch.Tensor, Optional[tuple[torch.Tensor, torch.Tensor]]]: # Self Attention residual = hidden_states hidden_states = self.self_attn( @@ -697,7 +697,7 @@ def encode_image(self, images: torch.Tensor) -> torch.Tensor: def forward( self, images: torch.Tensor, image_masks: torch.Tensor - ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: # image_features: (batch_size, num_crops(=num_image), num_patch, nximage_emb_dim) # noqa: E501 batch_size, num_image = images.shape[:2] @@ -748,15 +748,15 @@ def forward( # image_features: (batch_size, num_image, num_patch, d_model) return image_features - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("merged_linear", "gate_proj", 0), ("merged_linear", "up_proj", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: for (param_name, weight_name, shard_id) in stacked_params_mapping: @@ -863,10 +863,10 @@ def forward( hidden_states = self.norm(hidden_states) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if name.endswith(".bias") and name not in params_dict: @@ -1508,7 +1508,7 @@ def _parse_and_validate_image_input( def _process_image_input( self, image_input: MolmoImageInputs, - ) -> Union[torch.Tensor, List[torch.Tensor]]: + ) -> Union[torch.Tensor, list[torch.Tensor]]: if isinstance(image_input["images"], list): # Call the vision backbone on the whole batch at once images_flat = flatten_bn(image_input["images"], concat=True) @@ -1665,7 +1665,7 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): loader = AutoWeightsLoader(self) weights = _get_weights_with_merged_embedding(weights) @@ -1683,8 +1683,8 @@ def get_mm_mapping(self) -> MultiModelKeys: def _get_weights_with_merged_embedding( - weights: Iterable[Tuple[str, torch.Tensor]] -) -> Iterable[Tuple[str, torch.Tensor]]: + weights: Iterable[tuple[str, torch.Tensor]] +) -> Iterable[tuple[str, torch.Tensor]]: embedding_weights = {} for name, weight in weights: if "wte.embedding" in name: diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py index d716818f31c03..aa63897d22c6b 100644 --- a/vllm/model_executor/models/mpt.py +++ b/vllm/model_executor/models/mpt.py @@ -2,7 +2,8 @@ # Adapted from https://huggingface.co/mosaicml/mpt-7b/tree/main import math -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch import torch.nn as nn @@ -316,10 +317,10 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: # Skip loading extra bias for GPTQ models. if name.endswith(".bias") and name not in params_dict: diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py index 3b86b91465ca8..b882392720813 100644 --- a/vllm/model_executor/models/nemotron.py +++ b/vllm/model_executor/models/nemotron.py @@ -22,7 +22,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only Nemotron model compatible with HuggingFace weights.""" -from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Any, Optional, Union import torch from torch import nn @@ -70,7 +71,7 @@ def _cast_if_autocast_enabled(*args): class NemotronLayerNorm1P(nn.LayerNorm): def __init__(self, - normalized_shape: Union[int, List[int], torch.Size], + normalized_shape: Union[int, list[int], torch.Size], eps: float = 1e-5, elementwise_affine: bool = True, bias: bool = True, @@ -134,7 +135,7 @@ def __init__( num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, + rope_scaling: Optional[dict[str, Any]] = None, max_position_embeddings: int = 8192, quant_config: Optional[QuantizationConfig] = None, bias: bool = False, @@ -268,7 +269,7 @@ def forward( positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: # Self Attention if residual is None: residual = hidden_states @@ -453,8 +454,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) (".qkv_proj", ".q_proj", "q"), @@ -462,7 +463,7 @@ def load_weights(self, weights: Iterable[Tuple[str, (".qkv_proj", ".v_proj", "v"), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue diff --git a/vllm/model_executor/models/nvlm_d.py b/vllm/model_executor/models/nvlm_d.py index 5de8eeb3fffed..dafb0e45ea921 100644 --- a/vllm/model_executor/models/nvlm_d.py +++ b/vllm/model_executor/models/nvlm_d.py @@ -6,7 +6,8 @@ # Copyright (c) 2024 NVIDIA # Licensed under Apache 2.0 License [see LICENSE for details] # -------------------------------------------------------- -from typing import Mapping, Optional +from collections.abc import Mapping +from typing import Optional import torch import torch.nn as nn diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py index 4a341c97d6cdf..e77f344097bd8 100644 --- a/vllm/model_executor/models/olmo.py +++ b/vllm/model_executor/models/olmo.py @@ -22,7 +22,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only OLMo model compatible with HuggingFace weights.""" -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -210,7 +211,7 @@ def forward( self, positions: torch.Tensor, hidden_states: torch.Tensor, - ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]: + ) -> tuple[torch.Tensor, Optional[tuple[torch.Tensor, torch.Tensor]]]: # Attention block. residual = hidden_states hidden_states = self.input_layernorm(hidden_states) @@ -348,8 +349,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -359,7 +360,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ("gate_up_proj", "up_proj", 1), ] params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue diff --git a/vllm/model_executor/models/olmo2.py b/vllm/model_executor/models/olmo2.py index 54cc851de9347..fe8bb677aea60 100644 --- a/vllm/model_executor/models/olmo2.py +++ b/vllm/model_executor/models/olmo2.py @@ -23,8 +23,9 @@ # limitations under the License. """Inference-only OLMo2 model compatible with HuggingFace weights.""" +from collections.abc import Iterable from functools import partial -from typing import Iterable, Optional, Tuple, Union +from typing import Optional, Union import torch from torch import nn @@ -136,7 +137,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): ) def _apply_qk_norm(self, q: torch.Tensor, - k: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + k: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: if self.tp_size > 1: q = tensor_model_parallel_all_gather(q.contiguous()) k = tensor_model_parallel_all_gather(k.contiguous()) @@ -371,7 +372,7 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py index e27ff5deace29..b53bed453d0eb 100644 --- a/vllm/model_executor/models/olmoe.py +++ b/vllm/model_executor/models/olmoe.py @@ -12,7 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only OLMoE model compatible with HuggingFace weights.""" -from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Any, Optional, Union import torch from torch import nn @@ -102,7 +103,7 @@ def __init__( num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, + rope_scaling: Optional[dict[str, Any]] = None, max_position_embeddings: int = 4096, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, @@ -357,8 +358,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -377,7 +378,7 @@ def load_weights(self, weights: Iterable[Tuple[str, num_experts=self.config.num_experts) params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py index e4775478a54d1..921039d2b5222 100644 --- a/vllm/model_executor/models/opt.py +++ b/vllm/model_executor/models/opt.py @@ -18,7 +18,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only OPT model compatible with HuggingFace weights.""" -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -371,8 +372,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -380,7 +381,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ("qkv_proj", "v_proj", "v"), ] params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "lm_head.weight" in name and self.config.tie_word_embeddings: continue diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py index 6668ede91eecb..a8155c63c0ef3 100644 --- a/vllm/model_executor/models/orion.py +++ b/vllm/model_executor/models/orion.py @@ -5,7 +5,8 @@ # Copyright (c) OrionStar Inc. # LICENSE: https://huggingface.co/OrionStarAI/Orion-14B-Base/blob/main/LICENSE """Inference-only Orion-14B model compatible with HuggingFace weights.""" -from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Any, Optional, Union import torch from torch import nn @@ -73,7 +74,7 @@ def __init__( num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, + rope_scaling: Optional[dict[str, Any]] = None, max_position_embeddings: int = 8192, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, @@ -187,7 +188,7 @@ def forward( self, positions: torch.Tensor, hidden_states: torch.Tensor, - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: # Self Attention residual = hidden_states hidden_states = self.input_layernorm(hidden_states) @@ -313,8 +314,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -324,7 +325,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ("gate_up_proj", "up_proj", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py index 02d1861b8027c..caa895a5adf68 100644 --- a/vllm/model_executor/models/paligemma.py +++ b/vllm/model_executor/models/paligemma.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import (Iterable, Literal, Mapping, Optional, Set, Tuple, - TypedDict, Union) +from collections.abc import Iterable, Mapping +from typing import Literal, Optional, TypedDict, Union import torch from torch import nn @@ -323,7 +323,7 @@ def sample( ) -> Optional[SamplerOutput]: return self.language_model.sample(logits, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/persimmon.py b/vllm/model_executor/models/persimmon.py index db8d170a8c91b..6906720e623a3 100644 --- a/vllm/model_executor/models/persimmon.py +++ b/vllm/model_executor/models/persimmon.py @@ -21,7 +21,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only persimmon model compatible with HuggingFace weights.""" -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -313,10 +314,10 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py index 6ee80210c2b4d..cf53dc39c58f2 100644 --- a/vllm/model_executor/models/phi.py +++ b/vllm/model_executor/models/phi.py @@ -36,7 +36,8 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. """Inference-only Phi-1.5 model compatible with HuggingFace weights.""" -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -315,8 +316,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -324,7 +325,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ("qkv_proj", "v_proj", "v") ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: diff --git a/vllm/model_executor/models/phi3_small.py b/vllm/model_executor/models/phi3_small.py index 33984f54ae271..e092fb06eb120 100644 --- a/vllm/model_executor/models/phi3_small.py +++ b/vllm/model_executor/models/phi3_small.py @@ -1,7 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 import math -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -231,8 +232,8 @@ def forward( self, positions: torch.Tensor, hidden_states: torch.Tensor, - ) -> Tuple[torch.Tensor, Optional[torch.Tensor], - Optional[Tuple[torch.Tensor]]]: + ) -> tuple[torch.Tensor, Optional[torch.Tensor], + Optional[tuple[torch.Tensor]]]: qkv, _ = self.query_key_value(hidden_states) qkv = qkv.view(qkv.shape[:-1] + @@ -446,11 +447,11 @@ def sample( sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index 0f45f131065a8..389ae88bd0483 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -16,7 +16,7 @@ # limitations under the License. from collections.abc import Iterable, Mapping, Sequence from functools import cached_property -from typing import Any, List, Literal, Optional, Set, Tuple, TypedDict, Union +from typing import Any, Literal, Optional, TypedDict, Union import torch import torch.nn as nn @@ -96,7 +96,7 @@ def _init_img_processor(hf_config: PretrainedConfig, class Phi3VImagePixelInputs(TypedDict): type: Literal["pixel_values"] - data: Union[torch.Tensor, List[torch.Tensor]] + data: Union[torch.Tensor, list[torch.Tensor]] """ Shape: `(batch_size * num_images, 1 + num_patches, num_channels, height, width)` @@ -115,7 +115,7 @@ class Phi3VImagePixelInputs(TypedDict): class Phi3VImageEmbeddingInputs(TypedDict): type: Literal["image_embeds"] - data: Union[torch.Tensor, List[torch.Tensor]] + data: Union[torch.Tensor, list[torch.Tensor]] """Shape: `(batch_size * num_images, image_feature_size, hidden_size)` `hidden_size` must match the hidden size of language model backbone. @@ -569,8 +569,8 @@ def _validate_shape(d: torch.Tensor): return data def _validate_pixel_values( - self, data: Union[torch.Tensor, List[torch.Tensor]] - ) -> Union[torch.Tensor, List[torch.Tensor]]: + self, data: Union[torch.Tensor, list[torch.Tensor]] + ) -> Union[torch.Tensor, list[torch.Tensor]]: h = w = CLIP_VIT_LARGE_PATCH14_336_CONFIG.image_size expected_dims = (3, h, w) @@ -708,8 +708,8 @@ def sample( ) -> Optional[SamplerOutput]: return self.language_model.sample(logits, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) autoloaded_weights = loader.load_weights(weights, diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py index c35c7e9fcce74..1104dd705c693 100644 --- a/vllm/model_executor/models/phimoe.py +++ b/vllm/model_executor/models/phimoe.py @@ -22,7 +22,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only PhiMoE model.""" -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -581,8 +582,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -597,7 +598,7 @@ def load_weights(self, weights: Iterable[Tuple[str, num_experts=self.config.num_local_experts) params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index 87b1d50749a2c..c0ad3e122db3e 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -1,9 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 import math +from collections.abc import Iterable, Mapping from dataclasses import dataclass, fields from functools import cached_property -from typing import Iterable, List, Mapping, Optional, Set, Tuple, Union +from typing import Optional, Union import torch import torch.nn as nn @@ -295,10 +296,10 @@ def forward( def _parse_and_validate_image_input( self, - images: Optional[Union[List[List[torch.Tensor]], List[torch.Tensor], + images: Optional[Union[list[list[torch.Tensor]], list[torch.Tensor], torch.Tensor]] = None, image_tokens: Optional[torch.Tensor] = None, - ) -> Tuple[Optional[List[torch.Tensor]], Optional[torch.Tensor]]: + ) -> tuple[Optional[list[torch.Tensor]], Optional[torch.Tensor]]: if images is None: return None, None @@ -331,7 +332,7 @@ def _parse_and_validate_image_input( return images, image_tokens def _process_image_input(self, - image_input: List[torch.Tensor]) -> torch.Tensor: + image_input: list[torch.Tensor]) -> torch.Tensor: return self.vision_language_adapter(self.vision_encoder(image_input)) def compute_logits( @@ -349,12 +350,12 @@ def sample( ) -> Optional[SamplerOutput]: return self.language_model.sample(logits, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): - def is_vision_encoder_weights(weight: Tuple[str, torch.Tensor]): + def is_vision_encoder_weights(weight: tuple[str, torch.Tensor]): return weight[0].startswith("vision_encoder") - def is_vision_lang_adapter_weights(weight: Tuple[str, torch.Tensor]): + def is_vision_lang_adapter_weights(weight: tuple[str, torch.Tensor]): return weight[0].startswith("vision_language_adapter") # Get references to parameters for direct loading @@ -453,7 +454,7 @@ def apply_rotary_emb_vit( xq: torch.Tensor, xk: torch.Tensor, freqs_cis: torch.Tensor, -) -> Tuple[torch.Tensor, torch.Tensor]: +) -> tuple[torch.Tensor, torch.Tensor]: xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2)) xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2)) assert freqs_cis.dtype == torch.complex64 @@ -558,7 +559,7 @@ def forward( return x -def position_meshgrid(patch_embeds_list: List[torch.Tensor], ) -> torch.Tensor: +def position_meshgrid(patch_embeds_list: list[torch.Tensor], ) -> torch.Tensor: positions = torch.cat([ torch.stack( torch.meshgrid( @@ -620,7 +621,7 @@ def freqs_cis(self) -> torch.Tensor: def forward( self, - images: List[torch.Tensor], + images: list[torch.Tensor], ) -> torch.Tensor: """ Args: @@ -856,7 +857,7 @@ def forward( hidden_states: torch.Tensor, attention_mask: torch.Tensor, position_embeddings: torch.Tensor, - ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: batch, patches, _ = hidden_states.size() qkv_states, _ = self.qkv_proj(hidden_states) @@ -1017,7 +1018,7 @@ def __init__( def forward( self, - pixel_values: List[torch.Tensor], + pixel_values: list[torch.Tensor], feature_sample_layers: Optional[list[int]] = None, ) -> torch.Tensor: """ @@ -1077,8 +1078,8 @@ def forward( # (TODO) Add prefix argument for filtering out weights to be loaded # ref: https://github.com/vllm-project/vllm/pull/7186#discussion_r1734163986 - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) (".qkv_proj", ".q_proj", "q"), @@ -1088,7 +1089,7 @@ def load_weights(self, weights: Iterable[Tuple[str, (".gate_up_proj", ".up_proj", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() layer_count = len(self.transformer.layers) for name, loaded_weight in weights: diff --git a/vllm/model_executor/models/prithvi_geospatial_mae.py b/vllm/model_executor/models/prithvi_geospatial_mae.py index 3d95e949e71da..a60fe05333cb6 100644 --- a/vllm/model_executor/models/prithvi_geospatial_mae.py +++ b/vllm/model_executor/models/prithvi_geospatial_mae.py @@ -15,7 +15,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only IBM/NASA Prithvi Geospatial model.""" -from typing import Iterable, Mapping, Optional, Set, Tuple, Union +from collections.abc import Iterable, Mapping +from typing import Optional, Union import torch import torch.nn as nn @@ -158,7 +159,7 @@ def __init__(self, vllm_config: VllmConfig, prefix: str = ""): "by PrithviGeospatialMAE.") def _parse_and_validate_multimodal_data( - self, **kwargs) -> Tuple[torch.Tensor, torch.Tensor | None]: + self, **kwargs) -> tuple[torch.Tensor, torch.Tensor | None]: pixel_values = kwargs.pop("pixel_values", None) if not isinstance(pixel_values, torch.Tensor): @@ -199,8 +200,8 @@ def pooler( ) -> Optional[PoolerOutput]: return PoolerOutput([PoolingSequenceGroupOutput(hidden_states)]) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: params_list = [] model_buffers = dict(self.named_buffers()) loaded_buffers = [] diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index 96abfb9d1096c..b5f5079b6d416 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -6,7 +6,8 @@ # LICENSE: https://huggingface.co/Qwen/Qwen-7B/blob/main/LICENSE """Inference-only QWen model compatible with HuggingFace weights.""" -from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Any, Optional, Union import torch from torch import nn @@ -77,7 +78,7 @@ def __init__( num_heads: int, max_position_embeddings: int, rope_theta: float = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, + rope_scaling: Optional[dict[str, Any]] = None, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", @@ -167,7 +168,7 @@ def forward( positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: # Self Attention if residual is None: residual = hidden_states @@ -294,15 +295,15 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("gate_up_proj", "w2", 0), ("gate_up_proj", "w1", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index fe615c41aeaa1..491053ae296a5 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -23,7 +23,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only Qwen2 model compatible with HuggingFace weights.""" -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -108,7 +109,7 @@ def __init__(self, rope_theta: float = 10000, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, - rope_scaling: Optional[Tuple] = None, + rope_scaling: Optional[tuple] = None, prefix: str = "", attn_type: str = AttentionType.DECODER) -> None: super().__init__() @@ -232,7 +233,7 @@ def forward( positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: # Self Attention if residual is None: residual = hidden_states @@ -349,8 +350,8 @@ def forward( hidden_states, _ = self.norm(hidden_states, residual) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -360,7 +361,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ("gate_up_proj", "up_proj", 1), ] params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue @@ -481,8 +482,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, skip_prefixes=(["lm_head."] @@ -550,7 +551,7 @@ def pooler( ) -> Optional[PoolerOutput]: return self._pooler(hidden_states, pooling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): weights = self.hf_to_vllm_mapper.apply(weights) weights = ((name, data) for name, data in weights if not name.startswith("lm_head.")) diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 858cf28d2b873..4b1ff026c1d37 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -24,9 +24,9 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only Qwen2.5-VL model compatible with HuggingFace weights.""" +from collections.abc import Iterable, Mapping from functools import cached_property, partial -from typing import (Callable, Iterable, List, Literal, Mapping, Optional, Set, - Tuple, TypedDict, Union) +from typing import Callable, Literal, Optional, TypedDict, Union import torch import torch.nn as nn @@ -90,7 +90,7 @@ class Qwen2_5_VLImageEmbeddingInputs(TypedDict): type: Literal["image_embeds"] image_embeds: torch.Tensor """Supported types: - - List[`torch.Tensor`]: A list of tensors holding all images' features. + - list[`torch.Tensor`]: A list of tensors holding all images' features. Each tensor holds an image's features. - `torch.Tensor`: A tensor holding all images' features (concatenation of all images' feature tensors). @@ -136,7 +136,7 @@ class Qwen2_5_VLVideoEmbeddingInputs(TypedDict): type: Literal["video_embeds"] video_embeds: torch.Tensor """Supported types: - - List[`torch.Tensor`]: A list of tensors holding all videos' features. + - list[`torch.Tensor`]: A list of tensors holding all videos' features. Each tensor holds an video's features. - `torch.Tensor`: A tensor holding all videos' features (concatenation of all videos' feature tensors). @@ -647,8 +647,8 @@ def forward( hidden_states = hidden_states[reverse_indices, :] return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -656,7 +656,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ("qkv_proj", "v_proj", "v"), ] params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: for (param_name, weight_name, shard_id) in stacked_params_mapping: @@ -688,7 +688,7 @@ def get_hf_processor( min_pixels: Optional[int] = None, max_pixels: Optional[int] = None, size: Optional[dict[str, int]] = None, - fps: Optional[Union[float, List[float]]] = None, + fps: Optional[Union[float, list[float]]] = None, **kwargs: object, ) -> Qwen2_5_VLProcessor: if fps is not None: @@ -1064,8 +1064,8 @@ def sample( ) -> Optional[SamplerOutput]: return self.language_model.sample(logits, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py index f0dc8573ee14e..a8aafe3a39883 100644 --- a/vllm/model_executor/models/qwen2_audio.py +++ b/vllm/model_executor/models/qwen2_audio.py @@ -21,9 +21,9 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only Qwen2-Audio model compatible with HuggingFace weights.""" +from collections.abc import Iterable, Mapping from functools import cached_property -from typing import (Any, Iterable, Mapping, Optional, Set, Tuple, TypedDict, - Union) +from typing import Any, Optional, TypedDict, Union import torch import torch.nn as nn @@ -416,7 +416,7 @@ def sample( ) -> Optional[SamplerOutput]: return self.language_model.sample(logits, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py index 41536b34b2f2d..8011ebb1df87a 100644 --- a/vllm/model_executor/models/qwen2_moe.py +++ b/vllm/model_executor/models/qwen2_moe.py @@ -23,7 +23,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only Qwen2MoE model compatible with HuggingFace weights.""" -from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Any, Optional, Union import torch import torch.nn.functional as F @@ -168,7 +169,7 @@ def __init__( num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, + rope_scaling: Optional[dict[str, Any]] = None, max_position_embeddings: int = 8192, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, @@ -429,8 +430,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -449,7 +450,7 @@ def load_weights(self, weights: Iterable[Tuple[str, num_experts=self.config.num_experts) params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue diff --git a/vllm/model_executor/models/qwen2_rm.py b/vllm/model_executor/models/qwen2_rm.py index 21cc9e8ed1c6b..4ab6c75b639a0 100644 --- a/vllm/model_executor/models/qwen2_rm.py +++ b/vllm/model_executor/models/qwen2_rm.py @@ -5,7 +5,8 @@ # Copyright 2024 The Qwen team. # Copyright 2023 The vLLM team. """Inference-only Qwen2-RM model compatible with HuggingFace weights.""" -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -94,8 +95,8 @@ def pooler( ) -> Optional[PoolerOutput]: return self._pooler(hidden_states, pooling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self, ignore_unexpected_prefixes=["lm_head."]) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 849ef7293bb7f..a0dfbe68292b9 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -23,9 +23,9 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only Qwen2-VL model compatible with HuggingFace weights.""" +from collections.abc import Iterable, Mapping from functools import cached_property, partial -from typing import (Any, Callable, Iterable, Literal, Mapping, Optional, Set, - Tuple, Type, TypedDict, Union) +from typing import Any, Callable, Literal, Optional, TypedDict, Union import torch import torch.nn as nn @@ -100,7 +100,7 @@ class Qwen2VLImageEmbeddingInputs(TypedDict): type: Literal["image_embeds"] image_embeds: torch.Tensor """Supported types: - - List[`torch.Tensor`]: A list of tensors holding all images' features. + - list[`torch.Tensor`]: A list of tensors holding all images' features. Each tensor holds an image's features. - `torch.Tensor`: A tensor holding all images' features (concatenation of all images' feature tensors). @@ -140,7 +140,7 @@ class Qwen2VLVideoEmbeddingInputs(TypedDict): type: Literal["video_embeds"] video_embeds: torch.Tensor """Supported types: - - List[`torch.Tensor`]: A list of tensors holding all videos' features. + - list[`torch.Tensor`]: A list of tensors holding all videos' features. Each tensor holds an video's features. - `torch.Tensor`: A tensor holding all videos' features (concatenation of all videos' feature tensors). @@ -169,7 +169,7 @@ def __init__( self, in_features: int, hidden_features: int, - act_layer: Type[nn.Module] = QuickGELU, + act_layer: type[nn.Module] = QuickGELU, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", ): @@ -383,7 +383,7 @@ def __init__( dim: int, num_heads: int, mlp_ratio: float, - act_layer: Type[nn.Module] = QuickGELU, + act_layer: type[nn.Module] = QuickGELU, norm_layer: Optional[Callable[[int], nn.Module]] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", @@ -629,8 +629,8 @@ def forward( return x - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -638,7 +638,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ("qkv_proj", "v_proj", "v"), ] params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: for (param_name, weight_name, shard_id) in stacked_params_mapping: @@ -1371,8 +1371,8 @@ def sample( ) -> Optional[SamplerOutput]: return self.language_model.sample(logits, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) diff --git a/vllm/model_executor/models/qwen_vl.py b/vllm/model_executor/models/qwen_vl.py index e0d8bf2fa3d25..f662db2225c84 100644 --- a/vllm/model_executor/models/qwen_vl.py +++ b/vllm/model_executor/models/qwen_vl.py @@ -9,9 +9,9 @@ import math import re import unicodedata +from collections.abc import Collection, Mapping, Set from functools import lru_cache, partial -from typing import (AbstractSet, Callable, Collection, List, Literal, Mapping, - Optional, TypedDict, Union) +from typing import Callable, Literal, Optional, TypedDict, Union import torch from torch import nn @@ -393,7 +393,7 @@ class TokenizerWithoutImagePad(tokenizer.__class__): # type: ignore def tokenize( self, text: str, - allowed_special: Union[AbstractSet[str], str] = "all", + allowed_special: Union[Set[str], str] = "all", disallowed_special: Union[Collection[str], str] = (), **kwargs, ) -> list[Union[bytes, str]]: @@ -409,7 +409,7 @@ def tokenize( def _decode( self, - token_ids: Union[int, List[int]], + token_ids: Union[int, list[int]], skip_special_tokens: bool = False, errors: Optional[str] = None, **kwargs, diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 75e31d557dd10..bd0e1ced63632 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -10,10 +10,10 @@ import sys import tempfile from abc import ABC, abstractmethod +from collections.abc import Set from dataclasses import dataclass, field from functools import lru_cache -from typing import (AbstractSet, Callable, Dict, List, Optional, Tuple, Type, - TypeVar, Union) +from typing import Callable, Optional, TypeVar, Union import cloudpickle import torch.nn as nn @@ -230,7 +230,7 @@ class _ModelInfo: supports_transcription: bool @staticmethod - def from_model_cls(model: Type[nn.Module]) -> "_ModelInfo": + def from_model_cls(model: type[nn.Module]) -> "_ModelInfo": return _ModelInfo( architecture=model.__name__, is_text_generation_model=is_text_generation_model(model), @@ -251,7 +251,7 @@ def inspect_model_cls(self) -> _ModelInfo: raise NotImplementedError @abstractmethod - def load_model_cls(self) -> Type[nn.Module]: + def load_model_cls(self) -> type[nn.Module]: raise NotImplementedError @@ -262,10 +262,10 @@ class _RegisteredModel(_BaseRegisteredModel): """ interfaces: _ModelInfo - model_cls: Type[nn.Module] + model_cls: type[nn.Module] @staticmethod - def from_model_cls(model_cls: Type[nn.Module]): + def from_model_cls(model_cls: type[nn.Module]): return _RegisteredModel( interfaces=_ModelInfo.from_model_cls(model_cls), model_cls=model_cls, @@ -274,7 +274,7 @@ def from_model_cls(model_cls: Type[nn.Module]): def inspect_model_cls(self) -> _ModelInfo: return self.interfaces - def load_model_cls(self) -> Type[nn.Module]: + def load_model_cls(self) -> type[nn.Module]: return self.model_cls @@ -291,7 +291,7 @@ def inspect_model_cls(self) -> _ModelInfo: return _run_in_subprocess( lambda: _ModelInfo.from_model_cls(self.load_model_cls())) - def load_model_cls(self) -> Type[nn.Module]: + def load_model_cls(self) -> type[nn.Module]: mod = importlib.import_module(self.module_name) return getattr(mod, self.class_name) @@ -300,7 +300,7 @@ def load_model_cls(self) -> Type[nn.Module]: def _try_load_model_cls( model_arch: str, model: _BaseRegisteredModel, -) -> Optional[Type[nn.Module]]: +) -> Optional[type[nn.Module]]: from vllm.platforms import current_platform current_platform.verify_model_arch(model_arch) try: @@ -327,15 +327,15 @@ def _try_inspect_model_cls( @dataclass class _ModelRegistry: # Keyed by model_arch - models: Dict[str, _BaseRegisteredModel] = field(default_factory=dict) + models: dict[str, _BaseRegisteredModel] = field(default_factory=dict) - def get_supported_archs(self) -> AbstractSet[str]: + def get_supported_archs(self) -> Set[str]: return self.models.keys() def register_model( self, model_arch: str, - model_cls: Union[Type[nn.Module], str], + model_cls: Union[type[nn.Module], str], ) -> None: """ Register an external model to be used in vLLM. @@ -374,7 +374,7 @@ def register_model( self.models[model_arch] = model - def _raise_for_unsupported(self, architectures: List[str]): + def _raise_for_unsupported(self, architectures: list[str]): all_supported_archs = self.get_supported_archs() if any(arch in all_supported_archs for arch in architectures): @@ -387,7 +387,7 @@ def _raise_for_unsupported(self, architectures: List[str]): f"Supported architectures: {all_supported_archs}") def _try_load_model_cls(self, - model_arch: str) -> Optional[Type[nn.Module]]: + model_arch: str) -> Optional[type[nn.Module]]: if model_arch not in self.models: return None @@ -401,8 +401,8 @@ def _try_inspect_model_cls(self, model_arch: str) -> Optional[_ModelInfo]: def _normalize_archs( self, - architectures: Union[str, List[str]], - ) -> List[str]: + architectures: Union[str, list[str]], + ) -> list[str]: if isinstance(architectures, str): architectures = [architectures] if not architectures: @@ -417,8 +417,8 @@ def _normalize_archs( def inspect_model_cls( self, - architectures: Union[str, List[str]], - ) -> Tuple[_ModelInfo, str]: + architectures: Union[str, list[str]], + ) -> tuple[_ModelInfo, str]: architectures = self._normalize_archs(architectures) for arch in architectures: @@ -430,8 +430,8 @@ def inspect_model_cls( def resolve_model_cls( self, - architectures: Union[str, List[str]], - ) -> Tuple[Type[nn.Module], str]: + architectures: Union[str, list[str]], + ) -> tuple[type[nn.Module], str]: architectures = self._normalize_archs(architectures) for arch in architectures: @@ -443,63 +443,63 @@ def resolve_model_cls( def is_text_generation_model( self, - architectures: Union[str, List[str]], + architectures: Union[str, list[str]], ) -> bool: model_cls, _ = self.inspect_model_cls(architectures) return model_cls.is_text_generation_model def is_pooling_model( self, - architectures: Union[str, List[str]], + architectures: Union[str, list[str]], ) -> bool: model_cls, _ = self.inspect_model_cls(architectures) return model_cls.is_pooling_model def is_cross_encoder_model( self, - architectures: Union[str, List[str]], + architectures: Union[str, list[str]], ) -> bool: model_cls, _ = self.inspect_model_cls(architectures) return model_cls.supports_cross_encoding def is_multimodal_model( self, - architectures: Union[str, List[str]], + architectures: Union[str, list[str]], ) -> bool: model_cls, _ = self.inspect_model_cls(architectures) return model_cls.supports_multimodal def is_pp_supported_model( self, - architectures: Union[str, List[str]], + architectures: Union[str, list[str]], ) -> bool: model_cls, _ = self.inspect_model_cls(architectures) return model_cls.supports_pp def model_has_inner_state( self, - architectures: Union[str, List[str]], + architectures: Union[str, list[str]], ) -> bool: model_cls, _ = self.inspect_model_cls(architectures) return model_cls.has_inner_state def is_attention_free_model( self, - architectures: Union[str, List[str]], + architectures: Union[str, list[str]], ) -> bool: model_cls, _ = self.inspect_model_cls(architectures) return model_cls.is_attention_free def is_hybrid_model( self, - architectures: Union[str, List[str]], + architectures: Union[str, list[str]], ) -> bool: model_cls, _ = self.inspect_model_cls(architectures) return model_cls.is_hybrid def is_transcription_model( self, - architectures: Union[str, List[str]], + architectures: Union[str, list[str]], ) -> bool: model_cls, _ = self.inspect_model_cls(architectures) return model_cls.supports_transcription diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py index f86fa268072db..97e1bb3eb913f 100644 --- a/vllm/model_executor/models/roberta.py +++ b/vllm/model_executor/models/roberta.py @@ -1,7 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 import itertools -from typing import Iterable, Optional, Tuple +from collections.abc import Iterable +from typing import Optional import torch from torch import nn @@ -23,8 +24,8 @@ def roberta_task_weights_filter( - all_weights: Iterable[Tuple[str, torch.Tensor]] -) -> Tuple[Iterable[Tuple[str, torch.Tensor]], Iterable[Tuple[str, + all_weights: Iterable[tuple[str, torch.Tensor]] +) -> tuple[Iterable[tuple[str, torch.Tensor]], Iterable[tuple[str, torch.Tensor]]]: """ Separate task-specific weights that are applied on top @@ -178,7 +179,7 @@ def _build_model(self, prefix=prefix, embedding_class=RobertaEmbedding) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): weights = self.hf_to_vllm_mapper.apply(weights) # Separate weights in "roberta"-prefixed and all else (not in memory). # For use with models like FacebookAI/roberta-base. @@ -217,7 +218,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.classifier = RobertaClassificationHead(config) self._pooler = CrossEncodingPooler(config, self.classifier) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): bert_weights, task_weights = roberta_task_weights_filter(weights) self.roberta.load_weights(bert_weights) diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py index 2892f696107be..eecae4175561f 100644 --- a/vllm/model_executor/models/siglip.py +++ b/vllm/model_executor/models/siglip.py @@ -3,7 +3,8 @@ within a vision language model.""" import math -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from PIL import Image @@ -333,7 +334,7 @@ def __init__( def forward( self, hidden_states: torch.Tensor, - ) -> Tuple[torch.Tensor, None]: + ) -> tuple[torch.Tensor, None]: residual = hidden_states hidden_states = self.layer_norm1(hidden_states) @@ -548,8 +549,8 @@ def forward( feature_sample_layers=feature_sample_layers, ) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -557,7 +558,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ("qkv_proj", "v_proj", "v"), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() layer_count = len(self.vision_model.encoder.layers) for name, loaded_weight in weights: diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py index 0f9e517aeb557..94f44ff21291e 100644 --- a/vllm/model_executor/models/solar.py +++ b/vllm/model_executor/models/solar.py @@ -23,7 +23,8 @@ # limitations under the License. """Inference-only Solar model compatible with HuggingFace weights.""" -from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Any, Optional, Union import torch from torch import nn @@ -102,7 +103,7 @@ def __init__( num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, + rope_scaling: Optional[dict[str, Any]] = None, max_position_embeddings: int = 8192, quant_config: Optional[QuantizationConfig] = None, bias: bool = False, @@ -237,7 +238,7 @@ def forward( positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: # Self Attention if residual is None: residual = hidden_states @@ -449,8 +450,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) (".qkv_proj", ".q_proj", "q"), @@ -460,7 +461,7 @@ def load_weights(self, weights: Iterable[Tuple[str, (".gate_up_proj", ".up_proj", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py index a15faec547b95..3afcccc18af07 100644 --- a/vllm/model_executor/models/stablelm.py +++ b/vllm/model_executor/models/stablelm.py @@ -20,7 +20,8 @@ # https://huggingface.co/stabilityai/stablelm-3b-4e1t/blob/main/config.json """Inference-only StabeLM (https://github.com/Stability-AI/StableLM) model compatible with HuggingFace weights.""" -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -181,7 +182,7 @@ def forward( self, positions: torch.Tensor, hidden_states: torch.Tensor, - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: # Self Attention residual = hidden_states hidden_states = self.input_layernorm(hidden_states) @@ -306,8 +307,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -317,7 +318,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ("gate_up_proj", "up_proj", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py index 90098af9dde0e..2665367286841 100644 --- a/vllm/model_executor/models/starcoder2.py +++ b/vllm/model_executor/models/starcoder2.py @@ -19,7 +19,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """ PyTorch Starcoder2 model.""" -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -319,8 +320,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -329,7 +330,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ] params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue diff --git a/vllm/model_executor/models/telechat2.py b/vllm/model_executor/models/telechat2.py index a38035e37ec73..2175488116dc6 100644 --- a/vllm/model_executor/models/telechat2.py +++ b/vllm/model_executor/models/telechat2.py @@ -19,7 +19,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Iterable, Set, Tuple +from collections.abc import Iterable import torch @@ -48,14 +48,14 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): layer.mlp.gate_up_proj.bias = None layer.mlp.gate_up_proj.skip_bias_add = True - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ ('gate_up_proj', 'gate_proj', 0), ('gate_up_proj', 'up_proj', 1), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() total_num_heads = self.config.n_head head_dim = self.config.hidden_size // total_num_heads for name, loaded_weight in weights: @@ -123,8 +123,8 @@ class TeleChat2ForCausalLM(LlamaForCausalLM): def _init_model(self, vllm_config: VllmConfig, prefix: str = ""): return TeleChat2Model(vllm_config=vllm_config, prefix=prefix) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index 1c3c443b29413..27d4ad15a829b 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -15,7 +15,8 @@ # limitations under the License. """Wrapper around `transformers` models""" import re -from typing import Iterable, Literal, Optional, Union +from collections.abc import Iterable +from typing import Literal, Optional, Union import torch from torch import nn diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index b8d4aef252e5f..b1905348e78ef 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -3,9 +3,9 @@ # Adapted from https://github.com/fixie-ai/ultravox/blob/ecd58c4041030bae2ad15aa6bcf04ab43199ea02/ultravox/model/ultravox_model.py """PyTorch Ultravox model.""" import math +from collections.abc import Iterable, Mapping from functools import cached_property -from typing import (Any, Iterable, Literal, Mapping, Optional, Set, Tuple, - TypedDict, Union) +from typing import Any, Literal, Optional, TypedDict, Union import torch import torch.utils.checkpoint @@ -551,8 +551,8 @@ def sample( ) -> Optional[SamplerOutput]: return self.language_model.sample(logits, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self, ignore_unexpected_prefixes=["audio_tower."]) diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index fff4be34ddbeb..97433f1dde050 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -1,9 +1,9 @@ # SPDX-License-Identifier: Apache-2.0 import itertools +from collections.abc import Iterable, Mapping from dataclasses import dataclass, field -from typing import (Callable, Dict, Iterable, List, Literal, Mapping, Optional, - Protocol, Set, Tuple, Union, overload) +from typing import Callable, Literal, Optional, Protocol, Union, overload import torch import torch.nn as nn @@ -56,8 +56,8 @@ def _map_name(self, key: str) -> Optional[str]: return key def apply( - self, weights: Iterable[Tuple[str, torch.Tensor]] - ) -> Iterable[Tuple[str, torch.Tensor]]: + self, weights: Iterable[tuple[str, torch.Tensor]] + ) -> Iterable[tuple[str, torch.Tensor]]: return ((out_name, data) for name, data in weights if (out_name := self._map_name(name)) is not None) @@ -82,8 +82,8 @@ def __init__( self, module: nn.Module, *, - skip_prefixes: Optional[List[str]] = None, - ignore_unexpected_prefixes: Optional[List[str]] = None, + skip_prefixes: Optional[list[str]] = None, + ignore_unexpected_prefixes: Optional[list[str]] = None, ) -> None: super().__init__() @@ -93,8 +93,8 @@ def __init__( def _groupby_prefix( self, - weights: Iterable[Tuple[str, torch.Tensor]], - ) -> Iterable[Tuple[str, Iterable[Tuple[str, torch.Tensor]]]]: + weights: Iterable[tuple[str, torch.Tensor]], + ) -> Iterable[tuple[str, Iterable[tuple[str, torch.Tensor]]]]: weights_by_parts = ((weight_name.split(".", 1), weight_data) for weight_name, weight_data in weights) @@ -127,7 +127,7 @@ def _load_param( self, base_prefix: str, param: nn.Parameter, - weights: Iterable[Tuple[str, torch.Tensor]], + weights: Iterable[tuple[str, torch.Tensor]], ) -> Iterable[str]: for weight_name, weight_data in weights: weight_qualname = self._get_qualname(base_prefix, weight_name) @@ -160,7 +160,7 @@ def _load_module( self, base_prefix: str, module: nn.Module, - weights: Iterable[Tuple[str, torch.Tensor]], + weights: Iterable[tuple[str, torch.Tensor]], ) -> Iterable[str]: if isinstance(module, PPMissingLayer): return @@ -225,10 +225,10 @@ def _load_module( def load_weights( self, - weights: Iterable[Tuple[str, torch.Tensor]], + weights: Iterable[tuple[str, torch.Tensor]], *, mapper: Optional[WeightsMapper] = None, - ) -> Set[str]: + ) -> set[str]: if mapper is not None: weights = mapper.apply(weights) @@ -266,13 +266,13 @@ def flatten_bn(x: torch.Tensor) -> torch.Tensor: @overload -def flatten_bn(x: List[torch.Tensor]) -> List[torch.Tensor]: +def flatten_bn(x: list[torch.Tensor]) -> list[torch.Tensor]: ... @overload def flatten_bn( - x: Union[List[torch.Tensor], torch.Tensor], + x: Union[list[torch.Tensor], torch.Tensor], *, concat: Literal[True], ) -> torch.Tensor: @@ -281,18 +281,18 @@ def flatten_bn( @overload def flatten_bn( - x: Union[List[torch.Tensor], torch.Tensor], + x: Union[list[torch.Tensor], torch.Tensor], *, concat: bool = False, -) -> Union[List[torch.Tensor], torch.Tensor]: +) -> Union[list[torch.Tensor], torch.Tensor]: ... def flatten_bn( - x: Union[List[torch.Tensor], torch.Tensor], + x: Union[list[torch.Tensor], torch.Tensor], *, concat: bool = False, -) -> Union[List[torch.Tensor], torch.Tensor]: +) -> Union[list[torch.Tensor], torch.Tensor]: """ Flatten the ``B`` and ``N`` dimensions of batched multimodal inputs. @@ -416,7 +416,7 @@ def merge_multimodal_embeddings( input_ids: torch.Tensor, inputs_embeds: torch.Tensor, multimodal_embeddings: NestedTensors, - placeholder_token_id: Union[int, List[int]], + placeholder_token_id: Union[int, list[int]], ) -> torch.Tensor: """ Merge ``multimodal_embeddings`` into ``inputs_embeds`` by overwriting the @@ -544,7 +544,7 @@ def make_layers( num_hidden_layers: int, layer_fn: LayerFn, prefix: str, -) -> Tuple[int, int, torch.nn.ModuleList]: +) -> tuple[int, int, torch.nn.ModuleList]: """Make a list of layers with the given layer function, taking pipeline parallelism into account. """ @@ -562,10 +562,10 @@ def make_layers( # NOTE: don't use lru_cache here because it can prevent garbage collection -_model_to_pp_missing_layer_names: Dict[int, List[str]] = {} +_model_to_pp_missing_layer_names: dict[int, list[str]] = {} -def get_pp_missing_layer_names(model: torch.nn.Module) -> List[str]: +def get_pp_missing_layer_names(model: torch.nn.Module) -> list[str]: """Get the names of the missing layers in a pipeline parallel model.""" model_id = id(model) if model_id in _model_to_pp_missing_layer_names: @@ -593,7 +593,7 @@ def is_pp_missing_parameter(name: str, model: torch.nn.Module) -> bool: for missing_layer_name in get_pp_missing_layer_names(model)) -def make_empty_intermediate_tensors_factory(keys: List[str], hidden_size: int): +def make_empty_intermediate_tensors_factory(keys: list[str], hidden_size: int): def make_empty_intermediate_tensors( batch_size: int, @@ -632,7 +632,7 @@ def extract_layer_index(layer_name: str) -> int: - "model.encoder.layers.0.sub.1" -> ValueError """ subnames = layer_name.split(".") - int_vals: List[int] = [] + int_vals: list[int] = [] for subname in subnames: try: int_vals.append(int(subname)) diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py index e5f77e08c4035..e855db13c63ae 100644 --- a/vllm/model_executor/models/whisper.py +++ b/vllm/model_executor/models/whisper.py @@ -1,8 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 import math -from typing import (Iterable, List, Mapping, Optional, Set, Tuple, TypedDict, - Union) +from collections.abc import Iterable, Mapping +from typing import Optional, TypedDict, Union import torch from torch import nn @@ -389,7 +389,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.embed_positions.weight.copy_( sinusoids(*self.embed_positions.weight.shape)) - def forward(self, input_features: Union[torch.Tensor, List[torch.Tensor]]): + def forward(self, input_features: Union[torch.Tensor, list[torch.Tensor]]): hidden_states = [] for features in input_features: embeds = nn.functional.gelu(self.conv1(features)) @@ -467,7 +467,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): def forward( self, - input_features: Optional[Union[torch.Tensor, List[torch.Tensor]]], + input_features: Optional[Union[torch.Tensor, list[torch.Tensor]]], input_ids: Optional[torch.Tensor], positions: torch.Tensor, ) -> torch.Tensor: @@ -481,14 +481,14 @@ def forward( def get_encoder_outputs( self, - input_features: Optional[Union[torch.Tensor, List[torch.Tensor]]], + input_features: Optional[Union[torch.Tensor, list[torch.Tensor]]], ) -> Optional[torch.Tensor]: if input_features is None: return None return self.encoder(input_features) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) (".self_attn.qkv_proj", ".self_attn.q_proj", "q"), @@ -498,7 +498,7 @@ def load_weights(self, weights: Iterable[Tuple[str, (".encoder_attn.kv_proj", ".encoder_attn.v_proj", "v"), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: for param_name, weight_name, shard_id in stacked_params_mapping: if weight_name not in name: @@ -737,8 +737,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self, skip_prefixes=["proj_out."]) # add fake zeros bias for k_proj to state_dict @@ -747,8 +747,8 @@ def load_weights(self, weights: Iterable[Tuple[str, def _create_fake_bias_for_k_proj( - weights: Iterable[Tuple[str, torch.Tensor]] -) -> Iterable[Tuple[str, torch.Tensor]]: + weights: Iterable[tuple[str, torch.Tensor]] +) -> Iterable[tuple[str, torch.Tensor]]: """ Create full zeros bias for k_proj weight in self-attention layers. So that the bias for k_proj in qkv_proj can be initialized with zeros. diff --git a/vllm/model_executor/pooling_metadata.py b/vllm/model_executor/pooling_metadata.py index dea8b0e9d471d..4c5db7396c03c 100644 --- a/vllm/model_executor/pooling_metadata.py +++ b/vllm/model_executor/pooling_metadata.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 from dataclasses import dataclass -from typing import Any, Dict, List, Tuple +from typing import Any import torch @@ -23,9 +23,9 @@ class PoolingMetadata: def __init__( self, - seq_groups: List[Tuple[List[int], PoolingParams]], - seq_data: Dict[int, Any], # Specific data related to sequences - prompt_lens: List[int], + seq_groups: list[tuple[list[int], PoolingParams]], + seq_data: dict[int, Any], # Specific data related to sequences + prompt_lens: list[int], ) -> None: self.seq_groups = seq_groups self.seq_data = seq_data diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py index 0a580a4e907de..3a18e4d43c550 100644 --- a/vllm/model_executor/sampling_metadata.py +++ b/vllm/model_executor/sampling_metadata.py @@ -2,7 +2,7 @@ from array import array from dataclasses import dataclass -from typing import Dict, List, Optional, Tuple +from typing import Optional import torch @@ -25,10 +25,10 @@ class SequenceGroupToSample: # |-- query_len ---| # Sequence ids for the sequence group in a previous step. - seq_ids: List[int] + seq_ids: list[int] sampling_params: SamplingParams # seq_id -> sequence data. - seq_data: Dict[int, SequenceData] + seq_data: dict[int, SequenceData] # The length of the sequence (all tokens seen in the past + new token to # compute attention) of the sequence group. None if it is in a decode # stage. @@ -44,9 +44,9 @@ class SequenceGroupToSample: is_prompt: bool # Query token indices from logits. to compute prompt logprob. Empty if # prompt logprob is not required. - prompt_logprob_indices: List[int] + prompt_logprob_indices: list[int] # Sample token indices from logits. Empty if sampling is not required. - sample_indices: List[int] + sample_indices: list[int] @property def do_sample(self): @@ -78,7 +78,7 @@ class SamplingMetadataCache: """Used to cache SamplingMetadata objects between scheduler iterations""" def __init__(self): - self._seq_group_to_sample_cache: Dict[int, PyObjectCache] = {} + self._seq_group_to_sample_cache: dict[int, PyObjectCache] = {} def get_cached_seq_group_to_sample(self, num_seqs): if num_seqs not in self._seq_group_to_sample_cache: @@ -130,9 +130,9 @@ def sample(logits): def __init__( self, - seq_groups: List[SequenceGroupToSample], + seq_groups: list[SequenceGroupToSample], selected_token_indices: torch.Tensor, - categorized_sample_indices: Dict[SamplingType, torch.Tensor], + categorized_sample_indices: dict[SamplingType, torch.Tensor], num_prompts: int, skip_sampler_cpu_output: bool = False, reuse_sampling_tensors: bool = False, @@ -146,12 +146,12 @@ def __init__( @staticmethod def prepare( - seq_group_metadata_list: List[SequenceGroupMetadata], - seq_lens: List[int], - query_lens: List[int], + seq_group_metadata_list: list[SequenceGroupMetadata], + seq_lens: list[int], + query_lens: list[int], device: str, pin_memory: bool, - generators: Optional[Dict[str, torch.Generator]] = None, + generators: Optional[dict[str, torch.Generator]] = None, cache: Optional[SamplingMetadataCache] = None, ) -> "SamplingMetadata": ( @@ -195,16 +195,16 @@ def __repr__(self) -> str: def _prepare_seq_groups( - seq_group_metadata_list: List[SequenceGroupMetadata], - seq_lens: List[int], - query_lens: List[int], + seq_group_metadata_list: list[SequenceGroupMetadata], + seq_lens: list[int], + query_lens: list[int], device: str, - generators: Optional[Dict[str, torch.Generator]] = None, + generators: Optional[dict[str, torch.Generator]] = None, cache: Optional[SamplingMetadataCache] = None, -) -> Tuple[ - List[SequenceGroupToSample], - List[int], - Dict[SamplingType, List[int]], +) -> tuple[ + list[SequenceGroupToSample], + list[int], + dict[SamplingType, list[int]], int, ]: """Prepare sequence groups and indices for sampling. @@ -227,17 +227,17 @@ def _prepare_seq_groups( num_prompts: Total number of prompts from `seq_group_metadata_list`. """ # Batched sequence groups for the current model forward stsep. - seq_groups: List[SequenceGroupToSample] = [] + seq_groups: list[SequenceGroupToSample] = [] # A list of token indices to sample/compute logprob. It is used to # prune the outcome logits from the model for the performance. - selected_token_indices: List[int] = [] + selected_token_indices: list[int] = [] # Used for selected_token_indices. model_output_idx = 0 # Sampling type -> ( # indices to sample/prompt logprob within pruned output logits, # indices to sample within pruned logits) - categorized_sample_indices: Dict[SamplingType, List[int]] = { + categorized_sample_indices: dict[SamplingType, list[int]] = { t: [] for t in SamplingType } @@ -265,9 +265,9 @@ def _prepare_seq_groups( # If the current seq group is in decode stage, it is None. seq_len: Optional[int] = None query_len: Optional[int] = None - prompt_logprob_indices: List[int] = (sample_obj.prompt_logprob_indices + prompt_logprob_indices: list[int] = (sample_obj.prompt_logprob_indices if cache is not None else []) - sample_indices: List[int] = (sample_obj.sample_indices + sample_indices: list[int] = (sample_obj.sample_indices if cache is not None else []) do_sample = seq_group_metadata.do_sample @@ -389,16 +389,16 @@ def from_sampling_metadata( vocab_size: int, device: torch.device, dtype: torch.dtype, - ) -> Tuple["SamplingTensors", bool, bool, bool]: - prompt_tokens: List[array] = [] - output_tokens: List[array] = [] - top_ks: List[int] = [] - temperatures: List[float] = [] - top_ps: List[float] = [] - min_ps: List[float] = [] - presence_penalties: List[float] = [] - frequency_penalties: List[float] = [] - repetition_penalties: List[float] = [] + ) -> tuple["SamplingTensors", bool, bool, bool]: + prompt_tokens: list[array] = [] + output_tokens: list[array] = [] + top_ks: list[int] = [] + temperatures: list[float] = [] + top_ps: list[float] = [] + min_ps: list[float] = [] + presence_penalties: list[float] = [] + frequency_penalties: list[float] = [] + repetition_penalties: list[float] = [] do_penalties = False do_top_p_top_k = False do_min_p = False @@ -496,15 +496,15 @@ def from_sampling_metadata( @classmethod def from_lists( cls, - temperatures: List[float], - top_ps: List[float], - top_ks: List[int], - min_ps: List[float], - presence_penalties: List[float], - frequency_penalties: List[float], - repetition_penalties: List[float], - prompt_tokens: List[array], - output_tokens: List[array], + temperatures: list[float], + top_ps: list[float], + top_ks: list[int], + min_ps: list[float], + presence_penalties: list[float], + frequency_penalties: list[float], + repetition_penalties: list[float], + prompt_tokens: list[array], + output_tokens: list[array], vocab_size: int, device: torch.device, dtype: torch.dtype, diff --git a/vllm/model_executor/utils.py b/vllm/model_executor/utils.py index 04f922dfd77aa..f9d89e64bd9db 100644 --- a/vllm/model_executor/utils.py +++ b/vllm/model_executor/utils.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 """Utils for model executor.""" -from typing import Any, Dict, Optional +from typing import Any, Optional import torch @@ -12,7 +12,7 @@ def set_random_seed(seed: int) -> None: def set_weight_attrs( weight: torch.Tensor, - weight_attrs: Optional[Dict[str, Any]], + weight_attrs: Optional[dict[str, Any]], ): """Set attributes on a weight tensor. diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py index c48d07ba365ba..e0b160a65047a 100644 --- a/vllm/multimodal/base.py +++ b/vllm/multimodal/base.py @@ -2,9 +2,10 @@ from abc import ABC, abstractmethod from collections import defaultdict +from collections.abc import Sequence from pathlib import Path from typing import (TYPE_CHECKING, Any, Callable, Generic, NamedTuple, - Optional, Sequence, Tuple, Type, TypeVar, Union) + Optional, TypeVar, Union) from torch import nn @@ -39,7 +40,7 @@ """ _T = TypeVar("_T") -N = TypeVar("N", bound=Type[nn.Module]) +N = TypeVar("N", bound=type[nn.Module]) class MultiModalPlugin(ABC): @@ -274,7 +275,7 @@ def __init__(self): @classmethod def from_seq_group( cls, seq_group: "SequenceGroupMetadata", positions: range - ) -> Tuple[Optional[MultiModalDataDict], dict[str, + ) -> tuple[Optional[MultiModalDataDict], dict[str, "MultiModalPlaceholderMap"]]: """ Returns the multi-modal items that intersect with the portion of a diff --git a/vllm/multimodal/hasher.py b/vllm/multimodal/hasher.py index 7d277fd67deca..11665ef667538 100644 --- a/vllm/multimodal/hasher.py +++ b/vllm/multimodal/hasher.py @@ -1,7 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 import pickle -from typing import TYPE_CHECKING, Iterable, Mapping, Optional +from collections.abc import Iterable, Mapping +from typing import TYPE_CHECKING, Optional import numpy as np import torch diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py index 98ece8f806f1d..f76982ef8d729 100644 --- a/vllm/multimodal/image.py +++ b/vllm/multimodal/image.py @@ -3,7 +3,7 @@ import base64 from io import BytesIO from pathlib import Path -from typing import TYPE_CHECKING, Any, Dict, Optional +from typing import TYPE_CHECKING, Any, Optional import torch from PIL import Image @@ -31,7 +31,7 @@ def get_data_key(self) -> str: def _get_hf_image_processor( self, model_config: "ModelConfig", - mm_processor_kwargs: Optional[Dict[str, Any]] = None, + mm_processor_kwargs: Optional[dict[str, Any]] = None, ): if mm_processor_kwargs is None: mm_processor_kwargs = {} diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index 1882ffe9bf69f..d79290ef98322 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -2,9 +2,9 @@ import functools from collections import UserDict +from collections.abc import Mapping, Sequence from dataclasses import dataclass -from typing import (TYPE_CHECKING, Any, Dict, Generic, Mapping, Optional, - Protocol, Sequence, Type, TypeVar) +from typing import TYPE_CHECKING, Any, Generic, Optional, Protocol, TypeVar import torch.nn as nn @@ -29,7 +29,7 @@ logger = init_logger(__name__) -N = TypeVar("N", bound=Type[nn.Module]) +N = TypeVar("N", bound=type[nn.Module]) _I = TypeVar("_I", bound=BaseProcessingInfo) _I_co = TypeVar("_I_co", bound=BaseProcessingInfo, covariant=True) @@ -83,13 +83,13 @@ def build_processor( return self.processor(info, dummy_inputs_builder, cache=cache) -class _MultiModalLimits(UserDict["ModelConfig", Dict[str, int]]): +class _MultiModalLimits(UserDict["ModelConfig", dict[str, int]]): """ Wraps `_limits_by_model` for a more informative error message when attempting to access a model that does not exist. """ - def __getitem__(self, key: "ModelConfig") -> Dict[str, int]: + def __getitem__(self, key: "ModelConfig") -> dict[str, int]: try: return super().__getitem__(key) except KeyError as exc: @@ -170,7 +170,7 @@ def map_input( self, model_config: "ModelConfig", data: MultiModalDataDict, - mm_processor_kwargs: Optional[Dict[str, Any]] = None, + mm_processor_kwargs: Optional[dict[str, Any]] = None, ) -> MultiModalKwargs: """ Apply an input mapper to the data passed to the model. @@ -184,7 +184,7 @@ def map_input( Note: This should be called after :meth:`init_mm_limits_per_prompt`. """ - merged_dict: Dict[str, NestedTensors] = {} + merged_dict: dict[str, NestedTensors] = {} for data_key, data_value in data.items(): plugin = self._get_plugin(data_key) diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py index 8004377191b38..0b3d3f8c79d72 100644 --- a/vllm/multimodal/video.py +++ b/vllm/multimodal/video.py @@ -4,7 +4,7 @@ from functools import partial from io import BytesIO from pathlib import Path -from typing import TYPE_CHECKING, Any, Dict, Optional +from typing import TYPE_CHECKING, Any, Optional import numpy as np import numpy.typing as npt @@ -39,7 +39,7 @@ def get_data_key(self) -> str: def _get_hf_video_processor( self, model_config: "ModelConfig", - mm_processor_kwargs: Optional[Dict[str, Any]] = None, + mm_processor_kwargs: Optional[dict[str, Any]] = None, ): if mm_processor_kwargs is None: mm_processor_kwargs = {} diff --git a/vllm/outputs.py b/vllm/outputs.py index 030119710a187..8c355c89e3e9b 100644 --- a/vllm/outputs.py +++ b/vllm/outputs.py @@ -1,10 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 import time +from collections.abc import MutableSequence +from collections.abc import Sequence as GenericSequence from dataclasses import dataclass -from typing import Dict, Generic, List, MutableSequence, Optional -from typing import Sequence as GenericSequence -from typing import Union +from typing import Generic, Optional, Union import torch from typing_extensions import TypeVar, deprecated @@ -109,14 +109,14 @@ def __init__( self, request_id: str, prompt: Optional[str], - prompt_token_ids: Optional[List[int]], + prompt_token_ids: Optional[list[int]], prompt_logprobs: Optional[PromptLogprobs], - outputs: List[CompletionOutput], + outputs: list[CompletionOutput], finished: bool, metrics: Optional[RequestMetrics] = None, lora_request: Optional[LoRARequest] = None, encoder_prompt: Optional[str] = None, - encoder_prompt_token_ids: Optional[List[int]] = None, + encoder_prompt_token_ids: Optional[list[int]] = None, num_cached_tokens: Optional[int] = None, *, multi_modal_placeholders: Optional[MultiModalPlaceholderDict] = None, @@ -139,9 +139,9 @@ def new( cls, request_id: str, prompt: Optional[str], - prompt_token_ids: Optional[List[int]], + prompt_token_ids: Optional[list[int]], text: str, - token_ids: List[int], + token_ids: list[int], logprobs: Optional[SampleLogprobs], prompt_logprobs: Optional[PromptLogprobs], cumulative_logprob: Optional[float], @@ -189,7 +189,7 @@ def add(self, next_output: "RequestOutput") -> None: @classmethod def from_seq_group( cls, seq_group: SequenceGroup, use_cache: bool, - seq_id_to_seq_group: Dict[str, SequenceGroupBase] + seq_id_to_seq_group: dict[str, SequenceGroupBase] ) -> Optional["RequestOutput"]: finished = seq_group.is_finished() @@ -363,12 +363,12 @@ class PoolingRequestOutput(Generic[_O]): Args: request_id (str): A unique identifier for the pooling request. outputs (PoolingOutput): The pooling results for the given input. - prompt_token_ids (List[int]): A list of token IDs used in the prompt. + prompt_token_ids (list[int]): A list of token IDs used in the prompt. finished (bool): A flag indicating whether the pooling is completed. """ def __init__(self, request_id: str, outputs: _O, - prompt_token_ids: List[int], finished: bool): + prompt_token_ids: list[int], finished: bool): self.request_id = request_id self.prompt_token_ids = prompt_token_ids self.finished = finished @@ -407,7 +407,7 @@ class RequestOutputFactory: @staticmethod def create(seq_group: SequenceGroup, - seq_id_to_seq_group: Dict[str, SequenceGroupBase], + seq_id_to_seq_group: dict[str, SequenceGroupBase], use_cache: bool = False): if seq_group.pooled_data is not None: return PoolingRequestOutput.from_seq_group(seq_group) diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index c6f3ccf0a3c49..cb880a23bd673 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -5,8 +5,7 @@ import os from functools import lru_cache, wraps -from typing import (TYPE_CHECKING, Callable, List, Optional, Tuple, TypeVar, - Union) +from typing import TYPE_CHECKING, Callable, Optional, TypeVar, Union import torch from typing_extensions import ParamSpec @@ -100,7 +99,7 @@ def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool: return True @classmethod - def is_full_nvlink(cls, device_ids: List[int]) -> bool: + def is_full_nvlink(cls, device_ids: list[int]) -> bool: raise NotImplementedError @classmethod @@ -287,7 +286,7 @@ def get_device_capability(cls, @with_nvml_context def has_device_capability( cls, - capability: Union[Tuple[int, int], int], + capability: Union[tuple[int, int], int], device_id: int = 0, ) -> bool: try: @@ -320,7 +319,7 @@ def get_device_total_memory(cls, device_id: int = 0) -> int: @classmethod @with_nvml_context - def is_full_nvlink(cls, physical_device_ids: List[int]) -> bool: + def is_full_nvlink(cls, physical_device_ids: list[int]) -> bool: """ query if the set of gpus are fully connected by nvlink (1 hop) """ @@ -385,7 +384,7 @@ def get_device_total_memory(cls, device_id: int = 0) -> int: return device_props.total_memory @classmethod - def is_full_nvlink(cls, physical_device_ids: List[int]) -> bool: + def is_full_nvlink(cls, physical_device_ids: list[int]) -> bool: logger.exception( "NVLink detection not possible, as context support was" " not found. Assuming no NVLink available.") diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index 0e4988a4fa74d..fb46e00c0bb0d 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -4,7 +4,7 @@ import platform import random from platform import uname -from typing import TYPE_CHECKING, NamedTuple, Optional, Tuple, Union +from typing import TYPE_CHECKING, NamedTuple, Optional, Union import numpy as np import torch @@ -162,7 +162,7 @@ def get_device_capability( @classmethod def has_device_capability( cls, - capability: Union[Tuple[int, int], int], + capability: Union[tuple[int, int], int], device_id: int = 0, ) -> bool: """ diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index a4f18cbfc5871..09aca00fd9e54 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -2,7 +2,7 @@ import os from functools import lru_cache, wraps -from typing import TYPE_CHECKING, Dict, List, Optional +from typing import TYPE_CHECKING, Optional import torch @@ -36,7 +36,7 @@ logger.warning("Failed to import from vllm._rocm_C with %r", e) # Models not supported by ROCm. -_ROCM_UNSUPPORTED_MODELS: List[str] = [] +_ROCM_UNSUPPORTED_MODELS: list[str] = [] # Models partially supported by ROCm. # Architecture -> Reason. @@ -44,7 +44,7 @@ "Triton flash attention. For half-precision SWA support, " "please use CK flash attention by setting " "`VLLM_USE_TRITON_FLASH_ATTN=0`") -_ROCM_PARTIALLY_SUPPORTED_MODELS: Dict[str, str] = { +_ROCM_PARTIALLY_SUPPORTED_MODELS: dict[str, str] = { "Qwen2ForCausalLM": _ROCM_SWA_REASON, "MistralForCausalLM": diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py index 389cb87281031..d72ab2bd088c7 100644 --- a/vllm/plugins/__init__.py +++ b/vllm/plugins/__init__.py @@ -2,7 +2,7 @@ import logging import os -from typing import Callable, Dict +from typing import Callable import torch @@ -14,7 +14,7 @@ plugins_loaded = False -def load_plugins_by_group(group: str) -> Dict[str, Callable]: +def load_plugins_by_group(group: str) -> dict[str, Callable]: import sys if sys.version_info < (3, 10): from importlib_metadata import entry_points diff --git a/vllm/profiler/layerwise_profile.py b/vllm/profiler/layerwise_profile.py index 6351ef63da2be..6934d328a87ef 100644 --- a/vllm/profiler/layerwise_profile.py +++ b/vllm/profiler/layerwise_profile.py @@ -3,7 +3,7 @@ import copy from collections import defaultdict from dataclasses import asdict, dataclass, field -from typing import Any, Callable, Dict, List, Optional, Tuple, TypeAlias, Union +from typing import Any, Callable, Optional, TypeAlias, Union import pandas as pd from torch._C._autograd import DeviceType, _KinetoEvent, _ProfilerResult @@ -20,7 +20,7 @@ class _ModuleTreeNode: event: _ProfilerEvent parent: Optional['_ModuleTreeNode'] = None - children: List['_ModuleTreeNode'] = field(default_factory=list) + children: list['_ModuleTreeNode'] = field(default_factory=list) trace: str = "" @property @@ -60,19 +60,19 @@ class ModelStatsEntry: @dataclass class _StatsTreeNode: entry: StatsEntry - children: List[StatsEntry] + children: list[StatsEntry] parent: Optional[StatsEntry] @dataclass class LayerwiseProfileResults(profile): _kineto_results: _ProfilerResult - _kineto_event_correlation_map: Dict[int, - List[_KinetoEvent]] = field(init=False) - _event_correlation_map: Dict[int, List[FunctionEvent]] = field(init=False) - _module_tree: List[_ModuleTreeNode] = field(init=False) - _model_stats_tree: List[_StatsTreeNode] = field(init=False) - _summary_stats_tree: List[_StatsTreeNode] = field(init=False) + _kineto_event_correlation_map: dict[int, + list[_KinetoEvent]] = field(init=False) + _event_correlation_map: dict[int, list[FunctionEvent]] = field(init=False) + _module_tree: list[_ModuleTreeNode] = field(init=False) + _model_stats_tree: list[_StatsTreeNode] = field(init=False) + _summary_stats_tree: list[_StatsTreeNode] = field(init=False) # profile metadata num_running_seqs: Optional[int] = None @@ -82,7 +82,7 @@ def __post_init__(self): self._build_module_tree() self._build_stats_trees() - def print_model_table(self, column_widths: Dict[str, int] = None): + def print_model_table(self, column_widths: dict[str, int] = None): _column_widths = dict(name=60, cpu_time_us=12, cuda_time_us=12, @@ -100,7 +100,7 @@ def print_model_table(self, column_widths: Dict[str, int] = None): filtered_model_table, indent_style=lambda indent: "|" + "-" * indent + " ")) - def print_summary_table(self, column_widths: Dict[str, int] = None): + def print_summary_table(self, column_widths: dict[str, int] = None): _column_widths = dict(name=80, cuda_time_us=12, pct_cuda_time=12, @@ -142,7 +142,7 @@ def convert_stats_to_dict(self) -> dict[str, Any]: } @staticmethod - def _indent_row_names_based_on_depth(depths_rows: List[Tuple[int, + def _indent_row_names_based_on_depth(depths_rows: list[tuple[int, StatsEntry]], indent_style: Union[Callable[[int], str], @@ -229,7 +229,7 @@ def _total_cuda_time(self): [self._cumulative_cuda_time(root) for root in self._module_tree]) def _build_stats_trees(self): - summary_dict: Dict[str, _StatsTreeNode] = {} + summary_dict: dict[str, _StatsTreeNode] = {} total_cuda_time = self._total_cuda_time() def pct_cuda_time(cuda_time_us): @@ -238,7 +238,7 @@ def pct_cuda_time(cuda_time_us): def build_summary_stats_tree_df( node: _ModuleTreeNode, parent: Optional[_StatsTreeNode] = None, - summary_trace: Tuple[str] = ()): + summary_trace: tuple[str] = ()): if event_has_module(node.event): name = event_module_repr(node.event) @@ -313,8 +313,8 @@ def build_model_stats_tree_df(node: _ModuleTreeNode, self._model_stats_tree.append(build_model_stats_tree_df(root)) def _flatten_stats_tree( - self, tree: List[_StatsTreeNode]) -> List[Tuple[int, StatsEntry]]: - entries: List[Tuple[int, StatsEntry]] = [] + self, tree: list[_StatsTreeNode]) -> list[tuple[int, StatsEntry]]: + entries: list[tuple[int, StatsEntry]] = [] def df_traversal(node: _StatsTreeNode, depth=0): entries.append((depth, node.entry)) @@ -327,10 +327,10 @@ def df_traversal(node: _StatsTreeNode, depth=0): return entries def _convert_stats_tree_to_dict(self, - tree: List[_StatsTreeNode]) -> List[Dict]: - root_dicts: List[Dict] = [] + tree: list[_StatsTreeNode]) -> list[dict]: + root_dicts: list[dict] = [] - def df_traversal(node: _StatsTreeNode, curr_json_list: List[Dict]): + def df_traversal(node: _StatsTreeNode, curr_json_list: list[dict]): curr_json_list.append({ "entry": asdict(node.entry), "children": [] diff --git a/vllm/profiler/utils.py b/vllm/profiler/utils.py index 62b39f510703e..b26fd4dd8c071 100644 --- a/vllm/profiler/utils.py +++ b/vllm/profiler/utils.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import dataclasses -from typing import Callable, Dict, List, Type, Union +from typing import Callable, Union from torch._C._profiler import _EventType, _ProfilerEvent, _TensorMetadata @@ -30,14 +30,14 @@ def trim_string_back(string, width): class TablePrinter: - def __init__(self, row_cls: Type[dataclasses.dataclass], - column_widths: Dict[str, int]): + def __init__(self, row_cls: type[dataclasses.dataclass], + column_widths: dict[str, int]): self.row_cls = row_cls self.fieldnames = [x.name for x in dataclasses.fields(row_cls)] self.column_widths = column_widths assert set(self.column_widths.keys()) == set(self.fieldnames) - def print_table(self, rows: List[dataclasses.dataclass]): + def print_table(self, rows: list[dataclasses.dataclass]): self._print_header() self._print_line() for row in rows: diff --git a/vllm/prompt_adapter/models.py b/vllm/prompt_adapter/models.py index 795591606f259..bd1dc51c468b1 100644 --- a/vllm/prompt_adapter/models.py +++ b/vllm/prompt_adapter/models.py @@ -2,7 +2,7 @@ import logging import math -from typing import Any, Callable, Dict, List, Optional, Type +from typing import Any, Callable, Optional import torch from torch import nn @@ -45,7 +45,7 @@ def convert_to_embedding_indices(indices): def convert_mapping( mapping: PromptAdapterMapping, - prompt_adapter_index_to_id: List[Optional[int]], + prompt_adapter_index_to_id: list[Optional[int]], ) -> torch.Tensor: """Converts PromptAdapterMapping to index tensors. @@ -127,8 +127,8 @@ def __init__( prompt_adapter_config: the PromptAdapter config, """ self.model: nn.Module = model - # Dict instead of a Set for compatibility with LRUCache. - self.prompt_adapter_index_to_id: List[ + # dict instead of a Set for compatibility with LRUCache. + self.prompt_adapter_index_to_id: list[ Optional[int]] = [None] * self.prompt_adapter_slots self.max_num_seqs = max_num_seqs self.max_num_batched_tokens = math.ceil(max_num_batched_tokens / 8) * 8 @@ -139,7 +139,7 @@ def __init__( self.base_indices = torch.tensor([-1]) self.base_embedding_indices = torch.tensor([]) - self.modules: Dict[str, nn.Module] = {} + self.modules: dict[str, nn.Module] = {} self._create_prompt_adapter_modules() self._last_mapping: Optional[PromptAdapterMapping] = None @@ -252,7 +252,7 @@ def remove_adapter(self, adapter_id: int) -> bool: return remove_adapter(adapter_id, self._registered_adapters, self.deactivate_adapter) - def list_adapters(self) -> Dict[int, Any]: + def list_adapters(self) -> dict[int, Any]: return list_adapters(self._registered_adapters) def get_adapter(self, adapter_id: int) -> Optional[Any]: @@ -284,7 +284,7 @@ def __init__( self._active_adapters = PromptAdapterLRUCache( self.prompt_adapter_slots, self._deactivate_adapter) - def list_adapters(self) -> Dict[int, PromptAdapterModel]: + def list_adapters(self) -> dict[int, PromptAdapterModel]: """List all registered PromptAdapterModel.""" return dict(self._registered_adapters.cache) @@ -344,7 +344,7 @@ def create_prompt_adapter_manager( max_num_seqs: int, max_num_batched_tokens: int, prompt_adapter_config: PromptAdapterConfig, - prompt_adapter_manager_cls: Type[ + prompt_adapter_manager_cls: type[ PromptAdapterModelManager] = PromptAdapterModelManager, **kwargs) -> PromptAdapterModelManager: """Create a PromptAdapterModel for a given model.""" diff --git a/vllm/prompt_adapter/worker_manager.py b/vllm/prompt_adapter/worker_manager.py index 28dcc16871120..dbf82ab698480 100644 --- a/vllm/prompt_adapter/worker_manager.py +++ b/vllm/prompt_adapter/worker_manager.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import logging -from typing import Any, Optional, Set, Type +from typing import Any, Optional import torch @@ -28,7 +28,7 @@ class WorkerPromptAdapterManager(AbstractWorkerManager): loaded (unless they are already loaded), and every other prompt_adapter will be unloaded.""" - _manager_cls: Type[PromptAdapterModelManager] = PromptAdapterModelManager + _manager_cls: type[PromptAdapterModelManager] = PromptAdapterModelManager def __init__( self, @@ -36,7 +36,7 @@ def __init__( max_num_batched_tokens: int, device: torch.device, prompt_adapter_config: PromptAdapterConfig, - prompt_adapter_model_cls: Type[PromptAdapterModel] = PromptAdapterModel + prompt_adapter_model_cls: type[PromptAdapterModel] = PromptAdapterModel ): self._adapter_manager: PromptAdapterModelManager self.max_num_seqs = max_num_seqs @@ -90,7 +90,7 @@ def add_dummy_prompt_adapter( def pin_adapter(self, adapter_id: int) -> bool: return self._adapter_manager.pin_adapter(adapter_id) - def set_active_adapters(self, requests: Set[Any], + def set_active_adapters(self, requests: set[Any], mapping: Optional[Any]) -> None: set_active_adapters_worker(requests, mapping, self._apply_adapters, self._adapter_manager.set_adapter_mapping) @@ -101,7 +101,7 @@ def add_adapter(self, adapter_request: Any) -> bool: self._adapter_manager.add_adapter, self._adapter_manager.activate_adapter) - def _apply_adapters(self, adapter_requests: Set[Any]) -> None: + def _apply_adapters(self, adapter_requests: set[Any]) -> None: apply_adapters_worker(adapter_requests, self.list_adapters, self._adapter_manager.adapter_slots, self.remove_adapter, self.add_adapter) @@ -112,7 +112,7 @@ def remove_adapter(self, adapter_id: int) -> bool: def remove_all_adapters(self): self._adapter_manager.remove_all_adapters() - def list_adapters(self) -> Set[int]: + def list_adapters(self) -> set[int]: return list_adapters_worker(self._adapter_manager.list_adapters) @@ -125,7 +125,7 @@ class LRUCacheWorkerPromptAdapterManager(WorkerPromptAdapterManager): and least recently used prompt_adapters will be unloaded if the cache is above capacity.""" - _prompt_adapter_manager_cls: Type[ + _prompt_adapter_manager_cls: type[ LRUCachePromptAdapterModelManager] = LRUCachePromptAdapterModelManager def create_prompt_adapter_manager( @@ -143,7 +143,7 @@ def create_prompt_adapter_manager( return prompt_adapter_manager.model def _apply_adapters( - self, prompt_adapter_requests: Set[PromptAdapterRequest]) -> None: + self, prompt_adapter_requests: set[PromptAdapterRequest]) -> None: prompt_adapters_map = { prompt_adapter_request.prompt_adapter_id: prompt_adapter_request for prompt_adapter_request in prompt_adapter_requests diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py index 2ce87283df756..17e4e43387dd4 100644 --- a/vllm/sampling_params.py +++ b/vllm/sampling_params.py @@ -4,11 +4,10 @@ from dataclasses import dataclass from enum import Enum, IntEnum from functools import cached_property -from typing import Any, Dict, List, Optional, Set, Union +from typing import Annotated, Any, Optional, Union import msgspec from pydantic import BaseModel -from typing_extensions import Annotated from vllm.logger import init_logger from vllm.logits_process import LogitsProcessor @@ -29,9 +28,9 @@ class SamplingType(IntEnum): @dataclass class GuidedDecodingParams: """One of these fields will be used to build a logit processor.""" - json: Optional[Union[str, Dict]] = None + json: Optional[Union[str, dict]] = None regex: Optional[str] = None - choice: Optional[List[str]] = None + choice: Optional[list[str]] = None grammar: Optional[str] = None json_object: Optional[bool] = None """These are other options that can be set""" @@ -40,9 +39,9 @@ class GuidedDecodingParams: @staticmethod def from_optional( - json: Optional[Union[Dict, BaseModel, str]] = None, + json: Optional[Union[dict, BaseModel, str]] = None, regex: Optional[str] = None, - choice: Optional[List[str]] = None, + choice: Optional[list[str]] = None, grammar: Optional[str] = None, json_object: Optional[bool] = None, backend: Optional[str] = None, @@ -72,7 +71,7 @@ def backend_name(self) -> str: """ return (self.backend or "").split(":")[0] - def backend_options(self) -> List[str]: + def backend_options(self) -> list[str]: """Return the backend options as a list of strings.""" if not self.backend or ":" not in self.backend: return [] @@ -144,12 +143,12 @@ class SamplingParams( considered, relative to the probability of the most likely token. Must be in [0, 1]. Set to 0 to disable this. seed: Random seed to use for the generation. - stop: List of strings that stop the generation when they are generated. + stop: list of strings that stop the generation when they are generated. The returned output will not contain the stop strings. - stop_token_ids: List of tokens that stop the generation when they are + stop_token_ids: list of tokens that stop the generation when they are generated. The returned output will contain the stop tokens unless the stop tokens are special tokens. - bad_words: List of words that are not allowed to be generated. + bad_words: list of words that are not allowed to be generated. More precisely, only the last token of a corresponding token sequence is not allowed when the next generated token can complete the sequence. @@ -172,7 +171,7 @@ class SamplingParams( skip_special_tokens: Whether to skip special tokens in the output. spaces_between_special_tokens: Whether to add spaces between special tokens in the output. Defaults to True. - logits_processors: List of functions that modify logits based on + logits_processors: list of functions that modify logits based on previously generated tokens, and optionally prompt tokens as a first argument. truncate_prompt_tokens: If set to an integer k, will use only the last k @@ -198,9 +197,9 @@ class SamplingParams( top_k: int = -1 min_p: float = 0.0 seed: Optional[int] = None - stop: Optional[Union[str, List[str]]] = None - stop_token_ids: Optional[List[int]] = None - bad_words: Optional[List[str]] = None + stop: Optional[Union[str, list[str]]] = None + stop_token_ids: Optional[list[int]] = None + bad_words: Optional[list[str]] = None ignore_eos: bool = False max_tokens: Optional[int] = 16 min_tokens: int = 0 @@ -212,8 +211,8 @@ class SamplingParams( detokenize: bool = True skip_special_tokens: bool = True spaces_between_special_tokens: bool = True - # Optional[List[LogitsProcessor]] type. We use Any here because - # Optional[List[LogitsProcessor]] type is not supported by msgspec. + # Optional[list[LogitsProcessor]] type. We use Any here because + # Optional[list[LogitsProcessor]] type is not supported by msgspec. logits_processors: Optional[Any] = None include_stop_str_in_output: bool = False truncate_prompt_tokens: Optional[Annotated[int, msgspec.Meta(ge=1)]] = None @@ -222,12 +221,12 @@ class SamplingParams( # The below fields are not supposed to be used as an input. # They are set in post_init. output_text_buffer_length: int = 0 - _all_stop_token_ids: Set[int] = msgspec.field(default_factory=set) + _all_stop_token_ids: set[int] = msgspec.field(default_factory=set) # Fields used to construct logits processors guided_decoding: Optional[GuidedDecodingParams] = None - logit_bias: Optional[Dict[int, float]] = None - allowed_token_ids: Optional[List[int]] = None + logit_bias: Optional[dict[int, float]] = None + allowed_token_ids: Optional[list[int]] = None @staticmethod def from_optional( @@ -241,9 +240,9 @@ def from_optional( top_k: int = -1, min_p: float = 0.0, seed: Optional[int] = None, - stop: Optional[Union[str, List[str]]] = None, - stop_token_ids: Optional[List[int]] = None, - bad_words: Optional[List[str]] = None, + stop: Optional[Union[str, list[str]]] = None, + stop_token_ids: Optional[list[int]] = None, + bad_words: Optional[list[str]] = None, include_stop_str_in_output: bool = False, ignore_eos: bool = False, max_tokens: Optional[int] = 16, @@ -253,13 +252,13 @@ def from_optional( detokenize: bool = True, skip_special_tokens: bool = True, spaces_between_special_tokens: bool = True, - logits_processors: Optional[List[LogitsProcessor]] = None, + logits_processors: Optional[list[LogitsProcessor]] = None, truncate_prompt_tokens: Optional[Annotated[int, msgspec.Meta(ge=1)]] = None, output_kind: RequestOutputKind = RequestOutputKind.CUMULATIVE, guided_decoding: Optional[GuidedDecodingParams] = None, - logit_bias: Optional[Union[Dict[int, float], Dict[str, float]]] = None, - allowed_token_ids: Optional[List[int]] = None, + logit_bias: Optional[Union[dict[int, float], dict[str, float]]] = None, + allowed_token_ids: Optional[list[int]] = None, ) -> "SamplingParams": if logit_bias is not None: # Convert token_id to integer @@ -435,7 +434,7 @@ def _verify_greedy_sampling(self) -> None: def update_from_generation_config( self, - generation_config: Dict[str, Any], + generation_config: dict[str, Any], model_eos_token_id: Optional[int] = None) -> None: """Update if there are non-default values from generation_config""" @@ -468,7 +467,7 @@ def sampling_type(self) -> SamplingType: return SamplingType.RANDOM @property - def all_stop_token_ids(self) -> Set[int]: + def all_stop_token_ids(self) -> set[int]: return self._all_stop_token_ids def clone(self) -> "SamplingParams": diff --git a/vllm/sequence.py b/vllm/sequence.py index c0425ba33c9af..6a7b1e62a6045 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -5,11 +5,11 @@ from abc import ABC, abstractmethod from array import array from collections import defaultdict +from collections.abc import Mapping +from collections.abc import Sequence as GenericSequence from dataclasses import dataclass, field from functools import reduce -from typing import Any, Callable, DefaultDict, Dict, List, Mapping, Optional -from typing import Sequence as GenericSequence -from typing import Set, Tuple, Union +from typing import Any, Callable, Optional, Union import msgspec import torch @@ -50,9 +50,9 @@ class Logprob: # {token_id -> logprob} per each sequence group. None if the corresponding # sequence group doesn't require prompt logprob. -PromptLogprobs = List[Optional[Dict[int, Logprob]]] +PromptLogprobs = list[Optional[dict[int, Logprob]]] # {token_id -> logprob} for each sequence group. -SampleLogprobs = List[Dict[int, Logprob]] +SampleLogprobs = list[dict[int, Logprob]] class SequenceStatus(enum.IntEnum): @@ -129,7 +129,7 @@ class SequenceDataDelta( omit_defaults=True): # type: ignore[call-arg] """Delta SequenceData to send to workers per step.""" # A new token to be appended to existing SequenceData. - new_output_token_ids: List[int] + new_output_token_ids: list[int] # Overwriting existing `cumulative_logprob` new_cumulative_logprob: float # Overwriting existing `num_computed_tokens`. @@ -152,7 +152,7 @@ class SequenceData(msgspec.Struct, output_token_ids: The token IDs of the output. cumulative_logprob: The cumulative log probability of the output. """ - # NOTE: we cannot use Union[List, array] because msgspec cannot support + # NOTE: we cannot use Union[list, array] because msgspec cannot support # union of 2 list types. _prompt_token_ids: array _output_token_ids: array = msgspec.field( @@ -160,25 +160,25 @@ class SequenceData(msgspec.Struct, ### The below fields should not be passed as an argument ### _cumulative_logprob: float = 0.0 - _prompt_token_ids_tuple: Tuple[int, + _prompt_token_ids_tuple: tuple[int, ...] = msgspec.field(default_factory=tuple) # The number of tokens that are computed (that run against the model). _num_computed_tokens: int = 0 # The number of tokens with prefix cache hit. _num_cached_tokens: int = 0 _stage: SequenceStage = SequenceStage.PREFILL - _cached_all_token_ids: List[int] = msgspec.field(default_factory=list) + _cached_all_token_ids: list[int] = msgspec.field(default_factory=list) # It is used to get delta input. It is reset when `get_delta_and_reset` # is called. - _new_appended_tokens: List[int] = msgspec.field(default_factory=list) + _new_appended_tokens: list[int] = msgspec.field(default_factory=list) # It is used to compute mrope_position_ids. _mrope_position_delta: Optional[int] = None @staticmethod def from_prompt_token_counts( - *token_counts: Tuple[int, int]) -> "SequenceData": + *token_counts: tuple[int, int]) -> "SequenceData": """ Construct a :class:`SequenceData` instance by concatenating prompt token sequences. @@ -220,14 +220,14 @@ def from_seqs( def __post_init__(self) -> None: assert self._prompt_token_ids.typecode == "l" assert self._output_token_ids.typecode == "l" - self._prompt_token_ids_tuple: Tuple[int, ...] = tuple( + self._prompt_token_ids_tuple: tuple[int, ...] = tuple( self._prompt_token_ids) self._update_cached_all_tokens() def _update_cached_all_tokens(self): assert isinstance(self._prompt_token_ids, array) assert isinstance(self._output_token_ids, array) - self._cached_all_token_ids: List[int] = list(self._prompt_token_ids + + self._cached_all_token_ids: list[int] = list(self._prompt_token_ids + self._output_token_ids) @property @@ -235,7 +235,7 @@ def cumulative_logprob(self) -> float: return self._cumulative_logprob @property - def prompt_token_ids(self) -> Tuple[int, ...]: + def prompt_token_ids(self) -> tuple[int, ...]: return self._prompt_token_ids_tuple @prompt_token_ids.setter @@ -252,7 +252,7 @@ def prompt_token_ids_array(self) -> array: return self._prompt_token_ids @property - def output_token_ids(self) -> Tuple[int, ...]: + def output_token_ids(self) -> tuple[int, ...]: return tuple(self._output_token_ids) @output_token_ids.setter @@ -295,12 +295,12 @@ def get_prompt_len(self) -> int: def get_output_len(self) -> int: return len(self._output_token_ids) - def get_token_ids(self) -> List[int]: + def get_token_ids(self) -> list[int]: return self._cached_all_token_ids def get_prefix_token_ids( self, num_tokens: int - ) -> Tuple[Tuple[int, ...], Optional[Tuple[int, ...]]]: + ) -> tuple[tuple[int, ...], Optional[tuple[int, ...]]]: """Get prefix tokens, and make the return value hashable""" prompt_length = self.get_prompt_len() if num_tokens > prompt_length: @@ -351,10 +351,10 @@ def get_last_token_id(self) -> int: return self._prompt_token_ids[-1] return self._output_token_ids[-1] - def get_prompt_token_ids(self) -> Tuple[int, ...]: + def get_prompt_token_ids(self) -> tuple[int, ...]: return self.prompt_token_ids - def get_output_token_ids(self) -> Tuple[int, ...]: + def get_output_token_ids(self) -> tuple[int, ...]: return self.output_token_ids def get_delta_and_reset(self) -> SequenceDataDelta: @@ -432,7 +432,7 @@ def __init__( self.prefix_offset = 0 self.read_offset = 0 # Input + output tokens - self.tokens: Optional[List[str]] = None + self.tokens: Optional[list[str]] = None @property def n_blocks(self) -> int: @@ -443,7 +443,7 @@ def prompt(self) -> Optional[str]: return self.inputs.prompt @property - def prompt_token_ids(self) -> List[int]: + def prompt_token_ids(self) -> list[int]: return self.inputs.prompt_token_ids @property @@ -451,7 +451,7 @@ def prompt_embeds(self) -> Optional[torch.Tensor]: return self.inputs.prompt_embeds @property - def token_type_ids(self) -> List[int]: + def token_type_ids(self) -> list[int]: return self.inputs.token_type_ids @property @@ -463,7 +463,7 @@ def multi_modal_placeholders(self) -> MultiModalPlaceholderDict: return self.inputs.multi_modal_placeholders @property - def mm_processor_kwargs(self) -> Dict[str, Any]: + def mm_processor_kwargs(self) -> dict[str, Any]: return self.inputs.mm_processor_kwargs @property @@ -548,7 +548,7 @@ def reset_state_for_recompute(self): """Reset the sequence states for recomputation.""" self.data.reset_state_for_recompute() - def append_token_id(self, token_id: int, logprobs: Dict[int, + def append_token_id(self, token_id: int, logprobs: dict[int, Logprob]) -> None: assert token_id in logprobs self.output_logprobs.append(logprobs) @@ -563,16 +563,16 @@ def get_prompt_len(self) -> int: def get_output_len(self) -> int: return self.data.get_output_len() - def get_token_ids(self) -> List[int]: + def get_token_ids(self) -> list[int]: return self.data.get_token_ids() - def get_prompt_token_ids(self) -> Tuple[int, ...]: + def get_prompt_token_ids(self) -> tuple[int, ...]: return self.data.get_prompt_token_ids() def get_last_token_id(self) -> int: return self.data.get_last_token_id() - def get_output_token_ids(self) -> Tuple[int, ...]: + def get_output_token_ids(self) -> tuple[int, ...]: return self.data.get_output_token_ids() def get_cumulative_logprob(self) -> float: @@ -644,7 +644,7 @@ class SequenceGroup: def __init__( self, request_id: str, - seqs: List[Sequence], + seqs: list[Sequence], arrival_time: float, sampling_params: Optional[SamplingParams] = None, lora_request: Optional[LoRARequest] = None, @@ -686,7 +686,7 @@ def prompt(self) -> Optional[str]: return self.first_seq.prompt @property - def prompt_token_ids(self) -> List[int]: + def prompt_token_ids(self) -> list[int]: return self.first_seq.prompt_token_ids @property @@ -698,7 +698,7 @@ def encoder_prompt(self) -> Optional[str]: if self.encoder_seq is not None else None) @property - def encoder_prompt_token_ids(self) -> Optional[List[int]]: + def encoder_prompt_token_ids(self) -> Optional[list[int]]: # There are either 0 or 1 encoder sequences # If one is present, its prompt token ids are # distinct from the decoder's. @@ -706,7 +706,7 @@ def encoder_prompt_token_ids(self) -> Optional[List[int]]: if self.encoder_seq is not None else None) @property - def token_type_ids(self) -> Optional[List[int]]: + def token_type_ids(self) -> Optional[list[int]]: return self.first_seq.token_type_ids @property @@ -726,7 +726,7 @@ def multi_modal_placeholders(self) -> MultiModalPlaceholderDict: return {} @property - def mm_processor_kwargs(self) -> Dict[str, Any]: + def mm_processor_kwargs(self) -> dict[str, Any]: if self.first_seq.multi_modal_data: return self.first_seq.mm_processor_kwargs elif self.encoder_seq is not None: @@ -823,7 +823,7 @@ def get_max_num_running_seqs(self) -> int: def get_seqs( self, status: Optional[SequenceStatus] = None, - ) -> List[Sequence]: + ) -> list[Sequence]: if status is None: return self.seqs @@ -838,7 +838,7 @@ def is_encoder_decoder(self) -> bool: def get_encoder_seq(self) -> Optional[Sequence]: return self.encoder_seq - def get_finished_seqs(self) -> List[Sequence]: + def get_finished_seqs(self) -> list[Sequence]: if self.is_single_seq: return self.seqs if self.first_seq.is_finished() else [] @@ -897,13 +897,13 @@ class SequenceGroupMetadataDelta( After sending the first SequenceGroupMetadata, vLLM scheduler only sends delta to reduce the data payload size. """ - seq_data_delta: Dict[int, SequenceDataDelta] + seq_data_delta: dict[int, SequenceDataDelta] request_id: str - block_tables: Dict[int, List[int]] + block_tables: dict[int, list[int]] is_prompt: bool do_sample: bool = True token_chunk_size: Optional[int] = None - computed_block_nums: Optional[List[int]] = None + computed_block_nums: Optional[list[int]] = None state: Optional[SequenceGroupState] = msgspec.field( default_factory=lambda: SequenceGroupState()) @@ -947,23 +947,23 @@ class SequenceGroupMetadata( request_id: str is_prompt: bool - seq_data: Dict[int, SequenceData] + seq_data: dict[int, SequenceData] sampling_params: Optional[SamplingParams] - block_tables: Dict[int, List[int]] + block_tables: dict[int, list[int]] do_sample: bool = True pooling_params: Optional[PoolingParams] = None lora_request: Optional[LoRARequest] = None - computed_block_nums: Optional[List[int]] = None + computed_block_nums: Optional[list[int]] = None state: Optional[SequenceGroupState] = msgspec.field( default_factory=lambda: SequenceGroupState()) # "MultiModalDataDict" types. We have to use Any due to msgspec # doesn't allow to have union of 2 different dicts. - token_type_ids: Optional[List[int]] = None + token_type_ids: Optional[list[int]] = None multi_modal_data: Optional[Any] = None multi_modal_placeholders: Optional[MultiModalPlaceholderDict] = None - mm_processor_kwargs: Optional[Dict[str, Any]] = None + mm_processor_kwargs: Optional[dict[str, Any]] = None encoder_seq_data: Optional[SequenceData] = None - cross_block_table: Optional[List[int]] = None + cross_block_table: Optional[list[int]] = None prompt_adapter_request: Optional[PromptAdapterRequest] = None token_chunk_size: Optional[int] = None @@ -1042,7 +1042,7 @@ class SequenceOutput( """ parent_seq_id: int output_token: int - logprobs: Dict[int, Logprob] + logprobs: dict[int, Logprob] def __repr__(self) -> str: return (f"SequenceOutput(parent_seq_id={self.parent_seq_id}, " @@ -1076,7 +1076,7 @@ class CompletionSequenceGroupOutput( array_like=True): # type: ignore[call-arg] """The model output associated with a completion sequence group.""" __metaclass__ = SequenceGroupOutput - samples: List[SequenceOutput] + samples: list[SequenceOutput] # Prompt logprob for each prompt query token. prompt_logprobs: Optional[PromptLogprobs] @@ -1119,7 +1119,7 @@ class IntermediateTensors: contains the hidden states and residuals for a request. """ - tensors: Dict[str, torch.Tensor] + tensors: dict[str, torch.Tensor] def __init__(self, tensors): # manually define this function, so that @@ -1155,7 +1155,7 @@ class PoolerOutput( omit_defaults=True, # type: ignore[call-arg] array_like=True): # type: ignore[call-arg] """The output from a pooling operation in the pooling model.""" - outputs: List[PoolingSequenceGroupOutput] + outputs: list[PoolingSequenceGroupOutput] def __getitem__(self, idx: int) -> PoolingSequenceGroupOutput: return self.outputs[idx] @@ -1172,7 +1172,7 @@ def __eq__(self, other: object): def get_all_seq_ids( - seq_group_metadata_list: List[SequenceGroupMetadata]) -> List[int]: + seq_group_metadata_list: list[SequenceGroupMetadata]) -> list[int]: """Given a list of SequenceGroupMetadata, create a list of all sequence ids. """ @@ -1180,13 +1180,13 @@ def get_all_seq_ids( def get_all_seq_ids_and_request_ids( - seq_group_metadata_list: List[SequenceGroupMetadata] -) -> Tuple[List[int], Dict[str, Set[int]]]: + seq_group_metadata_list: list[SequenceGroupMetadata] +) -> tuple[list[int], dict[str, set[int]]]: """Given a list of SequenceGroupMetadata, create a list of all sequence ids. """ - seq_ids: List[int] = [] - request_id_seq_ids_mapping: DefaultDict[str, Set[int]] = defaultdict(set) + seq_ids: list[int] = [] + request_id_seq_ids_mapping: defaultdict[str, set[int]] = defaultdict(set) for sg in seq_group_metadata_list: for seq_id in sg.seq_data: seq_ids.append(seq_id) @@ -1206,14 +1206,14 @@ class HiddenStates(msgspec.Struct, array_like=True, # all tokens, whereas for decode step, it use used for last accepted tokens. hidden_states: torch.Tensor # The sequence group metadata list. Only needed for decode step. - seq_group_metadata_list: Optional[List[SequenceGroupMetadata]] = None + seq_group_metadata_list: Optional[list[SequenceGroupMetadata]] = None # Scorer hidden states of the 2nd last token proposed by the proposer ( # irrespective of whether it was accepted or not). Only used for cases when # last proposed token is accepted (i.e., in case of bonus tokens). For the # case of no bonus tokens, these are ignored. second_last_token_hidden_states: Optional[torch.Tensor] = None - _seq_ids: List[int] = msgspec.field(default_factory=list) + _seq_ids: list[int] = msgspec.field(default_factory=list) def __post_init__(self): if self.seq_group_metadata_list is not None: @@ -1221,12 +1221,12 @@ def __post_init__(self): self._seq_ids = get_all_seq_ids(self.seq_group_metadata_list) @property - def seq_ids(self) -> List[int]: + def seq_ids(self) -> list[int]: return self._seq_ids def update(self, hidden_states: torch.Tensor, - seq_group_metadata_list: List[SequenceGroupMetadata], + seq_group_metadata_list: list[SequenceGroupMetadata], second_last_token_hidden_states: Optional[torch.Tensor] = None): """Update hidden states from target model invocation. Only used for decode steps""" @@ -1244,7 +1244,7 @@ def update(self, ]) def prune(self, - seq_group_metadata_list: List[SequenceGroupMetadata]) -> None: + seq_group_metadata_list: list[SequenceGroupMetadata]) -> None: """Prune to provided list of sequence ids. Only used for decode steps. """ # Currently this prunes all seq_ids not present in @@ -1287,16 +1287,16 @@ class ExecuteModelRequest( """The model execution request, containing CPU metadata only. The LLM engine should create an instance of this class for each request batch.""" # The sequence group metadata list. - seq_group_metadata_list: List[Union[SequenceGroupMetadata, + seq_group_metadata_list: list[Union[SequenceGroupMetadata, SequenceGroupMetadataDelta]] # Blocks to swap in. List of CPU -> GPU block number. - blocks_to_swap_in: List[Tuple[int, + blocks_to_swap_in: list[tuple[int, int]] = msgspec.field(default_factory=list) # Blocks to swap out. List of GPU -> CPU block number. - blocks_to_swap_out: List[Tuple[int, + blocks_to_swap_out: list[tuple[int, int]] = msgspec.field(default_factory=list) # Blocks to copy. Source to dest block. - blocks_to_copy: List[Tuple[int, int]] = msgspec.field(default_factory=list) + blocks_to_copy: list[tuple[int, int]] = msgspec.field(default_factory=list) # Virtual engine ID for pipeline parallel. virtual_engine: int = 0 # The number of slots for lookahead decoding. @@ -1310,7 +1310,7 @@ class ExecuteModelRequest( # The step index for spec model input. spec_step_idx: Optional[int] = None # Finished request ids since last step. - finished_requests_ids: List[str] = msgspec.field(default_factory=list) + finished_requests_ids: list[str] = msgspec.field(default_factory=list) # The last sampled token ids for multi step decoding. last_sampled_token_ids: Optional[torch.Tensor] = None # Async callback @@ -1344,7 +1344,7 @@ def current_step(self) -> int: return state.current_step def clone( - self, seq_group_metadata_list: List[Union[SequenceGroupMetadata, + self, seq_group_metadata_list: list[Union[SequenceGroupMetadata, SequenceGroupMetadataDelta]] ) -> "ExecuteModelRequest": """Clone the request with a new sequence group metadata list.""" @@ -1371,13 +1371,13 @@ class SequenceGroupBase: assembled_seq_group: Optional[SequenceGroup] = None # seq id to a unique index inside this group - seq_id_to_index: Dict[str, int] = field(default_factory=dict) + seq_id_to_index: dict[str, int] = field(default_factory=dict) # seq ids to be finished - to_be_finished: Dict[str, SequenceGroup] = field(default_factory=dict) + to_be_finished: dict[str, SequenceGroup] = field(default_factory=dict) # seq id to finished sequences - finished_reqs: Dict[str, SequenceGroup] = field(default_factory=dict) + finished_reqs: dict[str, SequenceGroup] = field(default_factory=dict) streaming: bool = False diff --git a/vllm/spec_decode/batch_expansion.py b/vllm/spec_decode/batch_expansion.py index e08ed742a5225..9d2524656a52d 100644 --- a/vllm/spec_decode/batch_expansion.py +++ b/vllm/spec_decode/batch_expansion.py @@ -1,8 +1,9 @@ # SPDX-License-Identifier: Apache-2.0 from array import array +from collections.abc import Iterator from itertools import chain, count -from typing import Iterator, List, Optional, Tuple +from typing import Optional import torch @@ -103,10 +104,10 @@ def score_proposals( def _expand_batch( self, - seq_group_metadata_list: List[SequenceGroupMetadata], - proposal_token_ids_list: List[List[TokenId]], - proposal_lens_list: List[int], - ) -> Tuple[List[int], List[int], List[SequenceGroupMetadata], int]: + seq_group_metadata_list: list[SequenceGroupMetadata], + proposal_token_ids_list: list[list[TokenId]], + proposal_lens_list: list[int], + ) -> tuple[list[int], list[int], list[SequenceGroupMetadata], int]: """Given the input sequences and potentially multiple corresponding proposal tokens, create a new batch where each sequence has a single query token. @@ -139,8 +140,8 @@ def _expand_batch( def _contract_non_speculative( self, scores: SpeculativeScores, - seq_group_metadata_list: List[SequenceGroupMetadata], - non_spec_indices: List[int], non_spec_outputs: SpeculativeScores, + seq_group_metadata_list: list[SequenceGroupMetadata], + non_spec_indices: list[int], non_spec_outputs: SpeculativeScores, has_prompt_log: bool) -> SpeculativeScores: """ Augment input `scores` with non-speculative requests outputs. @@ -183,10 +184,10 @@ def _contract_non_speculative( def _contract_batch( self, - contracted_seq_group_metadata_list: List[SequenceGroupMetadata], + contracted_seq_group_metadata_list: list[SequenceGroupMetadata], target_sampler_output: SamplerOutput, proposals: SpeculativeProposals, num_scoring_tokens: int, - non_spec_indices: List[int], spec_indices: List[int], + non_spec_indices: list[int], spec_indices: list[int], k: int) -> SpeculativeScores: """Contract the expanded batch back into its original size. This maps the scores of speculative tokens back to their original @@ -314,10 +315,10 @@ def _contract_batch_all_spec( def _create_scoring_model_input( self, - seq_group_metadata_list: List[SequenceGroupMetadata], - proposal_token_ids: List[List[TokenId]], # shape: [batch_size, k] + seq_group_metadata_list: list[SequenceGroupMetadata], + proposal_token_ids: list[list[TokenId]], # shape: [batch_size, k] target_seq_ids_iter: Iterator[TargetSeqId], - ) -> List[SequenceGroupMetadata]: + ) -> list[SequenceGroupMetadata]: """Given the original input sequences and proposed tokens from the draft model, create a list of target sequences that can be used for scoring. @@ -344,10 +345,10 @@ def _create_scoring_model_input( def _create_target_seq_group_metadata( self, input_seq_group_metadata: SequenceGroupMetadata, - proposal_token_ids: List[List[TokenId]], # shape: [batch_size, k] + proposal_token_ids: list[list[TokenId]], # shape: [batch_size, k] batch_index: int, target_seq_ids_iter: Iterator[TargetSeqId], - ) -> List[SequenceGroupMetadata]: + ) -> list[SequenceGroupMetadata]: """Given an input sequence group metadata and a list of draft tokens, create a list of target SequenceGroupMetadata, one for each token id that needs to be scored. @@ -367,7 +368,7 @@ def _create_target_seq_group_metadata( proposal_token_ids[batch_index]) sampling_params = input_seq_group_metadata.sampling_params - target_seq_group_metadata_list: List[SequenceGroupMetadata] = [] + target_seq_group_metadata_list: list[SequenceGroupMetadata] = [] for i, token_ids in enumerate(token_ids_to_score): target_seq_group_metadata_list.append( self._create_single_target_seq_group_metadata( @@ -385,7 +386,7 @@ def _create_single_target_seq_group_metadata( seq_group_metadata: SequenceGroupMetadata, seq_id: SeqId, target_seq_id: TargetSeqId, - token_ids: List[TokenId], + token_ids: list[TokenId], sampling_params: SamplingParams, ) -> SequenceGroupMetadata: """Create a single target SequenceGroupMetadata. @@ -433,7 +434,7 @@ def _create_single_target_seq_group_metadata( @staticmethod def _split_scoring_output( sampler_output: SamplerOutput, num_scoring_tokens: int - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, Optional[torch.Tensor], torch.Tensor, torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: """Split the target model output into speculative and non-speculative @@ -468,7 +469,7 @@ def _split_scoring_output( @staticmethod def _create_target_seq_id_iterator( - seq_ids: List[SeqId]) -> Iterator[TargetSeqId]: + seq_ids: list[SeqId]) -> Iterator[TargetSeqId]: """Create an iterator for creating target sequence ids. Target sequence ids are distinct from sequence ids because we create a distinct target sequence id for each proposal token to be scored. @@ -480,8 +481,8 @@ def _create_target_seq_id_iterator( @staticmethod def _get_token_ids_to_score( - full_spec_token_ids: List[TokenId] # shape: [k] - ) -> List[List[TokenId]]: + full_spec_token_ids: list[TokenId] # shape: [k] + ) -> list[list[TokenId]]: """Given an int tensor of proposal token ids, return a list of token ids that should be scored. @@ -497,7 +498,7 @@ def _get_token_ids_to_score( [0, 1, 2] [0, 1, 2, 3] """ - empty_token_ids: List[TokenId] = [] + empty_token_ids: list[TokenId] = [] token_ids_to_score = [empty_token_ids] token_ids_to_score.extend(full_spec_token_ids[:i + 1] diff --git a/vllm/spec_decode/draft_model_runner.py b/vllm/spec_decode/draft_model_runner.py index c54e6abe18d73..eb1bb04cdaa8f 100644 --- a/vllm/spec_decode/draft_model_runner.py +++ b/vllm/spec_decode/draft_model_runner.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List, Optional +from typing import Optional import torch @@ -171,12 +171,12 @@ def set_indices_of_seq_with_bonus_tokens(self, def execute_model( self, model_input: ModelRunnerInputBase, - kv_caches: List[torch.Tensor], + kv_caches: list[torch.Tensor], previous_hidden_states: Optional[torch.Tensor] = None, intermediate_tensors: Optional[IntermediateTensors] = None, num_steps: int = 1, **kwargs, - ) -> Optional[List[SamplerOutput]]: + ) -> Optional[list[SamplerOutput]]: """Executes num_steps forward passes with advacement of input tensors on the GPU. Look at supports_gpu_multi_step(..) for pre-conditions. @@ -268,7 +268,7 @@ def execute_model( model_executable = self.model hidden_states = previous_hidden_states - outputs: List[SamplerOutput] = [] + outputs: list[SamplerOutput] = [] for step in range(num_steps): multi_modal_kwargs = model_input.multi_modal_kwargs or {} diff --git a/vllm/spec_decode/interfaces.py b/vllm/spec_decode/interfaces.py index dd085ad776384..3ae9c10f31e4b 100644 --- a/vllm/spec_decode/interfaces.py +++ b/vllm/spec_decode/interfaces.py @@ -2,7 +2,7 @@ from abc import ABC, abstractmethod from dataclasses import dataclass -from typing import List, Optional, Set, Union +from typing import Optional, Union import torch @@ -58,7 +58,7 @@ class SpeculativeScores: # Scoring model may also return logprobs for prompt tokens # for each request, when chunked prefill is enabled. - prompt_logprobs: Optional[List[PromptLogprobs]] = None + prompt_logprobs: Optional[list[PromptLogprobs]] = None def __repr__(self): return (f"SpeculativeScores(" @@ -74,7 +74,7 @@ def get_spec_proposals( execute_model_req: ExecuteModelRequest, # If set, this contains all sequence IDs that were assigned # bonus tokens in their last forward pass. - seq_ids_with_bonus_token_in_last_step: Set[int], + seq_ids_with_bonus_token_in_last_step: set[int], ) -> SpeculativeProposals: raise NotImplementedError diff --git a/vllm/spec_decode/medusa_worker.py b/vllm/spec_decode/medusa_worker.py index 0b62a988e8b26..9aa3bda34b669 100644 --- a/vllm/spec_decode/medusa_worker.py +++ b/vllm/spec_decode/medusa_worker.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import weakref -from typing import List, Optional, Set, Tuple +from typing import Optional import torch @@ -45,8 +45,8 @@ def sampler_output( execute_model_req: ExecuteModelRequest, sample_len: int, # Unused parameter. - seq_ids_with_bonus_token_in_last_step: Set[int], - ) -> Tuple[List[SamplerOutput], bool]: + seq_ids_with_bonus_token_in_last_step: set[int], + ) -> tuple[list[SamplerOutput], bool]: """Run the model forward pass to generate sample_len future tokens. Returns the list of sampler output, one per layer, along with indicator of whether torch tensor in sampler output need to be transposed in @@ -76,13 +76,13 @@ def sampler_output( def _prepare_input_tensors( self, - seq_group_metadata_list: Optional[List[SequenceGroupMetadata]], - ) -> Tuple[List[int], List[int]]: + seq_group_metadata_list: Optional[list[SequenceGroupMetadata]], + ) -> tuple[list[int], list[int]]: if not seq_group_metadata_list: return [], [] - seq_lens: List[int] = [] - query_lens: List[int] = [] + seq_lens: list[int] = [] + query_lens: list[int] = [] for seq_group_metadata in seq_group_metadata_list: is_prompt = seq_group_metadata.is_prompt @@ -105,7 +105,7 @@ def _prepare_input_tensors( def get_spec_proposals( self, execute_model_req: ExecuteModelRequest, - seq_ids_with_bonus_token_in_last_step: Set[int], + seq_ids_with_bonus_token_in_last_step: set[int], ) -> SpeculativeProposals: """Produce speculations given an input batch of sequences. The number of speculative tokens per sequence is determined by max_proposal_len. diff --git a/vllm/spec_decode/mlp_speculator_worker.py b/vllm/spec_decode/mlp_speculator_worker.py index bdaf31895e25d..5074a4f93fdd1 100644 --- a/vllm/spec_decode/mlp_speculator_worker.py +++ b/vllm/spec_decode/mlp_speculator_worker.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List, Optional, Set, Tuple +from typing import Optional import torch @@ -24,8 +24,8 @@ def sampler_output( sample_len: int, # Unused parameter. MLPSpeculatorWorker does not use the KV Cache and # therefore does not need this parameter. - seq_ids_with_bonus_token_in_last_step: Set[int], - ) -> Tuple[List[SamplerOutput], bool]: + seq_ids_with_bonus_token_in_last_step: set[int], + ) -> tuple[list[SamplerOutput], bool]: """Run the model forward pass to generate sample_len future tokens. Returns the list of sampler output, one per layer, along with indicator of whether torch tensor in sampler output need to be transposed in @@ -59,14 +59,14 @@ def sampler_output( def _prepare_input_tensors( self, - seq_group_metadata_list: Optional[List[SequenceGroupMetadata]], - ) -> Tuple[torch.Tensor, List[int], List[int]]: + seq_group_metadata_list: Optional[list[SequenceGroupMetadata]], + ) -> tuple[torch.Tensor, list[int], list[int]]: if not seq_group_metadata_list: return torch.empty(0, device=self.device), [], [] - input_tokens: List[int] = [] - seq_lens: List[int] = [] - query_lens: List[int] = [] + input_tokens: list[int] = [] + seq_lens: list[int] = [] + query_lens: list[int] = [] for seq_group_metadata in seq_group_metadata_list: is_prompt = seq_group_metadata.is_prompt diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index c28d413efe747..0b414fa505741 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -2,7 +2,6 @@ import copy import weakref -from typing import Dict, List, Set, Tuple import torch @@ -61,8 +60,8 @@ def sampler_output( self, execute_model_req: ExecuteModelRequest, sample_len: int, - seq_ids_with_bonus_token_in_last_step: Set[int], - ) -> Tuple[List[SamplerOutput], bool]: + seq_ids_with_bonus_token_in_last_step: set[int], + ) -> tuple[list[SamplerOutput], bool]: """Run the model forward pass sample_len times. Returns the list of sampler output, one per model forward pass, along with indicator of whether torch tensor in sampler output need to be transposed in latter @@ -79,7 +78,7 @@ def sampler_output( execute_model_req, seq_ids_with_bonus_token_in_last_step) # Run model sample_len times. - model_outputs: List[SamplerOutput] = [] + model_outputs: list[SamplerOutput] = [] if current_platform.is_cuda_alike() and isinstance( self.model_runner, TP1DraftModelRunner ) and self.model_runner.supports_gpu_multi_step(expanded_request): @@ -97,7 +96,7 @@ def sampler_output( # and other restrictions that are part of DraftModelRunner's # supports_gpu_multi_step(..) for _ in range(sample_len): - model_output: List[SamplerOutput] = self.worker.execute_model( + model_output: list[SamplerOutput] = self.worker.execute_model( execute_model_req=expanded_request) assert (len(model_output) == 1 ), "composing multistep workers not supported" @@ -119,7 +118,7 @@ def sampler_output( def _expand_execute_model_request( execute_model_req: ExecuteModelRequest, seq_with_bonus_token_in_last_step: set, - ) -> Tuple[ExecuteModelRequest, List[int]]: + ) -> tuple[ExecuteModelRequest, list[int]]: """ Expands the execute model request based on sequences with bonus tokens. @@ -136,11 +135,11 @@ def _expand_execute_model_request( contain bonus tokens. Returns: - Tuple[ExecuteModelRequest, List[int]]: The updated execute model + tuple[ExecuteModelRequest, list[int]]: The updated execute model request with expanded sequences and a list of indices corresponding to the original sequence groups. """ - updated_seq_group_metadata_list: List[SequenceGroupMetadata] = [] + updated_seq_group_metadata_list: list[SequenceGroupMetadata] = [] updated_execute_model_req = execute_model_req.clone( updated_seq_group_metadata_list) indices_of_original_sequence_groups = [] @@ -179,8 +178,8 @@ def _expand_execute_model_request( @staticmethod def _filter_model_output( - expanded_batch_outputs: List[SamplerOutput], - output_indices_to_retain: torch.Tensor) -> List[SamplerOutput]: + expanded_batch_outputs: list[SamplerOutput], + output_indices_to_retain: torch.Tensor) -> list[SamplerOutput]: """ Filters the model output to include only the specified sequence outputs. This method contracts the expanded batch output from the @@ -188,13 +187,13 @@ def _filter_model_output( provided indices. Args: - expanded_batch_output (List[SamplerOutput]): The expanded output + expanded_batch_output (list[SamplerOutput]): The expanded output batch from the model. output_indices_to_retain (torch.Tensor): Indices of the model outputs to retain. Returns: - List[SamplerOutput]: A list containing the filtered model + list[SamplerOutput]: A list containing the filtered model outputs for the specified indices. """ return [ @@ -231,9 +230,9 @@ def get_spec_proposals( @staticmethod def _append_new_tokens( - model_output: List[SamplerOutput], - seq_group_metadata_list: List[SequenceGroupMetadata], - indices_of_seq_with_bonus_tokens: List[int]) -> None: + model_output: list[SamplerOutput], + seq_group_metadata_list: list[SequenceGroupMetadata], + indices_of_seq_with_bonus_tokens: list[int]) -> None: """Given model output from a single run, append the tokens to the sequences. This is normally done outside of the worker, but it is required if the worker is to perform multiple forward passes. @@ -280,7 +279,7 @@ def _shallow_copy_seq_group_metadata( new_seq_group_metadata = copy.copy(seq_group_metadata) # We must shallow-copy seq_data as we will append token ids - new_seq_data: Dict[int, SequenceData] = {} + new_seq_data: dict[int, SequenceData] = {} for seq_id, old_seq_data in seq_group_metadata.seq_data.items(): new_seq_data[seq_id] = copy.copy(old_seq_data) new_seq_data[seq_id].output_token_ids =\ @@ -292,7 +291,7 @@ def _shallow_copy_seq_group_metadata( @staticmethod def _copy_seq_metadata_excluding_last_token( seq_group_metadata: SequenceGroupMetadata, - seq_ids_to_copy: Set[int], + seq_ids_to_copy: set[int], ) -> SequenceGroupMetadata: """ Creates a shallow copy of the given SequenceGroupMetadata, retaining @@ -303,7 +302,7 @@ def _copy_seq_metadata_excluding_last_token( Parameters: seq_group_metadata (SequenceGroupMetadata): The original sequence group metadata. - seq_ids_to_copy (Set[int]): The set of sequence IDs to include in the + seq_ids_to_copy (set[int]): The set of sequence IDs to include in the copy. Returns: @@ -313,7 +312,7 @@ def _copy_seq_metadata_excluding_last_token( # Shallow-copy the SequenceGroupMetadata. new_seq_group_metadata = copy.copy(seq_group_metadata) # Shallow-copy seq_data and modify the output_token_ids. - new_seq_data: Dict[int, SequenceData] = {} + new_seq_data: dict[int, SequenceData] = {} for seq_id, old_seq_data in seq_group_metadata.seq_data.items(): if (seq_id in seq_ids_to_copy): new_seq_data[seq_id] = copy.copy(old_seq_data) @@ -332,7 +331,7 @@ def _copy_seq_metadata_excluding_last_token( return new_seq_group_metadata def _assert_enough_kv_space( - self, seq_group_metadata_list: List[SequenceGroupMetadata], + self, seq_group_metadata_list: list[SequenceGroupMetadata], num_steps: int) -> None: """Assert there are enough physical blocks per sequence to store the current KV plus additional KV from num_steps tokens. diff --git a/vllm/spec_decode/ngram_worker.py b/vllm/spec_decode/ngram_worker.py index 57ae173af6744..7e89e824b67b9 100644 --- a/vllm/spec_decode/ngram_worker.py +++ b/vllm/spec_decode/ngram_worker.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import weakref -from typing import List, Optional, Set, Tuple +from typing import Optional import torch import torch.nn as nn @@ -71,8 +71,8 @@ def sampler_output( sample_len: int, # Unused parameter. NGramWorker does not use the KV Cache and # therefore does not need this parameter. - seq_ids_with_bonus_token_in_last_step: Set[int], - ) -> Tuple[Optional[List[Optional[SamplerOutput]]], bool]: + seq_ids_with_bonus_token_in_last_step: set[int], + ) -> tuple[Optional[list[Optional[SamplerOutput]]], bool]: """NGram match algo to pick proposal candidate. Returns the list of sampler output, one per SequenceGroupMetadata. @@ -82,8 +82,8 @@ def sampler_output( self._raise_if_unsupported(execute_model_req) has_spec_out = False - token_id_list: List[Optional[torch.Tensor]] = [] - token_prob_list: List[Optional[torch.Tensor]] = [] + token_id_list: list[Optional[torch.Tensor]] = [] + token_prob_list: list[Optional[torch.Tensor]] = [] for idx, seq_group_metadata in enumerate( execute_model_req.seq_group_metadata_list): seq_data = next(iter(seq_group_metadata.seq_data.values())) @@ -142,7 +142,7 @@ def sampler_output( if not has_spec_out: return None, False - outputs: List[Optional[SamplerOutput]] = [] + outputs: list[Optional[SamplerOutput]] = [] for idx in range(len(execute_model_req.seq_group_metadata_list)): if token_id_list[idx] is None: outputs.append(None) @@ -164,7 +164,7 @@ def get_spec_proposals( execute_model_req: ExecuteModelRequest, # Unused parameter. NGramWorker does not use the KV Cache and # therefore does not need this parameter. - seq_ids_with_bonus_token_in_last_step: Set[int], + seq_ids_with_bonus_token_in_last_step: set[int], ) -> SpeculativeProposals: """Produce speculations given an input batch of sequences. The number of speculative tokens per sequence is determined by max_proposal_len. diff --git a/vllm/spec_decode/proposer_worker_base.py b/vllm/spec_decode/proposer_worker_base.py index 2829d631b49ee..bb1b11465e525 100644 --- a/vllm/spec_decode/proposer_worker_base.py +++ b/vllm/spec_decode/proposer_worker_base.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 from abc import ABC, abstractmethod -from typing import List, Optional, Set, Tuple +from typing import Optional from vllm.model_executor.layers.sampler import SamplerOutput from vllm.sequence import ExecuteModelRequest @@ -23,8 +23,8 @@ def sampler_output( # This parameter is only used by the MultiStepWorker, which relies on # the KV cache for token generation. It is not used by workers that # do not utilize the KV cache. - seq_ids_with_bonus_token_in_last_step: Set[int] - ) -> Tuple[Optional[List[SamplerOutput]], bool]: + seq_ids_with_bonus_token_in_last_step: set[int] + ) -> tuple[Optional[list[SamplerOutput]], bool]: raise NotImplementedError def set_include_gpu_probs_tensor(self) -> None: @@ -42,11 +42,11 @@ class NonLLMProposerWorkerBase(ProposerWorkerBase, ABC): def execute_model( self, execute_model_req: Optional[ExecuteModelRequest] = None - ) -> List[SamplerOutput]: + ) -> list[SamplerOutput]: """get_spec_proposals is used to get the proposals""" return [] - def determine_num_available_blocks(self) -> Tuple[int, int]: + def determine_num_available_blocks(self) -> tuple[int, int]: """This is never called on the proposer, only the target model""" raise NotImplementedError diff --git a/vllm/spec_decode/smaller_tp_proposer_worker.py b/vllm/spec_decode/smaller_tp_proposer_worker.py index 6919562465097..87f5803b33d2d 100644 --- a/vllm/spec_decode/smaller_tp_proposer_worker.py +++ b/vllm/spec_decode/smaller_tp_proposer_worker.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List, Optional, Set, Tuple +from typing import Optional import torch import torch.nn as nn @@ -48,12 +48,12 @@ def maybe_wrap_worker(cls, worker, draft_tensor_parallel_size: int, logger.info("Wrapping {%s} in {%s}", type(worker), cls) return cls(worker, draft_ranks) - def __init__(self, worker: MultiStepWorker, draft_ranks: List[int]): + def __init__(self, worker: MultiStepWorker, draft_ranks: list[int]): """Create a SmallerTpProposerWorker. Args: worker (MultiStepWorker): an actual worker wrapped with this class - draft_ranks (List[int]): if this value is given, only the GPU ranks + draft_ranks (list[int]): if this value is given, only the GPU ranks written in this value participate in draft generation """ self._worker = worker @@ -105,7 +105,7 @@ def load_model(self) -> None: with self._patch_tensor_parallel_group(): self._worker.load_model() - def determine_num_available_blocks(self) -> Tuple[int, int]: + def determine_num_available_blocks(self) -> tuple[int, int]: if self._is_dummy: # this case is not used now return -1, -1 @@ -125,8 +125,8 @@ def sampler_output( self, execute_model_req: ExecuteModelRequest, sample_len: int, - seq_ids_with_bonus_token_in_last_step: Set[int], - ) -> Tuple[List[SamplerOutput], bool]: + seq_ids_with_bonus_token_in_last_step: set[int], + ) -> tuple[list[SamplerOutput], bool]: # Do not check _is_dummy, as it's always called by get_spec_proposals return self._worker.sampler_output( execute_model_req, sample_len, @@ -135,7 +135,7 @@ def sampler_output( def get_spec_proposals( self, execute_model_req: ExecuteModelRequest, - seq_ids_with_bonus_token_in_last_step: Set[int], + seq_ids_with_bonus_token_in_last_step: set[int], ) -> SpeculativeProposals: """Produce speculations given an input batch of sequences. The number of speculative tokens per sequence is determined by max_proposal_len. @@ -157,7 +157,7 @@ def get_model(self) -> nn.Module: def execute_model( self, execute_model_req: Optional[ExecuteModelRequest] = None - ) -> List[SamplerOutput]: + ) -> list[SamplerOutput]: if self._is_dummy: return [] diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 871a3aee63063..9c48e231c2d4d 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -3,7 +3,7 @@ import copy from collections import defaultdict from functools import cached_property -from typing import Any, Dict, List, Optional, Set, Tuple, Type +from typing import Any, Optional import torch import torch.nn as nn @@ -148,7 +148,7 @@ class SpecDecodeWorker(LoRANotSupportedWorkerBase): def create_worker( cls, scorer_worker: WorkerBase, - draft_worker_kwargs: Dict[str, Any], + draft_worker_kwargs: dict[str, Any], disable_mqa_scorer: bool, disable_by_batch_size: Optional[int], draft_token_acceptance_method: str, @@ -324,10 +324,10 @@ def __init__( # Tracks the sequence IDs that received a bonus token ID in # their last forward pass. Needed only if KV cache is being # used for token generation such as in the case of MultiStepWorker. - self._seq_with_bonus_token_in_last_step: Set[int] = set() + self._seq_with_bonus_token_in_last_step: set[int] = set() # Tracks the currently active request ids and the sequence IDs # corresponding to them - self._request_id_seq_id_mapping: Dict[str, Set[int]] = defaultdict(set) + self._request_id_seq_id_mapping: dict[str, set[int]] = defaultdict(set) # Tracks if the proposer worker uses the KV cache or not. self.probs_dtype = self.spec_decode_sampler.probs_dtype @@ -374,7 +374,7 @@ def init_device(self) -> None: self.spec_decode_sampler.init_tensors(self.rank, device_type=self.device) - scorer_cls: Type[SpeculativeScorer] + scorer_cls: type[SpeculativeScorer] if self.disable_mqa_scorer: scorer_cls = BatchExpansionTop1Scorer logger.info("[Speculative Decoding] Use batch " @@ -419,7 +419,7 @@ def _configure_model_sampler_for_spec_decode(self): self.proposer_worker.set_include_gpu_probs_tensor() self.proposer_worker.set_should_modify_greedy_probs_inplace() - def determine_num_available_blocks(self) -> Tuple[int, int]: + def determine_num_available_blocks(self) -> tuple[int, int]: """Determine the number of cache blocks to use. This is done by profiling the scorer model (which is typically the @@ -456,7 +456,7 @@ def get_model(self) -> nn.Module: def execute_model( self, execute_model_req: Optional[ExecuteModelRequest] = None - ) -> List[SamplerOutput]: + ) -> list[SamplerOutput]: """Perform speculative decoding on the input batch. """ if self.rank != self._driver_rank: @@ -560,7 +560,7 @@ def _should_disable_all_speculation( def _maybe_disable_speculative_tokens( self, disable_all_speculation: bool, - seq_group_metadata_list: List[SequenceGroupMetadata]) -> None: + seq_group_metadata_list: list[SequenceGroupMetadata]) -> None: if not disable_all_speculation: return @@ -574,7 +574,7 @@ def _maybe_disable_speculative_tokens( def _serialize_sampler_output_no_logprobs( self, execute_model_req: ExecuteModelRequest, - sampler_output: SamplerOutput) -> List[SamplerOutput]: + sampler_output: SamplerOutput) -> list[SamplerOutput]: """ Creates and returns a `SamplerOutput` with only the token IDs being serialized to CPU and populated in `CompletionSequenceGroupOutput`. @@ -609,7 +609,7 @@ def _serialize_sampler_output_no_logprobs( execute_model_req.seq_group_metadata_list \ for seq_id, seq_data in sg.seq_data.items() ] - completion_seq_group_output_list: List[ + completion_seq_group_output_list: list[ CompletionSequenceGroupOutput] = [] output_index = 0 # Make sure the non-terminal prefill chunks are still aligned with @@ -664,7 +664,7 @@ def _serialize_sampler_output_no_logprobs( @nvtx_range("spec_decode_worker._run_no_spec") def _run_no_spec(self, execute_model_req: ExecuteModelRequest, - skip_proposer: bool) -> List[SamplerOutput]: + skip_proposer: bool) -> list[SamplerOutput]: """Run a single generation step without any speculation. The input is sent to the proposer and scorer model so that the KV cache is consistent between the two. When skip_proposer is True, the proposer model is @@ -759,7 +759,7 @@ def _run_non_driver_rank(self) -> bool: @nvtx_range("spec_decode_worker._run_speculative_decoding_step") def _run_speculative_decoding_step( self, execute_model_req: ExecuteModelRequest, - num_lookahead_slots: int) -> List[SamplerOutput]: + num_lookahead_slots: int) -> list[SamplerOutput]: """Execute a single step of speculative decoding. This invokes the proposer worker to get k speculative tokens for each @@ -838,11 +838,11 @@ def _run_speculative_decoding_step( @nvtx_range("spec_decode_worker._verify_tokens") def _verify_tokens( self, - seq_group_metadata_list: List[SequenceGroupMetadata], + seq_group_metadata_list: list[SequenceGroupMetadata], proposal_scores: SpeculativeScores, proposals: SpeculativeProposals, max_proposal_len: int, - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: """Determine which speculative tokens are accepted using the probabilities of each token according to the proposer and scorer models. @@ -875,7 +875,7 @@ def _verify_tokens( proposal_token_ids = proposals.proposal_token_ids[spec_indices] # Sampler arguments - sampler_extra_kwargs: Dict[str, Any] = {} + sampler_extra_kwargs: dict[str, Any] = {} if self.generators and isinstance(self.spec_decode_sampler, SpecDecodeStochasticBaseSampler): sampler_extra_kwargs["seeded_seqs"] = { @@ -934,14 +934,14 @@ def _verify_tokens( def _create_output_sampler_list( self, - seq_group_metadata_list: List[SequenceGroupMetadata], + seq_group_metadata_list: list[SequenceGroupMetadata], accepted_token_ids: torch.Tensor, # shape: [batch_size, k+1] target_logprobs: torch.Tensor, # shape: [batch_size, k+1, vocab_size] prompt_logprobs: Optional[ torch.Tensor], # shape: [nprompt_tokens, vocab_size] k: int, - stage_times: Tuple[float, float, float], - ) -> List[SamplerOutput]: + stage_times: tuple[float, float, float], + ) -> list[SamplerOutput]: """Given the accepted token ids, create a list of SamplerOutput. The output is padded with -1 tokens such that each sequence has @@ -984,7 +984,7 @@ def _create_output_sampler_list( # Non-terminal prefill chunks will end up here as rows with just -1s # i.e mixed-batch [[-1, 1576], [-1, 29884], [-1, -1], [-1, -1]] while # terminal chunks will only have one generated token at time 0. - sampler_output_list: List[SamplerOutput] = [] + sampler_output_list: list[SamplerOutput] = [] # Prefills are not multi-step (return at most 1 token), in order to # avoid padding or repetition to fit decodes, we separate them. @@ -1058,7 +1058,7 @@ def _create_output_sampler_list( if not sg.is_prompt): break - step_output_token_ids: List[CompletionSequenceGroupOutput] = [] + step_output_token_ids: list[CompletionSequenceGroupOutput] = [] for sequence_index in range(batch_size): seq_meta = seq_group_metadata_list[sequence_index] # Prompts already processed above. @@ -1124,9 +1124,9 @@ def _create_dummy_logprob_lists( batch_size: int, num_steps: int, num_top_k: int, - ) -> Tuple[List[List[int]], List[List[float]], - List[List[List[Optional[float]]]], - List[List[List[Optional[int]]]]]: + ) -> tuple[list[list[int]], list[list[float]], + list[list[list[Optional[float]]]], + list[list[list[Optional[int]]]]]: """ Creates and returns four dummy lists representing token probabilities and their ranks. @@ -1153,10 +1153,10 @@ def _create_dummy_logprob_lists( for _ in range(num_steps)] accepted_token_id_logprobs_by_step = [[0.0] * batch_size for _ in range(num_steps)] - topk_logprobs_by_step: List[List[List[Optional[float]]]] = [[ + topk_logprobs_by_step: list[list[list[Optional[float]]]] = [[ [None] * num_top_k for _ in range(batch_size) ] for _ in range(num_steps)] - topk_indices_by_step: List[List[List[Optional[int]]]] = [[ + topk_indices_by_step: list[list[list[Optional[int]]]] = [[ [None] * num_top_k for _ in range(batch_size) ] for _ in range(num_steps)] return (accepted_token_id_ranks_by_step, @@ -1168,9 +1168,9 @@ def _create_logprob_lists_from_tensors( target_logprobs_by_step: torch.Tensor, accepted_token_ids_by_step: torch.Tensor, num_top_k: int, - ) -> Tuple[List[List[int]], List[List[float]], - List[List[List[Optional[float]]]], - List[List[List[Optional[int]]]]]: + ) -> tuple[list[list[int]], list[list[float]], + list[list[list[Optional[float]]]], + list[list[list[Optional[int]]]]]: """ Creates and returns four lists representing token probabilities and their ranks. @@ -1232,9 +1232,9 @@ def _track_finished_requests(self, execute_model_req: ExecuteModelRequest): del self._request_id_seq_id_mapping[finished_request] def _track_sequences_with_bonus_tokens( - self, seq_ids: List[int], - request_ids_seq_ids_mapping: Dict[str, Set[int]], - accepted_token_ids_by_step: List[List[int]]): + self, seq_ids: list[int], + request_ids_seq_ids_mapping: dict[str, set[int]], + accepted_token_ids_by_step: list[list[int]]): """ Updates the internal data structures which keep track of sequences which have been assigned bonus tokens in their last forward pass. diff --git a/vllm/spec_decode/target_model_runner.py b/vllm/spec_decode/target_model_runner.py index 08e773c562bf8..b13a070c2e454 100644 --- a/vllm/spec_decode/target_model_runner.py +++ b/vllm/spec_decode/target_model_runner.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List, Optional +from typing import Optional from vllm.sequence import SequenceGroupMetadata from vllm.worker.model_runner_base import (ModelRunnerBase, @@ -28,9 +28,9 @@ def __init__(self, model_runner: ModelRunnerBase): def prepare_model_input( self, - seq_group_metadata_list: List[SequenceGroupMetadata], + seq_group_metadata_list: list[SequenceGroupMetadata], virtual_engine: int = 0, - finished_requests_ids: Optional[List[str]] = None, + finished_requests_ids: Optional[list[str]] = None, ) -> ModelRunnerInputBase: model_input: ModelRunnerInputBase =\ self.model_runner.prepare_model_input( diff --git a/vllm/spec_decode/top1_proposer.py b/vllm/spec_decode/top1_proposer.py index b538923c03e74..14d5891c12a71 100644 --- a/vllm/spec_decode/top1_proposer.py +++ b/vllm/spec_decode/top1_proposer.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List, Optional, Set, Tuple +from typing import Optional import torch @@ -44,7 +44,7 @@ def __init__( def get_spec_proposals( self, execute_model_req: ExecuteModelRequest, - seq_ids_with_bonus_token_in_last_step: Set[int], + seq_ids_with_bonus_token_in_last_step: set[int], ) -> SpeculativeProposals: """Get speculative proposals given the input batch. @@ -115,18 +115,18 @@ def get_spec_proposals( def _split_by_proposal_len( self, - seq_group_metadata_list: List[SequenceGroupMetadata], + seq_group_metadata_list: list[SequenceGroupMetadata], proposal_len: int, - ) -> Tuple[List[int], List[SequenceGroupMetadata], List[int]]: + ) -> tuple[list[int], list[SequenceGroupMetadata], list[int]]: """Split sequences by two groups: 1. Sequences with non-zero proposal length. 2. Sequences with zero proposal length (due to disabled speculation or exceed the maximum model length). """ - proposal_lens: List[int] = [] - nonzero_proposal_len_seqs: List[SequenceGroupMetadata] = [] - nonzero_proposal_len_indices: List[int] = [] + proposal_lens: list[int] = [] + nonzero_proposal_len_seqs: list[SequenceGroupMetadata] = [] + nonzero_proposal_len_indices: list[int] = [] for i, seq_group_metadata in enumerate(seq_group_metadata_list): # The speculative decoding for this request has either been disabled # (e.g. due to high traffic) or this is a prompt request. @@ -174,9 +174,9 @@ def _remove_no_proposal_seqs(proposal_lens, maybe_sampler_output, return (proposal_lens, maybe_sampler_output, nonzero_proposal_len_indices) - new_proposal_lens: List[int] = [] - new_nonzero_proposal_len_indices: List[int] = [] - new_maybe_sampler_output: List[SamplerOutput] = [] + new_proposal_lens: list[int] = [] + new_nonzero_proposal_len_indices: list[int] = [] + new_maybe_sampler_output: list[SamplerOutput] = [] nonzero_proposal_len_idx_ptr = 0 seq_idx = 0 while seq_idx < len( @@ -217,11 +217,11 @@ def _merge_outputs( self, batch_size: int, proposal_len: int, - maybe_sampler_output: Optional[List[SamplerOutput]], - proposal_lens: List[int], - nonzero_proposal_len_indices: List[int], + maybe_sampler_output: Optional[list[SamplerOutput]], + proposal_lens: list[int], + nonzero_proposal_len_indices: list[int], sampler_transposed: bool, - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: """After speculations are produced, merge the speculation results with the skipped sequences. """ diff --git a/vllm/spec_decode/util.py b/vllm/spec_decode/util.py index 9c04680a6a7ab..1676be8ded73a 100644 --- a/vllm/spec_decode/util.py +++ b/vllm/spec_decode/util.py @@ -1,8 +1,9 @@ # SPDX-License-Identifier: Apache-2.0 import time +from collections.abc import Sequence from contextlib import contextmanager -from typing import Dict, List, Optional, Sequence, Tuple +from typing import Optional import torch @@ -16,14 +17,14 @@ def get_all_num_logprobs( - seq_group_metadata_list: List[SequenceGroupMetadata]) -> List[int]: + seq_group_metadata_list: list[SequenceGroupMetadata]) -> list[int]: """Given a list of SequenceGroupMetadata, create a list of all num_logprobs. If the sampling params do not call for any logprobs, return 0 for that sequence. """ - all_num_logprobs: List[int] = [] + all_num_logprobs: list[int] = [] for seq_group_metadata in seq_group_metadata_list: num_logprobs = seq_group_metadata.sampling_params.logprobs if num_logprobs is None: @@ -37,7 +38,7 @@ def get_sampled_token_logprobs( # shape [num_steps, batch_size, vocab_size] logprob_tensor: torch.Tensor, sampled_token_ids: torch.Tensor, # shape [num_steps, batch_size] -) -> Tuple[torch.Tensor, torch.Tensor]: +) -> tuple[torch.Tensor, torch.Tensor]: """Get the logprobs for the sampled tokens. Returns the ranks and logprobs. """ num_steps, batch_size, vocab_size = logprob_tensor.shape @@ -59,21 +60,21 @@ def create_logprobs_output( token_id: int, token_id_logprob_rank: int, token_id_logprob: float, - topk_token_ids: List[Optional[int]], - topk_logprobs: List[Optional[float]], -) -> Dict[int, Logprob]: - """Create a Logprob Dict for a token given the sampling results. + topk_token_ids: list[Optional[int]], + topk_logprobs: list[Optional[float]], +) -> dict[int, Logprob]: + """Create a Logprob dict for a token given the sampling results. Args: token_id (int): The sampled token for the sequence. token_id_logprob_rank (int): The logprob rank of the sampled token. token_id_logprob (float): The logprob value of the sampled token. - topk_token_ids (List[Optional[int]]): The list of top-k token ids. - topk_logprobs (List[Optional[float]]): The list of top-k logprobs. + topk_token_ids (list[Optional[int]]): The list of top-k token ids. + topk_logprobs (list[Optional[float]]): The list of top-k logprobs. """ # vLLM logprobs always include the sampled token. In addition, the user may # request topk-logprobs (where top-k varies per user up to max_logprobs). - logprobs: Dict[int, Logprob] = { + logprobs: dict[int, Logprob] = { token_id: Logprob( logprob=token_id_logprob, rank=token_id_logprob_rank, @@ -97,8 +98,8 @@ def create_sequence_group_output( token_id_logprob_rank: int, token_id_logprob: float, seq_id: SeqId, - topk_token_ids: List[Optional[int]], - topk_logprobs: List[Optional[float]], + topk_token_ids: list[Optional[int]], + topk_logprobs: list[Optional[float]], prompt_logprobs: Optional[PromptLogprobs] = None, ) -> CompletionSequenceGroupOutput: """Create a SequenceGroupOutput given the sampling results. @@ -108,8 +109,8 @@ def create_sequence_group_output( token_id_logprob_rank (int): The logprob rank of the sampled token. token_id_logprob (float): The logprob value of the sampled token. seq_id (int): The sequence id. - topk_token_ids (List[Optional[int]]): The list of top-k token ids. - topk_logprobs (List[Optional[float]]): The list of top-k logprobs. + topk_token_ids (list[Optional[int]]): The list of top-k token ids. + topk_logprobs (list[Optional[float]]): The list of top-k logprobs. """ logprobs = create_logprobs_output( @@ -131,17 +132,17 @@ def create_sequence_group_output( def split_batch_by_proposal_len( - seq_group_metadata_list: List[SequenceGroupMetadata], - proposal_lens: List[int], -) -> Tuple[Tuple[List[SequenceGroupMetadata], List[int]], Tuple[ - List[SequenceGroupMetadata], List[int]]]: + seq_group_metadata_list: list[SequenceGroupMetadata], + proposal_lens: list[int], +) -> tuple[tuple[list[SequenceGroupMetadata], list[int]], tuple[ + list[SequenceGroupMetadata], list[int]]]: """Utility function that splits a batch based on whether the proposal len is zero or not. We should remove this once vLLM supports per-sequence proposal lens in a batch. """ - nonzero_lists: Tuple[List[SequenceGroupMetadata], List[int]] = ([], []) - zero_lists: Tuple[List[SequenceGroupMetadata], List[int]] = ([], []) + nonzero_lists: tuple[list[SequenceGroupMetadata], list[int]] = ([], []) + zero_lists: tuple[list[SequenceGroupMetadata], list[int]] = ([], []) for i, (seq_group, proposal_len) in enumerate( zip(seq_group_metadata_list, proposal_lens)): seq_groups, indices = nonzero_lists if proposal_len else zero_lists @@ -152,7 +153,7 @@ def split_batch_by_proposal_len( def sampler_output_to_torch( sampler_output_list: Sequence[SamplerOutput], sampler_transposed: bool -) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: +) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: """Utility function which converts a list of SamplerOutput to tensors. sampler_transposed here is used as the indicator for whether diff --git a/vllm/tracing.py b/vllm/tracing.py index bf069ad84fd42..557ae40b87aee 100644 --- a/vllm/tracing.py +++ b/vllm/tracing.py @@ -1,7 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 import os -from typing import Mapping, Optional +from collections.abc import Mapping +from typing import Optional from vllm.logger import init_logger from vllm.utils import run_once diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 1937b13884711..a9f3625a90a9e 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -6,7 +6,7 @@ import time from functools import cache from pathlib import Path -from typing import Any, Callable, Dict, Literal, Optional, Type, Union +from typing import Any, Callable, Literal, Optional, Union import huggingface_hub from huggingface_hub import hf_hub_download @@ -53,11 +53,11 @@ logger = init_logger(__name__) -_CONFIG_REGISTRY_OVERRIDE_HF: Dict[str, Type[PretrainedConfig]] = { +_CONFIG_REGISTRY_OVERRIDE_HF: dict[str, type[PretrainedConfig]] = { "mllama": MllamaConfig } -_CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = { +_CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = { "chatglm": ChatGLMConfig, "cohere2": Cohere2Config, "dbrx": DbrxConfig, @@ -193,7 +193,7 @@ def patch_rope_scaling(config: PretrainedConfig) -> None: patch_rope_scaling_dict(rope_scaling) -def patch_rope_scaling_dict(rope_scaling: Dict[str, Any]) -> None: +def patch_rope_scaling_dict(rope_scaling: dict[str, Any]) -> None: if "rope_type" in rope_scaling and "type" in rope_scaling: rope_type = rope_scaling["rope_type"] rope_type_legacy = rope_scaling["type"] @@ -701,7 +701,7 @@ def get_hf_image_processor_config( model: Union[str, Path], revision: Optional[str] = None, **kwargs, -) -> Dict[str, Any]: +) -> dict[str, Any]: # ModelScope does not provide an interface for image_processor if VLLM_USE_MODELSCOPE: return dict() diff --git a/vllm/transformers_utils/configs/arctic.py b/vllm/transformers_utils/configs/arctic.py index 5ab70c0e41362..2261f0a9e9aac 100644 --- a/vllm/transformers_utils/configs/arctic.py +++ b/vllm/transformers_utils/configs/arctic.py @@ -8,7 +8,7 @@ """ Arctic model configuration""" from dataclasses import asdict, dataclass -from typing import Any, Dict +from typing import Any from transformers.configuration_utils import PretrainedConfig from transformers.utils import logging @@ -192,14 +192,14 @@ def __init__( ) @classmethod - def from_dict(cls, config_dict: Dict[str, Any], **kwargs) -> "ArcticConfig": + def from_dict(cls, config_dict: dict[str, Any], **kwargs) -> "ArcticConfig": result = super().from_dict(config_dict, **kwargs) config = result[0] if isinstance(result, tuple) else result if isinstance(config.quantization, dict): config.quantization = ArcticQuantizationConfig(**config.quantization) return result - def to_dict(self) -> Dict[str, Any]: + def to_dict(self) -> dict[str, Any]: ret = super().to_dict() if isinstance(ret["quantization"], ArcticQuantizationConfig): ret["quantization"] = asdict(ret["quantization"]) diff --git a/vllm/transformers_utils/configs/cohere2.py b/vllm/transformers_utils/configs/cohere2.py index e30409b3af5f0..21328d7675b82 100644 --- a/vllm/transformers_utils/configs/cohere2.py +++ b/vllm/transformers_utils/configs/cohere2.py @@ -61,7 +61,7 @@ class Cohere2Config(PretrainedConfig): Whether to tie weight embeddings rope_theta (`float`, *optional*, defaults to 10000.0): The base period of the RoPE embeddings. - rope_scaling (`Dict`, *optional*): + rope_scaling (`dict`, *optional*): Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value accordingly. @@ -86,11 +86,11 @@ class Cohere2Config(PretrainedConfig): `beta_slow` (`float`, *optional*): Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear ramp function. If unspecified, it defaults to 1. - `short_factor` (`List[float]`, *optional*): + `short_factor` (`list[float]`, *optional*): Only used with 'longrope'. The scaling factor to be applied to short contexts (< `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden size divided by the number of attention heads divided by 2 - `long_factor` (`List[float]`, *optional*): + `long_factor` (`list[float]`, *optional*): Only used with 'longrope'. The scaling factor to be applied to long contexts (< `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden size divided by the number of attention heads divided by 2 diff --git a/vllm/transformers_utils/configs/deepseek_vl2.py b/vllm/transformers_utils/configs/deepseek_vl2.py index 24d4052d87211..a54486fa41cd1 100644 --- a/vllm/transformers_utils/configs/deepseek_vl2.py +++ b/vllm/transformers_utils/configs/deepseek_vl2.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/faf18023f24b962b32d9f0a2d89e402a8d383a78/deepseek_vl2/models/modeling_deepseek_vl_v2.py#L115-L268 -from typing import Tuple from transformers.configuration_utils import PretrainedConfig @@ -191,12 +190,12 @@ class DeepseekVLV2Config(PretrainedConfig): tile_tag: str = "2D" global_view_pos: str = "head" - candidate_resolutions: Tuple[Tuple[int, int]] = ((384, 384), ) + candidate_resolutions: tuple[tuple[int, int]] = ((384, 384), ) def __init__(self, tile_tag: str = "tile_tag", global_view_pos: str = "head", - candidate_resolutions: Tuple[Tuple[int, + candidate_resolutions: tuple[tuple[int, int]] = ((384, 384), ), **kwargs): super().__init__(**kwargs) diff --git a/vllm/transformers_utils/configs/exaone.py b/vllm/transformers_utils/configs/exaone.py index 39364367e3031..76f6fffd7ee40 100644 --- a/vllm/transformers_utils/configs/exaone.py +++ b/vllm/transformers_utils/configs/exaone.py @@ -17,14 +17,12 @@ # limitations under the License. """Exaone model configuration""" -from typing import Dict - from transformers.configuration_utils import PretrainedConfig from transformers.utils import logging logger = logging.get_logger(__name__) -EXAONE_PRETRAINED_CONFIG_ARCHIVE_MAP: Dict[str, str] = {} +EXAONE_PRETRAINED_CONFIG_ARCHIVE_MAP: dict[str, str] = {} class ExaoneConfig(PretrainedConfig): diff --git a/vllm/transformers_utils/configs/jais.py b/vllm/transformers_utils/configs/jais.py index be0f3b7e5e529..b947c6a9e2b4b 100644 --- a/vllm/transformers_utils/configs/jais.py +++ b/vllm/transformers_utils/configs/jais.py @@ -98,7 +98,7 @@ class JAISConfig(PretrainedConfig): Scale attention weights by dividing by hidden_size instead of sqrt(hidden_size). Need to set scale_attn_weights to `True` as well. - alibi_scaling (`Dict`, *optional*): + alibi_scaling (`dict`, *optional*): Dictionary containing the scaling configuration for ALiBi embeddings. Currently only supports linear scaling strategy. Can specify either the scaling `factor` (must be @@ -108,7 +108,7 @@ class JAISConfig(PretrainedConfig): formats are `{"type": strategy name, "factor": scaling factor}` or `{"type": strategy name, "train_seq_len": training sequence length}`. - architectures (`List`, *optional*, defaults to ['JAISLMHeadModel']): + architectures (`list`, *optional*, defaults to ['JAISLMHeadModel']): architecture names for Jais. Example: diff --git a/vllm/transformers_utils/configs/mlp_speculator.py b/vllm/transformers_utils/configs/mlp_speculator.py index c761f659e5b2c..70f60752905cb 100644 --- a/vllm/transformers_utils/configs/mlp_speculator.py +++ b/vllm/transformers_utils/configs/mlp_speculator.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List, Optional +from typing import Optional from transformers import PretrainedConfig @@ -17,7 +17,7 @@ def __init__(self, emb_dim: int = 4096, inner_dim: int = 0, n_predict: int = 3, - top_k_tokens_per_head: Optional[List[int]] = None, + top_k_tokens_per_head: Optional[list[int]] = None, n_candidates: int = 5, tie_weights: bool = False, scale_input: bool = False, @@ -34,7 +34,7 @@ def __init__(self, the inner dimension of the model. If 0, will be the emb_dim. n_predict: int the number of lookaheads for the speculator - top_k_tokens_per_head: List[int] + top_k_tokens_per_head: list[int] Number of tokens to consider from each head when forming the candidate tree. For each candidate branch in the tree, head n produces topk[n] diff --git a/vllm/transformers_utils/configs/mpt.py b/vllm/transformers_utils/configs/mpt.py index 96356135f6b28..2d52658d3973c 100644 --- a/vllm/transformers_utils/configs/mpt.py +++ b/vllm/transformers_utils/configs/mpt.py @@ -4,11 +4,11 @@ # https://huggingface.co/mosaicml/mpt-7b/blob/main/configuration_mpt.py """A HuggingFace-style model configuration.""" import warnings -from typing import Any, Dict, Optional, Union +from typing import Any, Optional, Union from transformers import PretrainedConfig -attn_config_defaults: Dict = { +attn_config_defaults: dict = { 'attn_type': 'multihead_attention', 'attn_pdrop': 0.0, 'attn_impl': 'triton', @@ -20,8 +20,8 @@ 'alibi': False, 'alibi_bias_max': 8 } -ffn_config_defaults: Dict = {'ffn_type': 'mptmlp'} -init_config_defaults: Dict = { +ffn_config_defaults: dict = {'ffn_type': 'mptmlp'} +init_config_defaults: dict = { 'name': 'kaiming_normal_', 'fan_mode': 'fan_in', 'init_nonlinearity': 'relu', @@ -52,15 +52,15 @@ def __init__(self, resid_pdrop: float = 0.0, emb_pdrop: float = 0.0, learned_pos_emb: bool = True, - attn_config: Dict = attn_config_defaults, - ffn_config: Dict = ffn_config_defaults, + attn_config: dict = attn_config_defaults, + ffn_config: dict = ffn_config_defaults, init_device: str = 'cpu', logit_scale: Optional[Union[float, str]] = None, no_bias: bool = False, embedding_fraction: float = 1.0, norm_type: str = 'low_precision_layernorm', use_cache: bool = False, - init_config: Dict = init_config_defaults, + init_config: dict = init_config_defaults, fc_type: str = 'torch', verbose: Optional[int] = None, **kwargs: Any): @@ -102,8 +102,8 @@ def __init__(self, self._validate_config() def _set_config_defaults( - self, config: Dict[str, Any], - config_defaults: Dict[str, Any]) -> Dict[str, Any]: + self, config: dict[str, Any], + config_defaults: dict[str, Any]) -> dict[str, Any]: for (k, v) in config_defaults.items(): if k not in config: config[k] = v diff --git a/vllm/transformers_utils/configs/olmo2.py b/vllm/transformers_utils/configs/olmo2.py index c6e446333b43d..9935f5d9573e7 100644 --- a/vllm/transformers_utils/configs/olmo2.py +++ b/vllm/transformers_utils/configs/olmo2.py @@ -62,7 +62,7 @@ class Olmo2Config(PretrainedConfig): Whether to tie weight embeddings rope_theta (`float`, *optional*, defaults to 10000.0): The base period of the RoPE embeddings. - rope_scaling (`Dict`, *optional*): + rope_scaling (`dict`, *optional*): Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update diff --git a/vllm/transformers_utils/configs/solar.py b/vllm/transformers_utils/configs/solar.py index 0d5db896b93d3..6eaf699d17bee 100644 --- a/vllm/transformers_utils/configs/solar.py +++ b/vllm/transformers_utils/configs/solar.py @@ -108,7 +108,7 @@ class SolarConfig(PretrainedConfig): Whether to tie weight embeddings rope_theta (`float`, *optional*, defaults to 10000.0): The base period of the RoPE embeddings. - rope_scaling (`Dict`, *optional*): + rope_scaling (`dict`, *optional*): Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling diff --git a/vllm/transformers_utils/configs/ultravox.py b/vllm/transformers_utils/configs/ultravox.py index 6b2765db94e78..4c50724272634 100644 --- a/vllm/transformers_utils/configs/ultravox.py +++ b/vllm/transformers_utils/configs/ultravox.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # Adapted from https://github.com/fixie-ai/ultravox/blob/ecd58c4041030bae2ad15aa6bcf04ab43199ea02/ultravox/model/ultravox_config.py -from typing import Any, Dict, Optional +from typing import Any, Optional import transformers @@ -48,8 +48,8 @@ class UltravoxConfig(transformers.PretrainedConfig): def __init__( self, - audio_config: Optional[Dict[str, Any]] = None, - text_config: Optional[Dict[str, Any]] = None, + audio_config: Optional[dict[str, Any]] = None, + text_config: Optional[dict[str, Any]] = None, audio_model_id: Optional[str] = None, text_model_id: Optional[str] = None, ignore_index: int = -100, @@ -58,8 +58,8 @@ def __init__( stack_factor: int = 8, norm_init: float = 0.4, projector_act: str = "swiglu", - text_model_lora_config: Optional[Dict[str, Any]] = None, - audio_model_lora_config: Optional[Dict[str, Any]] = None, + text_model_lora_config: Optional[dict[str, Any]] = None, + audio_model_lora_config: Optional[dict[str, Any]] = None, projector_ln_mid: bool = False, **kwargs, ): diff --git a/vllm/transformers_utils/detokenizer.py b/vllm/transformers_utils/detokenizer.py index 9d1d4bb92e4ab..a35bf76dc7277 100644 --- a/vllm/transformers_utils/detokenizer.py +++ b/vllm/transformers_utils/detokenizer.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Dict, List, Optional +from typing import Optional from vllm.sequence import (VLLM_INVALID_TOKEN_ID, Logprob, SamplingParams, Sequence, SequenceGroup) @@ -22,7 +22,7 @@ def get_tokenizer_for_seq(self, sequence: Sequence) -> AnyTokenizer: return self.tokenizer_group.get_lora_tokenizer(sequence.lora_request) def decode_prompt_logprobs_inplace(self, seq_group: SequenceGroup, - prompt_logprobs: List[Optional[Dict[ + prompt_logprobs: list[Optional[dict[ int, Logprob]]], position_offset: int) -> None: """Decodes the logprobs for the prompt of a sequence group. @@ -49,7 +49,7 @@ def decode_prompt_logprobs_inplace(self, seq_group: SequenceGroup, read_offset = 0 next_iter_prefix_offset = 0 next_iter_read_offset = 0 - next_iter_tokens: List[str] = [] + next_iter_tokens: list[str] = [] prev_tokens = None for token_position_in_logprob, prompt_logprobs_for_token in enumerate( diff --git a/vllm/transformers_utils/detokenizer_utils.py b/vllm/transformers_utils/detokenizer_utils.py index a1fa27773fe5c..7373fa0ede237 100644 --- a/vllm/transformers_utils/detokenizer_utils.py +++ b/vllm/transformers_utils/detokenizer_utils.py @@ -1,11 +1,11 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List, Optional, Tuple +from typing import Optional from .tokenizer import AnyTokenizer -def _replace_none_with_empty(tokens: List[Optional[str]]): +def _replace_none_with_empty(tokens: list[Optional[str]]): for i, token in enumerate(tokens): if token is None: tokens[i] = "" @@ -13,7 +13,7 @@ def _replace_none_with_empty(tokens: List[Optional[str]]): def _convert_tokens_to_string_with_added_encoders( tokenizer: AnyTokenizer, - output_tokens: List[str], + output_tokens: list[str], skip_special_tokens: bool, spaces_between_special_tokens: bool, ) -> str: @@ -22,8 +22,8 @@ def _convert_tokens_to_string_with_added_encoders( # NOTE(woosuk): The following code is slow because it runs a for loop over # the output_tokens. In Python, running a for loop over a list can be slow # even when the loop body is very simple. - sub_texts: List[str] = [] - current_sub_text: List[str] = [] + sub_texts: list[str] = [] + current_sub_text: list[str] = [] all_special_tokens = set(tokenizer.all_special_tokens) for token in output_tokens: if skip_special_tokens and token in all_special_tokens: @@ -52,9 +52,9 @@ def _convert_tokens_to_string_with_added_encoders( def convert_prompt_ids_to_tokens( tokenizer: AnyTokenizer, - prompt_ids: List[int], + prompt_ids: list[int], skip_special_tokens: bool = False, -) -> Tuple[List[str], int, int]: +) -> tuple[list[str], int, int]: """Converts the prompt ids to tokens and returns the tokens and offsets for incremental detokenization. @@ -76,8 +76,8 @@ def convert_prompt_ids_to_tokens( def convert_ids_list_to_tokens( tokenizer: AnyTokenizer, - token_ids: List[int], -) -> List[str]: + token_ids: list[int], +) -> list[str]: """Detokenize the input ids individually. Args: @@ -98,13 +98,13 @@ def convert_ids_list_to_tokens( # under Apache 2.0 license def detokenize_incrementally( tokenizer: AnyTokenizer, - all_input_ids: List[int], - prev_tokens: Optional[List[str]], + all_input_ids: list[int], + prev_tokens: Optional[list[str]], prefix_offset: int, read_offset: int, skip_special_tokens: bool = False, spaces_between_special_tokens: bool = True, -) -> Tuple[List[str], str, int, int]: +) -> tuple[list[str], str, int, int]: """Detokenizes the input ids incrementally and returns the new tokens and the new text. diff --git a/vllm/transformers_utils/processors/deepseek_vl2.py b/vllm/transformers_utils/processors/deepseek_vl2.py index d37381ea9925f..a0f216e55e136 100644 --- a/vllm/transformers_utils/processors/deepseek_vl2.py +++ b/vllm/transformers_utils/processors/deepseek_vl2.py @@ -24,7 +24,6 @@ # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. import math -from typing import List, Tuple import torch import torchvision.transforms as T @@ -36,8 +35,8 @@ class ImageTransform: def __init__(self, - mean: Tuple[float, float, float] = (0.5, 0.5, 0.5), - std: Tuple[float, float, float] = (0.5, 0.5, 0.5), + mean: tuple[float, float, float] = (0.5, 0.5, 0.5), + std: tuple[float, float, float] = (0.5, 0.5, 0.5), normalize: bool = True): self.mean = mean self.std = std @@ -62,11 +61,11 @@ class DeepseekVLV2Processor(ProcessorMixin): def __init__( self, tokenizer: LlamaTokenizerFast, - candidate_resolutions: Tuple[Tuple[int, int]], + candidate_resolutions: tuple[tuple[int, int]], patch_size: int, downsample_ratio: int, - image_mean: Tuple[float, float, float] = (0.5, 0.5, 0.5), - image_std: Tuple[float, float, float] = (0.5, 0.5, 0.5), + image_mean: tuple[float, float, float] = (0.5, 0.5, 0.5), + image_std: tuple[float, float, float] = (0.5, 0.5, 0.5), normalize: bool = True, image_token: str = "", pad_token: str = "<|▁pad▁|>", @@ -170,13 +169,13 @@ def encode(self, text: str, bos: bool = True, eos: bool = False): return t - def decode(self, t: List[int], **kwargs) -> str: + def decode(self, t: list[int], **kwargs) -> str: return self.tokenizer.decode(t, **kwargs) def process_one( self, prompt: str, - images: List[Image.Image], + images: list[Image.Image], inference_mode: bool = True, **kwargs, ): @@ -184,8 +183,8 @@ def process_one( Args: prompt (str): the formatted prompt; - conversations (List[Dict]): conversations with a list of messages; - images (List[ImageType]): the list of images; + conversations (list[dict]): conversations with a list of messages; + images (list[ImageType]): the list of images; inference_mode (bool): if True, then remove the last eos token; system_prompt (str): the system prompt; **kwargs: @@ -196,7 +195,7 @@ def process_one( - target_ids (torch.LongTensor): [N + image tokens] - pixel_values (torch.FloatTensor): [n_patches, 3, H, W] - image_id (int): the id of the image token - - num_image_tokens (List[int]): the number of image tokens + - num_image_tokens (list[int]): the number of image tokens """ assert (prompt is not None and images is not None @@ -257,7 +256,7 @@ def __call__( self, *, prompt: str, - images: List[Image.Image], + images: list[Image.Image], inference_mode: bool = True, **kwargs, ): @@ -265,7 +264,7 @@ def __call__( Args: prompt (str): the formatted prompt; - images (List[ImageType]): the list of images; + images (list[ImageType]): the list of images; inference_mode (bool): if True, then remove the last eos token; **kwargs: @@ -274,7 +273,7 @@ def __call__( - input_ids (torch.LongTensor): [N + image tokens] - images (torch.FloatTensor): [n_images, 3, H, W] - image_id (int): the id of the image token - - num_image_tokens (List[int]): the number of image tokens + - num_image_tokens (list[int]): the number of image tokens """ prepare = self.process_one( @@ -288,7 +287,7 @@ def __call__( def tokenize_with_images( self, conversation: str, - images: List[Image.Image], + images: list[Image.Image], bos: bool = True, eos: bool = True, cropping: bool = True, diff --git a/vllm/transformers_utils/tokenizer_base.py b/vllm/transformers_utils/tokenizer_base.py index bb5ddaf88b219..b4eb081c9b99d 100644 --- a/vllm/transformers_utils/tokenizer_base.py +++ b/vllm/transformers_utils/tokenizer_base.py @@ -2,7 +2,7 @@ import importlib from abc import ABC, abstractmethod -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Any, Optional, Union if TYPE_CHECKING: from vllm.entrypoints.chat_utils import ChatCompletionMessageParam @@ -12,17 +12,17 @@ class TokenizerBase(ABC): @property @abstractmethod - def all_special_tokens_extended(self) -> List[str]: + def all_special_tokens_extended(self) -> list[str]: raise NotImplementedError() @property @abstractmethod - def all_special_tokens(self) -> List[str]: + def all_special_tokens(self) -> list[str]: raise NotImplementedError() @property @abstractmethod - def all_special_ids(self) -> List[int]: + def all_special_ids(self) -> list[int]: raise NotImplementedError() @property @@ -66,7 +66,7 @@ def __len__(self) -> int: @abstractmethod def __call__( self, - text: Union[str, List[str], List[int]], + text: Union[str, list[str], list[int]], text_pair: Optional[str] = None, add_special_tokens: bool = False, truncation: bool = False, @@ -75,11 +75,11 @@ def __call__( raise NotImplementedError() @abstractmethod - def get_vocab(self) -> Dict[str, int]: + def get_vocab(self) -> dict[str, int]: raise NotImplementedError() @abstractmethod - def get_added_vocab(self) -> Dict[str, int]: + def get_added_vocab(self) -> dict[str, int]: raise NotImplementedError() @abstractmethod @@ -88,44 +88,44 @@ def encode_one( text: str, truncation: bool = False, max_length: Optional[int] = None, - ) -> List[int]: + ) -> list[int]: raise NotImplementedError() @abstractmethod def encode(self, text: str, - add_special_tokens: Optional[bool] = None) -> List[int]: + add_special_tokens: Optional[bool] = None) -> list[int]: raise NotImplementedError() @abstractmethod def apply_chat_template(self, - messages: List["ChatCompletionMessageParam"], - tools: Optional[List[Dict[str, Any]]] = None, - **kwargs) -> List[int]: + messages: list["ChatCompletionMessageParam"], + tools: Optional[list[dict[str, Any]]] = None, + **kwargs) -> list[int]: raise NotImplementedError() @abstractmethod - def convert_tokens_to_string(self, tokens: List[str]) -> str: + def convert_tokens_to_string(self, tokens: list[str]) -> str: raise NotImplementedError() @abstractmethod def decode(self, - ids: Union[List[int], int], + ids: Union[list[int], int], skip_special_tokens: bool = True) -> str: raise NotImplementedError() @abstractmethod def convert_ids_to_tokens( self, - ids: List[int], + ids: list[int], skip_special_tokens: bool = True, - ) -> List[str]: + ) -> list[str]: raise NotImplementedError() class TokenizerRegistry: # Tokenizer name -> (tokenizer module, tokenizer class) - REGISTRY: Dict[str, Tuple[str, str]] = {} + REGISTRY: dict[str, tuple[str, str]] = {} @staticmethod def register(name: str, module: str, class_name: str) -> None: diff --git a/vllm/transformers_utils/tokenizer_group/__init__.py b/vllm/transformers_utils/tokenizer_group/__init__.py index c223768b16d6b..2c976b3b267d5 100644 --- a/vllm/transformers_utils/tokenizer_group/__init__.py +++ b/vllm/transformers_utils/tokenizer_group/__init__.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Optional, Type +from typing import Optional from vllm.config import (LoRAConfig, ModelConfig, ParallelConfig, SchedulerConfig, TokenizerPoolConfig) @@ -35,7 +35,7 @@ def init_tokenizer_from_configs(model_config: ModelConfig, def get_tokenizer_group(tokenizer_pool_config: Optional[TokenizerPoolConfig], **init_kwargs) -> BaseTokenizerGroup: - tokenizer_cls: Type[BaseTokenizerGroup] + tokenizer_cls: type[BaseTokenizerGroup] if tokenizer_pool_config is None: tokenizer_cls = TokenizerGroup elif isinstance(tokenizer_pool_config.pool_type, type) and issubclass( diff --git a/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py index fbdfa3e57e172..6cd63984dbfdc 100644 --- a/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py +++ b/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 from abc import ABC, abstractmethod -from typing import List, Optional +from typing import Optional from vllm.config import TokenizerPoolConfig from vllm.lora.request import LoRARequest @@ -35,7 +35,7 @@ def encode(self, prompt: str, request_id: Optional[str] = None, lora_request: Optional[LoRARequest] = None, - add_special_tokens: Optional[bool] = None) -> List[int]: + add_special_tokens: Optional[bool] = None) -> list[int]: """Encode a prompt using the tokenizer group.""" pass @@ -45,7 +45,7 @@ async def encode_async( prompt: str, request_id: Optional[str] = None, lora_request: Optional[LoRARequest] = None, - add_special_tokens: Optional[bool] = None) -> List[int]: + add_special_tokens: Optional[bool] = None) -> list[int]: """Encode a prompt using the tokenizer group.""" pass diff --git a/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py index 30cab752ccf3c..86044de936821 100644 --- a/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py +++ b/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py @@ -2,7 +2,7 @@ import asyncio import os -from typing import List, Optional +from typing import Optional try: from ray.exceptions import ActorDiedError # type: ignore @@ -115,7 +115,7 @@ def encode(self, prompt: str, request_id: Optional[str] = None, lora_request: Optional[LoRARequest] = None, - add_special_tokens: Optional[bool] = None) -> List[int]: + add_special_tokens: Optional[bool] = None) -> list[int]: """Encode a prompt using the tokenizer group. We pick an idle actor and use it to encode the prompt. @@ -166,7 +166,7 @@ async def encode_async( prompt: str, request_id: Optional[str] = None, lora_request: Optional[LoRARequest] = None, - add_special_tokens: Optional[bool] = None) -> List[int]: + add_special_tokens: Optional[bool] = None) -> list[int]: """Encode a prompt using the tokenizer group. We pick an idle actor and use it to encode the prompt. diff --git a/vllm/transformers_utils/tokenizer_group/tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/tokenizer_group.py index 025971cb7e477..2eaf821de3338 100644 --- a/vllm/transformers_utils/tokenizer_group/tokenizer_group.py +++ b/vllm/transformers_utils/tokenizer_group/tokenizer_group.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List, Optional +from typing import Optional from vllm.config import TokenizerPoolConfig from vllm.lora.request import LoRARequest @@ -43,7 +43,7 @@ def get_max_input_len(self, return self.max_input_length def _raise_if_input_too_long(self, - encoded_tokens: List[int], + encoded_tokens: list[int], lora_request: Optional[LoRARequest] = None): input_length = len(encoded_tokens) if lora_request: @@ -58,7 +58,7 @@ def encode(self, prompt: str, request_id: Optional[str] = None, lora_request: Optional[LoRARequest] = None, - add_special_tokens: Optional[bool] = None) -> List[int]: + add_special_tokens: Optional[bool] = None) -> list[int]: tokenizer = self.get_lora_tokenizer(lora_request) ret = encode_tokens(tokenizer, prompt, @@ -71,7 +71,7 @@ async def encode_async( prompt: str, request_id: Optional[str] = None, lora_request: Optional[LoRARequest] = None, - add_special_tokens: Optional[bool] = None) -> List[int]: + add_special_tokens: Optional[bool] = None) -> list[int]: tokenizer = await self.get_lora_tokenizer_async(lora_request) ret = encode_tokens(tokenizer, prompt, diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py index 801597bd36508..dc38388c3dbee 100644 --- a/vllm/transformers_utils/tokenizers/mistral.py +++ b/vllm/transformers_utils/tokenizers/mistral.py @@ -4,7 +4,7 @@ import re from dataclasses import dataclass from pathlib import Path -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union, cast +from typing import TYPE_CHECKING, Any, Optional, Union, cast import huggingface_hub from huggingface_hub import HfApi, hf_hub_download @@ -28,7 +28,7 @@ @dataclass class Encoding: - input_ids: Union[List[int], List[List[int]]] + input_ids: Union[list[int], list[list[int]]] def maybe_serialize_tool_calls(request: "ChatCompletionRequest"): @@ -98,7 +98,7 @@ def truncate_tool_call_ids(request: "ChatCompletionRequest"): request.messages[i]["tool_call_id"] = tool_call_id -def list_local_repo_files(repo_id: str, revision: Optional[str]) -> List[str]: +def list_local_repo_files(repo_id: str, revision: Optional[str]) -> list[str]: repo_cache = os.path.join( huggingface_hub.constants.HF_HUB_CACHE, huggingface_hub.constants.REPO_ID_SEPARATOR.join( @@ -118,7 +118,7 @@ def list_local_repo_files(repo_id: str, revision: Optional[str]) -> List[str]: return [] -def find_tokenizer_file(files: List[str]): +def find_tokenizer_file(files: list[str]): file_pattern = re.compile( r"^tokenizer\.model\.v.*$|^tekken\.json$|^tokenizer\.mm\.model\.v.*$") @@ -136,14 +136,14 @@ def find_tokenizer_file(files: List[str]): def make_mistral_chat_completion_request( - messages: List["ChatCompletionMessageParam"], - tools: Optional[List[Dict[str, + messages: list["ChatCompletionMessageParam"], + tools: Optional[list[dict[str, Any]]] = None) -> "ChatCompletionRequest": - last_message = cast(Dict[str, Any], messages[-1]) + last_message = cast(dict[str, Any], messages[-1]) if last_message["role"] == "assistant": last_message["prefix"] = True - last_message = cast(Dict[str, Any], messages[-1]) + last_message = cast(dict[str, Any], messages[-1]) if last_message["role"] == "assistant": last_message["prefix"] = True @@ -194,7 +194,7 @@ def __init__(self, tokenizer: "PublicMistralTokenizer") -> None: raise TypeError(f"Unsupported tokenizer: {type(tokenizer_)}") self._vocab = tokenizer_.vocab() - # Convert to a Dict[str, int] to match protocol, but this is a lossy + # Convert to a dict[str, int] to match protocol, but this is a lossy # conversion. There may be multiple token ids that decode to the same # string due to partial UTF-8 byte sequences being converted to � self._vocab_dict = { @@ -252,7 +252,7 @@ def _download_mistral_tokenizer_from_hf(tokenizer_name: str, # the following attributes are set to fit VLLM's design and are used # by the guided structured output backends. @property - def all_special_tokens_extended(self) -> List[str]: + def all_special_tokens_extended(self) -> list[str]: from mistral_common.tokens.tokenizers.base import SpecialTokens # tekken defines its own extended special tokens list @@ -266,11 +266,11 @@ def all_special_tokens_extended(self) -> List[str]: ] @property - def all_special_tokens(self) -> List[str]: + def all_special_tokens(self) -> list[str]: return self.all_special_tokens_extended @property - def all_special_ids(self) -> List[int]: + def all_special_ids(self) -> list[int]: return [ self.all_special_tokens.index(t) for t in self.all_special_tokens ] @@ -308,21 +308,21 @@ def __len__(self) -> int: def __call__( self, - text: Union[str, List[str], List[int]], + text: Union[str, list[str], list[int]], text_pair: Optional[str] = None, add_special_tokens: bool = False, truncation: bool = False, max_length: Optional[int] = None, ): - input_ids: Union[List[int], List[List[int]]] - # For List[str], original prompt text + input_ids: Union[list[int], list[list[int]]] + # For list[str], original prompt text if is_list_of(text, str): - input_ids_: List[List[int]] = [] + input_ids_: list[list[int]] = [] for p in text: each_input_ids = self.encode_one(p, truncation, max_length) input_ids_.append(each_input_ids) input_ids = input_ids_ - # For List[int], apply chat template output, already tokens. + # For list[int], apply chat template output, already tokens. elif is_list_of(text, int): input_ids = text # For str, single prompt text @@ -330,12 +330,12 @@ def __call__( input_ids = self.encode_one(text, truncation, max_length) return Encoding(input_ids=input_ids) - def get_vocab(self) -> Dict[str, int]: + def get_vocab(self) -> dict[str, int]: # NB: the dictionary form of the vocabulary collapses token ids that map # to the same string but have different bytes return self._vocab_dict - def get_added_vocab(self) -> Dict[str, int]: + def get_added_vocab(self) -> dict[str, int]: # Mistral tokenizers have no added vocabulary return {} @@ -344,7 +344,7 @@ def encode_one( text: str, truncation: bool = False, max_length: Optional[int] = None, - ) -> List[int]: + ) -> list[int]: # Mistral Tokenizers should not add special tokens input_ids = self.encode(text) @@ -354,7 +354,7 @@ def encode_one( def encode(self, text: str, - add_special_tokens: Optional[bool] = None) -> List[int]: + add_special_tokens: Optional[bool] = None) -> list[int]: # `encode` should only be used for prompt completion # it should never be used for chat_completion. # For chat completion use `apply_chat_template` @@ -366,9 +366,9 @@ def encode(self, return self.tokenizer.encode(text, bos=True, eos=False) def apply_chat_template(self, - messages: List["ChatCompletionMessageParam"], - tools: Optional[List[Dict[str, Any]]] = None, - **kwargs) -> List[int]: + messages: list["ChatCompletionMessageParam"], + tools: Optional[list[dict[str, Any]]] = None, + **kwargs) -> list[int]: request = make_mistral_chat_completion_request(messages, tools) encoded = self.mistral.encode_chat_completion(request) @@ -376,7 +376,7 @@ def apply_chat_template(self, # encode-decode to get clean prompt return encoded.tokens - def convert_tokens_to_string(self, tokens: List[str]) -> str: + def convert_tokens_to_string(self, tokens: list[str]) -> str: from mistral_common.tokens.tokenizers.base import SpecialTokens if self.is_tekken: tokens = [ @@ -409,7 +409,7 @@ def _token_to_id(t: str): # make sure certain special tokens like Tool calls are # not decoded special_tokens = {SpecialTokens.tool_calls} - regular_tokens: List[str] = [] + regular_tokens: list[str] = [] decoded_list = [] for token in tokens: @@ -434,7 +434,7 @@ def _token_to_id(t: str): # See: guided_decoding/outlines_logits_processors.py::_adapt_tokenizer # for more. def decode(self, - ids: Union[List[int], int], + ids: Union[list[int], int], skip_special_tokens: bool = True) -> str: assert ( skip_special_tokens @@ -446,9 +446,9 @@ def decode(self, def convert_ids_to_tokens( self, - ids: List[int], + ids: list[int], skip_special_tokens: bool = True, - ) -> List[str]: + ) -> list[str]: from mistral_common.tokens.tokenizers.base import SpecialTokens # TODO(Patrick) - potentially allow special tokens to not be skipped diff --git a/vllm/transformers_utils/utils.py b/vllm/transformers_utils/utils.py index 87e446f894384..a8a14e5ad073e 100644 --- a/vllm/transformers_utils/utils.py +++ b/vllm/transformers_utils/utils.py @@ -2,7 +2,7 @@ from os import PathLike from pathlib import Path -from typing import List, Optional, Union +from typing import Optional, Union def is_s3(model_or_path: str) -> bool: @@ -26,7 +26,7 @@ def modelscope_list_repo_files( repo_id: str, revision: Optional[str] = None, token: Union[str, bool, None] = None, -) -> List[str]: +) -> list[str]: """List files in a modelscope repo.""" from modelscope.hub.api import HubApi api = HubApi() diff --git a/vllm/usage/usage_lib.py b/vllm/usage/usage_lib.py index fbbb21c89370a..d8795e1e0557b 100644 --- a/vllm/usage/usage_lib.py +++ b/vllm/usage/usage_lib.py @@ -9,7 +9,7 @@ from enum import Enum from pathlib import Path from threading import Thread -from typing import Any, Dict, Optional, Union +from typing import Any, Optional, Union from uuid import uuid4 import cpuinfo @@ -27,7 +27,7 @@ _USAGE_STATS_ENABLED = None _USAGE_STATS_SERVER = envs.VLLM_USAGE_STATS_SERVER -_GLOBAL_RUNTIME_DATA: Dict[str, Union[str, int, bool]] = {} +_GLOBAL_RUNTIME_DATA: dict[str, Union[str, int, bool]] = {} _USAGE_ENV_VARS_TO_COLLECT = [ "VLLM_USE_MODELSCOPE", @@ -150,7 +150,7 @@ def __init__(self) -> None: def report_usage(self, model_architecture: str, usage_context: UsageContext, - extra_kvs: Optional[Dict[str, Any]] = None) -> None: + extra_kvs: Optional[dict[str, Any]] = None) -> None: t = Thread(target=self._report_usage_worker, args=(model_architecture, usage_context, extra_kvs or {}), daemon=True) @@ -158,13 +158,13 @@ def report_usage(self, def _report_usage_worker(self, model_architecture: str, usage_context: UsageContext, - extra_kvs: Dict[str, Any]) -> None: + extra_kvs: dict[str, Any]) -> None: self._report_usage_once(model_architecture, usage_context, extra_kvs) self._report_continous_usage() def _report_usage_once(self, model_architecture: str, usage_context: UsageContext, - extra_kvs: Dict[str, Any]) -> None: + extra_kvs: dict[str, Any]) -> None: # Platform information from vllm.platforms import current_platform if current_platform.is_cuda_alike(): @@ -227,7 +227,7 @@ def _report_continous_usage(self): self._write_to_file(data) self._send_to_server(data) - def _send_to_server(self, data: Dict[str, Any]) -> None: + def _send_to_server(self, data: dict[str, Any]) -> None: try: global_http_client = global_http_connection.get_sync_client() global_http_client.post(_USAGE_STATS_SERVER, json=data) @@ -235,7 +235,7 @@ def _send_to_server(self, data: Dict[str, Any]) -> None: # silently ignore unless we are using debug log logging.debug("Failed to send usage data to server") - def _write_to_file(self, data: Dict[str, Any]) -> None: + def _write_to_file(self, data: dict[str, Any]) -> None: os.makedirs(os.path.dirname(_USAGE_STATS_JSON_PATH), exist_ok=True) Path(_USAGE_STATS_JSON_PATH).touch(exist_ok=True) with open(_USAGE_STATS_JSON_PATH, "a") as f: diff --git a/vllm/utils.py b/vllm/utils.py index 29e60a9c9be2d..26c9e1a908371 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -28,12 +28,12 @@ import weakref from asyncio import FIRST_COMPLETED, AbstractEventLoop, Task from collections import OrderedDict, UserDict, defaultdict -from collections.abc import Hashable, Iterable, Mapping +from collections.abc import (AsyncGenerator, Awaitable, Generator, Hashable, + Iterable, Iterator, Mapping) from dataclasses import dataclass, field from functools import cache, lru_cache, partial, wraps -from typing import (TYPE_CHECKING, Any, AsyncGenerator, Awaitable, Callable, - Dict, Generator, Generic, Iterator, List, Literal, - NamedTuple, Optional, Tuple, Type, TypeVar, Union) +from typing import (TYPE_CHECKING, Any, Callable, Generic, Literal, NamedTuple, + Optional, TypeVar, Union) from uuid import uuid4 import cloudpickle @@ -400,7 +400,7 @@ def _next_task(iterator: AsyncGenerator[T, None], async def merge_async_iterators( *iterators: AsyncGenerator[T, - None], ) -> AsyncGenerator[Tuple[int, T], None]: + None], ) -> AsyncGenerator[tuple[int, T], None]: """Merge multiple asynchronous iterators into a single iterator. This method handle the case where some iterators finish before others. @@ -433,7 +433,7 @@ async def merge_async_iterators( async def collect_from_async_generator( - iterator: AsyncGenerator[T, None]) -> List[T]: + iterator: AsyncGenerator[T, None]) -> list[T]: """Collect all items from an async generator into a list.""" items = [] async for item in iterator: @@ -560,7 +560,7 @@ def find_process_using_port(port: int) -> Optional[psutil.Process]: return None -def update_environment_variables(envs: Dict[str, str]): +def update_environment_variables(envs: dict[str, str]): for k, v in envs.items(): if k in os.environ and os.environ[k] != v: logger.warning( @@ -569,7 +569,7 @@ def update_environment_variables(envs: Dict[str, str]): os.environ[k] = v -def chunk_list(lst: List[T], chunk_size: int): +def chunk_list(lst: list[T], chunk_size: int): """Yield successive chunk_size chunks from lst.""" for i in range(0, len(lst), chunk_size): yield lst[i:i + chunk_size] @@ -642,7 +642,7 @@ def create_kv_caches_with_random_flash( model_dtype: Optional[Union[str, torch.dtype]] = None, seed: int = 0, device: Optional[str] = "cuda", -) -> Tuple[List[torch.Tensor], List[torch.Tensor]]: +) -> tuple[list[torch.Tensor], list[torch.Tensor]]: from vllm.platforms import current_platform current_platform.seed_everything(seed) @@ -650,8 +650,8 @@ def create_kv_caches_with_random_flash( key_value_cache_shape = (num_blocks, 2, block_size, num_heads, head_size) scale = head_size**-0.5 - key_caches: List[torch.Tensor] = [] - value_caches: List[torch.Tensor] = [] + key_caches: list[torch.Tensor] = [] + value_caches: list[torch.Tensor] = [] for _ in range(num_layers): key_value_cache = torch.empty(size=key_value_cache_shape, @@ -679,7 +679,7 @@ def create_kv_caches_with_random( model_dtype: Optional[Union[str, torch.dtype]] = None, seed: int = 0, device: Optional[str] = "cuda", -) -> Tuple[List[torch.Tensor], List[torch.Tensor]]: +) -> tuple[list[torch.Tensor], list[torch.Tensor]]: if cache_dtype == "fp8" and head_size % 16: raise ValueError( @@ -693,7 +693,7 @@ def create_kv_caches_with_random( scale = head_size**-0.5 x = 16 // torch.tensor([], dtype=torch_dtype).element_size() key_cache_shape = (num_blocks, num_heads, head_size // x, block_size, x) - key_caches: List[torch.Tensor] = [] + key_caches: list[torch.Tensor] = [] for _ in range(num_layers): key_cache = torch.empty(size=key_cache_shape, dtype=torch_dtype, @@ -708,7 +708,7 @@ def create_kv_caches_with_random( key_caches.append(key_cache) value_cache_shape = (num_blocks, num_heads, head_size, block_size) - value_caches: List[torch.Tensor] = [] + value_caches: list[torch.Tensor] = [] for _ in range(num_layers): value_cache = torch.empty(size=value_cache_shape, dtype=torch_dtype, @@ -754,7 +754,7 @@ def __exit__(self, exc_type, exc_val, exc_tb): def make_ndarray_with_pad( - x: List[List[T]], + x: list[list[T]], pad: T, dtype: npt.DTypeLike, *, @@ -779,7 +779,7 @@ def make_ndarray_with_pad( def make_tensor_with_pad( - x: List[List[T]], + x: list[list[T]], pad: T, dtype: torch.dtype, *, @@ -831,7 +831,7 @@ def is_list_of( typ: Union[type[T], tuple[type[T], ...]], *, check: Literal["first", "all"] = "first", -) -> TypeIs[List[T]]: +) -> TypeIs[list[T]]: if not isinstance(value, list): return False @@ -843,8 +843,8 @@ def is_list_of( assert_never(check) -JSONTree = Union[Dict[str, "JSONTree[T]"], List["JSONTree[T]"], - Tuple["JSONTree[T]", ...], T] +JSONTree = Union[dict[str, "JSONTree[T]"], list["JSONTree[T]"], + tuple["JSONTree[T]", ...], T] """A nested JSON structure where the leaves need not be JSON-serializable.""" @@ -859,7 +859,7 @@ def json_map_leaves(func: Callable[[T], U], value: JSONTree[T]) -> JSONTree[U]: return func(value) -def flatten_2d_lists(lists: List[List[T]]) -> List[T]: +def flatten_2d_lists(lists: list[list[T]]) -> list[T]: """Flatten a list of lists to a single list.""" return [item for sublist in lists for item in sublist] @@ -1226,7 +1226,7 @@ def check_port(self, value): return value - def _pull_args_from_config(self, args: List[str]) -> List[str]: + def _pull_args_from_config(self, args: list[str]) -> list[str]: """Method to pull arguments specified in the config file into the command-line args variable. @@ -1291,7 +1291,7 @@ def _pull_args_from_config(self, args: List[str]) -> List[str]: return args - def _load_config_file(self, file_path: str) -> List[str]: + def _load_config_file(self, file_path: str) -> list[str]: """Loads a yaml file and returns the key value pairs as a flattened list with argparse like pattern ```yaml @@ -1313,9 +1313,9 @@ def _load_config_file(self, file_path: str) -> List[str]: %s supplied", extension) # only expecting a flat dictionary of atomic types - processed_args: List[str] = [] + processed_args: list[str] = [] - config: Dict[str, Union[int, str]] = {} + config: dict[str, Union[int, str]] = {} try: with open(file_path) as config_file: config = yaml.safe_load(config_file) @@ -1399,7 +1399,7 @@ def resolve_mm_processor_kwargs( *, requires_kw_only: bool = True, allow_var_kwargs: bool = False, -) -> Dict[str, Any]: +) -> dict[str, Any]: """Applies filtering to eliminate invalid mm_processor_kwargs, i.e., those who are not explicit keywords to the given callable (of one is given; otherwise no filtering is done), then merges the kwarg dicts, @@ -1440,7 +1440,7 @@ def get_allowed_kwarg_only_overrides( *, requires_kw_only: bool = True, allow_var_kwargs: bool = False, -) -> Dict[str, Any]: +) -> dict[str, Any]: """ Given a callable which has one or more keyword only params and a dict mapping param names to values, drop values that can be not be kwarg @@ -1531,9 +1531,9 @@ def value(self): # Adapted from: https://stackoverflow.com/a/47212782/5082708 class LazyDict(Mapping[str, T], Generic[T]): - def __init__(self, factory: Dict[str, Callable[[], T]]): + def __init__(self, factory: dict[str, Callable[[], T]]): self._factory = factory - self._dict: Dict[str, T] = {} + self._dict: dict[str, T] = {} def __getitem__(self, key: str) -> T: if key not in self._dict: @@ -1552,9 +1552,9 @@ def __len__(self): return len(self._factory) -class ClassRegistry(UserDict[Type[T], _V]): +class ClassRegistry(UserDict[type[T], _V]): - def __getitem__(self, key: Type[T]) -> _V: + def __getitem__(self, key: type[T]) -> _V: for cls in key.mro(): if cls in self.data: return self.data[cls] @@ -1584,8 +1584,8 @@ def weak_ref_tensor(tensor: torch.Tensor) -> torch.Tensor: def weak_ref_tensors( - tensors: Union[torch.Tensor, List[torch.Tensor], Tuple[torch.Tensor]] -) -> Union[torch.Tensor, List[torch.Tensor], Tuple[torch.Tensor]]: + tensors: Union[torch.Tensor, list[torch.Tensor], tuple[torch.Tensor]] +) -> Union[torch.Tensor, list[torch.Tensor], tuple[torch.Tensor]]: """ Convenience function to create weak references to tensors, for single tensor, list of tensors or tuple of tensors. @@ -1857,7 +1857,7 @@ def __getattr__(self, key: str): def direct_register_custom_op( op_name: str, op_func: Callable, - mutates_args: List[str], + mutates_args: list[str], fake_impl: Optional[Callable] = None, target_lib: Optional[Library] = None, dispatch_key: str = "CUDA", @@ -2177,8 +2177,8 @@ def get_mp_context(): def bind_kv_cache( - ctx: Dict[str, Any], - kv_cache: List[List[torch.Tensor]], # [virtual_engine][layer_index] + ctx: dict[str, Any], + kv_cache: list[list[torch.Tensor]], # [virtual_engine][layer_index] ) -> None: # Bind the kv_cache tensor to Attention modules, similar to # ctx[layer_name].kv_cache[ve]=kv_cache[ve][extract_layer_index(layer_name)] @@ -2210,8 +2210,8 @@ def bind_kv_cache( forward_ctx.kv_cache[ve] = ve_kv_cache[kv_cache_idx] -def run_method(obj: Any, method: Union[str, bytes, Callable], args: Tuple[Any], - kwargs: Dict[str, Any]) -> Any: +def run_method(obj: Any, method: Union[str, bytes, Callable], args: tuple[Any], + kwargs: dict[str, Any]) -> Any: """ Run a method of an object with the given arguments and keyword arguments. If the method is string, it will be converted to a method using getattr. @@ -2263,7 +2263,7 @@ def import_pynvml(): return pynvml -def warn_for_unimplemented_methods(cls: Type[T]) -> Type[T]: +def warn_for_unimplemented_methods(cls: type[T]) -> type[T]: """ A replacement for `abc.ABC`. When we use `abc.ABC`, subclasses will fail to instantiate diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index 1922a3bf27247..ea044b7d4255c 100755 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 """Attention layer with FlashAttention.""" from dataclasses import dataclass -from typing import Any, Dict, List, Optional, Tuple, Type +from typing import Any, Optional import numpy as np import torch @@ -25,7 +25,7 @@ class FlashAttentionBackend(AttentionBackend): accept_output_buffer: bool = True @staticmethod - def get_supported_head_sizes() -> List[int]: + def get_supported_head_sizes() -> list[int]: return [32, 64, 96, 128, 160, 192, 224, 256] @staticmethod @@ -33,11 +33,11 @@ def get_name() -> str: return "FLASH_ATTN_VLLM_V1" @staticmethod - def get_impl_cls() -> Type["FlashAttentionImpl"]: + def get_impl_cls() -> type["FlashAttentionImpl"]: return FlashAttentionImpl @staticmethod - def get_metadata_cls() -> Type["AttentionMetadata"]: + def get_metadata_cls() -> type["AttentionMetadata"]: return FlashAttentionMetadata @staticmethod @@ -46,7 +46,7 @@ def get_kv_cache_shape( block_size: int, num_kv_heads: int, head_size: int, - ) -> Tuple[int, ...]: + ) -> tuple[int, ...]: if block_size % 16 != 0: raise ValueError("Block size must be a multiple of 16.") return (2, num_blocks, block_size, num_kv_heads, head_size) @@ -93,10 +93,10 @@ def __init__( head_size: int, scale: float, num_kv_heads: int, - alibi_slopes: Optional[List[float]], + alibi_slopes: Optional[list[float]], sliding_window: Optional[int], kv_cache_dtype: str, - blocksparse_params: Optional[Dict[str, Any]] = None, + blocksparse_params: Optional[dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, attn_type: AttentionType = AttentionType.DECODER, ) -> None: @@ -316,7 +316,7 @@ def cascade_attention( max_kv_len: int, softmax_scale: float, alibi_slopes: Optional[torch.Tensor], - sliding_window: Tuple[int, int], + sliding_window: tuple[int, int], logits_soft_cap: float, block_table: torch.Tensor, common_prefix_len: int, diff --git a/vllm/v1/attention/backends/pallas.py b/vllm/v1/attention/backends/pallas.py index 37bf33f6e3e91..e6ac2935fda21 100644 --- a/vllm/v1/attention/backends/pallas.py +++ b/vllm/v1/attention/backends/pallas.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 from dataclasses import dataclass -from typing import Any, Dict, List, Optional, Tuple, Type +from typing import Any, Optional import torch import torch_xla.experimental.custom_kernel # Required to register custom ops. @@ -19,15 +19,15 @@ def get_name() -> str: return "PALLAS_VLLM_V1" @staticmethod - def get_impl_cls() -> Type["PallasAttentionBackendImpl"]: + def get_impl_cls() -> type["PallasAttentionBackendImpl"]: return PallasAttentionBackendImpl @staticmethod - def get_metadata_cls() -> Type["PallasMetadata"]: + def get_metadata_cls() -> type["PallasMetadata"]: return PallasMetadata @staticmethod - def get_state_cls() -> Type["CommonAttentionState"]: + def get_state_cls() -> type["CommonAttentionState"]: return CommonAttentionState @staticmethod @@ -36,7 +36,7 @@ def get_kv_cache_shape( block_size: int, num_kv_heads: int, head_size: int, - ) -> Tuple[int, ...]: + ) -> tuple[int, ...]: return (num_kv_heads, num_blocks, block_size, head_size) @staticmethod @@ -50,8 +50,8 @@ def swap_blocks( @torch.compile(backend="openxla") @staticmethod def copy_blocks( - kv_caches: List[Tuple[torch.Tensor, torch.Tensor]], - src_to_dists: Tuple[torch.Tensor, torch.Tensor], + kv_caches: list[tuple[torch.Tensor, torch.Tensor]], + src_to_dists: tuple[torch.Tensor, torch.Tensor], ) -> None: src_indices, dst_indices = src_to_dists for k_cache, v_cache in kv_caches: @@ -98,10 +98,10 @@ def __init__( head_size: int, scale: float, num_kv_heads: int, - alibi_slopes: Optional[List[float]], + alibi_slopes: Optional[list[float]], sliding_window: Optional[int], kv_cache_dtype: str, - blocksparse_params: Optional[Dict[str, Any]] = None, + blocksparse_params: Optional[dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, attn_type: str = AttentionType.DECODER, ) -> None: @@ -157,7 +157,7 @@ def forward( query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, - kv_cache: Tuple[torch.Tensor, torch.Tensor], + kv_cache: tuple[torch.Tensor, torch.Tensor], attn_metadata: PallasMetadata, output: Optional[torch.Tensor] = None, ) -> torch.Tensor: diff --git a/vllm/v1/attention/backends/rocm_attn.py b/vllm/v1/attention/backends/rocm_attn.py index 0f3fabf05fc28..3962aeff58e94 100644 --- a/vllm/v1/attention/backends/rocm_attn.py +++ b/vllm/v1/attention/backends/rocm_attn.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 """Attention layer with PagedAttention on rocm""" -from typing import Any, Dict, List, Optional, Tuple, Type +from typing import Any, Optional import torch @@ -19,7 +19,7 @@ class ROCmAttentionBackend(AttentionBackend): accept_output_buffer: bool = True @staticmethod - def get_supported_head_sizes() -> List[int]: + def get_supported_head_sizes() -> list[int]: return [32, 64, 96, 128, 160, 192, 224, 256] @staticmethod @@ -27,11 +27,11 @@ def get_name() -> str: return "ROCM_ATTN_VLLM_V1" @staticmethod - def get_impl_cls() -> Type["ROCmAttentionImpl"]: + def get_impl_cls() -> type["ROCmAttentionImpl"]: return ROCmAttentionImpl @staticmethod - def get_metadata_cls() -> Type["AttentionMetadata"]: + def get_metadata_cls() -> type["AttentionMetadata"]: return FlashAttentionMetadata @staticmethod @@ -40,7 +40,7 @@ def get_kv_cache_shape( block_size: int, num_kv_heads: int, head_size: int, - ) -> Tuple[int, ...]: + ) -> tuple[int, ...]: if block_size % 16 != 0: raise ValueError("Block size must be a multiple of 16.") return (2, num_blocks, block_size, num_kv_heads, head_size) @@ -58,10 +58,10 @@ def __init__( head_size: int, scale: float, num_kv_heads: int, - alibi_slopes: Optional[List[float]], + alibi_slopes: Optional[list[float]], sliding_window: Optional[int], kv_cache_dtype: str, - blocksparse_params: Optional[Dict[str, Any]] = None, + blocksparse_params: Optional[dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, attn_type: AttentionType = AttentionType.DECODER, ) -> None: diff --git a/vllm/v1/core/encoder_cache_manager.py b/vllm/v1/core/encoder_cache_manager.py index 13ad14e45b32e..018379c1f43af 100644 --- a/vllm/v1/core/encoder_cache_manager.py +++ b/vllm/v1/core/encoder_cache_manager.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import TYPE_CHECKING, Dict, List, Set, Tuple +from typing import TYPE_CHECKING from vllm.logger import init_logger from vllm.multimodal import MULTIMODAL_REGISTRY @@ -18,9 +18,9 @@ def __init__(self, cache_size: int): self.cache_size = cache_size self.num_free_slots = cache_size # req_id -> cached input ids - self.cached: Dict[str, Set[int]] = {} - # List of [req_id, input_id] - self.freed: List[Tuple[str, int]] = [] + self.cached: dict[str, set[int]] = {} + # list of [req_id, input_id] + self.freed: list[tuple[str, int]] = [] def has_cache(self, request: Request, input_id: int) -> bool: req_id = request.request_id @@ -37,7 +37,7 @@ def allocate(self, request: Request, input_id: int) -> None: self.cached[req_id].add(input_id) self.num_free_slots -= request.get_num_encoder_tokens(input_id) - def get_cached_input_ids(self, request: Request) -> Set[int]: + def get_cached_input_ids(self, request: Request) -> set[int]: return self.cached.get(request.request_id, set()) def free_encoder_input(self, request: Request, input_id: int) -> None: @@ -58,7 +58,7 @@ def free(self, request: Request) -> None: for input_id in input_ids: self.free_encoder_input(request, input_id) - def get_freed_ids(self) -> List[Tuple[str, int]]: + def get_freed_ids(self) -> list[tuple[str, int]]: freed = self.freed self.freed = [] return freed @@ -67,7 +67,7 @@ def get_freed_ids(self) -> List[Tuple[str, int]]: def compute_encoder_budget( model_config: "ModelConfig", scheduler_config: "SchedulerConfig", -) -> Tuple[int, int]: +) -> tuple[int, int]: """Compute the encoder cache budget based on the model and scheduler configurations. @@ -97,7 +97,7 @@ def compute_encoder_budget( def _compute_encoder_budget_multimodal( model_config: "ModelConfig", scheduler_config: "SchedulerConfig", -) -> Tuple[int, int]: +) -> tuple[int, int]: """Compute the encoder cache budget based on the model and scheduler configurations for a multimodal model. diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index 017e625dcdba8..1989d6bc17d1f 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -1,7 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 from collections import defaultdict -from typing import DefaultDict, Dict, Iterable, List, Optional, Tuple +from collections.abc import Iterable +from typing import Optional from vllm.logger import init_logger from vllm.utils import cdiv @@ -50,7 +51,7 @@ def __init__( self.num_preallocate_blocks = cdiv(num_preallocate_tokens, block_size) # A Block pool of all kv-cache blocks. - self.block_pool: List[KVCacheBlock] = [ + self.block_pool: list[KVCacheBlock] = [ KVCacheBlock(idx) for idx in range(num_gpu_blocks) ] # Free block queue that constructs and manipulates a doubly linked @@ -67,26 +68,26 @@ def __init__( # if there is already an identical block in the cache. This is because # we want to make sure the allocated block IDs won't change so that # block tables are append-only. - self.cached_block_hash_to_block: Dict[BlockHashType, Dict[ + self.cached_block_hash_to_block: dict[BlockHashType, dict[ int, KVCacheBlock]] = defaultdict(dict) # Mapping from request ID to blocks to track the blocks allocated # for each request, so that we can free the blocks when the request # is finished. - self.req_to_blocks: DefaultDict[str, - List[KVCacheBlock]] = defaultdict(list) + self.req_to_blocks: defaultdict[str, + list[KVCacheBlock]] = defaultdict(list) # Mapping from request ID to kv block hashes. # This is to avoid recomputing the block hashes for each call of # `get_computed_blocks` or `allocate_slots`. - self.req_to_block_hashes: DefaultDict[ - str, List[BlockHashType]] = defaultdict(list) + self.req_to_block_hashes: defaultdict[ + str, list[BlockHashType]] = defaultdict(list) # {req_id: The number of cached blocks for this given request} # This is used to track the number of cached blocks for each request. # This is only used to track the RUNNING requests, we do not track the # data for reempted ones. - self.num_cached_block: Dict[str, int] = defaultdict(int) + self.num_cached_block: dict[str, int] = defaultdict(int) self.prefix_cache_stats = PrefixCacheStats() @property @@ -110,7 +111,7 @@ def make_prefix_cache_stats(self) -> PrefixCacheStats: return stats def get_computed_blocks( - self, request: Request) -> Tuple[List[KVCacheBlock], int]: + self, request: Request) -> tuple[list[KVCacheBlock], int]: """Get the computed (cached) blocks for the request. Note that the computed blocks must be full. @@ -158,8 +159,8 @@ def allocate_slots( self, request: Request, num_tokens: int, - new_computed_blocks: Optional[List[KVCacheBlock]] = None - ) -> Optional[List[KVCacheBlock]]: + new_computed_blocks: Optional[list[KVCacheBlock]] = None + ) -> Optional[list[KVCacheBlock]]: """Add slots for a request with new tokens to append. Args: @@ -367,7 +368,7 @@ def get_num_common_prefix_blocks( break return num_common_blocks - def _get_new_blocks(self, num_blocks: int) -> List[KVCacheBlock]: + def _get_new_blocks(self, num_blocks: int) -> list[KVCacheBlock]: """Get new blocks from the free block pool. Note that we do not check block cache in this function. @@ -382,7 +383,7 @@ def _get_new_blocks(self, num_blocks: int) -> List[KVCacheBlock]: raise ValueError( f"Cannot get {num_blocks} free blocks from the pool") - ret: List[KVCacheBlock] = [] + ret: list[KVCacheBlock] = [] idx = 0 while idx < num_blocks: # First allocate blocks. @@ -438,7 +439,7 @@ def _get_cached_block(self, return self.cached_block_hash_to_block[block_hash][first_block_id] return None - def _touch(self, blocks: List[KVCacheBlock]) -> None: + def _touch(self, blocks: list[KVCacheBlock]) -> None: """Touch a block increases its reference count by 1, and may remove the block from the free queue. This is used when a block is hit by another request with the same prefix. @@ -457,7 +458,7 @@ def _cache_full_blocks( self, request: Request, blk_start_idx: int, - full_blocks: List[KVCacheBlock], + full_blocks: list[KVCacheBlock], prev_block: Optional[KVCacheBlock], ) -> None: """Cache a list of full blocks for prefix caching. diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index e3eb6b24c1950..c52461851073f 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -3,7 +3,7 @@ from collections import deque from collections.abc import Sequence from dataclasses import dataclass -from typing import Any, List, NamedTuple, Optional, Tuple +from typing import Any, NamedTuple, Optional from vllm.config import VllmConfig from vllm.logger import init_logger @@ -25,7 +25,7 @@ class BlockHashType(NamedTuple): # Hash value of the block in an integer. hash_value: int # Token IDs in the block. - token_ids: Tuple[int, ...] + token_ids: tuple[int, ...] # Extra keys for the block. extra_keys: Optional[Any] = None @@ -45,7 +45,7 @@ def __init__(self, interval: int = 1000): self.aggregated_query_total = 0 self.aggregated_query_hit = 0 # A deque of (requests, queries, hits) for the most recent requests. - self.query_queue: deque[Tuple[int, int, int]] = deque() + self.query_queue: deque[tuple[int, int, int]] = deque() def observe(self, stats: PrefixCacheStats): """Observe the prefix caching for a set of requests. @@ -151,7 +151,7 @@ class FreeKVCacheBlockQueue: blocks: A list of KVCacheBlock objects. """ - def __init__(self, blocks: List[KVCacheBlock]) -> None: + def __init__(self, blocks: list[KVCacheBlock]) -> None: self.num_free_blocks = len(blocks) # Initialize the doubly linked list of free blocks. @@ -220,7 +220,7 @@ def append(self, block: KVCacheBlock) -> None: block.next_free_block = None self.num_free_blocks += 1 - def get_all_free_blocks(self) -> List[KVCacheBlock]: + def get_all_free_blocks(self) -> list[KVCacheBlock]: """Get all free blocks in the free list. Mainly used for testing. Returns: @@ -251,7 +251,7 @@ def need_extra_keys(request: Request) -> bool: def _gen_mm_extra_hash_keys(request: Request, start_token_idx: int, end_token_idx: int, - start_mm_idx: int) -> Tuple[List[Any], int]: + start_mm_idx: int) -> tuple[list[Any], int]: """Generate extra keys related to MultiModal request for block hash computation. For multi-modal inputs, the extra keys are (mm_hash, start_offset) that indicate a mm input contained in the @@ -266,7 +266,7 @@ def _gen_mm_extra_hash_keys(request: Request, start_token_idx: int, Returns: A tuple of extra keys and the next multi-modal index. """ - extra_keys: List[Any] = [] + extra_keys: list[Any] = [] mm_positions, mm_hashes = request.mm_positions, request.mm_hashes if not mm_positions: @@ -318,7 +318,7 @@ def _gen_mm_extra_hash_keys(request: Request, start_token_idx: int, return extra_keys, curr_mm_idx -def _gen_lora_extra_hash_keys(request: Request) -> List[int]: +def _gen_lora_extra_hash_keys(request: Request) -> list[int]: """Generate extra keys related to LoRA for block hash computation. Args: @@ -335,7 +335,7 @@ def _gen_lora_extra_hash_keys(request: Request) -> List[int]: def generate_block_hash_extra_keys( request: Request, start_token_idx: int, end_token_idx: int, - start_mm_idx: int) -> Tuple[Optional[Tuple[Any, ...]], int]: + start_mm_idx: int) -> tuple[Optional[tuple[Any, ...]], int]: """Generate extra keys for the block hash. The extra keys can come from the multi-modal inputs and request specific metadata (e.g., LoRA ID). @@ -348,12 +348,12 @@ def generate_block_hash_extra_keys( Returns: A tuple of extra keys and the next multi-modal index. """ - mm_extra_keys: List[Any] + mm_extra_keys: list[Any] mm_extra_keys, new_start_mm_idx = _gen_mm_extra_hash_keys( request, start_token_idx, end_token_idx, start_mm_idx) - lora_extra_keys: List[int] = _gen_lora_extra_hash_keys(request) + lora_extra_keys: list[int] = _gen_lora_extra_hash_keys(request) - extra_keys: List[Any] = lora_extra_keys + mm_extra_keys + extra_keys: list[Any] = lora_extra_keys + mm_extra_keys if not extra_keys: return None, new_start_mm_idx @@ -364,7 +364,7 @@ def generate_block_hash_extra_keys( def hash_block_tokens( parent_block_hash: Optional[int], curr_block_token_ids: Sequence[int], - extra_keys: Optional[Tuple[Any, ...]] = None) -> BlockHashType: + extra_keys: Optional[tuple[Any, ...]] = None) -> BlockHashType: """Computes a hash value corresponding to the contents of a block and the contents of the preceding block(s). The hash value is used for prefix caching. We use LRU cache for this function to avoid recomputing @@ -397,7 +397,7 @@ def hash_block_tokens( def hash_request_tokens(block_size: int, - request: Request) -> List[BlockHashType]: + request: Request) -> list[BlockHashType]: """Computes hash values of a chain of blocks given a sequence of token IDs. The hash value is used for prefix caching. @@ -541,8 +541,8 @@ def _get_kv_cache_config_uniform_type(vllm_config: VllmConfig, def get_kv_cache_configs(vllm_config: VllmConfig, - kv_cache_specs: List[KVCacheSpec], - available_memory: int) -> List[KVCacheConfig]: + kv_cache_specs: list[KVCacheSpec], + available_memory: int) -> list[KVCacheConfig]: """ Generates the KV cache configuration for a model TODO: support hybrid models with more than one type of KV cache. diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py index 87c9c0cd12b7b..db14c9455a1f3 100644 --- a/vllm/v1/core/scheduler.py +++ b/vllm/v1/core/scheduler.py @@ -2,7 +2,8 @@ import time from collections import deque -from typing import Deque, Dict, Iterable, List, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union from vllm.config import (CacheConfig, LoRAConfig, ModelConfig, SchedulerConfig, SpeculativeConfig) @@ -57,24 +58,24 @@ def __init__( self.block_size = self.cache_config.block_size # req_id -> Request - self.requests: Dict[str, Request] = {} + self.requests: dict[str, Request] = {} # Priority queues for requests. - self.waiting: Deque[Request] = deque() - self.running: List[Request] = [] + self.waiting: deque[Request] = deque() + self.running: list[Request] = [] # The requests that have been scheduled and are being executed # by the executor. - self.scheduled_req_ids: Set[str] = set() + self.scheduled_req_ids: set[str] = set() # The request IDs that are finished in between the previous and the # current steps. This is used to notify the workers about the finished # requests so that they can free the cached states for those requests. # This is flushed at the end of each scheduling step. - self.finished_req_ids: Set[str] = set() + self.finished_req_ids: set[str] = set() # OPTIMIZATION: Cache the CachedRequestData objects to avoid creating # them at each scheduling step. # Request id -> CachedRequestData - self._cached_reqs_data: Dict[str, CachedRequestData] = {} + self._cached_reqs_data: dict[str, CachedRequestData] = {} # Encoder-related. # Calculate encoder cache size if applicable @@ -108,19 +109,19 @@ def schedule(self) -> "SchedulerOutput": # chunked prefills, prefix caching, speculative decoding, # and the "jump decoding" optimization in the future. - scheduled_new_reqs: List[Request] = [] - scheduled_resumed_reqs: List[Request] = [] - scheduled_running_reqs: List[Request] = [] - preempted_reqs: List[Request] = [] + scheduled_new_reqs: list[Request] = [] + scheduled_resumed_reqs: list[Request] = [] + scheduled_running_reqs: list[Request] = [] + preempted_reqs: list[Request] = [] - req_to_new_block_ids: Dict[str, List[int]] = {} - num_scheduled_tokens: Dict[str, int] = {} + req_to_new_block_ids: dict[str, list[int]] = {} + num_scheduled_tokens: dict[str, int] = {} token_budget = self.max_num_scheduled_tokens # Encoder-related. - scheduled_encoder_inputs: Dict[str, List[int]] = {} + scheduled_encoder_inputs: dict[str, list[int]] = {} encoder_budget = self.max_num_encoder_input_tokens # Spec decode-related. - scheduled_spec_decode_tokens: Dict[str, List[int]] = {} + scheduled_spec_decode_tokens: dict[str, list[int]] = {} # For logging. scheduled_timestamp = time.monotonic() @@ -211,7 +212,7 @@ def schedule(self) -> "SchedulerOutput": encoder_budget = new_encoder_budget # Record the LoRAs in scheduled_running_reqs - requested_loras: Set[int] = set() + requested_loras: set[int] = set() if self.lora_config: requested_loras = set( req.lora_request.lora_int_id for req in scheduled_running_reqs @@ -378,7 +379,7 @@ def _make_cached_request_data( request: Request, num_scheduled_tokens: int, num_scheduled_spec_tokens: int, - new_block_ids: List[int], + new_block_ids: list[int], resumed_from_preemption: bool, ) -> "CachedRequestData": # OPTIMIZATION: Cache the CachedRequestData objects to avoid creating @@ -407,7 +408,7 @@ def _try_schedule_encoder_inputs( num_computed_tokens: int, num_new_tokens: int, encoder_budget: int, - ) -> Tuple[List[int], int, int]: + ) -> tuple[list[int], int, int]: """ Determine which encoder inputs need to be scheduled in the current step, and update `num_new_tokens` and encoder token budget accordingly. @@ -427,7 +428,7 @@ def _try_schedule_encoder_inputs( if not request.has_encoder_inputs(): return [], num_new_tokens, encoder_budget - encoder_inputs_to_schedule: List[int] = [] + encoder_inputs_to_schedule: list[int] = [] mm_positions = request.mm_positions assert mm_positions is not None assert len(mm_positions) > 0 @@ -482,8 +483,8 @@ def update_from_output( prompt_logprobs_dict = model_runner_output.prompt_logprobs_dict num_scheduled_tokens = scheduler_output.num_scheduled_tokens - new_running: List[Request] = [] - outputs: List[EngineCoreOutput] = [] + new_running: list[Request] = [] + outputs: list[EngineCoreOutput] = [] # NOTE(woosuk): As len(self.running) can be up to 1K or more, the below # loop can be a performance bottleneck. We should do our best to avoid @@ -543,7 +544,7 @@ def update_from_output( stopped = False new_logprobs = None - new_token_ids: List[int] = [] + new_token_ids: list[int] = [] if request.num_computed_tokens >= request.num_tokens: for output_token_id in generated_token_ids: diff --git a/vllm/v1/core/scheduler_output.py b/vllm/v1/core/scheduler_output.py index 47413527c32f2..b6caa8b4ebf73 100644 --- a/vllm/v1/core/scheduler_output.py +++ b/vllm/v1/core/scheduler_output.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 from dataclasses import dataclass -from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple +from typing import TYPE_CHECKING, Optional if TYPE_CHECKING: from vllm.lora.request import LoRARequest @@ -15,13 +15,13 @@ class NewRequestData: req_id: str - prompt_token_ids: List[int] + prompt_token_ids: list[int] prompt: Optional[str] - mm_inputs: List["MultiModalKwargs"] - mm_hashes: List[str] - mm_positions: List["PlaceholderRange"] + mm_inputs: list["MultiModalKwargs"] + mm_hashes: list[str] + mm_positions: list["PlaceholderRange"] sampling_params: "SamplingParams" - block_ids: List[int] + block_ids: list[int] num_computed_tokens: int lora_request: Optional["LoRARequest"] @@ -29,7 +29,7 @@ class NewRequestData: def from_request( cls, request: "Request", - block_ids: List[int], + block_ids: list[int], ) -> "NewRequestData": return cls( req_id=request.request_id, @@ -53,8 +53,8 @@ class CachedRequestData: # the request's block IDs. If True, new_block_ids will be used as the # request's block IDs instead of appending to the existing block IDs. resumed_from_preemption: bool - new_token_ids: List[int] - new_block_ids: List[int] + new_token_ids: list[int] + new_block_ids: list[int] num_computed_tokens: int @classmethod @@ -62,8 +62,8 @@ def from_request( cls, request: "Request", resumed_from_preemption: bool, - new_token_ids: List[int], - new_block_ids: List[int], + new_token_ids: list[int], + new_block_ids: list[int], ) -> "CachedRequestData": return cls( req_id=request.request_id, @@ -77,29 +77,29 @@ def from_request( @dataclass class SchedulerOutput: - # List of the requests that are scheduled for the first time. + # list of the requests that are scheduled for the first time. # We cache the request's data in each worker process, so that we don't # need to re-send it every scheduling step. - scheduled_new_reqs: List[NewRequestData] - # List of the requests that have been scheduled before. + scheduled_new_reqs: list[NewRequestData] + # list of the requests that have been scheduled before. # Since the request's data is already cached in the worker processes, # we only send the diff to minimize the communication cost. - scheduled_cached_reqs: List[CachedRequestData] + scheduled_cached_reqs: list[CachedRequestData] # req_id -> num_scheduled_tokens # Number of tokens scheduled for each request. - num_scheduled_tokens: Dict[str, int] + num_scheduled_tokens: dict[str, int] # Total number of tokens scheduled for all requests. # Equal to sum(num_scheduled_tokens.values()) total_num_scheduled_tokens: int # req_id -> spec_token_ids # If a request does not have any spec decode tokens, it will not be # included in the dictionary. - scheduled_spec_decode_tokens: Dict[str, List[int]] + scheduled_spec_decode_tokens: dict[str, list[int]] # req_id -> encoder input indices that need processing. # E.g., if a request has [0, 1], it could mean the vision encoder needs # to process that the request's 0-th and 1-th images in the current step. - scheduled_encoder_inputs: Dict[str, List[int]] + scheduled_encoder_inputs: dict[str, list[int]] # Number of common prefix blocks for all requests. # This can be used for cascade attention. num_common_prefix_blocks: int @@ -107,7 +107,7 @@ class SchedulerOutput: # Request IDs that are finished in between the previous and the current # steps. This is used to notify the workers about the finished requests # so that they can free the cached states for those requests. - finished_req_ids: Set[str] - # List of (req_id, encoder_input_index) tuples. + finished_req_ids: set[str] + # list of (req_id, encoder_input_index) tuples. # Used to free the encoder cache. - free_encoder_input_ids: List[Tuple[str, int]] + free_encoder_input_ids: list[tuple[str, int]] diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py index 32fb3c5bd62e2..cd29c2d7d57c0 100644 --- a/vllm/v1/engine/__init__.py +++ b/vllm/v1/engine/__init__.py @@ -2,7 +2,7 @@ import enum import time -from typing import Any, List, Optional, Union +from typing import Any, Optional, Union import msgspec @@ -51,10 +51,10 @@ class EngineCoreRequest( # NOTE(ywang96): original text prompt is needed when a request is added to # Detokenizer, but set to None when it is added to EngineCoreClient. prompt: Optional[str] - prompt_token_ids: List[int] - mm_inputs: Optional[List[Optional[MultiModalKwargs]]] - mm_hashes: Optional[List[str]] - mm_placeholders: Optional[List[PlaceholderRange]] + prompt_token_ids: list[int] + mm_inputs: Optional[list[Optional[MultiModalKwargs]]] + mm_hashes: Optional[list[str]] + mm_placeholders: Optional[list[PlaceholderRange]] sampling_params: SamplingParams eos_token_id: Optional[int] arrival_time: float @@ -93,14 +93,14 @@ class EngineCoreOutput( gc=False): # type: ignore[call-arg] request_id: str - new_token_ids: List[int] + new_token_ids: list[int] new_logprobs: Optional[LogprobsLists] = None new_prompt_logprobs_tensors: Optional[LogprobsTensors] = None finish_reason: Optional[FinishReason] = None stop_reason: Union[int, str, None] = None - events: Optional[List[EngineCoreEvent]] = None + events: Optional[list[EngineCoreEvent]] = None @property def finished(self) -> bool: @@ -129,7 +129,7 @@ class EngineCoreOutputs( # e.g. columnwise layout # [num_reqs] - outputs: List[EngineCoreOutput] = [] + outputs: list[EngineCoreOutput] = [] scheduler_stats: Optional[SchedulerStats] = None timestamp: float = 0.0 diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 0c04e14cec2f6..ab3cdc4ee295d 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -2,7 +2,8 @@ import asyncio import os -from typing import AsyncGenerator, List, Mapping, Optional, Set, Type, Union +from collections.abc import AsyncGenerator, Mapping +from typing import Optional, Union import numpy as np @@ -39,7 +40,7 @@ class AsyncLLM(EngineClient): def __init__( self, vllm_config: VllmConfig, - executor_class: Type[Executor], + executor_class: type[Executor], log_stats: bool, usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, input_registry: InputRegistry = INPUT_REGISTRY, @@ -54,7 +55,7 @@ def __init__( self.log_requests = log_requests self.log_stats = log_stats - self.stat_loggers: List[StatLoggerBase] = [] + self.stat_loggers: list[StatLoggerBase] = [] if self.log_stats: self.stat_loggers.extend([ LoggingStatLogger(), @@ -400,7 +401,7 @@ async def remove_lora(self, lora_id: int) -> bool: """Remove an already loaded LoRA adapter.""" return await self.engine_core.remove_lora_async(lora_id) - async def list_loras(self) -> Set[int]: + async def list_loras(self) -> set[int]: """List all registered adapters.""" return await self.engine_core.list_loras_async() diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 041896f1c7cc5..b9bf8fac40f60 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -7,7 +7,7 @@ from concurrent.futures import Future from inspect import isclass, signature from multiprocessing.connection import Connection -from typing import Any, List, Optional, Set, Tuple, Type +from typing import Any, Optional import msgspec import psutil @@ -42,7 +42,7 @@ class EngineCore: def __init__( self, vllm_config: VllmConfig, - executor_class: Type[Executor], + executor_class: type[Executor], log_stats: bool, ): assert vllm_config.model_config.runner_type != "pooling" @@ -80,7 +80,7 @@ def __init__( # schedule and execute batches, and is required by pipeline parallelism # to eliminate pipeline bubbles. self.batch_queue_size = self.model_executor.max_concurrent_batches - self.batch_queue: Optional[queue.Queue[Tuple[Future[ModelRunnerOutput], + self.batch_queue: Optional[queue.Queue[tuple[Future[ModelRunnerOutput], SchedulerOutput]]] = None if self.batch_queue_size > 1: logger.info("Batch queue is enabled with size %d", @@ -88,7 +88,7 @@ def __init__( self.batch_queue = queue.Queue(self.batch_queue_size) def _initialize_kv_caches(self, - vllm_config: VllmConfig) -> Tuple[int, int]: + vllm_config: VllmConfig) -> tuple[int, int]: start = time.time() # Get all kv cache needed by the model @@ -134,7 +134,7 @@ def add_request(self, request: EngineCoreRequest): self.scheduler.add_request(req) - def abort_requests(self, request_ids: List[str]): + def abort_requests(self, request_ids: list[str]): """Abort requests from the scheduler.""" # TODO: The scheduler doesn't really need to know the @@ -228,7 +228,7 @@ def add_lora(self, lora_request: LoRARequest) -> bool: def remove_lora(self, lora_id: int) -> bool: return self.model_executor.remove_lora(lora_id) - def list_loras(self) -> Set[int]: + def list_loras(self) -> set[int]: return self.model_executor.list_loras() def pin_lora(self, lora_id: int) -> bool: @@ -244,7 +244,7 @@ def __init__( output_path: str, ready_pipe: Connection, vllm_config: VllmConfig, - executor_class: Type[Executor], + executor_class: type[Executor], log_stats: bool, ): super().__init__(vllm_config, executor_class, log_stats) @@ -254,7 +254,7 @@ def __init__( # and to overlap some serialization/deserialization with the # model forward pass. # Threads handle Socket <-> Queues and core_busy_loop uses Queue. - self.input_queue: queue.Queue[Tuple[EngineCoreRequestType, + self.input_queue: queue.Queue[tuple[EngineCoreRequestType, Any]] = queue.Queue() self.output_queue: queue.Queue[EngineCoreOutputs] = queue.Queue() threading.Thread(target=self.process_input_socket, diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 9f36e11d12d76..cdce14afe0b3f 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -10,7 +10,7 @@ from concurrent.futures import Future from dataclasses import dataclass from threading import Thread -from typing import Any, Dict, List, Optional, Set, Type, Union +from typing import Any, Optional, Union import zmq import zmq.asyncio @@ -48,7 +48,7 @@ def make_client( multiprocess_mode: bool, asyncio_mode: bool, vllm_config: VllmConfig, - executor_class: Type[Executor], + executor_class: type[Executor], log_stats: bool, ) -> "EngineCoreClient": @@ -94,7 +94,7 @@ def execute_dummy_batch(self) -> None: async def execute_dummy_batch_async(self) -> None: raise NotImplementedError - def abort_requests(self, request_ids: List[str]) -> None: + def abort_requests(self, request_ids: list[str]) -> None: raise NotImplementedError def add_lora(self, lora_request: LoRARequest) -> bool: @@ -103,7 +103,7 @@ def add_lora(self, lora_request: LoRARequest) -> bool: def remove_lora(self, lora_id: int) -> bool: raise NotImplementedError - def list_loras(self) -> Set[int]: + def list_loras(self) -> set[int]: raise NotImplementedError def pin_lora(self, lora_id: int) -> bool: @@ -127,7 +127,7 @@ async def sleep_async(self, level: int = 1) -> None: async def wake_up_async(self) -> None: raise NotImplementedError - async def abort_requests_async(self, request_ids: List[str]) -> None: + async def abort_requests_async(self, request_ids: list[str]) -> None: raise NotImplementedError async def add_lora_async(self, lora_request: LoRARequest) -> bool: @@ -136,7 +136,7 @@ async def add_lora_async(self, lora_request: LoRARequest) -> bool: async def remove_lora_async(self, lora_id: int) -> bool: raise NotImplementedError - async def list_loras_async(self) -> Set[int]: + async def list_loras_async(self) -> set[int]: raise NotImplementedError async def pin_lora_async(self, lora_id: int) -> bool: @@ -162,7 +162,7 @@ def get_output(self) -> EngineCoreOutputs: def add_request(self, request: EngineCoreRequest) -> None: self.engine_core.add_request(request) - def abort_requests(self, request_ids: List[str]) -> None: + def abort_requests(self, request_ids: list[str]) -> None: if len(request_ids) > 0: self.engine_core.abort_requests(request_ids) @@ -190,7 +190,7 @@ def add_lora(self, lora_request: LoRARequest) -> bool: def remove_lora(self, lora_id: int) -> bool: return self.engine_core.remove_lora(lora_id) - def list_loras(self) -> Set[int]: + def list_loras(self) -> set[int]: return self.engine_core.list_loras() def pin_lora(self, lora_id: int) -> bool: @@ -239,7 +239,7 @@ def __init__( self, asyncio_mode: bool, vllm_config: VllmConfig, - executor_class: Type[Executor], + executor_class: type[Executor], log_stats: bool, ): # The child processes will send SIGUSR1 when unrecoverable @@ -293,14 +293,14 @@ def sigusr1_handler(signum, frame): self.output_socket = resources.output_socket self.input_socket = resources.input_socket - self.utility_results: Dict[int, AnyFuture] = {} + self.utility_results: dict[int, AnyFuture] = {} def shutdown(self): self._finalizer() def _process_utility_output(output: UtilityOutput, - utility_results: Dict[int, AnyFuture]): + utility_results: dict[int, AnyFuture]): """Set the result from a utility method in the waiting future""" future = utility_results.pop(output.call_id) if output.failure_message is not None: @@ -312,7 +312,7 @@ def _process_utility_output(output: UtilityOutput, class SyncMPClient(MPClient): """Synchronous client for multi-proc EngineCore.""" - def __init__(self, vllm_config: VllmConfig, executor_class: Type[Executor], + def __init__(self, vllm_config: VllmConfig, executor_class: type[Executor], log_stats: bool): super().__init__( asyncio_mode=False, @@ -373,7 +373,7 @@ def add_request(self, request: EngineCoreRequest) -> None: request.prompt = None self._send_input(EngineCoreRequestType.ADD, request) - def abort_requests(self, request_ids: List[str]) -> None: + def abort_requests(self, request_ids: list[str]) -> None: if len(request_ids) > 0: self._send_input(EngineCoreRequestType.ABORT, request_ids) @@ -389,7 +389,7 @@ def add_lora(self, lora_request: LoRARequest) -> bool: def remove_lora(self, lora_id: int) -> bool: return self._call_utility("remove_lora", lora_id) - def list_loras(self) -> Set[int]: + def list_loras(self) -> set[int]: return self._call_utility("list_loras") def pin_lora(self, lora_id: int) -> bool: @@ -408,7 +408,7 @@ def execute_dummy_batch(self) -> None: class AsyncMPClient(MPClient): """Asyncio-compatible client for multi-proc EngineCore.""" - def __init__(self, vllm_config: VllmConfig, executor_class: Type[Executor], + def __init__(self, vllm_config: VllmConfig, executor_class: type[Executor], log_stats: bool): super().__init__( asyncio_mode=True, @@ -471,7 +471,7 @@ async def add_request_async(self, request: EngineCoreRequest) -> None: request.prompt = None await self._send_input(EngineCoreRequestType.ADD, request) - async def abort_requests_async(self, request_ids: List[str]) -> None: + async def abort_requests_async(self, request_ids: list[str]) -> None: if len(request_ids) > 0: await self._send_input(EngineCoreRequestType.ABORT, request_ids) @@ -496,7 +496,7 @@ async def add_lora_async(self, lora_request: LoRARequest) -> bool: async def remove_lora_async(self, lora_id: int) -> bool: return await self._call_utility_async("remove_lora", lora_id) - async def list_loras_async(self) -> Set[int]: + async def list_loras_async(self) -> set[int]: return await self._call_utility_async("list_loras") async def pin_lora_async(self, lora_id: int) -> bool: diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py index 629da06f4925b..4a1636f494956 100644 --- a/vllm/v1/engine/detokenizer.py +++ b/vllm/v1/engine/detokenizer.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 from dataclasses import dataclass -from typing import List, Optional +from typing import Optional from vllm.engine.output_processor.stop_checker import StopChecker from vllm.logger import init_logger @@ -17,12 +17,12 @@ class IncrementalDetokenizer: # Generation data output_text: str - tokens: List[str] - token_ids: List[int] + tokens: list[str] + token_ids: list[int] prompt_len: int # Stop strings - stop: List[str] + stop: list[str] include_stop_str_in_output: bool # Metadata for incremental detokenization @@ -41,7 +41,7 @@ class IncrementalDetokenizer: _last_output_text_offset: int = 0 @property - def output_token_ids(self) -> List[int]: + def output_token_ids(self) -> list[int]: return self.token_ids[self.prompt_len:] @classmethod @@ -84,7 +84,7 @@ def from_new_request( stop_buffer_length=stop_buffer_length, ) - def update(self, new_token_ids: List[int]) -> Optional[str]: + def update(self, new_token_ids: list[int]) -> Optional[str]: """ Update RequestState for the request_id by: 1) Detokenize the new token ids incrementally. diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index ccf52250c1d6f..2e76694a7f512 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Dict, List, Mapping, Optional, Set, Type, Union +from collections.abc import Mapping +from typing import Optional, Union from typing_extensions import TypeVar @@ -36,10 +37,10 @@ class LLMEngine: def __init__( self, vllm_config: VllmConfig, - executor_class: Type[Executor], + executor_class: type[Executor], log_stats: bool, usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, - stat_loggers: Optional[Dict[str, StatLoggerBase]] = None, + stat_loggers: Optional[dict[str, StatLoggerBase]] = None, input_registry: InputRegistry = INPUT_REGISTRY, mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY, use_cached_outputs: bool = False, @@ -97,7 +98,7 @@ def from_engine_args( cls, engine_args: EngineArgs, usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, - stat_loggers: Optional[Dict[str, StatLoggerBase]] = None, + stat_loggers: Optional[dict[str, StatLoggerBase]] = None, enable_multiprocessing: bool = False, ) -> "LLMEngine": """Creates an LLM engine from the engine arguments.""" @@ -139,7 +140,7 @@ def has_unfinished_requests_dp(self, has_unfinished: bool) -> bool: def validate_outputs(cls, outputs, output_type): return outputs - def abort_request(self, request_ids: List[str]) -> None: + def abort_request(self, request_ids: list[str]) -> None: """Remove request_ids from EngineCore and Detokenizer.""" self.engine_core.abort_requests(request_ids) @@ -199,7 +200,7 @@ def _add_request( # 3) Add the request to EngineCore. self.engine_core.add_request(request) - def step(self) -> List[RequestOutput]: + def step(self) -> list[RequestOutput]: if self.should_execute_dummy_batch: self.should_execute_dummy_batch = False @@ -241,7 +242,7 @@ def wake_up(self): def get_tokenizer_group( self, - group_type: Type[_G] = BaseTokenizerGroup, + group_type: type[_G] = BaseTokenizerGroup, ) -> _G: tokenizer_group = self.tokenizer @@ -263,7 +264,7 @@ def remove_lora(self, lora_id: int) -> bool: """Remove an already loaded LoRA adapter.""" return self.engine_core.remove_lora(lora_id) - def list_loras(self) -> Set[int]: + def list_loras(self) -> set[int]: """List all registered adapters.""" return self.engine_core.list_loras() diff --git a/vllm/v1/engine/logprobs.py b/vllm/v1/engine/logprobs.py index 4622cafa4a028..7f572163ead4f 100644 --- a/vllm/v1/engine/logprobs.py +++ b/vllm/v1/engine/logprobs.py @@ -2,7 +2,7 @@ import itertools from dataclasses import dataclass -from typing import Dict, List, Optional +from typing import Optional from vllm.logger import init_logger from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs @@ -151,12 +151,12 @@ def pop_prompt_logprobs(self) -> Optional[PromptLogprobs]: @staticmethod def _make_logprob_dict( - logprobs: List[float], - logprob_token_ids: List[int], - decoded_tokens: List[str], + logprobs: list[float], + logprob_token_ids: list[int], + decoded_tokens: list[str], rank: int, num_logprobs: int, - ) -> Dict[int, Logprob]: + ) -> dict[int, Logprob]: """Make a Logprob dictionary for a position. Args: @@ -168,7 +168,7 @@ def _make_logprob_dict( by the user (in addition to sampled logprob) Returns: - Dict[token id, Logprob] + dict[token id, Logprob] """ # We do not need a special case for the sampled token diff --git a/vllm/v1/engine/mm_input_cache.py b/vllm/v1/engine/mm_input_cache.py index a1d802bf818a2..0f66f68109b17 100644 --- a/vllm/v1/engine/mm_input_cache.py +++ b/vllm/v1/engine/mm_input_cache.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Any, Dict, List, Optional +from typing import Any, Optional from vllm.config import ModelConfig from vllm.envs import VLLM_MM_INPUT_CACHE_SIZE @@ -68,10 +68,10 @@ def cache_hit_ratio(self, steps): def process_inputs( self, mm_data: MultiModalDataDict, - mm_hashes: Optional[List[str]], - mm_processor_kwargs: Optional[Dict[str, Any]], - precomputed_mm_inputs: Optional[List[MultiModalKwargs]], - ) -> List[MultiModalKwargs]: + mm_hashes: Optional[list[str]], + mm_processor_kwargs: Optional[dict[str, Any]], + precomputed_mm_inputs: Optional[list[MultiModalKwargs]], + ) -> list[MultiModalKwargs]: if precomputed_mm_inputs is None: image_inputs = mm_data["image"] if not isinstance(image_inputs, list): @@ -88,7 +88,7 @@ def process_inputs( # Process each image input separately, so that later we can schedule # them in a fine-grained manner. # Apply caching (if enabled) and reuse precomputed inputs (if provided) - ret_inputs: List[MultiModalKwargs] = [] + ret_inputs: list[MultiModalKwargs] = [] for input_id in range(num_inputs): if self.mm_debug_cache_hit_ratio_steps is not None: self.cache_hit_ratio(self.mm_debug_cache_hit_ratio_steps) @@ -133,9 +133,9 @@ def __init__(self, model_config): def get_and_update( self, - mm_inputs: List[Optional[MultiModalKwargs]], - mm_hashes: List[str], - ) -> List[MultiModalKwargs]: + mm_inputs: list[Optional[MultiModalKwargs]], + mm_hashes: list[str], + ) -> list[MultiModalKwargs]: assert len(mm_inputs) == len(mm_hashes) if not self.use_cache: diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py index 9ae8303df54df..22bbb8a0f5b47 100644 --- a/vllm/v1/engine/output_processor.py +++ b/vllm/v1/engine/output_processor.py @@ -2,7 +2,7 @@ import asyncio from dataclasses import dataclass -from typing import Dict, List, Optional, Union +from typing import Optional, Union from vllm.outputs import RequestOutput from vllm.sampling_params import RequestOutputKind @@ -18,8 +18,8 @@ @dataclass class OutputProcessorOutput: - request_outputs: List[RequestOutput] - reqs_to_abort: List[str] + request_outputs: list[RequestOutput] + reqs_to_abort: list[str] class RequestState: @@ -30,7 +30,7 @@ def __init__( lora_name: Optional[str], output_kind: RequestOutputKind, prompt: Optional[str], - prompt_token_ids: List[int], + prompt_token_ids: list[int], logprobs_processor: LogprobsProcessor, detokenizer: IncrementalDetokenizer, arrival_time: float, @@ -90,7 +90,7 @@ def __init__( ): self.log_stats = log_stats self.tokenizer = tokenizer - self.request_states: Dict[str, RequestState] = {} + self.request_states: dict[str, RequestState] = {} self.lora_states = LoRARequestStates() def is_request_active(self, request_id: str) -> bool: @@ -104,7 +104,7 @@ def has_unfinished_requests(self) -> bool: def abort_requests( self, - request_ids: List[str], + request_ids: list[str], ) -> None: for request_id in request_ids: req_state = self.request_states.pop(request_id, None) @@ -130,7 +130,7 @@ def add_request( def process_outputs( self, - engine_core_outputs: List[EngineCoreOutput], + engine_core_outputs: list[EngineCoreOutput], engine_core_timestamp: Optional[float] = None, iteration_stats: Optional[IterationStats] = None, ) -> OutputProcessorOutput: @@ -158,8 +158,8 @@ def process_outputs( ********************************************************** """ - request_outputs: List[RequestOutput] = [] - reqs_to_abort: List[str] = [] + request_outputs: list[RequestOutput] = [] + reqs_to_abort: list[str] = [] for engine_core_output in engine_core_outputs: req_id = engine_core_output.request_id req_state = self.request_states.get(req_id) @@ -265,7 +265,7 @@ def _update_stats_from_finished(self, req_state: RequestState, @staticmethod def _make_request_output( request_state: RequestState, - new_token_ids: List[int], + new_token_ids: list[int], finish_reason: Optional[FinishReason], stop_reason: Union[int, str, None], ) -> Optional[RequestOutput]: diff --git a/vllm/v1/engine/parallel_sampling.py b/vllm/v1/engine/parallel_sampling.py index 5d4ea111abfc9..291360771b54f 100644 --- a/vllm/v1/engine/parallel_sampling.py +++ b/vllm/v1/engine/parallel_sampling.py @@ -1,8 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 +from collections.abc import AsyncGenerator, Mapping from copy import copy -from typing import (AsyncGenerator, Dict, List, Mapping, Optional, Protocol, - Tuple, Union) +from typing import Optional, Protocol, Union from vllm.inputs import PromptType from vllm.lora.request import LoRARequest @@ -137,7 +137,7 @@ def _get_final_request_output(self) -> RequestOutput: key=lambda x: x.index) return self.request_output - def get_child_info(self, index: int) -> Tuple[str, SamplingParams]: + def get_child_info(self, index: int) -> tuple[str, SamplingParams]: """Get child request ID and sampling params. Args: @@ -237,9 +237,9 @@ class SyncParallelSamplingManager: def __init__(self): # Parent req ID -> parent request manager - self.parent_reqs: Dict[str, ParallelSamplingRequest] = {} + self.parent_reqs: dict[str, ParallelSamplingRequest] = {} # Child req ID -> (child req index, parent req ID) - self.child_reqs: Dict[str, Tuple[int, str]] = {} + self.child_reqs: dict[str, tuple[int, str]] = {} def _register_parent_request(self, req: ParallelSamplingRequest) -> None: """Register parallel sampling parent request.""" @@ -299,8 +299,8 @@ def add_request_parallel_sampling( def step( self, - outputs: List[RequestOutput], - ) -> List[RequestOutput]: + outputs: list[RequestOutput], + ) -> list[RequestOutput]: """Build parallel sampling request outputs. Extract child request outputs, aggregate them @@ -355,7 +355,7 @@ async def generate_parallel_sampling_async( parent_req = ParallelSamplingRequest(request_id, sampling_params) # Aggregate generators for n child requests - gens: List[AsyncGenerator[RequestOutput, None]] = [] + gens: list[AsyncGenerator[RequestOutput, None]] = [] for idx in range(parent_req.n): child_req_id, child_params = parent_req.get_child_info(idx) child_gen = generate( diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 2547cebaede7c..3a3fc69e53e44 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -1,7 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 import time -from typing import Mapping, Optional, Union +from collections.abc import Mapping +from typing import Optional, Union from vllm.config import CacheConfig, LoRAConfig, ModelConfig from vllm.inputs import (INPUT_REGISTRY, InputRegistry, ProcessorInputs, diff --git a/vllm/v1/executor/abstract.py b/vllm/v1/executor/abstract.py index 11002ad0022df..aa6ae83c26ea7 100644 --- a/vllm/v1/executor/abstract.py +++ b/vllm/v1/executor/abstract.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 from concurrent.futures import Future -from typing import List, Type, Union +from typing import Union import torch import torch.distributed as dist @@ -22,8 +22,8 @@ class Executor(ExecutorBase): For methods shared by v0 and v1, define them in ExecutorBase""" @staticmethod - def get_class(vllm_config: VllmConfig) -> Type["Executor"]: - executor_class: Type[Executor] + def get_class(vllm_config: VllmConfig) -> type["Executor"]: + executor_class: type[Executor] parallel_config = vllm_config.parallel_config distributed_executor_backend = ( parallel_config.distributed_executor_backend) @@ -53,7 +53,7 @@ def get_class(vllm_config: VllmConfig) -> Type["Executor"]: return executor_class def initialize_from_config(self, - kv_cache_configs: List[KVCacheConfig]) -> None: + kv_cache_configs: list[KVCacheConfig]) -> None: """ Initialize the KV caches and begin the model execution loop of the underlying workers. @@ -69,7 +69,7 @@ def determine_available_memory(self) -> int: # in bytes # operators can be applied to all workers. return min(output) - def get_kv_cache_specs(self) -> List[KVCacheSpec]: + def get_kv_cache_specs(self) -> list[KVCacheSpec]: output = self.collective_rpc("get_kv_cache_spec") return output diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index d4582122fa6d4..79a36505e3901 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -10,7 +10,7 @@ from enum import Enum, auto from functools import partial from multiprocessing.process import BaseProcess -from typing import Any, Callable, Dict, List, Optional, Tuple, Union +from typing import Any, Callable, Optional, Union import cloudpickle import psutil @@ -77,7 +77,7 @@ def sigusr1_handler(signum, frame): scheduler_output_handle = self.rpc_broadcast_mq.export_handle() # Create workers - self.workers: List[WorkerProcHandle] = [] + self.workers: list[WorkerProcHandle] = [] for rank in range(self.world_size): worker = WorkerProc.make_worker_process(self.vllm_config, rank, rank, @@ -94,8 +94,8 @@ def sigusr1_handler(signum, frame): def collective_rpc(self, method: Union[str, Callable], timeout: Optional[float] = None, - args: Tuple = (), - kwargs: Optional[Dict] = None) -> List[Any]: + args: tuple = (), + kwargs: Optional[dict] = None) -> list[Any]: start_time = time.monotonic() kwargs = kwargs or {} @@ -208,7 +208,7 @@ def __init__( self.rank = rank wrapper = WorkerWrapperBase(vllm_config=vllm_config, rpc_rank=rank) # TODO: move `init_worker` to executor level as a collective rpc call - all_kwargs: List[Dict] = [ + all_kwargs: list[dict] = [ {} for _ in range(vllm_config.parallel_config.world_size) ] all_kwargs[rank] = { diff --git a/vllm/v1/kv_cache_interface.py b/vllm/v1/kv_cache_interface.py index eddfb5949ebe6..dfef1039fce2e 100644 --- a/vllm/v1/kv_cache_interface.py +++ b/vllm/v1/kv_cache_interface.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 from dataclasses import dataclass -from typing import Dict, List import torch @@ -74,7 +73,7 @@ def bytes_for_tokens(self, num_tokens: int) -> int: return cdiv(num_tokens, self.block_size) * self.page_size_bytes -KVCacheSpec = Dict[str, KVCacheSpecBase] +KVCacheSpec = dict[str, KVCacheSpecBase] @dataclass @@ -95,7 +94,7 @@ class KVCacheConfig: """The number of KV cache blocks""" num_blocks: int """layer_name -> how to initialize KV cache for that layer""" - tensors: Dict[str, KVCacheTensor] + tensors: dict[str, KVCacheTensor] """ A list of kv-cache groups. Each group includes a set of layers with the same kv-cache spec, and the total page_size of layers inside a group @@ -108,6 +107,6 @@ class KVCacheConfig: 3. (not implemented yet) A model with 2 full attention layers and 4 sliding window attention layers: three groups, (full * 2), (sw * 2), (sw * 2). """ - groups: List[List[str]] + groups: list[list[str]] """the KVCacheSpec of the model""" kv_cache_spec: KVCacheSpec diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index 40dfc56616720..5a2a1c30a9d58 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -2,7 +2,7 @@ import time from abc import ABC, abstractmethod -from typing import Dict, List, Optional +from typing import Optional import numpy as np import prometheus_client @@ -35,8 +35,8 @@ def _reset(self, now): self.last_log_time = now # Tracked stats over current local logging interval. - self.num_prompt_tokens: List[int] = [] - self.num_generation_tokens: List[int] = [] + self.num_prompt_tokens: list[int] = [] + self.num_generation_tokens: list[int] = [] # Prefix cache metrics. TODO: Make the interval configurable. self.prefix_caching_metrics = PrefixCachingMetrics() @@ -52,7 +52,7 @@ def _track_iteration_stats(self, iteration_stats: IterationStats): self.num_generation_tokens.append( iteration_stats.num_generation_tokens) - def _get_throughput(self, tracked_stats: List[int], now: float) -> float: + def _get_throughput(self, tracked_stats: list[int], now: float) -> float: # Compute summary metrics for tracked stats return float(np.sum(tracked_stats) / (now - self.last_log_time)) @@ -147,7 +147,7 @@ def __init__(self, vllm_config: VllmConfig): documentation="Number of generation tokens processed.", labelnames=labelnames).labels(*labelvalues) - self.counter_request_success: Dict[FinishReason, + self.counter_request_success: dict[FinishReason, prometheus_client.Counter] = {} counter_request_success_base = prometheus_client.Counter( name="vllm:request_success_total", @@ -338,14 +338,14 @@ def _unregister_vllm_metrics(): prometheus_client.REGISTRY.unregister(collector) -def build_buckets(mantissa_lst: List[int], max_value: int) -> List[int]: +def build_buckets(mantissa_lst: list[int], max_value: int) -> list[int]: """ Builds a list of buckets with increasing powers of 10 multiplied by mantissa values until the value exceeds the specified maximum. """ exponent = 0 - buckets: List[int] = [] + buckets: list[int] = [] while True: for m in mantissa_lst: value = m * 10**exponent @@ -356,7 +356,7 @@ def build_buckets(mantissa_lst: List[int], max_value: int) -> List[int]: exponent += 1 -def build_1_2_5_buckets(max_value: int) -> List[int]: +def build_1_2_5_buckets(max_value: int) -> list[int]: """ Example: >>> build_1_2_5_buckets(100) @@ -365,7 +365,7 @@ def build_1_2_5_buckets(max_value: int) -> List[int]: return build_buckets([1, 2, 5], max_value) -def build_cudagraph_buckets(vllm_config: VllmConfig) -> List[int]: +def build_cudagraph_buckets(vllm_config: VllmConfig) -> list[int]: if not vllm_config.model_config.enforce_eager: buckets = vllm_config.compilation_config.\ cudagraph_capture_sizes.copy() diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py index 30f460e5a6918..625edb607467b 100644 --- a/vllm/v1/metrics/stats.py +++ b/vllm/v1/metrics/stats.py @@ -2,7 +2,7 @@ import time from dataclasses import dataclass, field -from typing import TYPE_CHECKING, Dict, List, Optional, Set +from typing import TYPE_CHECKING, Optional if TYPE_CHECKING: from vllm.outputs import RequestOutput @@ -39,8 +39,8 @@ class SchedulerStats: @dataclass class LoRAStats: - waiting_requests: Set[str] = field(default_factory=set) - running_requests: Set[str] = field(default_factory=set) + waiting_requests: set[str] = field(default_factory=set) + running_requests: set[str] = field(default_factory=set) @dataclass @@ -81,11 +81,11 @@ def __init__(self): self.num_generation_tokens = 0 self.num_prompt_tokens = 0 self.num_preempted_reqs = 0 - self.finished_requests: List[FinishedRequestStats] = [] - self.time_to_first_tokens_iter: List[float] = [] - self.time_per_output_tokens_iter: List[float] = [] - self.waiting_lora_adapters: Dict[str, int] = {} - self.running_lora_adapters: Dict[str, int] = {} + self.finished_requests: list[FinishedRequestStats] = [] + self.time_to_first_tokens_iter: list[float] = [] + self.time_per_output_tokens_iter: list[float] = [] + self.waiting_lora_adapters: dict[str, int] = {} + self.running_lora_adapters: dict[str, int] = {} def _time_since(self, start: float) -> float: """Calculate an interval relative to this iteration's timestamp.""" @@ -132,7 +132,7 @@ def update_from_output(self, output: "EngineCoreOutput", if num_new_generation_tokens > 0: req_stats.last_token_ts = engine_core_timestamp - def update_from_events(self, req_id: str, events: List["EngineCoreEvent"], + def update_from_events(self, req_id: str, events: list["EngineCoreEvent"], is_prefilling: bool, req_stats: RequestStateStats, lora_stats: Optional[LoRAStats]): # Avoid circular dependency @@ -185,7 +185,7 @@ class LoRARequestStates: """Per-LoRA request state stats.""" def __init__(self): - self.lora_name_to_stats: Dict[str, LoRAStats] = {} + self.lora_name_to_stats: dict[str, LoRAStats] = {} def get_stats(self, req_state: 'RequestState') -> Optional[LoRAStats]: if req_state.lora_name is None: diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py index 0c8eca38ade7a..ed02aa5e71111 100644 --- a/vllm/v1/outputs.py +++ b/vllm/v1/outputs.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 from dataclasses import dataclass -from typing import Dict, List, NamedTuple, Optional +from typing import NamedTuple, Optional import torch @@ -9,11 +9,11 @@ class LogprobsLists(NamedTuple): # [num_reqs, max_num_logprobs + 1] - logprob_token_ids: List[List[int]] + logprob_token_ids: list[list[int]] # [num_reqs, max_num_logprobs + 1] - logprobs: List[List[float]] + logprobs: list[list[float]] # [num_reqs] - sampled_token_ranks: List[int] + sampled_token_ranks: list[int] def slice(self, start: int, end: int): return LogprobsLists( @@ -52,23 +52,23 @@ class SamplerOutput: # ModelRunnerOutput is serialized and sent to the scheduler process. -# This is expensive for torch.Tensor so prefer to use List instead. +# This is expensive for torch.Tensor so prefer to use list instead. @dataclass class ModelRunnerOutput: # [num_reqs] - req_ids: List[str] + req_ids: list[str] # req_id -> index - req_id_to_index: Dict[str, int] + req_id_to_index: dict[str, int] # num_reqs x num_generated_tokens # num_generated_tokens is the number of tokens # generated in the current step. It can be different for # each request due to speculative/jump decoding. - sampled_token_ids: List[List[int]] + sampled_token_ids: list[list[int]] # num_reqs x num_spec_tokens - spec_token_ids: Optional[List[List[int]]] + spec_token_ids: Optional[list[list[int]]] # [num_reqs, max_num_logprobs + 1] # [num_reqs, max_num_logprobs + 1] @@ -79,4 +79,4 @@ class ModelRunnerOutput: # [prompt_len, num_prompt_logprobs] # [prompt_len, num_prompt_logprobs] # [prompt_len] - prompt_logprobs_dict: Dict[str, LogprobsTensors] + prompt_logprobs_dict: dict[str, LogprobsTensors] diff --git a/vllm/v1/request.py b/vllm/v1/request.py index 52d7faeeb0664..99df547348360 100644 --- a/vllm/v1/request.py +++ b/vllm/v1/request.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import enum -from typing import TYPE_CHECKING, List, Optional, Union +from typing import TYPE_CHECKING, Optional, Union from vllm.lora.request import LoRARequest from vllm.sampling_params import SamplingParams @@ -20,10 +20,10 @@ def __init__( self, request_id: str, prompt: Optional[str], - prompt_token_ids: List[int], - multi_modal_inputs: Optional[List["MultiModalKwargs"]], - multi_modal_hashes: Optional[List[str]], - multi_modal_placeholders: Optional[List["PlaceholderRange"]], + prompt_token_ids: list[int], + multi_modal_inputs: Optional[list["MultiModalKwargs"]], + multi_modal_hashes: Optional[list[str]], + multi_modal_placeholders: Optional[list["PlaceholderRange"]], sampling_params: SamplingParams, eos_token_id: Optional[int], arrival_time: float, @@ -36,7 +36,7 @@ def __init__( self.lora_request = lora_request self.status = RequestStatus.WAITING - self.events: List[EngineCoreEvent] = [] + self.events: list[EngineCoreEvent] = [] self.stop_reason: Union[int, str, None] = None assert sampling_params.max_tokens is not None self.max_tokens = sampling_params.max_tokens @@ -44,15 +44,15 @@ def __init__( self.prompt = prompt self.prompt_token_ids = prompt_token_ids self.num_prompt_tokens = len(self.prompt_token_ids) - self._output_token_ids: List[int] = [] - self._all_token_ids: List[int] = self.prompt_token_ids.copy() - self.spec_token_ids: List[int] = [] + self._output_token_ids: list[int] = [] + self._all_token_ids: list[int] = self.prompt_token_ids.copy() + self.spec_token_ids: list[int] = [] self.num_computed_tokens = 0 # Multi-modal related self.mm_positions = multi_modal_placeholders or [] self.mm_inputs = multi_modal_inputs or [] - self.mm_hashes: List[str] = multi_modal_hashes or [] + self.mm_hashes: list[str] = multi_modal_hashes or [] # Sanity check assert len(self.mm_inputs) == len(self.mm_positions) @@ -89,7 +89,7 @@ def scheduled(self, timestamp: Optional[float] = None) -> None: EngineCoreEvent.new_event(EngineCoreEventType.SCHEDULED, timestamp)) - def take_events(self) -> Optional[List[EngineCoreEvent]]: + def take_events(self) -> Optional[list[EngineCoreEvent]]: if not self.events: return None events, self.events = self.events, [] @@ -97,7 +97,7 @@ def take_events(self) -> Optional[List[EngineCoreEvent]]: def append_output_token_ids( self, - token_ids: Union[int, List[int]], + token_ids: Union[int, list[int]], ) -> None: if isinstance(token_ids, int): token_ids = [token_ids] diff --git a/vllm/v1/sample/metadata.py b/vllm/v1/sample/metadata.py index b757a1dc60c74..55d9739b80073 100644 --- a/vllm/v1/sample/metadata.py +++ b/vllm/v1/sample/metadata.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 from dataclasses import dataclass -from typing import Dict, List, Optional, Set, Tuple +from typing import Optional import torch @@ -17,7 +17,7 @@ class SamplingMetadata: top_k: Optional[torch.Tensor] min_p: Optional[torch.Tensor] - generators: Dict[int, torch.Generator] + generators: dict[int, torch.Generator] # None means no logprobs, 0 means sampled token logprobs only max_num_logprobs: Optional[int] @@ -28,12 +28,12 @@ class SamplingMetadata: presence_penalties: torch.Tensor repetition_penalties: torch.Tensor - output_token_ids: List[List[int]] + output_token_ids: list[list[int]] # req_index -> (min_tokens, stop_token_ids) - min_tokens: Dict[int, Tuple[int, Set[int]]] + min_tokens: dict[int, tuple[int, set[int]]] - logit_bias: List[Optional[Dict[int, float]]] + logit_bias: list[Optional[dict[int, float]]] # `allowed_token_ids_mask` is a 2D bool tensor of shape (max batch size, # vocab size). diff --git a/vllm/v1/sample/ops/penalties.py b/vllm/v1/sample/ops/penalties.py index 8d9f6529fa0bd..ed05e3f48401a 100644 --- a/vllm/v1/sample/ops/penalties.py +++ b/vllm/v1/sample/ops/penalties.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Dict, List, Set, Tuple - import torch from vllm.model_executor.layers.utils import apply_penalties @@ -9,13 +7,13 @@ def apply_min_token_penalties( - logits: torch.Tensor, output_token_ids: List[List[int]], - min_tokens: Dict[int, Tuple[int, Set[int]]]) -> None: + logits: torch.Tensor, output_token_ids: list[list[int]], + min_tokens: dict[int, tuple[int, set[int]]]) -> None: """ Applies minimum token penalty by setting the logits of the stop tokens to -inf. """ - min_tokens_logits_to_penalize: List[Tuple[int, int]] = [] + min_tokens_logits_to_penalize: list[tuple[int, int]] = [] for index, (min_token, stop_token_ids) in min_tokens.items(): if len(output_token_ids[index]) < min_token: for stop_token_id in stop_token_ids: @@ -30,7 +28,7 @@ def apply_all_penalties( presence_penalties: torch.Tensor, frequency_penalties: torch.Tensor, repetition_penalties: torch.Tensor, - output_token_ids: List[List[int]], + output_token_ids: list[list[int]], ) -> torch.Tensor: """ Applies presence, frequency and repetition penalties to the logits. @@ -43,7 +41,7 @@ def apply_all_penalties( repetition_penalties) -def _convert_to_tensors(output_token_ids: List[List[int]], vocab_size: int, +def _convert_to_tensors(output_token_ids: list[list[int]], vocab_size: int, device: torch.device) -> torch.Tensor: """ Convert the different list data structures to tensors. diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py index 78c88ad8b8305..1bb950be822c1 100644 --- a/vllm/v1/sample/ops/topk_topp_sampler.py +++ b/vllm/v1/sample/ops/topk_topp_sampler.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Dict, Optional +from typing import Optional import torch import torch.nn as nn @@ -54,7 +54,7 @@ def __init__(self): def forward_native( self, logits: torch.Tensor, - generators: Dict[int, torch.Generator], + generators: dict[int, torch.Generator], k: Optional[torch.Tensor], p: Optional[torch.Tensor], ) -> torch.Tensor: @@ -66,7 +66,7 @@ def forward_native( def forward_cuda( self, logits: torch.Tensor, - generators: Dict[int, torch.Generator], + generators: dict[int, torch.Generator], k: Optional[torch.Tensor], p: Optional[torch.Tensor], ) -> torch.Tensor: @@ -117,7 +117,7 @@ def apply_top_k_top_p( def random_sample( probs: torch.Tensor, - generators: Dict[int, torch.Generator], + generators: dict[int, torch.Generator], ) -> torch.Tensor: """Randomly sample from the probabilities. @@ -143,7 +143,7 @@ def flashinfer_sample( probs: torch.Tensor, k: Optional[torch.Tensor], p: Optional[torch.Tensor], - generators: Dict[int, torch.Generator], + generators: dict[int, torch.Generator], ) -> torch.Tensor: """Sample from the probabilities using FlashInfer. diff --git a/vllm/v1/sample/rejection_sampler.py b/vllm/v1/sample/rejection_sampler.py index 2e3927345eb5f..80a4b24186ab7 100644 --- a/vllm/v1/sample/rejection_sampler.py +++ b/vllm/v1/sample/rejection_sampler.py @@ -1,5 +1,4 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List import torch import torch.nn as nn @@ -54,7 +53,7 @@ def __init__(self): else: self.forward_method = self.forward_native - def forward(self, draft_token_ids: List[List[int]], + def forward(self, draft_token_ids: list[list[int]], target_probs: torch.Tensor, sampling_metadata: SamplingMetadata) -> SamplerOutput: if not sampling_metadata.all_greedy: @@ -66,7 +65,7 @@ def forward(self, draft_token_ids: List[List[int]], def flashinfer_sample( self, - draft_token_ids: List[List[int]], + draft_token_ids: list[list[int]], target_probs: torch.Tensor, sampling_metadata: SamplingMetadata, ) -> SamplerOutput: @@ -119,7 +118,7 @@ def flashinfer_sample( # TODO: The following method can be optimized for better performance. def forward_native( self, - draft_token_ids: List[List[int]], + draft_token_ids: list[list[int]], target_probs: torch.Tensor, sampling_metadata: SamplingMetadata, ) -> SamplerOutput: diff --git a/vllm/v1/stats/common.py b/vllm/v1/stats/common.py index 09d382638bffd..46818977dae58 100644 --- a/vllm/v1/stats/common.py +++ b/vllm/v1/stats/common.py @@ -4,7 +4,7 @@ from dataclasses import dataclass from dataclasses import field as dataclass_field from enum import IntEnum -from typing import ClassVar, Dict, List, Optional, Set +from typing import ClassVar, Optional import msgspec from msgspec import field as msgspec_field @@ -78,7 +78,7 @@ class Type(IntEnum): ▼ FINISHED (All could go to FINISHED) """ - _VALID_TRANSITIONS: ClassVar[Dict[Type, Set[Type]]] = { + _VALID_TRANSITIONS: ClassVar[dict[Type, set[Type]]] = { Type.ARRIVED: { Type.INPUT_PROCESSED, Type.FINISHED, @@ -140,7 +140,7 @@ class Type(IntEnum): finish_reason: Optional[str] = None # Non-optional fields for each update type. - _REQUIRED_FIELDS: ClassVar[Dict[Type, List[str]]] = { + _REQUIRED_FIELDS: ClassVar[dict[Type, list[str]]] = { Type.INPUT_PROCESSED: ["num_prompt_tokens", "sampling_params"], Type.PREFILLING: ["num_computed_tokens", "num_cached_tokens"], Type.DETOKENIZED: ["num_new_tokens"], @@ -218,13 +218,13 @@ class RequestStats: # 2. the request was preempted and resumed. It is equivalent to running # a prefill of the original prefill tokens + generated output tokens # before preemption. - prefill_start_ts_s_lst: List[float] = dataclass_field(default_factory=list) + prefill_start_ts_s_lst: list[float] = dataclass_field(default_factory=list) # A list of timestamps when a token is decoded by the engine core. - decoding_ts_s_lst: List[float] = dataclass_field(default_factory=list) + decoding_ts_s_lst: list[float] = dataclass_field(default_factory=list) # A sorted list of timestamps for each output token. - output_token_ts_s_lst: List[float] = dataclass_field(default_factory=list) + output_token_ts_s_lst: list[float] = dataclass_field(default_factory=list) # First token's timestamp. first_token_ts_s: Optional[float] = None @@ -241,7 +241,7 @@ class RequestStats: # metric to measure the impact of preemption other than observation of # large P99 TPOT. Ideally we could quantify the impact of preemption by # measuring the number of tokens re-computed due to preemption. - preempted_ts_s_lst: List[float] = dataclass_field(default_factory=list) + preempted_ts_s_lst: list[float] = dataclass_field(default_factory=list) # Timestamp when the request was finished at the engine core. finished_ts_s: Optional[float] = None @@ -308,7 +308,7 @@ def decode_latency_s(self) -> Optional[float]: return self.e2e_latency_s - self.first_token_latency_s @property - def output_token_latency_s_lst(self) -> List[float]: + def output_token_latency_s_lst(self) -> list[float]: if len(self.output_token_ts_s_lst) == 0: return [] latency_s_lst = [] @@ -442,7 +442,7 @@ class EngineCoreStatsSnapshot( default_factory=SchedulerStats) # Per request stats updates. - requests_stats_updates: List[RequestStatsUpdate] = msgspec_field( + requests_stats_updates: list[RequestStatsUpdate] = msgspec_field( default_factory=list) # Engine core's queue stats. diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py index 62271255b0c05..8e1fb18cca05b 100644 --- a/vllm/v1/utils.py +++ b/vllm/v1/utils.py @@ -5,8 +5,8 @@ import weakref from collections import defaultdict from collections.abc import Sequence -from typing import (TYPE_CHECKING, Any, Callable, Dict, Generic, List, - Optional, TypeVar, Union, overload) +from typing import (TYPE_CHECKING, Any, Callable, Generic, Optional, TypeVar, + Union, overload) import torch @@ -24,7 +24,7 @@ class ConstantList(Generic[T], Sequence): - def __init__(self, x: List[T]) -> None: + def __init__(self, x: list[T]) -> None: self._x = x def append(self, item): @@ -57,10 +57,10 @@ def __getitem__(self, item: int) -> T: ... @overload - def __getitem__(self, s: slice, /) -> List[T]: + def __getitem__(self, s: slice, /) -> list[T]: ... - def __getitem__(self, item: Union[int, slice]) -> Union[T, List[T]]: + def __getitem__(self, item: Union[int, slice]) -> Union[T, list[T]]: return self._x[item] @overload @@ -71,7 +71,7 @@ def __setitem__(self, item: int, value: T): def __setitem__(self, s: slice, value: T, /): ... - def __setitem__(self, item: Union[int, slice], value: Union[T, List[T]]): + def __setitem__(self, item: Union[int, slice], value: Union[T, list[T]]): raise Exception("Cannot set item in a constant list") def __delitem__(self, item): @@ -99,7 +99,7 @@ def __init__( output_path: str, process_name: str, target_fn: Callable, - process_kwargs: Dict[Any, Any], + process_kwargs: dict[Any, Any], ): context = get_mp_context() reader, writer = context.Pipe(duplex=False) @@ -146,9 +146,9 @@ def shutdown(proc: multiprocessing.Process, input_path: str, output_path: str): def bind_kv_cache( - kv_caches: Dict[str, torch.Tensor], - forward_context: Dict[str, "Attention"], - runner_kv_caches: List[torch.Tensor], + kv_caches: dict[str, torch.Tensor], + forward_context: dict[str, "Attention"], + runner_kv_caches: list[torch.Tensor], ) -> None: """ Bind the allocated KV cache to both ModelRunner and forward context so diff --git a/vllm/v1/worker/block_table.py b/vllm/v1/worker/block_table.py index 669175f5d9c3d..37859b02ee7f3 100644 --- a/vllm/v1/worker/block_table.py +++ b/vllm/v1/worker/block_table.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List - import numpy as np import torch @@ -44,7 +42,7 @@ def append_row( self, row_idx: int, start: int, - block_ids: List[int], + block_ids: list[int], ) -> None: if not block_ids: return @@ -52,7 +50,7 @@ def append_row( self.block_table_np[row_idx, start:start + num_blocks] = block_ids self.num_blocks_per_row[row_idx] = start + num_blocks - def add_row(self, row_idx: int, block_ids: List[int]) -> None: + def add_row(self, row_idx: int, block_ids: list[int]) -> None: self.append_row(row_idx, 0, block_ids) def move_row(self, src: int, tgt: int) -> None: diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index e4e6b88245d0d..f37804a60742b 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -2,7 +2,7 @@ # Datastructures defining an input batch from dataclasses import dataclass -from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple, cast +from typing import TYPE_CHECKING, Optional, cast import numpy as np import torch @@ -24,16 +24,16 @@ class CachedRequestState: req_id: str - prompt_token_ids: List[int] + prompt_token_ids: list[int] prompt: Optional[str] - mm_inputs: List[MultiModalKwargs] - mm_positions: List["PlaceholderRange"] + mm_inputs: list[MultiModalKwargs] + mm_positions: list["PlaceholderRange"] sampling_params: SamplingParams generator: Optional[torch.Generator] - block_ids: List[int] + block_ids: list[int] num_computed_tokens: int - output_token_ids: List[int] + output_token_ids: list[int] mrope_positions: Optional[torch.Tensor] = None mrope_position_delta: Optional[int] = None @@ -63,8 +63,8 @@ def __init__( self.pin_memory = pin_memory self.vocab_size = vocab_size - self._req_ids: List[Optional[str]] = [] - self.req_id_to_index: Dict[str, int] = {} + self._req_ids: list[Optional[str]] = [] + self.req_id_to_index: dict[str, int] = {} # TODO(woosuk): This buffer could be too large if max_model_len is big. # Find a way to reduce the CPU memory usage. @@ -100,8 +100,8 @@ def __init__( device="cpu", pin_memory=pin_memory) self.temperature_cpu = self.temperature_cpu_tensor.numpy() - self.greedy_reqs: Set[str] = set() - self.random_reqs: Set[str] = set() + self.greedy_reqs: set[str] = set() + self.random_reqs: set[str] = set() self.top_p = torch.empty((max_num_reqs, ), dtype=torch.float32, @@ -111,7 +111,7 @@ def __init__( device="cpu", pin_memory=pin_memory) self.top_p_cpu = self.top_p_cpu_tensor.numpy() - self.top_p_reqs: Set[str] = set() + self.top_p_reqs: set[str] = set() self.top_k = torch.empty((max_num_reqs, ), dtype=torch.int32, @@ -121,7 +121,7 @@ def __init__( device="cpu", pin_memory=pin_memory) self.top_k_cpu = self.top_k_cpu_tensor.numpy() - self.top_k_reqs: Set[str] = set() + self.top_k_reqs: set[str] = set() self.min_p = torch.empty((max_num_reqs, ), dtype=torch.float32, @@ -131,7 +131,7 @@ def __init__( device="cpu", pin_memory=pin_memory) self.min_p_cpu = self.min_p_cpu_tensor.numpy() - self.min_p_reqs: Set[str] = set() + self.min_p_reqs: set[str] = set() # Frequency penalty related data structures self.frequency_penalties = torch.empty((max_num_reqs, ), @@ -144,7 +144,7 @@ def __init__( pin_memory=pin_memory) self.frequency_penalties_cpu = \ self.frequency_penalties_cpu_tensor.numpy() - self.frequency_penalties_reqs: Set[str] = set() + self.frequency_penalties_reqs: set[str] = set() # Presence penalty related data structures self.presence_penalties = torch.empty((max_num_reqs, ), @@ -156,7 +156,7 @@ def __init__( pin_memory=pin_memory) self.presence_penalties_cpu = self.presence_penalties_cpu_tensor.numpy( ) - self.presence_penalties_reqs: Set[str] = set() + self.presence_penalties_reqs: set[str] = set() # Repetition penalty related data structures self.repetition_penalties = torch.empty((max_num_reqs, ), @@ -169,43 +169,43 @@ def __init__( pin_memory=pin_memory) self.repetition_penalties_cpu = \ self.repetition_penalties_cpu_tensor.numpy() - self.repetition_penalties_reqs: Set[str] = set() + self.repetition_penalties_reqs: set[str] = set() # req_index -> (min_tokens, stop_token_ids) - self.min_tokens: Dict[int, Tuple[int, Set[int]]] = {} + self.min_tokens: dict[int, tuple[int, set[int]]] = {} # lora related self.request_lora_mapping = np.zeros((self.max_num_reqs, ), dtype=np.int32) - self.lora_id_to_request_ids: Dict[int, Set[str]] = {} - self.lora_id_to_lora_request: Dict[int, LoRARequest] = {} + self.lora_id_to_request_ids: dict[int, set[str]] = {} + self.lora_id_to_lora_request: dict[int, LoRARequest] = {} # req_index -> generator # NOTE(woosuk): The indices of the requests that do not have their own # generator should not be included in the dictionary. - self.generators: Dict[int, torch.Generator] = {} + self.generators: dict[int, torch.Generator] = {} - self.num_logprobs: Dict[str, int] = {} + self.num_logprobs: dict[str, int] = {} # NOTE(rob): num_prompt_logprobs only includes reqs # that are currently in the prefill phase. - self.num_prompt_logprobs: Dict[str, int] = {} + self.num_prompt_logprobs: dict[str, int] = {} - self.logit_bias: List[Optional[Dict[int, + self.logit_bias: list[Optional[dict[int, float]]] = [None] * max_num_reqs - self.has_allowed_token_ids: Set[str] = set() + self.has_allowed_token_ids: set[str] = set() self.allowed_token_ids_mask: Optional[torch.Tensor] = None self.allowed_token_ids_mask_cpu_tensor: Optional[torch.Tensor] = None - self.req_output_token_ids: List[Optional[List[int]]] = [] + self.req_output_token_ids: list[Optional[list[int]]] = [] # This is updated each time the batch constituents change. self.sampling_metadata = self._make_sampling_metadata() @property - def req_ids(self) -> List[str]: + def req_ids(self) -> list[str]: # None elements should only be present transiently # while performing state updates to the batch. - return cast(List[str], self._req_ids) + return cast(list[str], self._req_ids) def add_request( self, @@ -356,7 +356,7 @@ def remove_request(self, req_id: str) -> Optional[int]: self.allowed_token_ids_mask_cpu_tensor[req_index].fill_(False) return req_index - def condense(self, empty_req_indices: List[int]) -> None: + def condense(self, empty_req_indices: list[int]) -> None: num_reqs = self.num_reqs if num_reqs == 0: # The batched states are empty. @@ -489,7 +489,7 @@ def _make_sampling_metadata(self) -> SamplingMetadata: frequency_penalties=self.frequency_penalties[:num_reqs], presence_penalties=self.presence_penalties[:num_reqs], repetition_penalties=self.repetition_penalties[:num_reqs], - output_token_ids=cast(List[List[int]], self.req_output_token_ids), + output_token_ids=cast(list[list[int]], self.req_output_token_ids), min_tokens=self.min_tokens, no_penalties=self.no_penalties, logit_bias=self.logit_bias[:num_reqs], @@ -516,7 +516,7 @@ def _make_prompt_token_ids_tensor(self) -> torch.Tensor: def make_lora_inputs( self, num_scheduled_tokens: np.ndarray - ) -> Tuple[Tuple[int, ...], Tuple[int, ...], Set[LoRARequest]]: + ) -> tuple[tuple[int, ...], tuple[int, ...], set[LoRARequest]]: """ Given the num_scheduled_tokens for each request in the batch, return datastructures used to activate the current LoRAs. @@ -532,7 +532,7 @@ def make_lora_inputs( prompt_lora_mapping = tuple(req_lora_mapping) token_lora_mapping = tuple( req_lora_mapping.repeat(num_scheduled_tokens)) - active_lora_requests: Set[LoRARequest] = set( + active_lora_requests: set[LoRARequest] = set( self.lora_id_to_lora_request.values()) return prompt_lora_mapping, token_lora_mapping, active_lora_requests diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 4d0ae9a205a15..d7c97240c2880 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2,7 +2,7 @@ import gc import time -from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Optional, Union import numpy as np import torch @@ -114,9 +114,9 @@ def __init__( # Lazy initialization # self.model: nn.Module # Set after load_model - self.kv_caches: List[torch.Tensor] = [] + self.kv_caches: list[torch.Tensor] = [] # req_id -> (input_id -> encoder_output) - self.encoder_cache: Dict[str, Dict[int, torch.Tensor]] = {} + self.encoder_cache: dict[str, dict[int, torch.Tensor]] = {} # Set up speculative decoding. self.use_spec_decode = False @@ -137,7 +137,7 @@ def __init__( ) # Request states. - self.requests: Dict[str, CachedRequestState] = {} + self.requests: dict[str, CachedRequestState] = {} # Persistent batch. self.input_batch = InputBatch( max_num_reqs=self.max_num_reqs, @@ -253,7 +253,7 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: # then resubmitted with the same ID. In this case, we treat them as two # distinct requests - clearing the cached states for the first request # and handling the second as a new request. - removed_req_indices: List[int] = [] + removed_req_indices: list[int] = [] for req_id in scheduler_output.finished_req_ids: req_index = self.input_batch.remove_request(req_id) if req_index is not None: @@ -284,7 +284,7 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: assert req_index is not None removed_req_indices.append(req_index) - req_ids_to_add: List[str] = [] + req_ids_to_add: list[str] = [] # Add new requests to the cached states. for new_req_data in scheduler_output.scheduled_new_reqs: req_id = new_req_data.req_id @@ -427,7 +427,7 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: def _prepare_inputs( self, scheduler_output: "SchedulerOutput", - ) -> Tuple[FlashAttentionMetadata, torch.Tensor]: + ) -> tuple[FlashAttentionMetadata, torch.Tensor]: total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens assert total_num_scheduled_tokens > 0 num_reqs = self.input_batch.num_reqs @@ -782,8 +782,8 @@ def _execute_encoder(self, scheduler_output: "SchedulerOutput"): return # Batch the multi-modal inputs. - mm_inputs: List[MultiModalKwargs] = [] - req_input_ids: List[Tuple[str, int]] = [] + mm_inputs: list[MultiModalKwargs] = [] + req_input_ids: list[tuple[str, int]] = [] for req_id, encoder_input_ids in scheduled_encoder_inputs.items(): req_state = self.requests[req_id] for input_id in encoder_input_ids: @@ -827,8 +827,8 @@ def _execute_encoder(self, scheduler_output: "SchedulerOutput"): def _gather_encoder_outputs( self, scheduler_output: "SchedulerOutput", - ) -> List[torch.Tensor]: - encoder_outputs: List[torch.Tensor] = [] + ) -> list[torch.Tensor]: + encoder_outputs: list[torch.Tensor] = [] for req_id in self.input_batch.req_ids: num_scheduled_tokens = scheduler_output.num_scheduled_tokens[ req_id] @@ -1030,10 +1030,10 @@ def execute_model( def generate_draft_token_ids( self, - sampled_token_ids: List[List[int]], - ) -> List[List[int]]: + sampled_token_ids: list[list[int]], + ) -> list[list[int]]: # TODO(woosuk): Optimize. - draft_token_ids: List[List[int]] = [] + draft_token_ids: list[list[int]] = [] for i, sampled_ids in enumerate(sampled_token_ids): num_sampled_ids = len(sampled_ids) if not num_sampled_ids: @@ -1077,12 +1077,12 @@ def _get_prompt_logprobs_dict( self, hidden_states: torch.Tensor, scheduler_output: "SchedulerOutput", - ) -> Dict[str, LogprobsTensors]: + ) -> dict[str, LogprobsTensors]: num_prompt_logprobs_dict = self.input_batch.num_prompt_logprobs if not num_prompt_logprobs_dict: return {} - prompt_logprobs_dict: Dict[str, LogprobsTensors] = {} + prompt_logprobs_dict: dict[str, LogprobsTensors] = {} # Since prompt logprobs are a rare feature, prioritize simple, # maintainable loop over optimal performance. @@ -1283,7 +1283,7 @@ def profile_run(self) -> None: num_tokens = self.max_num_tokens min_tokens_per_req: int = num_tokens // num_reqs - num_scheduled_tokens_list: List[int] = [min_tokens_per_req] * num_reqs + num_scheduled_tokens_list: list[int] = [min_tokens_per_req] * num_reqs num_scheduled_tokens_list[-1] += num_tokens % num_reqs assert sum(num_scheduled_tokens_list) == num_tokens assert len(num_scheduled_tokens_list) == num_reqs @@ -1372,7 +1372,7 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None: "Hybrid models with more than one KV cache type are not " "supported yet.") - kv_caches: Dict[str, torch.Tensor] = {} + kv_caches: dict[str, torch.Tensor] = {} for layer_name, layer_spec in kv_cache_config.kv_cache_spec.items(): tensor_config = kv_cache_config.tensors[layer_name] diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index f681925f557eb..cc6268d6569ba 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -2,7 +2,7 @@ """A GPU worker class.""" import gc import os -from typing import TYPE_CHECKING, Optional, Set +from typing import TYPE_CHECKING, Optional import torch import torch.distributed @@ -243,7 +243,7 @@ def add_lora(self, lora_request: LoRARequest) -> bool: def remove_lora(self, lora_id: int) -> bool: return self.model_runner.remove_lora(lora_id) - def list_loras(self) -> Set[int]: + def list_loras(self) -> set[int]: return self.model_runner.list_loras() def pin_lora(self, lora_id: int) -> bool: diff --git a/vllm/v1/worker/lora_model_runner_mixin.py b/vllm/v1/worker/lora_model_runner_mixin.py index 731e758e6e74c..f34aacacf3edc 100644 --- a/vllm/v1/worker/lora_model_runner_mixin.py +++ b/vllm/v1/worker/lora_model_runner_mixin.py @@ -4,7 +4,6 @@ """ from contextlib import contextmanager -from typing import Set, Tuple import numpy as np import torch.nn as nn @@ -57,9 +56,9 @@ def load_lora_model(self, model: nn.Module, model_config: ModelConfig, ) return self.lora_manager.create_lora_manager(model) - def _set_active_loras(self, prompt_lora_mapping: Tuple[int, ...], - token_lora_mapping: Tuple[int, ...], - lora_requests: Set[LoRARequest]) -> None: + def _set_active_loras(self, prompt_lora_mapping: tuple[int, ...], + token_lora_mapping: tuple[int, ...], + lora_requests: set[LoRARequest]) -> None: if not self.lora_manager: raise RuntimeError("LoRA is not enabled.") @@ -74,10 +73,10 @@ def _set_active_loras(self, prompt_lora_mapping: Tuple[int, ...], def set_active_loras(self, input_batch: InputBatch, num_scheduled_tokens: np.ndarray) -> None: - prompt_lora_mapping: Tuple[int, ...] # of size input_batch.num_reqs - token_lora_mapping: Tuple[int, + prompt_lora_mapping: tuple[int, ...] # of size input_batch.num_reqs + token_lora_mapping: tuple[int, ...] # of size np.sum(num_scheduled_tokens) - lora_requests: Set[LoRARequest] + lora_requests: set[LoRARequest] prompt_lora_mapping, token_lora_mapping, lora_requests = \ input_batch.make_lora_inputs(num_scheduled_tokens) return self._set_active_loras(prompt_lora_mapping, token_lora_mapping, @@ -105,7 +104,7 @@ def maybe_profile_with_lora(self, lora_config: LoRAConfig, num_scheduled_tokens) # Make dummy lora requests - lora_requests: Set[LoRARequest] = { + lora_requests: set[LoRARequest] = { LoRARequest(lora_name=f"warmup_{lora_id}", lora_int_id=lora_id, lora_path="/not/a/real/path") @@ -143,7 +142,7 @@ def pin_lora(self, lora_id: int) -> bool: raise RuntimeError("LoRA is not enabled.") return self.lora_manager.pin_adapter(lora_id) - def list_loras(self) -> Set[int]: + def list_loras(self) -> set[int]: if not self.lora_manager: raise RuntimeError("LoRA is not enabled.") return self.lora_manager.list_adapters() \ No newline at end of file diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index f7d72d26e0454..5f9d072a3f1fe 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -2,7 +2,7 @@ import enum import time from dataclasses import dataclass -from typing import TYPE_CHECKING, Dict, List, Optional, Tuple +from typing import TYPE_CHECKING, Optional from unittest.mock import patch import numpy as np @@ -50,9 +50,9 @@ def is_prefill(self) -> bool: @dataclass class PromptDecodeInfo: - prompt_req_ids: List[str] - decode_req_ids: List[str] - prompt_scheduled_tokens: List[int] + prompt_req_ids: list[str] + decode_req_ids: list[str] + prompt_scheduled_tokens: list[int] @dataclass @@ -126,13 +126,13 @@ def __init__( ) # Request states. - self.requests: Dict[str, CachedRequestState] = {} + self.requests: dict[str, CachedRequestState] = {} # req_id -> (input_id -> encoder_output) - self.encoder_cache: Dict[str, Dict[int, torch.Tensor]] = {} + self.encoder_cache: dict[str, dict[int, torch.Tensor]] = {} # KV caches for forward pass - self.kv_caches: List[Tuple[torch.Tensor, torch.Tensor]] = [] + self.kv_caches: list[tuple[torch.Tensor, torch.Tensor]] = [] # Cached torch/numpy tensors self.num_swaps = 2 @@ -204,7 +204,7 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> bool: # then resubmitted with the same ID. In this case, we treat them as two # distinct requests - clearing the cached states for the first request # and handling the second as a new request. - removed_req_indices: List[int] = [] + removed_req_indices: list[int] = [] for req_id in scheduler_output.finished_req_ids: req_index = self.input_batch.remove_request(req_id) if req_index is not None: @@ -227,7 +227,7 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> bool: assert req_index is not None removed_req_indices.append(req_index) - req_ids_to_add: List[str] = [] + req_ids_to_add: list[str] = [] # Add new requests to the cached states. for new_req_data in scheduler_output.scheduled_new_reqs: req_id = new_req_data.req_id @@ -488,7 +488,7 @@ def _prepare_prompt(self, req_index: int, def _prepare_decode( self, - decode_req_ids: List[str], + decode_req_ids: list[str], ) -> DecodeData: # Batch size batch_size = len(decode_req_ids) @@ -685,7 +685,7 @@ def execute_model( # Create output. all_req_ids = pd_info.decode_req_ids + pd_info.prompt_req_ids - prompt_logprobs_dict: Dict[str, Optional[LogprobsTensors]] = {} + prompt_logprobs_dict: dict[str, Optional[LogprobsTensors]] = {} for req_id in all_req_ids: prompt_logprobs_dict[req_id] = None @@ -923,7 +923,7 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None: "Hybrid models with more than one KV cache type are not " "supported yet.") - kv_caches: Dict[str, torch.Tensor] = {} + kv_caches: dict[str, torch.Tensor] = {} for layer_name, layer_spec in kv_cache_config.kv_cache_spec.items(): tensor_config = kv_cache_config.tensors[layer_name] @@ -960,7 +960,7 @@ def forward( self, token_ids: torch.Tensor, position_ids: torch.Tensor, - kv_caches: List[Tuple[torch.Tensor, torch.Tensor]], + kv_caches: list[tuple[torch.Tensor, torch.Tensor]], ) -> torch.Tensor: """Executes the forward pass of the model and samples the next token. diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py index c236f263eddb2..a4684dfe04a03 100644 --- a/vllm/v1/worker/tpu_worker.py +++ b/vllm/v1/worker/tpu_worker.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 """A TPU worker class.""" import os -from typing import Dict, List, Optional +from typing import Optional import torch import torch.distributed @@ -103,7 +103,7 @@ def init_device(self): self.model_runner = TPUModelRunner(self.vllm_config, self.device) def determine_available_memory(self) -> int: - kv_caches: Dict[str, torch.Tensor] = {} + kv_caches: dict[str, torch.Tensor] = {} kv_cache_spec = self.model_runner.get_kv_cache_spec() for layer_name, layer_spec in kv_cache_spec.items(): if isinstance(layer_spec, FullAttentionSpec): @@ -118,7 +118,7 @@ def determine_available_memory(self) -> int: else: raise NotImplementedError - runner_kv_caches: List[torch.Tensor] = [] + runner_kv_caches: list[torch.Tensor] = [] bind_kv_cache( kv_caches, self.vllm_config.compilation_config.static_forward_context, diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py index 3960392cf74ef..9c6e6dc74280f 100644 --- a/vllm/worker/cache_engine.py +++ b/vllm/worker/cache_engine.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 """CacheEngine class for managing the KV cache.""" -from typing import List import numpy as np import torch @@ -74,12 +73,12 @@ def _allocate_kv_cache( self, num_blocks: int, device: str, - ) -> List[torch.Tensor]: + ) -> list[torch.Tensor]: """Allocates KV cache on the specified device.""" kv_cache_shape = self.attn_backend.get_kv_cache_shape( num_blocks, self.block_size, self.num_kv_heads, self.head_size) pin_memory = is_pin_memory_available() if device == "cpu" else False - kv_cache: List[torch.Tensor] = [] + kv_cache: list[torch.Tensor] = [] # Align entries so they are 256 byte aligned for better performance # Primarily targets MLA as this typically only ends up having entries diff --git a/vllm/worker/cpu_enc_dec_model_runner.py b/vllm/worker/cpu_enc_dec_model_runner.py index ac7c93e48395d..85afa979f1993 100644 --- a/vllm/worker/cpu_enc_dec_model_runner.py +++ b/vllm/worker/cpu_enc_dec_model_runner.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import dataclasses -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, cast +from typing import TYPE_CHECKING, Any, Optional, cast import torch @@ -31,7 +31,7 @@ class EncoderDecoderModelInputForCPU(ModelInputForCPUWithSamplingMetadata): encoder_input_tokens: Optional[torch.Tensor] = None encoder_input_positions: Optional[torch.Tensor] = None - def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: + def as_broadcastable_tensor_dict(self) -> dict[str, Any]: tensor_dict = { "input_tokens": self.input_tokens, "input_positions": self.input_positions, @@ -47,7 +47,7 @@ def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: @classmethod def from_broadcasted_tensor_dict( cls, - tensor_dict: Dict[str, Any], + tensor_dict: dict[str, Any], attn_backend: Optional["AttentionBackend"] = None, ) -> "EncoderDecoderModelInputForCPU": return cast( @@ -57,19 +57,19 @@ def from_broadcasted_tensor_dict( class CPUEncoderDecoderModelRunner( CPUModelRunnerBase[EncoderDecoderModelInputForCPU]): - _model_input_cls: Type[EncoderDecoderModelInputForCPU] = ( + _model_input_cls: type[EncoderDecoderModelInputForCPU] = ( EncoderDecoderModelInputForCPU) - _builder_cls: Type[ModelInputForCPUBuilder] = ModelInputForCPUBuilder + _builder_cls: type[ModelInputForCPUBuilder] = ModelInputForCPUBuilder def _list_to_int32_tensor( self, - _list: List[int], + _list: list[int], ) -> torch.Tensor: return torch.tensor(_list, dtype=torch.int32, device=self.device) def _list_to_long_tensor( self, - _list: List[int], + _list: list[int], ) -> torch.Tensor: return torch.tensor(_list, dtype=torch.long, device=self.device) @@ -80,7 +80,7 @@ def _empty_long_tensor(self) -> torch.Tensor: return self._list_to_long_tensor([]) def make_model_input_from_broadcasted_tensor_dict( - self, tensor_dict: Dict[str, + self, tensor_dict: dict[str, Any]) -> EncoderDecoderModelInputForCPU: return EncoderDecoderModelInputForCPU.from_broadcasted_tensor_dict( tensor_dict, @@ -89,9 +89,9 @@ def make_model_input_from_broadcasted_tensor_dict( def prepare_model_input( self, - seq_group_metadata_list: List[SequenceGroupMetadata], + seq_group_metadata_list: list[SequenceGroupMetadata], virtual_engine: int = 0, - finished_requests_ids: Optional[List[str]] = None + finished_requests_ids: Optional[list[str]] = None ) -> EncoderDecoderModelInputForCPU: model_input = self._prepare_model_input_tensors( seq_group_metadata_list, finished_requests_ids) @@ -120,9 +120,9 @@ def prepare_model_input( def _prepare_encoder_model_input_tensors( self, - seq_group_metadata_list: List[SequenceGroupMetadata], + seq_group_metadata_list: list[SequenceGroupMetadata], model_input: EncoderDecoderModelInputForCPU, - ) -> Tuple[AttentionMetadata, Optional[torch.Tensor], + ) -> tuple[AttentionMetadata, Optional[torch.Tensor], Optional[torch.Tensor]]: """Helper method to prepare the encoder- and cross-attn-related model inputs based on a given sequence group. These additional inputs @@ -167,7 +167,7 @@ def _prepare_encoder_model_input_tensors( is_prompt = seq_group_metadata_list[0].is_prompt # Build encoder inputs - encoder_seq_lens: List[int] = [] + encoder_seq_lens: list[int] = [] if is_prompt: # Prefill phase. cross_block_tables = self._empty_int32_tensor().view( @@ -279,10 +279,10 @@ def _prepare_encoder_model_input_tensors( def execute_model( self, model_input: EncoderDecoderModelInputForCPU, - kv_caches: List[torch.Tensor], + kv_caches: list[torch.Tensor], intermediate_tensors: Optional[IntermediateTensors] = None, num_steps: int = 1, - ) -> Optional[List[SamplerOutput]]: + ) -> Optional[list[SamplerOutput]]: if num_steps > 1: raise ValueError( "CPU worker does not support multi-step execution.") diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py index 8407f073040ee..bb0de9fff0ee1 100644 --- a/vllm/worker/cpu_model_runner.py +++ b/vllm/worker/cpu_model_runner.py @@ -4,8 +4,7 @@ import weakref from collections import defaultdict from dataclasses import dataclass -from typing import (TYPE_CHECKING, Any, Dict, List, Optional, Set, Type, - TypeVar, Union) +from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union import torch from torch import nn @@ -53,13 +52,13 @@ class ModelInputForCPU(ModelRunnerInputBase): attn_metadata: Optional["AttentionMetadata"] = None multi_modal_kwargs: Optional[BatchedTensorInputs] = None virtual_engine: Optional[int] = None - seq_lens: Optional[List[int]] = None - query_lens: Optional[List[int]] = None + seq_lens: Optional[list[int]] = None + query_lens: Optional[list[int]] = None lora_mapping: Optional["LoRAMapping"] = None - lora_requests: Optional[Set[LoRARequest]] = None + lora_requests: Optional[set[LoRARequest]] = None def as_broadcastable_tensor_dict( - self) -> Dict[str, Union[int, torch.Tensor]]: + self) -> dict[str, Union[int, torch.Tensor]]: tensor_dict = { "input_tokens": self.input_tokens, "input_positions": self.input_positions, @@ -74,8 +73,8 @@ def as_broadcastable_tensor_dict( @classmethod def from_broadcasted_tensor_dict( - cls: Type[TModelInputForCPU], - tensor_dict: Dict[str, Any], + cls: type[TModelInputForCPU], + tensor_dict: dict[str, Any], attn_backend: Optional["AttentionBackend"] = None ) -> TModelInputForCPU: if attn_backend is not None: @@ -92,7 +91,7 @@ class ModelInputForCPUWithSamplingMetadata(ModelInputForCPU): sampling_metadata: Optional["SamplingMetadata"] = None is_prompt: Optional[bool] = None - def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: + def as_broadcastable_tensor_dict(self) -> dict[str, Any]: tensor_dict = { "input_tokens": self.input_tokens, "input_positions": self.input_positions, @@ -107,7 +106,7 @@ def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: @classmethod def from_broadcasted_tensor_dict( cls, - tensor_dict: Dict[str, Any], + tensor_dict: dict[str, Any], attn_backend: Optional["AttentionBackend"] = None, ) -> "ModelInputForCPUWithSamplingMetadata": tensor_dict = _init_sampling_metadata_from_tensor_dict(tensor_dict) @@ -123,28 +122,28 @@ class ModelInputData: def __init__(self, use_mrope: bool): self.use_mrope = use_mrope - self.input_tokens: List[int] = [] - self.input_positions: List[int] = [] - self.token_type_ids: Optional[List[int]] = [] - self.seq_lens: List[int] = [] - self.query_lens: List[int] = [] - self.prefill_block_tables: List[List[int]] = [] - self.decode_block_tables: List[List[int]] = [] + self.input_tokens: list[int] = [] + self.input_positions: list[int] = [] + self.token_type_ids: Optional[list[int]] = [] + self.seq_lens: list[int] = [] + self.query_lens: list[int] = [] + self.prefill_block_tables: list[list[int]] = [] + self.decode_block_tables: list[list[int]] = [] self.max_decode_seq_len: int = 0 self.num_prefills: int = 0 self.num_prefill_tokens: int = 0 self.num_decode_tokens: int = 0 - self.slot_mapping: List[int] = [] - self.multi_modal_inputs_list: List[MultiModalKwargs] = [] - self.multi_modal_placeholder_maps: Dict[ + self.slot_mapping: list[int] = [] + self.multi_modal_inputs_list: list[MultiModalKwargs] = [] + self.multi_modal_placeholder_maps: dict[ str, MultiModalPlaceholderMap] = defaultdict( MultiModalPlaceholderMap) - self.input_mrope_positions: List[List[int]] = [[] + self.input_mrope_positions: list[list[int]] = [[] for _ in range(3)] def __init__(self, runner: "CPUModelRunner", - finished_requests_ids: Optional[List[str]] = None) -> None: + finished_requests_ids: Optional[list[str]] = None) -> None: super().__init__() self.runner = runner self.chunked_prefill = (runner.scheduler_config.chunked_prefill_enabled @@ -162,8 +161,8 @@ def __init__(self, self.att_metadata_builder = attn_backend.get_builder_cls()(self) def prepare(self, - finished_requests_ids: Optional[List[str]] = None) -> None: - self.seq_group_metadata_list: List[SequenceGroupMetadata] = [] + finished_requests_ids: Optional[list[str]] = None) -> None: + self.seq_group_metadata_list: list[SequenceGroupMetadata] = [] self.input_data = ModelInputForCPUBuilder.ModelInputData( self.runner.model_config.uses_mrope) self.att_metadata_builder.prepare() @@ -172,7 +171,7 @@ def add_seq_group(self, seq_group_metadata: SequenceGroupMetadata): self.seq_group_metadata_list.append(seq_group_metadata) def set_seq_group_list( - self, seq_group_metadata_list: List[SequenceGroupMetadata]): + self, seq_group_metadata_list: list[SequenceGroupMetadata]): self.seq_group_metadata_list = seq_group_metadata_list def build(self) -> ModelInputForCPU: @@ -411,7 +410,7 @@ def _compute_multi_modal_input(self, placeholder_map) def _prepare_lora_input( - self, seq_group_metadata_list: List[SequenceGroupMetadata], + self, seq_group_metadata_list: list[SequenceGroupMetadata], is_prefill: bool) -> LoRAMapping: index_mapping = [] prompt_mapping = [] @@ -433,8 +432,8 @@ class CPUModelRunnerBase(ModelRunnerBase[TModelInputForCPU]): """ Helper class for shared methods between CPU model runners. """ - _model_input_cls: Type[TModelInputForCPU] - _builder_cls: Type[ModelInputForCPUBuilder] + _model_input_cls: type[TModelInputForCPU] + _builder_cls: type[ModelInputForCPUBuilder] builder: ModelInputForCPUBuilder def __init__( @@ -523,8 +522,8 @@ def get_model(self) -> nn.Module: def _prepare_model_input_tensors( self, - seq_group_metadata_list: List[SequenceGroupMetadata], - finished_requests_ids: Optional[List[str]] = None + seq_group_metadata_list: list[SequenceGroupMetadata], + finished_requests_ids: Optional[list[str]] = None ) -> TModelInputForCPU: """Helper method to prepare the model input based on a given sequence group. Prepares metadata needed for the base model forward pass but not @@ -550,7 +549,7 @@ def remove_all_loras(self): raise RuntimeError("LoRA is not enabled.") self.lora_manager.remove_all_adapters() - def set_active_loras(self, lora_requests: Set[LoRARequest], + def set_active_loras(self, lora_requests: set[LoRARequest], lora_mapping: LoRAMapping) -> None: if not self.lora_manager: raise RuntimeError("LoRA is not enabled.") @@ -571,20 +570,20 @@ def pin_lora(self, lora_id: int) -> bool: raise RuntimeError("LoRA is not enabled.") return self.lora_manager.pin_adapter(lora_id) - def list_loras(self) -> Set[int]: + def list_loras(self) -> set[int]: if not self.lora_manager: raise RuntimeError("LoRA is not enabled.") return self.lora_manager.list_adapters() class CPUModelRunner(CPUModelRunnerBase[ModelInputForCPUWithSamplingMetadata]): - _model_input_cls: Type[ModelInputForCPUWithSamplingMetadata] = ( + _model_input_cls: type[ModelInputForCPUWithSamplingMetadata] = ( ModelInputForCPUWithSamplingMetadata) - _builder_cls: Type[ModelInputForCPUBuilder] = ModelInputForCPUBuilder + _builder_cls: type[ModelInputForCPUBuilder] = ModelInputForCPUBuilder def make_model_input_from_broadcasted_tensor_dict( self, - tensor_dict: Dict[str, Any], + tensor_dict: dict[str, Any], ) -> ModelInputForCPUWithSamplingMetadata: return ModelInputForCPUWithSamplingMetadata.from_broadcasted_tensor_dict( # noqa: E501 tensor_dict, @@ -593,9 +592,9 @@ def make_model_input_from_broadcasted_tensor_dict( def prepare_model_input( self, - seq_group_metadata_list: List[SequenceGroupMetadata], + seq_group_metadata_list: list[SequenceGroupMetadata], virtual_engine: int = 0, - finished_requests_ids: Optional[List[str]] = None + finished_requests_ids: Optional[list[str]] = None ) -> ModelInputForCPUWithSamplingMetadata: """Prepare the model input based on a given sequence group, including metadata for the sampling step. @@ -623,11 +622,11 @@ def prepare_model_input( def execute_model( self, model_input: ModelInputForCPUWithSamplingMetadata, - kv_caches: List[torch.Tensor], + kv_caches: list[torch.Tensor], intermediate_tensors: Optional[IntermediateTensors] = None, num_steps: int = 1, previous_hidden_states: Optional[torch.Tensor] = None, - ) -> Optional[List[SamplerOutput]]: + ) -> Optional[list[SamplerOutput]]: if num_steps > 1: raise ValueError( "CPU worker does not support multi-step execution.") diff --git a/vllm/worker/cpu_pooling_model_runner.py b/vllm/worker/cpu_pooling_model_runner.py index 1ceb2557c6b3d..810d7373e302b 100644 --- a/vllm/worker/cpu_pooling_model_runner.py +++ b/vllm/worker/cpu_pooling_model_runner.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import dataclasses -from typing import Any, Dict, List, Optional, Tuple, Type, Union +from typing import Any, Optional, Union import torch @@ -25,18 +25,18 @@ class ModelInputForCPUWithPoolingMetadata(ModelInputForCPU): class CPUPoolingModelRunner( CPUModelRunnerBase[ModelInputForCPUWithPoolingMetadata]): - _model_input_cls: Type[ModelInputForCPUWithPoolingMetadata] = ( + _model_input_cls: type[ModelInputForCPUWithPoolingMetadata] = ( ModelInputForCPUWithPoolingMetadata) - _builder_cls: Type[ModelInputForCPUBuilder] = ModelInputForCPUBuilder + _builder_cls: type[ModelInputForCPUBuilder] = ModelInputForCPUBuilder @torch.inference_mode() def execute_model( self, model_input: ModelInputForCPUWithPoolingMetadata, - kv_caches: List[torch.Tensor], + kv_caches: list[torch.Tensor], intermediate_tensors: Optional[IntermediateTensors] = None, num_steps: int = 1, - ) -> Optional[Union[List[PoolerOutput], IntermediateTensors]]: + ) -> Optional[Union[list[PoolerOutput], IntermediateTensors]]: if num_steps > 1: raise ValueError( "CPU worker does not support multi-step execution.") @@ -72,7 +72,7 @@ def execute_model( def make_model_input_from_broadcasted_tensor_dict( self, - tensor_dict: Dict[str, + tensor_dict: dict[str, Any]) -> ModelInputForCPUWithPoolingMetadata: return ModelInputForCPUWithPoolingMetadata.from_broadcasted_tensor_dict( tensor_dict, @@ -81,9 +81,9 @@ def make_model_input_from_broadcasted_tensor_dict( def prepare_model_input( self, - seq_group_metadata_list: Optional[List[SequenceGroupMetadata]], + seq_group_metadata_list: Optional[list[SequenceGroupMetadata]], virtual_engine: int = 0, - finished_requests_ids: Optional[List[str]] = None + finished_requests_ids: Optional[list[str]] = None ) -> ModelInputForCPUWithPoolingMetadata: assert seq_group_metadata_list is not None model_input = self._prepare_model_input_tensors( @@ -99,17 +99,17 @@ def prepare_model_input( def _prepare_pooling( self, - seq_group_metadata_list: List[SequenceGroupMetadata], - prompt_lens: List[int], + seq_group_metadata_list: list[SequenceGroupMetadata], + prompt_lens: list[int], ) -> PoolingMetadata: """Prepare PoolingMetadata for the sequence group metadata list.""" - seq_groups: List[Tuple[List[int], PoolingParams]] = [] + seq_groups: list[tuple[list[int], PoolingParams]] = [] for i, seq_group_metadata in enumerate(seq_group_metadata_list): seq_ids = list(seq_group_metadata.seq_data.keys()) pooling_params = seq_group_metadata.pooling_params seq_groups.append((seq_ids, pooling_params)) - seq_data: Dict[int, SequenceData] = {} + seq_data: dict[int, SequenceData] = {} for seq_group_metadata in seq_group_metadata_list: seq_data.update(seq_group_metadata.seq_data) diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py index 27b1a2dd1be8c..cb729af319d8d 100644 --- a/vllm/worker/cpu_worker.py +++ b/vllm/worker/cpu_worker.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 """A CPU worker class.""" -from typing import Dict, List, Optional, Set, Tuple, Type +from typing import Optional import torch import torch.distributed @@ -71,23 +71,23 @@ def __init__(self, cache_config: CacheConfig, model_config: ModelConfig, def _allocate_kv_cache( self, num_blocks: int, - ) -> List[torch.Tensor]: + ) -> list[torch.Tensor]: """Allocates KV cache on CPU.""" kv_cache_shape = self.attn_backend.get_kv_cache_shape( num_blocks, self.block_size, self.num_heads, self.head_size) - kv_cache: List[torch.Tensor] = [] + kv_cache: list[torch.Tensor] = [] for _ in range(self.num_layers): kv_cache.append( torch.empty(kv_cache_shape, dtype=self.dtype, device="cpu")) return kv_cache - def swap_in(self, src_to_dst: Dict[int, int]) -> None: + def swap_in(self, src_to_dst: dict[int, int]) -> None: raise NotImplementedError("Swap is not supported in CPUCacheEngine.") - def swap_out(self, src_to_dst: Dict[int, int]) -> None: + def swap_out(self, src_to_dst: dict[int, int]) -> None: raise NotImplementedError("Swap is not supported in CPUCacheEngine.") - def copy(self, src_to_dsts: Dict[int, List[int]]) -> None: + def copy(self, src_to_dsts: dict[int, list[int]]) -> None: self.attn_backend.copy_blocks(self.cpu_cache, src_to_dsts) @staticmethod @@ -129,7 +129,7 @@ def __init__( distributed_init_method: str, kv_cache_dtype: Optional[str] = "auto", is_driver_worker: bool = False, - model_runner_cls: Optional[Type[CPUModelRunner]] = None, + model_runner_cls: Optional[type[CPUModelRunner]] = None, ) -> None: WorkerBase.__init__(self, vllm_config=vllm_config) @@ -163,7 +163,7 @@ def __init__( or (speculative_config.draft_model_config.hf_config.model_type not in ["medusa", "mlp_speculator", "eagle"]) \ else {"return_hidden_states": True} - ModelRunnerClass: Type[CPUModelRunnerBase] = CPUModelRunner + ModelRunnerClass: type[CPUModelRunnerBase] = CPUModelRunner if self.model_config.runner_type == "pooling": ModelRunnerClass = CPUPoolingModelRunner elif self.model_config.is_encoder_decoder: @@ -178,9 +178,9 @@ def __init__( self.model_runner = model_runner_cls(self.model_runner) # Uninitialized cache engine. Will be initialized by # initialize_cache. - self.cache_engine: List[CPUCacheEngine] + self.cache_engine: list[CPUCacheEngine] # Initialize cpu_cache as pooling models don't initialize kv_caches - self.cpu_cache: Optional[List[List[torch.Tensor]]] = None + self.cpu_cache: Optional[list[list[torch.Tensor]]] = None # Torch profiler. Enabled and configured through env vars: # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace @@ -221,7 +221,7 @@ def init_device(self) -> None: def load_model(self): self.model_runner.load_model() - def determine_num_available_blocks(self) -> Tuple[int, int]: + def determine_num_available_blocks(self) -> tuple[int, int]: """Determine the number of blocks available for the KV cache. This determines how many KV blocks can fit into the configured CPU @@ -276,7 +276,7 @@ def remove_lora(self, lora_id: int) -> bool: def pin_lora(self, lora_id: int) -> bool: return self.model_runner.pin_lora(lora_id) - def list_loras(self) -> Set[int]: + def list_loras(self) -> set[int]: return self.model_runner.list_loras() def _validate_num_cpu_blocks(self, num_cpu_blocks: int) -> None: @@ -324,7 +324,7 @@ def do_metadata_broadcast(self) -> bool: return self.parallel_config.tensor_parallel_size > 1 @property - def kv_cache(self) -> Optional[List[List[torch.Tensor]]]: + def kv_cache(self) -> Optional[list[list[torch.Tensor]]]: return self.cpu_cache @property diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py index 5f39f2fa4947c..48e9cf5f56562 100644 --- a/vllm/worker/enc_dec_model_runner.py +++ b/vllm/worker/enc_dec_model_runner.py @@ -2,7 +2,7 @@ import dataclasses import itertools -from typing import Any, Dict, List, Optional, Tuple, Type, cast +from typing import Any, Optional, cast import torch import torch.distributed @@ -44,7 +44,7 @@ class EncoderDecoderModelInput(ModelInputForGPUWithSamplingMetadata): encoder_input_tokens: Optional[torch.Tensor] = None encoder_input_positions: Optional[torch.Tensor] = None - def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: + def as_broadcastable_tensor_dict(self) -> dict[str, Any]: tensor_dict = { "input_tokens": self.input_tokens, "input_positions": self.input_positions, @@ -63,7 +63,7 @@ def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: @classmethod def from_broadcasted_tensor_dict( cls, - tensor_dict: Dict[str, Any], + tensor_dict: dict[str, Any], attn_backend: Optional["AttentionBackend"] = None, ) -> "EncoderDecoderModelInput": return cast( @@ -72,9 +72,9 @@ def from_broadcasted_tensor_dict( class EncoderDecoderModelRunner(GPUModelRunnerBase[EncoderDecoderModelInput]): - _model_input_cls: Type[EncoderDecoderModelInput] = ( + _model_input_cls: type[EncoderDecoderModelInput] = ( EncoderDecoderModelInput) - _builder_cls: Type[ModelInputForGPUBuilder] = (ModelInputForGPUBuilder) + _builder_cls: type[ModelInputForGPUBuilder] = (ModelInputForGPUBuilder) def __init__( self, @@ -133,13 +133,13 @@ def raise_backend_err(): def _list_to_int32_tensor( self, - _list: List[int], + _list: list[int], ) -> torch.Tensor: return torch.tensor(_list, dtype=torch.int32, device=self.device) def _list_to_long_tensor( self, - _list: List[int], + _list: list[int], ) -> torch.Tensor: return torch.tensor(_list, dtype=torch.long, device=self.device) @@ -153,10 +153,10 @@ def _empty_long_tensor(self) -> torch.Tensor: def execute_model( self, model_input: EncoderDecoderModelInput, - kv_caches: List[torch.Tensor], + kv_caches: list[torch.Tensor], intermediate_tensors: Optional[IntermediateTensors] = None, num_steps: int = 1, - ) -> Optional[List[PoolerOutput]]: + ) -> Optional[list[PoolerOutput]]: if num_steps > 1: raise ValueError("num_steps > 1 is not supported in " "EncoderDecoderModelRunner") @@ -207,7 +207,7 @@ def execute_model( return [output] def make_model_input_from_broadcasted_tensor_dict( - self, tensor_dict: Dict[str, Any]) -> EncoderDecoderModelInput: + self, tensor_dict: dict[str, Any]) -> EncoderDecoderModelInput: return EncoderDecoderModelInput.from_broadcasted_tensor_dict( tensor_dict, attn_backend=self.attn_backend, @@ -215,9 +215,9 @@ def make_model_input_from_broadcasted_tensor_dict( def prepare_model_input( self, - seq_group_metadata_list: List[SequenceGroupMetadata], + seq_group_metadata_list: list[SequenceGroupMetadata], virtual_engine: int = 0, - finished_requests_ids: Optional[List[str]] = None + finished_requests_ids: Optional[list[str]] = None ) -> EncoderDecoderModelInput: """Prepare the model input based on a given sequence group, including metadata for the sampling step. @@ -270,7 +270,7 @@ def profile_run(self) -> None: # Profile memory usage with max_num_sequences sequences and the total # number of tokens equal to max_num_batched_tokens. - seqs: List[SequenceGroupMetadata] = [] + seqs: list[SequenceGroupMetadata] = [] max_mm_tokens = self.mm_registry.get_max_multimodal_tokens( self.model_config) @@ -332,9 +332,9 @@ def profile_run(self) -> None: def _prepare_encoder_model_input_tensors( self, - seq_group_metadata_list: List[SequenceGroupMetadata], + seq_group_metadata_list: list[SequenceGroupMetadata], model_input: EncoderDecoderModelInput, - ) -> Tuple[AttentionMetadata, Optional[torch.Tensor], + ) -> tuple[AttentionMetadata, Optional[torch.Tensor], Optional[torch.Tensor]]: """Helper method to prepare the encoder- and cross-attn-related model inputs based on a given sequence group. These additional inputs @@ -379,7 +379,7 @@ def _prepare_encoder_model_input_tensors( is_prompt = seq_group_metadata_list[0].is_prompt # Build encoder inputs - encoder_seq_lens: List[int] = [] + encoder_seq_lens: list[int] = [] if is_prompt: # Prefill phase. cross_block_tables = self._empty_int32_tensor().view( diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index d6eaf84e40f6b..57b916c2e2cdb 100644 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -17,8 +17,8 @@ from array import array from dataclasses import dataclass, field from enum import IntEnum -from typing import (TYPE_CHECKING, Any, Callable, Dict, List, NamedTuple, - Optional, Set, Tuple, Type, TypeVar, Union) +from typing import (TYPE_CHECKING, Any, Callable, NamedTuple, Optional, + TypeVar, Union) import habana_frameworks.torch as htorch import habana_frameworks.torch.internal.bridge_config as bc @@ -70,7 +70,7 @@ class Singleton(type): - _instances: Dict[type, object] = {} + _instances: dict[type, object] = {} def __call__(cls, *args, **kwargs): if cls not in cls._instances: @@ -80,18 +80,18 @@ def __call__(cls, *args, **kwargs): @dataclass class HPUBucketingGlobalState(metaclass=Singleton): - prompt_bs_bucket_cfg: Tuple[int, int, int] = field(init=False) - decode_bs_bucket_cfg: Tuple[int, int, int] = field(init=False) - prompt_seq_bucket_cfg: Tuple[int, int, int] = field(init=False) - decode_block_bucket_cfg: Tuple[int, int, int] = field(init=False) - prompt_buckets: List[Tuple[int, int]] = field(init=False) - decode_buckets: List[Tuple[int, int]] = field(init=False) + prompt_bs_bucket_cfg: tuple[int, int, int] = field(init=False) + decode_bs_bucket_cfg: tuple[int, int, int] = field(init=False) + prompt_seq_bucket_cfg: tuple[int, int, int] = field(init=False) + decode_block_bucket_cfg: tuple[int, int, int] = field(init=False) + prompt_buckets: list[tuple[int, int]] = field(init=False) + decode_buckets: list[tuple[int, int]] = field(init=False) def subtuple(obj: object, typename: str, - to_copy: List[str], - to_override: Optional[Dict[str, object]] = None): + to_copy: list[str], + to_override: Optional[dict[str, object]] = None): if obj is None: return None if to_override is None: @@ -123,7 +123,7 @@ def read_bucket_settings(phase: str, dim: str, **defaults): return values -def warmup_range(config: Tuple[int, int, int]): +def warmup_range(config: tuple[int, int, int]): """Generate a warmup range. Start from bmin and multiply by 2 until you reach bstep. @@ -225,7 +225,7 @@ def round_up(value: int, k: int): return (value + k - 1) // k * k -def find_bucket(value: int, config: Tuple[int, int, int]): +def find_bucket(value: int, config: tuple[int, int, int]): bmin, bstep, _ = config next_step = round_up(value, bstep) next_pow = next_pow2(value, bmin) @@ -406,16 +406,16 @@ def sample(self, *args, **kwargs): class PreparePromptMetadata(NamedTuple): input_tokens: torch.Tensor - input_positions: List[List[int]] + input_positions: list[list[int]] attn_metadata: Optional[AttentionMetadata] - seq_lens: List[int] - query_lens: List[int] - lora_index_mapping: List[List[int]] - lora_prompt_mapping: List[List[int]] - lora_requests: Set[LoRARequest] - multi_modal_kwargs: Optional[Dict[str, BatchedTensorInputs]] - slot_mapping: List[List[int]] - lora_ids: List[int] + seq_lens: list[int] + query_lens: list[int] + lora_index_mapping: list[list[int]] + lora_prompt_mapping: list[list[int]] + lora_requests: set[LoRARequest] + multi_modal_kwargs: Optional[dict[str, BatchedTensorInputs]] + slot_mapping: list[list[int]] + lora_ids: list[int] @classmethod def empty(cls): @@ -434,13 +434,13 @@ def empty(cls): class PrepareDecodeMetadata(NamedTuple): input_tokens: torch.Tensor - input_positions: List[List[int]] + input_positions: list[list[int]] attn_metadata: Optional[AttentionMetadata] - lora_index_mapping: List[List[int]] - lora_prompt_mapping: List[List[int]] - lora_requests: Set[LoRARequest] - slot_mapping: List[List[int]] - lora_ids: List[int] + lora_index_mapping: list[list[int]] + lora_prompt_mapping: list[list[int]] + lora_requests: set[LoRARequest] + slot_mapping: list[list[int]] + lora_ids: list[int] @classmethod def empty(cls): @@ -477,19 +477,19 @@ class ModelInputForHPU(ModelRunnerInputBase): """ input_tokens: Optional[torch.Tensor] = None input_positions: Optional[torch.Tensor] = None - seq_lens: Optional[List[int]] = None - query_lens: Optional[List[int]] = None + seq_lens: Optional[list[int]] = None + query_lens: Optional[list[int]] = None lora_mapping: Optional["LoRAMapping"] = None - lora_requests: Optional[Set[LoRARequest]] = None + lora_requests: Optional[set[LoRARequest]] = None attn_metadata: Optional["AttentionMetadata"] = None - multi_modal_kwargs: Optional[Dict[str, torch.Tensor]] = None + multi_modal_kwargs: Optional[dict[str, torch.Tensor]] = None real_batch_size: Optional[int] = None batch_size_padded: Optional[int] = None virtual_engine: int = 0 - lora_ids: Optional[List[int]] = None + lora_ids: Optional[list[int]] = None async_callback: Optional[Callable] = None - def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: + def as_broadcastable_tensor_dict(self) -> dict[str, Any]: tensor_dict = { "input_tokens": self.input_tokens, "input_positions": self.input_positions, @@ -506,8 +506,8 @@ def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: @classmethod def from_broadcasted_tensor_dict( - cls: Type[TModelInputForHPU], - tensor_dict: Dict[str, Any], + cls: type[TModelInputForHPU], + tensor_dict: dict[str, Any], attn_backend: Optional["AttentionBackend"] = None, ) -> TModelInputForHPU: if attn_backend is not None: @@ -526,7 +526,7 @@ class ModelInputForHPUWithSamplingMetadata(ModelInputForHPU): # used by the driver worker. is_prompt: Optional[bool] = None - def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: + def as_broadcastable_tensor_dict(self) -> dict[str, Any]: tensor_dict = { "input_tokens": self.input_tokens, "input_positions": self.input_positions, @@ -543,7 +543,7 @@ def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: @classmethod def from_broadcasted_tensor_dict( cls, - tensor_dict: Dict[str, Any], + tensor_dict: dict[str, Any], attn_backend: Optional["AttentionBackend"] = None, ) -> "ModelInputForHPUWithSamplingMetadata": tensor_dict = _init_sampling_metadata_from_tensor_dict(tensor_dict) @@ -558,7 +558,7 @@ class HPUModelRunnerBase(ModelRunnerBase[TModelInputForHPU]): """ Helper class for shared methods between GPU model runners. """ - _model_input_cls: Type[TModelInputForHPU] + _model_input_cls: type[TModelInputForHPU] def __init__( self, @@ -754,7 +754,7 @@ def _setup_buckets(self) -> None: step=self.block_size, max=max(self.block_size, self.max_num_seqs * max_decode_seq // self.block_size)) - self.graphed_buckets: Set[Any] = set() + self.graphed_buckets: set[Any] = set() msg = ("Prompt bucket config (min, step, max_warmup) " f"bs:{self.bucketing_global_state.prompt_bs_bucket_cfg}, " @@ -768,20 +768,20 @@ def _setup_buckets(self) -> None: def _prepare_prompt( self, - seq_group_metadata_list: List[SequenceGroupMetadata], + seq_group_metadata_list: list[SequenceGroupMetadata], ) -> PreparePromptMetadata: - input_tokens: List[List[int]] = [] - input_positions: List[List[int]] = [] - slot_mapping: List[List[int]] = [] - lora_index_mapping: List[List[int]] = [] - lora_prompt_mapping: List[List[int]] = [] - lora_requests: Set[LoRARequest] = set() - - seq_lens: List[int] = [] - context_lens: List[int] = [] - query_lens: List[int] = [] - prefix_block_tables: List[List[int]] = [] - multi_modal_kwargs_list: List[MultiModalKwargs] = [] + input_tokens: list[list[int]] = [] + input_positions: list[list[int]] = [] + slot_mapping: list[list[int]] = [] + lora_index_mapping: list[list[int]] = [] + lora_prompt_mapping: list[list[int]] = [] + lora_requests: set[LoRARequest] = set() + + seq_lens: list[int] = [] + context_lens: list[int] = [] + query_lens: list[int] = [] + prefix_block_tables: list[list[int]] = [] + multi_modal_kwargs_list: list[MultiModalKwargs] = [] if len(seq_group_metadata_list) == 0: return PreparePromptMetadata.empty() @@ -885,7 +885,7 @@ def _prepare_prompt( self.bucketing_global_state.prompt_seq_bucket_cfg), self.block_size) - lora_ids: List[int] = [] + lora_ids: list[int] = [] for seq_group_metadata, context_len in zip(seq_group_metadata_list, context_lens): lora_id = seq_group_metadata.lora_int_id @@ -959,20 +959,20 @@ def _prepare_prompt( def _prepare_decode( self, - seq_group_metadata_list: List[SequenceGroupMetadata], + seq_group_metadata_list: list[SequenceGroupMetadata], ) -> PrepareDecodeMetadata: - input_tokens: List[List[int]] = [] - input_positions: List[List[int]] = [] - slot_mapping: List[List[int]] = [] - seq_lens: List[int] = [] - block_tables: List[List[int]] = [] - lora_index_mapping: List[List[int]] = [] - lora_prompt_mapping: List[List[int]] = [] - lora_requests: Set[LoRARequest] = set() + input_tokens: list[list[int]] = [] + input_positions: list[list[int]] = [] + slot_mapping: list[list[int]] = [] + seq_lens: list[int] = [] + block_tables: list[list[int]] = [] + lora_index_mapping: list[list[int]] = [] + lora_prompt_mapping: list[list[int]] = [] + lora_requests: set[LoRARequest] = set() if len(seq_group_metadata_list) == 0: return PrepareDecodeMetadata.empty() - lora_ids: List[int] = [] + lora_ids: list[int] = [] dummy_slots = itertools.cycle( range(_PAD_SLOT_ID, _PAD_SLOT_ID + self.block_size)) @@ -1051,7 +1051,7 @@ def _prepare_decode( block_bucket_size = find_bucket( block_bucket_size, self.bucketing_global_state.decode_block_bucket_cfg) - indices: List[Any] + indices: list[Any] indices = [None] * block_bucket_size for i, bid in enumerate(block_list): indices[bid] = i @@ -1113,8 +1113,8 @@ def _prepare_decode( def prepare_input_tensors( self, - seq_group_metadata_list: List[SequenceGroupMetadata], - ) -> Tuple[TModelInputForHPU, SamplingMetadata]: + seq_group_metadata_list: list[SequenceGroupMetadata], + ) -> tuple[TModelInputForHPU, SamplingMetadata]: if len(seq_group_metadata_list) == 0: return self._model_input_cls(), None @@ -1366,8 +1366,8 @@ def warmup_scenario(self, # that will have unique loras, an therefore the max amount of memory # consumption create dummy lora request copies from the lora request # passed in, which contains a lora from the lora warmup path. - dummy_lora_requests: List[LoRARequest] = [] - dummy_lora_requests_per_seq: List[LoRARequest] = [] + dummy_lora_requests: list[LoRARequest] = [] + dummy_lora_requests_per_seq: list[LoRARequest] = [] if self.lora_config and is_lora_profile_run: assert self.lora_manager is not None with self.lora_manager.dummy_lora_cache(): @@ -1431,7 +1431,7 @@ def remove_all_loras(self): raise RuntimeError("LoRA is not enabled.") self.lora_manager.remove_all_adapters() - def set_active_loras(self, lora_requests: Set[LoRARequest], + def set_active_loras(self, lora_requests: set[LoRARequest], lora_mapping: LoRAMapping) -> None: if not self.lora_manager: raise RuntimeError("LoRA is not enabled.") @@ -1452,7 +1452,7 @@ def pin_lora(self, lora_id: int) -> bool: raise RuntimeError("LoRA is not enabled.") return self.lora_manager.pin_adapter(lora_id) - def list_loras(self) -> Set[int]: + def list_loras(self) -> set[int]: if not self.lora_manager: raise RuntimeError("LoRA is not enabled.") return self.lora_manager.list_adapters() @@ -1486,8 +1486,8 @@ def warmup_graphs(self, idx = 0 phase = f'Graph/{"Prompt" if is_prompt else "Decode"}' num_candidates = len(buckets) - ordering : Union[Callable[[Any], Tuple[Any, Any]], \ - Callable[[Any], Tuple[Any, Any, Any]]] + ordering : Union[Callable[[Any], tuple[Any, Any]], \ + Callable[[Any], tuple[Any, Any, Any]]] if strategy == 'min_tokens': ordering = lambda b: (b[0] * b[1], b[1], b[0]) elif strategy == 'max_bs': @@ -1533,7 +1533,7 @@ def log_graph_warmup_summary(self, buckets, is_prompt, total_mem): logger.info(msg) @torch.inference_mode() - def warmup_model(self, kv_caches: List[torch.Tensor]) -> None: + def warmup_model(self, kv_caches: list[torch.Tensor]) -> None: if profile := os.environ.get('VLLM_PT_PROFILE', None): phase, bs, seq_len, graph = profile.split('_') is_prompt = phase == 'prompt' @@ -1805,12 +1805,12 @@ class HPUModelRunner(HPUModelRunnerBase[ModelInputForHPUWithSamplingMetadata]): """ GPU model runner with sampling step. """ - _model_input_cls: Type[ModelInputForHPUWithSamplingMetadata] = ( + _model_input_cls: type[ModelInputForHPUWithSamplingMetadata] = ( ModelInputForHPUWithSamplingMetadata) def make_model_input_from_broadcasted_tensor_dict( self, - tensor_dict: Dict[str, Any], + tensor_dict: dict[str, Any], ) -> ModelInputForHPUWithSamplingMetadata: return ( ModelInputForHPUWithSamplingMetadata.from_broadcasted_tensor_dict( @@ -1821,9 +1821,9 @@ def make_model_input_from_broadcasted_tensor_dict( @torch.inference_mode() def prepare_model_input( self, - seq_group_metadata_list: List[SequenceGroupMetadata], + seq_group_metadata_list: list[SequenceGroupMetadata], virtual_engine: int = 0, - finished_requests_ids: Optional[List[str]] = None + finished_requests_ids: Optional[list[str]] = None ) -> ModelInputForHPUWithSamplingMetadata: """Prepare the model input based on a given sequence group, including metadata for the sampling step. @@ -1862,7 +1862,7 @@ def _check_config(self, batch_size, seq_len, is_prompt, warmup_mode): logger.warning("Configuration: (%s, %s, %s) was not warmed-up!", phase, batch_size, seq_len) - def create_lora_mask(self, input_tokens: torch.Tensor, lora_ids: List[int], + def create_lora_mask(self, input_tokens: torch.Tensor, lora_ids: list[int], is_prompt: bool): ''' This is a helper function to create the mask for lora computations. @@ -1936,11 +1936,11 @@ def create_lora_mask(self, input_tokens: torch.Tensor, lora_ids: List[int], def execute_model( self, model_input: ModelInputForHPUWithSamplingMetadata, - kv_caches: List[torch.Tensor], + kv_caches: list[torch.Tensor], intermediate_tensors: Optional[IntermediateTensors] = None, num_steps: int = 1, warmup_mode=False, - ) -> Optional[Union[List[SamplerOutput], IntermediateTensors]]: + ) -> Optional[Union[list[SamplerOutput], IntermediateTensors]]: if num_steps > 1: raise ValueError( "num_steps > 1 is not supported in HPUModelRunner") diff --git a/vllm/worker/hpu_worker.py b/vllm/worker/hpu_worker.py index ccb175d88fd3c..51f297bc9745e 100644 --- a/vllm/worker/hpu_worker.py +++ b/vllm/worker/hpu_worker.py @@ -7,7 +7,7 @@ import contextlib import gc import os -from typing import List, Optional, Set, Tuple, Type +from typing import Optional import habana_frameworks.torch as htorch # noqa:F401 import torch @@ -49,7 +49,7 @@ def __init__( rank: int, distributed_init_method: str, is_driver_worker: bool = False, - model_runner_cls: Optional[Type[ModelRunnerBase]] = None, + model_runner_cls: Optional[type[ModelRunnerBase]] = None, ) -> None: WorkerBase.__init__(self, vllm_config=vllm_config) self.parallel_config.rank = rank @@ -69,9 +69,9 @@ def __init__( vllm_config=vllm_config, is_driver_worker=is_driver_worker) # Uninitialized cache engine. Will be initialized by # initialize_cache. - self.cache_engine: List[HPUCacheEngine] + self.cache_engine: list[HPUCacheEngine] # Initialize gpu_cache as pooling models don't initialize kv_caches - self.hpu_cache: Optional[List[List[torch.Tensor]]] = None + self.hpu_cache: Optional[list[list[torch.Tensor]]] = None # Torch profiler. Enabled and configured through env vars: # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace if envs.VLLM_TORCH_PROFILER_DIR: @@ -131,7 +131,7 @@ def load_model(self): def execute_model( self, execute_model_req: Optional[ExecuteModelRequest] = None, - ) -> Optional[List[SamplerOutput]]: + ) -> Optional[list[SamplerOutput]]: # VLLM_HPU_LOG_STEP_GRAPH_COMPILATION - will log graph compilations per engine step, only when there was any - highly recommended to use alongside PT_HPU_METRICS_GC_DETAILS! # noqa:E501 # VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL - will log graph compilations per engine step, always, even if there were none # noqa:E501 # VLLM_HPU_LOG_STEP_CPU_FALLBACKS - will log cpu fallbacks per engine step, only when there was any # noqa:E501 @@ -193,7 +193,7 @@ def execute_model( return output @torch.inference_mode() - def determine_num_available_blocks(self) -> Tuple[int, int]: + def determine_num_available_blocks(self) -> tuple[int, int]: """Profiles the peak memory usage of the model to determine how many KV blocks may be allocated without OOMs. @@ -305,7 +305,7 @@ def do_metadata_broadcast(self) -> bool: return self.parallel_config.tensor_parallel_size > 1 @property - def kv_cache(self) -> Optional[List[List[torch.Tensor]]]: + def kv_cache(self) -> Optional[list[list[torch.Tensor]]]: return self.hpu_cache @torch.inference_mode() @@ -361,7 +361,7 @@ def remove_lora(self, lora_id: int) -> bool: def pin_lora(self, lora_id: int) -> bool: return self.model_runner.pin_lora(lora_id) - def list_loras(self) -> Set[int]: + def list_loras(self) -> set[int]: return self.model_runner.list_loras() def add_prompt_adapter( @@ -377,7 +377,7 @@ def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool: raise NotImplementedError( "Prompt Adapter is not implemented for HPU backend.") - def list_prompt_adapters(self) -> Set[int]: + def list_prompt_adapters(self) -> set[int]: raise NotImplementedError( "Prompt Adapter is not implemented for HPU backend.") @@ -465,11 +465,11 @@ def _allocate_kv_cache( self, num_blocks: int, device: str, - ) -> List[Tuple[torch.Tensor, torch.Tensor]]: + ) -> list[tuple[torch.Tensor, torch.Tensor]]: """Allocates KV cache on the specified device.""" kv_cache_shape = self.attn_backend.get_kv_cache_shape( num_blocks, self.block_size, self.num_kv_heads, self.head_size) - kv_cache: List[Tuple[torch.Tensor, torch.Tensor]] = [] + kv_cache: list[tuple[torch.Tensor, torch.Tensor]] = [] for _ in range(self.num_attention_layers): key_cache = torch.zeros(kv_cache_shape, dtype=self.dtype, diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index a37a3168bbbc7..591287608b5b0 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -8,8 +8,7 @@ import weakref from contextlib import contextmanager from dataclasses import dataclass -from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Optional, Set, - Tuple, Type, TypeVar, Union) +from typing import TYPE_CHECKING, Any, Callable, Optional, TypeVar, Union import numpy as np import torch @@ -86,22 +85,22 @@ class ModelInputForGPU(ModelRunnerInputBase): input_tokens: Optional[torch.Tensor] = None input_positions: Optional[torch.Tensor] = None token_types: Optional[torch.Tensor] = None - seq_lens: Optional[List[int]] = None - query_lens: Optional[List[int]] = None + seq_lens: Optional[list[int]] = None + query_lens: Optional[list[int]] = None lora_mapping: Optional["LoRAMapping"] = None - lora_requests: Optional[Set[LoRARequest]] = None + lora_requests: Optional[set[LoRARequest]] = None attn_metadata: Optional["AttentionMetadata"] = None prompt_adapter_mapping: Optional[PromptAdapterMapping] = None - prompt_adapter_requests: Optional[Set[PromptAdapterRequest]] = None + prompt_adapter_requests: Optional[set[PromptAdapterRequest]] = None multi_modal_kwargs: Optional[BatchedTensorInputs] = None - request_ids_to_seq_ids: Optional[Dict[str, List[int]]] = None - finished_requests_ids: Optional[List[str]] = None + request_ids_to_seq_ids: Optional[dict[str, list[int]]] = None + finished_requests_ids: Optional[list[str]] = None virtual_engine: int = 0 async_callback: Optional[Callable] = None scheduler_outputs: Optional[SchedulerOutputs] = None previous_hidden_states: Optional[torch.Tensor] = None - def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: + def as_broadcastable_tensor_dict(self) -> dict[str, Any]: tensor_dict = { "input_tokens": self.input_tokens, "input_positions": self.input_positions, @@ -119,8 +118,8 @@ def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: @classmethod def from_broadcasted_tensor_dict( - cls: Type[TModelInputForGPU], - tensor_dict: Dict[str, Any], + cls: type[TModelInputForGPU], + tensor_dict: dict[str, Any], attn_backend: Optional["AttentionBackend"] = None, ) -> TModelInputForGPU: if attn_backend is not None: @@ -151,7 +150,7 @@ class ModelInputForGPUWithSamplingMetadata(ModelInputForGPU): # used by the driver worker. is_prompt: Optional[bool] = None - def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: + def as_broadcastable_tensor_dict(self) -> dict[str, Any]: tensor_dict = { "input_tokens": self.input_tokens, "input_positions": self.input_positions, @@ -172,7 +171,7 @@ def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: @classmethod def from_broadcasted_tensor_dict( cls, - tensor_dict: Dict[str, Any], + tensor_dict: dict[str, Any], attn_backend: Optional["AttentionBackend"] = None, ) -> "ModelInputForGPUWithSamplingMetadata": tensor_dict = _init_sampling_metadata_from_tensor_dict(tensor_dict) @@ -212,43 +211,43 @@ def __init__( *, # From sequence group metadata. request_id: str, - seq_ids: List[int], + seq_ids: list[int], is_prompt: bool, - block_tables: Optional[Dict[int, List[int]]], - computed_block_nums: List[int], + block_tables: Optional[dict[int, list[int]]], + computed_block_nums: list[int], n_seqs: int = 0, # Input tokens and positions. - input_tokens: Optional[List[List[int]]] = None, - input_positions: Optional[List[List[int]]] = None, - token_types: Optional[List[List[int]]] = None, - mrope_input_positions: Optional[List[List[List[int]]]] = None, + input_tokens: Optional[list[list[int]]] = None, + input_positions: Optional[list[list[int]]] = None, + token_types: Optional[list[list[int]]] = None, + mrope_input_positions: Optional[list[list[list[int]]]] = None, # The sequence length (may be capped to the sliding window). - seq_lens: Optional[List[int]] = None, + seq_lens: Optional[list[int]] = None, # The original sequence length (before applying sliding window). # This is used to compute slot mapping. - orig_seq_lens: Optional[List[int]] = None, + orig_seq_lens: Optional[list[int]] = None, # The query length. - query_lens: Optional[List[int]] = None, + query_lens: Optional[list[int]] = None, # The number of tokens that are already computed. - context_lens: Optional[List[int]] = None, + context_lens: Optional[list[int]] = None, # The current sliding window block. - curr_sliding_window_blocks: Optional[List[int]] = None, + curr_sliding_window_blocks: Optional[list[int]] = None, # LoRA inputs. - lora_index_mapping: Optional[List[List[int]]] = None, - lora_prompt_mapping: Optional[List[List[int]]] = None, - lora_requests: Optional[Set[LoRARequest]] = None, + lora_index_mapping: Optional[list[list[int]]] = None, + lora_prompt_mapping: Optional[list[list[int]]] = None, + lora_requests: Optional[set[LoRARequest]] = None, # Prompt adapter inputs. - prompt_adapter_index_mapping: Optional[List[int]] = None, - prompt_adapter_prompt_mapping: Optional[List[int]] = None, + prompt_adapter_index_mapping: Optional[list[int]] = None, + prompt_adapter_prompt_mapping: Optional[list[int]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, # Multi-modal inputs. multi_modal_kwargs: Optional[MultiModalKwargs] = None, - multi_modal_placeholder_maps: Optional[Dict[ + multi_modal_placeholder_maps: Optional[dict[ str, MultiModalPlaceholderMap]] = None, # Whether the prefix cache is hit (prefill only). @@ -430,7 +429,7 @@ def reset_cached_inter_data(self): def __init__(self, runner: "GPUModelRunnerBase", - finished_requests_ids: Optional[List[str]] = None): + finished_requests_ids: Optional[list[str]] = None): super().__init__() # Compute functions for each sequence in a sequence group. # WARNING: The order of the functions matters! @@ -475,7 +474,7 @@ def __init__(self, self.sliding_window_blocks * self.block_size def prepare(self, - finished_requests_ids: Optional[List[str]] = None) -> None: + finished_requests_ids: Optional[list[str]] = None) -> None: self.finished_requests_ids = finished_requests_ids # if the current batch is decode-only. @@ -484,7 +483,7 @@ def prepare(self, # Intermediate data (data in CPU before going to GPU) for # the current sequence group. - self.inter_data_list: List[ + self.inter_data_list: list[ ModelInputForGPUBuilder.InterDataForSeqGroup] = [] self.attn_metadata_builder.prepare() @@ -835,7 +834,7 @@ def build(self) -> ModelInputForGPU: # prefix caching and there is no decode request. return self.model_input_cls() - mrope_input_positions: Optional[List[List[int]]] = None + mrope_input_positions: Optional[list[list[int]]] = None if any(inter_data.mrope_input_positions is not None for inter_data in self.inter_data_list): mrope_input_positions = [[] for _ in range(3)] @@ -949,7 +948,7 @@ def build(self) -> ModelInputForGPU: is_prefill=not self.decode_only)) # Prompt adapter data. - prompt_adapter_requests: Set[PromptAdapterRequest] = set() + prompt_adapter_requests: set[PromptAdapterRequest] = set() prompt_adapter_mapping = None if self.enable_prompt_adapter: prompt_adapter_requests = set( @@ -998,8 +997,8 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]): """ Helper class for shared methods between GPU model runners. """ - _model_input_cls: Type[TModelInputForGPU] - _builder_cls: Type[ModelInputForGPUBuilder] + _model_input_cls: type[TModelInputForGPU] + _builder_cls: type[ModelInputForGPUBuilder] builder: ModelInputForGPUBuilder def __init__( @@ -1029,10 +1028,10 @@ def __init__( self.max_batchsize_to_capture = \ self.vllm_config.compilation_config.max_capture_size - self.graph_runners: List[Dict[int, CUDAGraphRunner]] = [ + self.graph_runners: list[dict[int, CUDAGraphRunner]] = [ {} for _ in range(self.parallel_config.pipeline_parallel_size) ] - self.graph_memory_pool: Optional[Tuple[ + self.graph_memory_pool: Optional[tuple[ int, int]] = None # Set during graph capture. self.has_inner_state = model_config.has_inner_state @@ -1090,7 +1089,7 @@ def __init__( int(self.cache_config.cpu_offload_gb * 1024**3)) # Used to cache python objects - self.inter_data_cache: Dict[int, PyObjectCache] = {} + self.inter_data_cache: dict[int, PyObjectCache] = {} # Using the PythonizationCache in Pipeline-Parallel clobbers the # SequenceGroupToSample object. In Pipeline-Parallel, we have @@ -1197,8 +1196,8 @@ def get_max_block_per_batch(self) -> int: def _prepare_model_input_tensors( self, - seq_group_metadata_list: List[SequenceGroupMetadata], - finished_requests_ids: Optional[List[str]] = None + seq_group_metadata_list: list[SequenceGroupMetadata], + finished_requests_ids: Optional[list[str]] = None ) -> TModelInputForGPU: """Helper method to prepare the model input based on a given sequence group. Prepares metadata needed for the base model forward pass but not @@ -1254,8 +1253,8 @@ def _dummy_run(self, # that will have unique loras, an therefore the max amount of memory # consumption create dummy lora request copies from the lora request # passed in, which contains a lora from the lora warmup path. - dummy_lora_requests: List[LoRARequest] = [] - dummy_lora_requests_per_seq: List[LoRARequest] = [] + dummy_lora_requests: list[LoRARequest] = [] + dummy_lora_requests_per_seq: list[LoRARequest] = [] if self.lora_config: assert self.lora_manager is not None with self.lora_manager.dummy_lora_cache(): @@ -1276,7 +1275,7 @@ def _dummy_run(self, # Profile memory usage with max_num_sequences sequences and the # total number of tokens equal to max_num_batched_tokens. - seqs: List[SequenceGroupMetadata] = [] + seqs: list[SequenceGroupMetadata] = [] # Additional GPU memory may be needed for multi-modal encoding, # which needs to be accounted for when calculating the GPU blocks # for vLLM blocker manager. @@ -1364,7 +1363,7 @@ def remove_all_loras(self): raise RuntimeError("LoRA is not enabled.") self.lora_manager.remove_all_adapters() - def set_active_loras(self, lora_requests: Set[LoRARequest], + def set_active_loras(self, lora_requests: set[LoRARequest], lora_mapping: LoRAMapping) -> None: if not self.lora_manager: raise RuntimeError("LoRA is not enabled.") @@ -1385,7 +1384,7 @@ def pin_lora(self, lora_id: int) -> bool: raise RuntimeError("LoRA is not enabled.") return self.lora_manager.pin_adapter(lora_id) - def list_loras(self) -> Set[int]: + def list_loras(self) -> set[int]: if not self.lora_manager: raise RuntimeError("LoRA is not enabled.") return self.lora_manager.list_adapters() @@ -1396,7 +1395,7 @@ def remove_all_prompt_adapters(self): self.prompt_adapter_manager.remove_all_adapters() def set_active_prompt_adapters( - self, prompt_adapter_requests: Set[PromptAdapterRequest], + self, prompt_adapter_requests: set[PromptAdapterRequest], prompt_adapter_mapping: PromptAdapterMapping) -> None: if not self.prompt_adapter_manager: raise RuntimeError("PromptAdapter is not enabled.") @@ -1419,13 +1418,13 @@ def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool: raise RuntimeError("PromptAdapter is not enabled.") return self.prompt_adapter_manager.pin_adapter(prompt_adapter_id) - def list_prompt_adapters(self) -> Set[int]: + def list_prompt_adapters(self) -> set[int]: if not self.prompt_adapter_manager: raise RuntimeError("PromptAdapter is not enabled.") return self.prompt_adapter_manager.list_adapters() @torch.inference_mode() - def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None: + def capture_model(self, kv_caches: list[list[torch.Tensor]]) -> None: """Cuda graph capture a model. Note that CUDA graph's performance gain is negligible if number @@ -1571,7 +1570,7 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None: elapsed_time, cuda_graph_size / GiB_bytes) def _update_inputs_to_capture_for_enc_dec_model(self, - capture_inputs: Dict[str, + capture_inputs: dict[str, Any]): """ Updates the set of input tensors needed for CUDA graph capture in an @@ -1599,13 +1598,13 @@ class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]): """ GPU model runner with sampling step. """ - _model_input_cls: Type[ModelInputForGPUWithSamplingMetadata] = ( + _model_input_cls: type[ModelInputForGPUWithSamplingMetadata] = ( ModelInputForGPUWithSamplingMetadata) - _builder_cls: Type[ModelInputForGPUBuilder] = ModelInputForGPUBuilder + _builder_cls: type[ModelInputForGPUBuilder] = ModelInputForGPUBuilder def make_model_input_from_broadcasted_tensor_dict( self, - tensor_dict: Dict[str, Any], + tensor_dict: dict[str, Any], ) -> ModelInputForGPUWithSamplingMetadata: model_input = \ ModelInputForGPUWithSamplingMetadata.from_broadcasted_tensor_dict( @@ -1616,9 +1615,9 @@ def make_model_input_from_broadcasted_tensor_dict( def prepare_model_input( self, - seq_group_metadata_list: List[SequenceGroupMetadata], + seq_group_metadata_list: list[SequenceGroupMetadata], virtual_engine: int = 0, - finished_requests_ids: Optional[List[str]] = None, + finished_requests_ids: Optional[list[str]] = None, ) -> ModelInputForGPUWithSamplingMetadata: """Prepare the model input based on a given sequence group, including metadata for the sampling step. @@ -1655,11 +1654,11 @@ def prepare_model_input( def execute_model( self, model_input: ModelInputForGPUWithSamplingMetadata, - kv_caches: List[torch.Tensor], + kv_caches: list[torch.Tensor], intermediate_tensors: Optional[IntermediateTensors] = None, num_steps: int = 1, **kwargs, - ) -> Optional[Union[List[SamplerOutput], IntermediateTensors]]: + ) -> Optional[Union[list[SamplerOutput], IntermediateTensors]]: if num_steps > 1: raise ValueError("num_steps > 1 is not supported in ModelRunner") @@ -1885,8 +1884,8 @@ def __init__(self, model: nn.Module, backend_name: str, self.backend_name = backend_name self.attn_state = attn_state - self.input_buffers: Dict[str, torch.Tensor] = {} - self.output_buffers: Dict[str, torch.Tensor] = {} + self.input_buffers: dict[str, torch.Tensor] = {} + self.output_buffers: dict[str, torch.Tensor] = {} self._graph: Optional[torch.cuda.CUDAGraph] = None self._is_encoder_decoder_model = is_encoder_decoder_model @@ -1901,9 +1900,9 @@ def capture( input_ids: torch.Tensor, positions: torch.Tensor, intermediate_inputs: Optional[IntermediateTensors], - kv_caches: List[torch.Tensor], + kv_caches: list[torch.Tensor], attn_metadata: AttentionMetadata, - memory_pool: Optional[Tuple[int, int]], + memory_pool: Optional[tuple[int, int]], stream: torch.cuda.Stream, **kwargs, ): diff --git a/vllm/worker/model_runner_base.py b/vllm/worker/model_runner_base.py index 935325cb2e1c0..95ecbc1de264d 100644 --- a/vllm/worker/model_runner_base.py +++ b/vllm/worker/model_runner_base.py @@ -2,8 +2,7 @@ import dataclasses from abc import ABC, abstractmethod -from typing import (TYPE_CHECKING, Any, Dict, Generic, List, Optional, Type, - TypeVar) +from typing import TYPE_CHECKING, Any, Generic, Optional, TypeVar import torch import torch.nn as nn @@ -24,7 +23,7 @@ def _add_attn_metadata_broadcastable_dict( - tensor_dict: Dict[str, Any], + tensor_dict: dict[str, Any], attn_metadata: Optional["AttentionMetadata"]) -> None: """ Helper method to update tensor_dict with broadcastable @@ -36,8 +35,8 @@ def _add_attn_metadata_broadcastable_dict( def _init_attn_metadata_from_tensor_dict( attn_backend: "AttentionBackend", - tensor_dict: Dict[str, Any], -) -> Dict[str, Any]: + tensor_dict: dict[str, Any], +) -> dict[str, Any]: """ Helper method to initialize AttentionMetadata based on an AttentionBackend and broadcastable AttentionMetadata fields. @@ -57,7 +56,7 @@ def _init_attn_metadata_from_tensor_dict( def _init_sampling_metadata_from_tensor_dict( # type: ignore - tensor_dict: Dict[str, Any]) -> Dict[str, Any]: + tensor_dict: dict[str, Any]) -> dict[str, Any]: """ Helper method to initialize SamplingMetadata based on broadcastable SamplingMetadata fields. @@ -78,7 +77,7 @@ def _init_sampling_metadata_from_tensor_dict( # type: ignore def _add_sampling_metadata_broadcastable_dict( - tensor_dict: Dict[str, Any], + tensor_dict: dict[str, Any], sampling_metadata: Optional["SamplingMetadata"]) -> None: """ Helper method to update tensor_dict with broadcastable @@ -90,8 +89,8 @@ def _add_sampling_metadata_broadcastable_dict( def _init_frozen_model_input_from_tensor_dict( - frozen_model_input_cls: Type["ModelRunnerInputBase"], - tensor_dict: Dict[str, Any]) -> Dict[str, Any]: + frozen_model_input_cls: type["ModelRunnerInputBase"], + tensor_dict: dict[str, Any]) -> dict[str, Any]: """ Helper method to initialize a frozen ModelInput based on broadcastable """ @@ -109,7 +108,7 @@ def _init_frozen_model_input_from_tensor_dict( class BroadcastableModelInput(ABC): @abstractmethod - def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: + def as_broadcastable_tensor_dict(self) -> dict[str, Any]: """ Extract broadcastable fields. Override for fields that require some custom deserialization. @@ -119,8 +118,8 @@ def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: @classmethod @abstractmethod def from_broadcasted_tensor_dict( - cls: Type[T], - tensor_dict: Dict[str, Any], + cls: type[T], + tensor_dict: dict[str, Any], attn_backend: Optional["AttentionBackend"] = None, ) -> T: """ @@ -150,7 +149,7 @@ class ModelRunnerInputBuilderBase(ABC, Generic[T]): @abstractmethod def prepare(self, - finished_requests_ids: Optional[List[str]] = None) -> None: + finished_requests_ids: Optional[list[str]] = None) -> None: raise NotImplementedError @abstractmethod @@ -191,12 +190,12 @@ def __init__( self.observability_config = vllm_config.observability_config # Map of request_id -> generator used for seeded random sampling - generators: Dict[str, torch.Generator] = {} + generators: dict[str, torch.Generator] = {} @abstractmethod def make_model_input_from_broadcasted_tensor_dict( self, - tensor_dict: Dict[str, Any], + tensor_dict: dict[str, Any], ) -> T: """ Make an instance of a ModelRunnerInputBase from the broadcasted tensor @@ -207,9 +206,9 @@ def make_model_input_from_broadcasted_tensor_dict( @abstractmethod def prepare_model_input( self, - seq_group_metadata_list: List[SequenceGroupMetadata], + seq_group_metadata_list: list[SequenceGroupMetadata], virtual_engine: int = 0, - finished_requests_ids: Optional[List[str]] = None, + finished_requests_ids: Optional[list[str]] = None, ) -> T: """ Prepare the inputs to ModelRunnerBase.execute_model from an execution @@ -225,17 +224,17 @@ def get_model(self) -> nn.Module: def execute_model( self, model_input: T, - kv_caches: Optional[List[torch.Tensor]], + kv_caches: Optional[list[torch.Tensor]], intermediate_tensors: Optional[IntermediateTensors] = None, num_steps: int = 1, **kwargs, - ) -> Optional[List[SamplerOutput]]: + ) -> Optional[list[SamplerOutput]]: """ Execute the model on the given input. """ raise NotImplementedError - def get_generators(self, finished_request_ids: Optional[List[str]] = None): + def get_generators(self, finished_request_ids: Optional[list[str]] = None): """ Return dict of per-request generators used for random sampling. """ diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py index 7ddf382079c62..f77411755a0ee 100644 --- a/vllm/worker/multi_step_model_runner.py +++ b/vllm/worker/multi_step_model_runner.py @@ -3,8 +3,7 @@ import dataclasses import functools from dataclasses import dataclass, field -from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, - Union) +from typing import TYPE_CHECKING, Any, Callable, Optional, Union import torch @@ -37,7 +36,7 @@ MULTI_STEP_CHUNKED_PREFILL_ATTENTION_BACKENDS = ["FLASH_ATTN", "FLASHINFER"] def _get_supported_attention_backends(chunked_prefill_enabled: bool) \ - -> List[str]: + -> list[str]: if chunked_prefill_enabled: return MULTI_STEP_CHUNKED_PREFILL_ATTENTION_BACKENDS else: @@ -146,8 +145,8 @@ class StatefulModelInput(BroadcastableModelInput): # actual frozen model input dataclass passed to _base_model_runner frozen_model_input: Optional[ModelInputForGPUWithSamplingMetadata] = None - # list of model outputs for each step, may not be all pythonized - cached_outputs: List[ModelOutput] = field(default_factory=list) + # List of model outputs for each step, may not be all pythonized + cached_outputs: list[ModelOutput] = field(default_factory=list) # used to pass sampled token ids from the last step to the current step for # TP workers. Used to append to end of outputs and used by advance_step @@ -158,13 +157,13 @@ class StatefulModelInput(BroadcastableModelInput): is_first_multi_step: bool = False base_output_proc_callback: Optional[Callable] = None # ping-pong data structures for multi-step to wait on the previous step - step_cuda_events: List[torch.cuda.Event] = field( + step_cuda_events: list[torch.cuda.Event] = field( default_factory=lambda: [torch.cuda.Event(blocking=True)] * 2) num_seqs: int = -1 num_queries: int = -1 num_single_step_prefills: int = 0 - def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: + def as_broadcastable_tensor_dict(self) -> dict[str, Any]: assert self.frozen_model_input is not None tensor_dict = self.frozen_model_input.as_broadcastable_tensor_dict() new_tensor_dict = { @@ -183,7 +182,7 @@ def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: @classmethod def from_broadcasted_tensor_dict( cls, - tensor_dict: Dict[str, Any], + tensor_dict: dict[str, Any], attn_backend: Optional["AttentionBackend"] = None, ) -> "StatefulModelInput": tensor_dict = _init_sampling_metadata_from_tensor_dict(tensor_dict) @@ -312,7 +311,7 @@ def __init__(self, base_model_runner: GPUModelRunnerBase, *args, **kwargs): super().__init__(*args, **kwargs) # Check attention backend support. - supported_attention_backends: List[str] = \ + supported_attention_backends: list[str] = \ _get_supported_attention_backends( self.scheduler_config.chunked_prefill_enabled) if self.attn_backend.get_name() not in supported_attention_backends: @@ -346,7 +345,7 @@ def _copy_stream(self): return torch.cuda.Stream() def make_model_input_from_broadcasted_tensor_dict( - self, tensor_dict: Dict[str, Any]) -> StatefulModelInput: + self, tensor_dict: dict[str, Any]) -> StatefulModelInput: model_input = (StatefulModelInput.from_broadcasted_tensor_dict( tensor_dict, attn_backend=self.attn_backend, @@ -355,9 +354,9 @@ def make_model_input_from_broadcasted_tensor_dict( def prepare_model_input( self, - seq_group_metadata_list: List[SequenceGroupMetadata], + seq_group_metadata_list: list[SequenceGroupMetadata], virtual_engine: int = 0, - finished_requests_ids: Optional[List[str]] = None + finished_requests_ids: Optional[list[str]] = None ) -> StatefulModelInput: frozen_model_input: ModelInputForGPUWithSamplingMetadata = \ self._base_model_runner.prepare_model_input( @@ -410,7 +409,7 @@ def _async_process_outputs(self, model_input: StatefulModelInput, def _final_process_outputs( self, model_input: StatefulModelInput, - output_proc_callback: Optional[Callable]) -> List[SamplerOutput]: + output_proc_callback: Optional[Callable]) -> list[SamplerOutput]: assert model_input.frozen_model_input is not None has_async_callback = output_proc_callback is not None @@ -461,10 +460,10 @@ def _final_process_outputs( def execute_model( self, model_input: StatefulModelInput, - kv_caches: List[torch.Tensor], + kv_caches: list[torch.Tensor], intermediate_tensors: Optional[IntermediateTensors] = None, num_steps: int = 1, - ) -> Optional[Union[List[SamplerOutput], IntermediateTensors]]: + ) -> Optional[Union[list[SamplerOutput], IntermediateTensors]]: """ Execute the model for a single step and update multi-step metadata @@ -671,7 +670,7 @@ def profile_run(self) -> None: def remove_all_loras(self): return self._base_model_runner.remove_all_loras() - def capture_model(self, kv_caches: List[List]) -> None: + def capture_model(self, kv_caches: list[list]) -> None: return self._base_model_runner.capture_model(kv_caches) @property @@ -679,8 +678,8 @@ def vocab_size(self) -> int: return self._base_model_runner.vocab_size -DeferredLogprobsReturnType = Tuple[Optional[List[Optional[PromptLogprobs]]], - Optional[List[SampleLogprobs]]] +DeferredLogprobsReturnType = tuple[Optional[list[Optional[PromptLogprobs]]], + Optional[list[SampleLogprobs]]] def deferred_pythonize_logprobs( @@ -854,7 +853,7 @@ def _pythonize_sampler_output( seq_ids = seq_group.seq_ids next_token_ids = sample_result parent_ids = [0] - seq_outputs: List[SequenceOutput] + seq_outputs: list[SequenceOutput] if cache is not None: completion_seq_group_output: CompletionSequenceGroupOutput = \ diff --git a/vllm/worker/multi_step_tpu_worker.py b/vllm/worker/multi_step_tpu_worker.py index 3871199987cee..887af34660217 100644 --- a/vllm/worker/multi_step_tpu_worker.py +++ b/vllm/worker/multi_step_tpu_worker.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import dataclasses -from typing import Dict, Optional, Tuple +from typing import Optional import torch @@ -20,7 +20,7 @@ def __init__(self, *args, **kwargs): def _get_driver_input_and_broadcast( self, execute_model_req: ExecuteModelRequest - ) -> Tuple[ModelInputForTPU, WorkerInput, Dict[str, torch.Tensor]]: + ) -> tuple[ModelInputForTPU, WorkerInput, dict[str, torch.Tensor]]: assert self.is_driver_worker assert execute_model_req.virtual_engine == 0 @@ -71,7 +71,7 @@ def _get_driver_input_and_broadcast( def prepare_input( self, execute_model_req: Optional[ExecuteModelRequest] = None, - ) -> Optional[Tuple[ModelInputForTPU, WorkerInput, Dict[str, + ) -> Optional[tuple[ModelInputForTPU, WorkerInput, dict[str, torch.Tensor]]]: if self.is_driver_worker: if execute_model_req is None: diff --git a/vllm/worker/multi_step_worker.py b/vllm/worker/multi_step_worker.py index 3518ab2f64fed..d3f73fea203e3 100644 --- a/vllm/worker/multi_step_worker.py +++ b/vllm/worker/multi_step_worker.py @@ -2,7 +2,7 @@ import dataclasses from dataclasses import dataclass -from typing import Dict, List, Optional, Tuple +from typing import Optional import torch @@ -35,13 +35,13 @@ def __init__(self, *args, **kwargs): ) pipeline_parallel_size = self.parallel_config.pipeline_parallel_size - self.multi_step_states: List[ + self.multi_step_states: list[ Optional[MultiStepState]] = [None] * pipeline_parallel_size self.temp_output = None def _get_driver_input_and_broadcast( self, execute_model_req: ExecuteModelRequest - ) -> Tuple[BroadcastableModelInput, WorkerInput, Dict[str, torch.Tensor]]: + ) -> tuple[BroadcastableModelInput, WorkerInput, dict[str, torch.Tensor]]: """ Get the driver input and broadcast it to other workers. """ @@ -136,7 +136,7 @@ def _prepare_last_sampled_token_ids_for_tp_workers( def prepare_input( self, execute_model_req: Optional[ExecuteModelRequest] = None, - ) -> Optional[Tuple[StatefulModelInput, WorkerInput, Dict[str, + ) -> Optional[tuple[StatefulModelInput, WorkerInput, dict[str, torch.Tensor]]]: """ Depending on the current state of the request and multi step worker, diff --git a/vllm/worker/neuron_model_runner.py b/vllm/worker/neuron_model_runner.py index f2093fc42ad16..b4b4b3535d954 100644 --- a/vllm/worker/neuron_model_runner.py +++ b/vllm/worker/neuron_model_runner.py @@ -3,7 +3,7 @@ import os from dataclasses import dataclass from importlib.util import find_spec -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Any, Optional, Union import torch from torch import nn @@ -39,13 +39,13 @@ class ModelInputForNeuron(ModelRunnerInputBase): multi_modal_kwargs: Optional[BatchedTensorInputs] = None def as_broadcastable_tensor_dict( - self) -> Dict[str, Union[int, torch.Tensor]]: + self) -> dict[str, Union[int, torch.Tensor]]: raise NotImplementedError("ModelInputForNeuron cannot be broadcast.") @classmethod def from_broadcasted_tensor_dict( cls, - tensor_dict: Dict[str, Any], + tensor_dict: dict[str, Any], attn_backend: Optional["AttentionBackend"] = None, ) -> "ModelInputForNeuron": assert attn_backend is None @@ -85,7 +85,7 @@ def __init__( # NEURON needs to update sampling parameters when request IDs change # across batches. This variable stores the previous batch's request IDs # to determine if an update is needed. - self._previous_batch_request_ids: List[str] = [] + self._previous_batch_request_ids: list[str] = [] if not self._on_device_sampling_disabled: logger.warning( @@ -120,16 +120,16 @@ def get_model(self) -> nn.Module: def _prepare_prompt( self, - seq_group_metadata_list: List[SequenceGroupMetadata], - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, List[int], + seq_group_metadata_list: list[SequenceGroupMetadata], + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, list[int], BatchedTensorInputs]: assert len(seq_group_metadata_list) > 0 - input_tokens: List[List[int]] = [] - input_positions: List[List[int]] = [] - input_block_ids: List[int] = [] + input_tokens: list[list[int]] = [] + input_positions: list[list[int]] = [] + input_block_ids: list[int] = [] - seq_lens: List[int] = [] - multi_modal_kwargs_list: List[MultiModalKwargs] = [] + seq_lens: list[int] = [] + multi_modal_kwargs_list: list[MultiModalKwargs] = [] for seq_group_metadata in seq_group_metadata_list: assert seq_group_metadata.is_prompt seq_ids = list(seq_group_metadata.seq_data.keys()) @@ -184,13 +184,13 @@ def _prepare_prompt( def _prepare_decode( self, - seq_group_metadata_list: List[SequenceGroupMetadata], - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + seq_group_metadata_list: list[SequenceGroupMetadata], + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: assert len(seq_group_metadata_list) > 0 - input_tokens: List[List[int]] = [] - input_positions: List[List[int]] = [] - input_block_ids: List[int] = [] - context_lens: List[int] = [] + input_tokens: list[list[int]] = [] + input_positions: list[list[int]] = [] + input_block_ids: list[int] = [] + context_lens: list[int] = [] for seq_group_metadata in seq_group_metadata_list: assert not seq_group_metadata.is_prompt @@ -232,14 +232,14 @@ def _prepare_decode( return input_tokens, input_positions, input_block_ids def make_model_input_from_broadcasted_tensor_dict( - self, tensor_dict: Dict[str, Any]) -> ModelInputForNeuron: + self, tensor_dict: dict[str, Any]) -> ModelInputForNeuron: return ModelInputForNeuron.from_broadcasted_tensor_dict(tensor_dict) def prepare_model_input( self, - seq_group_metadata_list: List[SequenceGroupMetadata], + seq_group_metadata_list: list[SequenceGroupMetadata], virtual_engine: int = 0, - finished_requests_ids: Optional[List[str]] = None + finished_requests_ids: Optional[list[str]] = None ) -> ModelInputForNeuron: multi_modal_kwargs = None # NOTE: We assume that all sequences in the group are all prompts or @@ -312,10 +312,10 @@ def _convert_to_neuron_top_k(self, top_k: int) -> int: def execute_model( self, model_input: ModelInputForNeuron, - kv_caches: Optional[List[torch.Tensor]] = None, + kv_caches: Optional[list[torch.Tensor]] = None, intermediate_tensors: Optional[IntermediateTensors] = None, num_steps: int = 1, - ) -> Optional[List[SamplerOutput]]: + ) -> Optional[list[SamplerOutput]]: if num_steps > 1: raise ValueError( "NeuronModelRunner does not support multi-step execution.") diff --git a/vllm/worker/neuron_worker.py b/vllm/worker/neuron_worker.py index df651e05a7bbc..c229c283dbf50 100644 --- a/vllm/worker/neuron_worker.py +++ b/vllm/worker/neuron_worker.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 """A Neuron worker class.""" -from typing import List, Optional, Tuple +from typing import Optional import torch import torch.distributed @@ -45,7 +45,7 @@ def __init__( def execute_model( self, execute_model_req: Optional[ExecuteModelRequest] = None, - ) -> Optional[List[SamplerOutput]]: + ) -> Optional[list[SamplerOutput]]: assert execute_model_req is not None assert (not execute_model_req.blocks_to_swap_in and not execute_model_req.blocks_to_swap_out @@ -66,7 +66,7 @@ def init_device(self) -> None: def load_model(self): self.model_runner.load_model() - def determine_num_available_blocks(self) -> Tuple[int, int]: + def determine_num_available_blocks(self) -> tuple[int, int]: """Determine the number of available KV blocks. Swapping is not yet supported, so always return num_cpu_blocks=0. @@ -100,7 +100,7 @@ def do_metadata_broadcast(self) -> bool: return False @property - def kv_cache(self) -> Optional[List[List[torch.Tensor]]]: + def kv_cache(self) -> Optional[list[list[torch.Tensor]]]: return None @torch.inference_mode() diff --git a/vllm/worker/openvino_model_runner.py b/vllm/worker/openvino_model_runner.py index 5035ea20294c4..cb2857b5fc687 100644 --- a/vllm/worker/openvino_model_runner.py +++ b/vllm/worker/openvino_model_runner.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 from collections import defaultdict -from typing import Dict, List, NamedTuple, Optional, Tuple +from typing import NamedTuple, Optional import openvino as ov import torch @@ -27,8 +27,8 @@ class ModelInput(NamedTuple): input_tokens: torch.Tensor input_positions: torch.Tensor attn_metadata: Optional[OpenVINOAttentionMetadata] - seq_lens: List[int] - query_lens: List[int] + seq_lens: list[int] + query_lens: list[int] multi_modal_kwargs: BatchedTensorInputs @classmethod @@ -88,7 +88,7 @@ def get_model(self) -> nn.Module: def _prepare_model_input( self, - seq_group_metadata_list: List[SequenceGroupMetadata], + seq_group_metadata_list: list[SequenceGroupMetadata], ) -> ModelInput: """Prepare the model input based on a given sequence group. @@ -100,20 +100,20 @@ def _prepare_model_input( - input_tokens[:num_prefill_tokens] contains prefill tokens. - input_tokens[num_prefill_tokens:] contains decode tokens. """ - input_tokens: List[int] = [] - input_positions: List[int] = [] - - seq_lens: List[int] = [] - past_lens: List[int] = [] - query_lens: List[int] = [] - multi_modal_kwargs_list: List[MultiModalKwargs] = [] - multi_modal_placeholder_maps: Dict[ + input_tokens: list[int] = [] + input_positions: list[int] = [] + + seq_lens: list[int] = [] + past_lens: list[int] = [] + query_lens: list[int] = [] + multi_modal_kwargs_list: list[MultiModalKwargs] = [] + multi_modal_placeholder_maps: dict[ str, MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap) - subsequence_begins: List[int] = [] - block_indices: List[int] = [] - block_indices_begins: List[int] = [] + subsequence_begins: list[int] = [] + block_indices: list[int] = [] + block_indices_begins: list[int] = [] # initialize beginning of prefix sums subsequence_begins.append(0) @@ -297,8 +297,8 @@ def _prepare_model_input( def prepare_input_tensors( self, - seq_group_metadata_list: List[SequenceGroupMetadata], - ) -> Tuple[torch.Tensor, torch.Tensor, OpenVINOAttentionMetadata, + seq_group_metadata_list: list[SequenceGroupMetadata], + ) -> tuple[torch.Tensor, torch.Tensor, OpenVINOAttentionMetadata, SamplingMetadata, BatchedTensorInputs]: # Prepare input tensors. ( @@ -329,8 +329,8 @@ def prepare_input_tensors( @torch.inference_mode() def execute_model( self, - seq_group_metadata_list: List[SequenceGroupMetadata], - kv_caches: List[Tuple["ov.Tensor", "ov.Tensor"]], + seq_group_metadata_list: list[SequenceGroupMetadata], + kv_caches: list[tuple["ov.Tensor", "ov.Tensor"]], ) -> Optional[SamplerOutput]: ( input_tokens, diff --git a/vllm/worker/openvino_worker.py b/vllm/worker/openvino_worker.py index fad91270ea2a4..fc70c9a2d8c74 100644 --- a/vllm/worker/openvino_worker.py +++ b/vllm/worker/openvino_worker.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 """An OpenVINO worker class.""" -from typing import Any, Dict, List, Optional, Tuple +from typing import Any, Optional import openvino as ov import torch @@ -80,13 +80,13 @@ def __init__( ) # Initialize the cache. - self.kv_cache: List[Tuple[ov.Tensor, + self.kv_cache: list[tuple[ov.Tensor, ov.Tensor]] = self._allocate_kv_cache( self.num_device_blocks, ov_core, ov_device) # Initialize the swap. - self.swap_cache: List[Tuple[ov.Tensor, + self.swap_cache: list[tuple[ov.Tensor, ov.Tensor]] = self._allocate_swap_cache( self.num_swap_blocks, ov_device) @@ -95,11 +95,11 @@ def _allocate_kv_cache( num_blocks: int, ov_core: ov.Core, ov_device: str, - ) -> List[Tuple[ov.Tensor, ov.Tensor]]: + ) -> list[tuple[ov.Tensor, ov.Tensor]]: """Allocates KV cache.""" k_block_shape = v_block_shape = self.attn_backend.get_kv_cache_shape( num_blocks, self.block_size, self.num_kv_heads, self.head_size)[1:] - kv_cache: List[Tuple[ov.Tensor, ov.Tensor]] = [] + kv_cache: list[tuple[ov.Tensor, ov.Tensor]] = [] if current_platform.is_openvino_cpu(): for _ in range(self.num_layers): @@ -134,11 +134,11 @@ def _allocate_swap_cache( self, num_blocks: int, ov_device: str, - ) -> List[Tuple[ov.Tensor, ov.Tensor]]: + ) -> list[tuple[ov.Tensor, ov.Tensor]]: """Allocates swap cache.""" k_block_shape = v_block_shape = self.attn_backend.get_kv_cache_shape( num_blocks, self.block_size, self.num_kv_heads, self.head_size)[1:] - swap_cache: List[Tuple[ov.Tensor, ov.Tensor]] = [] + swap_cache: list[tuple[ov.Tensor, ov.Tensor]] = [] if num_blocks == 0: return swap_cache @@ -159,21 +159,21 @@ def _allocate_swap_cache( return swap_cache - def swap_in(self, src_to_dst: List[Tuple[int, int]]) -> None: + def swap_in(self, src_to_dst: list[tuple[int, int]]) -> None: for i in range(self.num_layers): for swap_tensor, kv_tensor in zip(self.swap_cache[i], self.kv_cache[i]): self.attn_backend.swap_blocks(swap_tensor, kv_tensor, src_to_dst) - def swap_out(self, src_to_dst: List[Tuple[int, int]]) -> None: + def swap_out(self, src_to_dst: list[tuple[int, int]]) -> None: for i in range(self.num_layers): for swap_tensor, kv_tensor in zip(self.swap_cache[i], self.kv_cache[i]): self.attn_backend.swap_blocks(kv_tensor, swap_tensor, src_to_dst) - def copy(self, src_to_dsts: List[Tuple[int, int]]) -> None: + def copy(self, src_to_dsts: list[tuple[int, int]]) -> None: if (len(src_to_dsts) > 0): self.attn_backend.copy_blocks(self.kv_cache, src_to_dsts) @@ -243,7 +243,7 @@ def __init__( # Uninitialized cache engine. Will be initialized by # initialize_cache. self.cache_engine: OpenVINOCacheEngine - self.kv_cache: List[Tuple[ov.Tensor, ov.Tensor]] + self.kv_cache: list[tuple[ov.Tensor, ov.Tensor]] def init_device(self) -> None: self.init_distributed_environment() @@ -253,7 +253,7 @@ def init_device(self) -> None: def load_model(self): self.model_runner.load_model() - def determine_num_available_blocks(self) -> Tuple[int, int]: + def determine_num_available_blocks(self) -> tuple[int, int]: """Determine the number of blocks available for the KV cache. This determines how many KV blocks can fit into the configured @@ -352,15 +352,15 @@ def _init_cache_engine(self) -> None: key_cache.data[:] = 0 value_cache.data[:] = 0 - def cache_swap_in(self, src_to_dst: List[Tuple[int, int]]) -> None: + def cache_swap_in(self, src_to_dst: list[tuple[int, int]]) -> None: self.cache_engine.swap_in(src_to_dst) - def cache_swap_out(self, src_to_dst: List[Tuple[int, int]]) -> None: + def cache_swap_out(self, src_to_dst: list[tuple[int, int]]) -> None: self.cache_engine.swap_out(src_to_dst) def cache_copy( self, - blocks_to_copy: List[Tuple[int, int]], + blocks_to_copy: list[tuple[int, int]], ) -> None: self.cache_engine.copy(blocks_to_copy) # type: ignore @@ -371,7 +371,7 @@ def get_model(self) -> nn.Module: def execute_model( self, execute_model_req: Optional[ExecuteModelRequest] = None, - ) -> List[SamplerOutput]: + ) -> list[SamplerOutput]: if execute_model_req is None: seq_group_metadata_list = None else: @@ -384,7 +384,7 @@ def execute_model( blocks_to_copy = execute_model_req.blocks_to_copy blocks_to_swap_in = execute_model_req.blocks_to_swap_in blocks_to_swap_out = execute_model_req.blocks_to_swap_out - data: Dict[str, Any] = { + data: dict[str, Any] = { "num_seq_groups": num_seq_groups, "blocks_to_copy": execute_model_req.blocks_to_copy, "blocks_to_swap_in": execute_model_req.blocks_to_swap_in, @@ -488,7 +488,7 @@ def model_profile_run(): # Profile memory usage with max_num_sequences sequences and the # total # number of tokens equal to max_num_batched_tokens. - seqs: List[SequenceGroupMetadata] = [] + seqs: list[SequenceGroupMetadata] = [] for group_id in range(max_num_seqs): seq_len = (max_num_batched_tokens // max_num_seqs + (group_id < max_num_batched_tokens % max_num_seqs)) diff --git a/vllm/worker/pooling_model_runner.py b/vllm/worker/pooling_model_runner.py index cbd5e2060cad5..20eb0bf31f9ca 100644 --- a/vllm/worker/pooling_model_runner.py +++ b/vllm/worker/pooling_model_runner.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import dataclasses -from typing import Any, Dict, List, Optional, Tuple, Type, Union +from typing import Any, Optional, Union import torch @@ -30,9 +30,9 @@ class ModelInputForGPUWithPoolingMetadata(ModelInputForGPU): class PoolingModelRunner( GPUModelRunnerBase[ModelInputForGPUWithPoolingMetadata]): - _model_input_cls: Type[ModelInputForGPUWithPoolingMetadata] = ( + _model_input_cls: type[ModelInputForGPUWithPoolingMetadata] = ( ModelInputForGPUWithPoolingMetadata) - _builder_cls: Type[ModelInputForGPUBuilder] = ModelInputForGPUBuilder + _builder_cls: type[ModelInputForGPUBuilder] = ModelInputForGPUBuilder def __init__( self, @@ -48,10 +48,10 @@ def __init__( def execute_model( self, model_input: ModelInputForGPUWithPoolingMetadata, - kv_caches: List[torch.Tensor], + kv_caches: list[torch.Tensor], intermediate_tensors: Optional[IntermediateTensors] = None, num_steps: int = 1, - ) -> Optional[Union[List[PoolerOutput], IntermediateTensors]]: + ) -> Optional[Union[list[PoolerOutput], IntermediateTensors]]: if num_steps > 1: raise ValueError( "PoolingModelRunner does not support multi-step execution.") @@ -151,7 +151,7 @@ def execute_model( def make_model_input_from_broadcasted_tensor_dict( self, - tensor_dict: Dict[str, + tensor_dict: dict[str, Any]) -> ModelInputForGPUWithPoolingMetadata: return ModelInputForGPUWithPoolingMetadata.from_broadcasted_tensor_dict( tensor_dict, @@ -160,9 +160,9 @@ def make_model_input_from_broadcasted_tensor_dict( def prepare_model_input( self, - seq_group_metadata_list: Optional[List[SequenceGroupMetadata]], + seq_group_metadata_list: Optional[list[SequenceGroupMetadata]], virtual_engine: int = 0, - finished_requests_ids: Optional[List[str]] = None + finished_requests_ids: Optional[list[str]] = None ) -> ModelInputForGPUWithPoolingMetadata: assert seq_group_metadata_list is not None model_input = self._prepare_model_input_tensors( @@ -177,17 +177,17 @@ def prepare_model_input( def _prepare_pooling( self, - seq_group_metadata_list: List[SequenceGroupMetadata], - prompt_lens: List[int], + seq_group_metadata_list: list[SequenceGroupMetadata], + prompt_lens: list[int], ) -> PoolingMetadata: """Prepare PoolingMetadata for the sequence group metadata list.""" - seq_groups: List[Tuple[List[int], PoolingParams]] = [] + seq_groups: list[tuple[list[int], PoolingParams]] = [] for i, seq_group_metadata in enumerate(seq_group_metadata_list): seq_ids = list(seq_group_metadata.seq_data.keys()) pooling_params = seq_group_metadata.pooling_params seq_groups.append((seq_ids, pooling_params)) - seq_data: Dict[int, SequenceData] = {} + seq_data: dict[int, SequenceData] = {} for seq_group_metadata in seq_group_metadata_list: seq_data.update(seq_group_metadata.seq_data) diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py index 53541a2579ed5..59b410b705277 100644 --- a/vllm/worker/tpu_model_runner.py +++ b/vllm/worker/tpu_model_runner.py @@ -3,8 +3,7 @@ import enum import time from dataclasses import dataclass -from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, - Type, Union) +from typing import TYPE_CHECKING, Any, Callable, Optional, Union from unittest.mock import patch import numpy as np @@ -60,15 +59,15 @@ class ModelInputForTPU(ModelRunnerInputBase): t: torch.Tensor p: torch.Tensor num_samples: int - n: List[int] - seq_groups: List[List[int]] + n: list[int] + seq_groups: list[list[int]] is_first_multi_step: bool = True is_last_step: bool = True virtual_engine: int = 0 async_callback: Optional[Callable] = None def as_broadcastable_tensor_dict( - self) -> Dict[str, Union[int, torch.Tensor]]: + self) -> dict[str, Union[int, torch.Tensor]]: tensor_dict = { "token_ids": self.token_ids, "position_ids": self.position_ids, @@ -87,8 +86,8 @@ def as_broadcastable_tensor_dict( @classmethod def from_broadcasted_tensor_dict( - cls: Type["ModelInputForTPU"], - tensor_dict: Dict[str, Any], + cls: type["ModelInputForTPU"], + tensor_dict: dict[str, Any], attn_backend: Optional["AttentionBackend"] = None, ) -> "ModelInputForTPU": if attn_backend is not None: @@ -121,7 +120,7 @@ def __init__( self.model_config.is_attention_free, False, ) - self.cached_step_outputs: List[torch.Tensor] = [] + self.cached_step_outputs: list[torch.Tensor] = [] smem_size = 512 * 1024 block_table_size = 4 * self.block_tables.size @@ -167,7 +166,7 @@ def _dummy_run( self, batch_size: int, seq_len: int, - kv_caches: List[Tuple[torch.Tensor, torch.Tensor]], + kv_caches: list[tuple[torch.Tensor, torch.Tensor]], exec_mode: ExecutionMode, ) -> None: exec_mode = ExecutionMode(exec_mode) @@ -280,7 +279,7 @@ def _dummy_run( def warmup_model( self, - kv_caches: List[Tuple[torch.Tensor, torch.Tensor]], + kv_caches: list[tuple[torch.Tensor, torch.Tensor]], ) -> None: # Prefill logger.info("Compiling the model with different input shapes...") @@ -347,14 +346,14 @@ def warmup_model( def _prepare_prompt( self, - seq_group_metadata_list: List[SequenceGroupMetadata], - ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, torch.Tensor]: + seq_group_metadata_list: list[SequenceGroupMetadata], + ) -> tuple[torch.Tensor, torch.Tensor, AttentionMetadata, torch.Tensor]: assert len(seq_group_metadata_list) > 0 - input_tokens: List[int] = [] - input_positions: List[int] = [] - prompt_lens: List[int] = [] - context_lens: List[int] = [] - slot_mapping: List[int] = [] + input_tokens: list[int] = [] + input_positions: list[int] = [] + prompt_lens: list[int] = [] + context_lens: list[int] = [] + slot_mapping: list[int] = [] for batch_idx, seq_group_metadata in enumerate( seq_group_metadata_list): @@ -439,13 +438,13 @@ def _prepare_prompt( def _prepare_decode( self, - seq_group_metadata_list: List[SequenceGroupMetadata], - ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, torch.Tensor]: + seq_group_metadata_list: list[SequenceGroupMetadata], + ) -> tuple[torch.Tensor, torch.Tensor, AttentionMetadata, torch.Tensor]: assert len(seq_group_metadata_list) > 0 - input_tokens: List[List[int]] = [] - input_positions: List[List[int]] = [] - slot_mapping: List[List[int]] = [] - context_lens: List[int] = [] + input_tokens: list[list[int]] = [] + input_positions: list[list[int]] = [] + slot_mapping: list[list[int]] = [] + context_lens: list[int] = [] batch_idx = 0 for seq_group_metadata in seq_group_metadata_list: @@ -510,9 +509,9 @@ def _prepare_decode( def _prepare_sample( self, - seq_group_metadata_list: List[SequenceGroupMetadata], + seq_group_metadata_list: list[SequenceGroupMetadata], padded_batch_size: int, - ) -> Tuple[torch.Tensor, torch.Tensor, List[int]]: + ) -> tuple[torch.Tensor, torch.Tensor, list[int]]: assert len(seq_group_metadata_list) > 0 t = [] p = [] @@ -558,9 +557,9 @@ def _prepare_sample( def prepare_model_input( self, - seq_group_metadata_list: List[SequenceGroupMetadata], + seq_group_metadata_list: list[SequenceGroupMetadata], virtual_engine: int = 0, - finished_requests_ids: Optional[List[str]] = None, + finished_requests_ids: Optional[list[str]] = None, ) -> ModelInputForTPU: del finished_requests_ids # Unused. assert virtual_engine == 0 @@ -586,7 +585,7 @@ def prepare_model_input( input_lens, t, p, num_samples, n, seq_groups) def make_model_input_from_broadcasted_tensor_dict( - self, tensor_dict: Dict[str, Any]) -> ModelInputForTPU: + self, tensor_dict: dict[str, Any]) -> ModelInputForTPU: model_input = ModelInputForTPU.from_broadcasted_tensor_dict( tensor_dict, attn_backend=self.attn_backend) return model_input @@ -595,10 +594,10 @@ def make_model_input_from_broadcasted_tensor_dict( def execute_model( self, model_input: ModelInputForTPU, - kv_caches: Optional[List[Any]], + kv_caches: Optional[list[Any]], intermediate_tensors: Optional[IntermediateTensors] = None, num_steps: int = 1, - ) -> List[SamplerOutput]: + ) -> list[SamplerOutput]: assert intermediate_tensors is None if not model_input.is_first_multi_step: if not model_input.is_last_step: @@ -781,7 +780,7 @@ def forward( t: torch.Tensor, p: torch.Tensor, num_samples: int, - kv_caches: List[Tuple[torch.Tensor, torch.Tensor]], + kv_caches: list[tuple[torch.Tensor, torch.Tensor]], ) -> torch.Tensor: """Executes the forward pass of the model and samples the next token. @@ -888,8 +887,8 @@ def _apply_top_p(logits: torch.Tensor, p: torch.Tensor) -> torch.Tensor: def _make_decode_output( - next_token_ids: List[int], - seq_groups: List[List[int]], + next_token_ids: list[int], + seq_groups: list[list[int]], ) -> SamplerOutput: zero_logprob = Logprob(0.0) sampler_outputs = [] diff --git a/vllm/worker/tpu_worker.py b/vllm/worker/tpu_worker.py index 7903e81943c24..6392faf6ef92e 100644 --- a/vllm/worker/tpu_worker.py +++ b/vllm/worker/tpu_worker.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import os -from typing import List, Optional, Tuple, Union +from typing import Optional, Union import torch import torch_xla.core.xla_model as xm @@ -96,7 +96,7 @@ def init_device(self) -> None: def load_model(self): self.model_runner.load_model() - def determine_num_available_blocks(self) -> Tuple[int, int]: + def determine_num_available_blocks(self) -> tuple[int, int]: num_layers = self.model_config.get_num_layers(self.parallel_config) head_size = self.model_config.get_head_size() num_kv_heads = self.model_config.get_num_kv_heads(self.parallel_config) @@ -157,8 +157,8 @@ def initialize_cache( num_kv_heads = self.model_config.get_num_kv_heads(self.parallel_config) head_size = self.model_config.get_head_size() - self.cpu_cache: List[Tuple[torch.Tensor, torch.Tensor]] = [] - self.tpu_cache: List[Tuple[torch.Tensor, torch.Tensor]] = [] + self.cpu_cache: list[tuple[torch.Tensor, torch.Tensor]] = [] + self.tpu_cache: list[tuple[torch.Tensor, torch.Tensor]] = [] tpu_cache_shape = self.model_runner.attn_backend.get_kv_cache_shape( num_gpu_blocks, self.block_size, num_kv_heads, head_size) cpu_cache_shape = self.model_runner.attn_backend.get_kv_cache_shape( @@ -207,7 +207,7 @@ def do_metadata_broadcast(self) -> bool: return self.parallel_config.tensor_parallel_size > 1 @property - def kv_cache(self) -> Optional[List[List[torch.Tensor]]]: + def kv_cache(self) -> Optional[list[list[torch.Tensor]]]: # NOTE(woosuk): This assumes virtual_engine == 0, i.e., no pipeline # parallelism. return [self.tpu_cache] @@ -268,10 +268,10 @@ def execute_worker(self, worker_input: WorkerInput) -> None: def _make_src_to_dst( - mapping: List[Tuple[int, int]], + mapping: list[tuple[int, int]], src_device: Union[torch.device, str], dst_device: Union[torch.device, str], -) -> Optional[Tuple[torch.Tensor, torch.Tensor]]: +) -> Optional[tuple[torch.Tensor, torch.Tensor]]: if not mapping: return None diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index ad94a6a4db7a3..d0ecb52eb3c60 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -2,7 +2,7 @@ """A GPU worker class.""" import gc import os -from typing import Dict, List, Optional, Set, Tuple, Type, Union +from typing import Optional, Union import torch import torch.distributed @@ -50,7 +50,7 @@ def __init__( rank: int, distributed_init_method: str, is_driver_worker: bool = False, - model_runner_cls: Optional[Type[GPUModelRunnerBase]] = None, + model_runner_cls: Optional[type[GPUModelRunnerBase]] = None, ) -> None: WorkerBase.__init__(self, vllm_config) self.parallel_config.rank = rank @@ -74,7 +74,7 @@ def __init__( not in ("medusa", "mlp_speculator", "eagle", "deepseek_mtp")) \ else {"return_hidden_states": True} - ModelRunnerClass: Type[GPUModelRunnerBase] = ModelRunner + ModelRunnerClass: type[GPUModelRunnerBase] = ModelRunner if model_config.runner_type == "pooling": ModelRunnerClass = PoolingModelRunner elif self.model_config.is_encoder_decoder: @@ -90,10 +90,10 @@ def __init__( # Uninitialized cache engine. Will be initialized by # initialize_cache. - self.cache_engine: List[CacheEngine] + self.cache_engine: list[CacheEngine] # Initialize gpu_cache as pooling models don't initialize kv_caches - self.gpu_cache: Optional[List[List[torch.Tensor]]] = None - self._seq_group_metadata_cache: Dict[str, SequenceGroupMetadata] = {} + self.gpu_cache: Optional[list[list[torch.Tensor]]] = None + self._seq_group_metadata_cache: dict[str, SequenceGroupMetadata] = {} # Torch profiler. Enabled and configured through env vars: # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace @@ -202,7 +202,7 @@ def save_tensorized_model( tensorizer_config=tensorizer_config, ) @torch.inference_mode() - def determine_num_available_blocks(self) -> Tuple[int, int]: + def determine_num_available_blocks(self) -> tuple[int, int]: """Profiles the peak memory usage of the model to determine how many KV blocks may be allocated without OOMs. @@ -345,7 +345,7 @@ def do_metadata_broadcast(self) -> bool: return self.parallel_config.tensor_parallel_size > 1 @property - def kv_cache(self) -> Optional[List[List[torch.Tensor]]]: + def kv_cache(self) -> Optional[list[list[torch.Tensor]]]: return self.gpu_cache @torch.inference_mode() @@ -396,9 +396,9 @@ def execute_worker(self, worker_input: WorkerInput) -> None: def _get_cached_seq_group_metadata( self, - seq_group_metadata_list: List[Union[SequenceGroupMetadata, + seq_group_metadata_list: list[Union[SequenceGroupMetadata, SequenceGroupMetadataDelta]], - finished_request_ids: List[str]) -> List[SequenceGroupMetadata]: + finished_request_ids: list[str]) -> list[SequenceGroupMetadata]: """Return a list of cached Sequence Group Metadata after updating its state. @@ -439,7 +439,7 @@ def _execute_model_spmd( self, execute_model_req: ExecuteModelRequest, intermediate_tensors: Optional[IntermediateTensors] = None, - ) -> Optional[List[SamplerOutput]]: + ) -> Optional[list[SamplerOutput]]: if execute_model_req is not None: new_seq_group_metadata_list = self._get_cached_seq_group_metadata( execute_model_req.seq_group_metadata_list, @@ -460,7 +460,7 @@ def remove_lora(self, lora_id: int) -> bool: def pin_lora(self, lora_id: int) -> bool: return self.model_runner.pin_lora(lora_id) - def list_loras(self) -> Set[int]: + def list_loras(self) -> set[int]: return self.model_runner.list_loras() def add_prompt_adapter( @@ -473,7 +473,7 @@ def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool: def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool: return self.model_runner.pin_prompt_adapter(prompt_adapter_id) - def list_prompt_adapters(self) -> Set[int]: + def list_prompt_adapters(self) -> set[int]: return self.model_runner.list_prompt_adapters() @property diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py index 7cc1562a5bce5..225d2036f5911 100644 --- a/vllm/worker/worker_base.py +++ b/vllm/worker/worker_base.py @@ -4,7 +4,7 @@ import os import time from abc import abstractmethod -from typing import Any, Dict, List, Optional, Set, Tuple, Type, Union +from typing import Any, Optional, Union import cloudpickle import torch @@ -77,7 +77,7 @@ def load_model(self) -> None: def execute_model( self, execute_model_req: Optional[ExecuteModelRequest] = None - ) -> Optional[List[SamplerOutput]]: + ) -> Optional[list[SamplerOutput]]: raise NotImplementedError def start_worker_execution_loop(self) -> None: @@ -92,14 +92,14 @@ def start_worker_execution_loop(self) -> None: if output is None: return None - def determine_num_available_blocks(self) -> Tuple[int, int]: + def determine_num_available_blocks(self) -> tuple[int, int]: """Determine the number of available blocks for the GPU KV cache and swappable CPU KV cache. The implementation may run profiling or other heuristics to determine the size of caches. - Returns a Tuple[num_gpu_blocks, num_cpu_blocks], where num_gpu_blocks + Returns a tuple[num_gpu_blocks, num_cpu_blocks], where num_gpu_blocks are blocks that are "active" on the device and can be appended to. num_cpu_blocks refers to "swapped" blocks in CPU memory and cannot be appended to. @@ -121,7 +121,7 @@ def remove_lora(self, lora_id: int) -> bool: def pin_lora(self, lora_id: int) -> bool: raise NotImplementedError - def list_loras(self) -> Set[int]: + def list_loras(self) -> set[int]: raise NotImplementedError @property @@ -150,7 +150,7 @@ def __init__( def init_device(self) -> None: self.worker.init_device() - def determine_num_available_blocks(self) -> Tuple[int, int]: + def determine_num_available_blocks(self) -> tuple[int, int]: return self.worker.determine_num_available_blocks() def initialize_cache(self, num_gpu_blocks: int, @@ -167,7 +167,7 @@ def get_model(self) -> nn.Module: def execute_model( self, execute_model_req: Optional[ExecuteModelRequest] = None - ) -> Optional[List[SamplerOutput]]: + ) -> Optional[list[SamplerOutput]]: return self.worker.execute_model(execute_model_req) def get_cache_block_size_bytes(self) -> int: @@ -182,7 +182,7 @@ def remove_lora(self, lora_id: int) -> bool: def pin_lora(self, lora_id: int) -> bool: return self.worker.pin_lora(lora_id) - def list_loras(self) -> Set[int]: + def list_loras(self) -> set[int]: return self.worker.list_loras() def __getattr__(self, attr): @@ -204,7 +204,7 @@ def pin_lora(self, lora_id: int) -> bool: return ValueError( f"{type(self)} does not support LoRA") # type: ignore - def list_loras(self) -> Set[int]: + def list_loras(self) -> set[int]: raise ValueError(f"{type(self)} does not support LoRA") @@ -223,8 +223,8 @@ class WorkerInput: @classmethod def from_broadcasted_tensor_dict( - cls: Type["WorkerInput"], - tensor_dict: Dict[str, Any], + cls: type["WorkerInput"], + tensor_dict: dict[str, Any], ) -> "WorkerInput": """ Pop fields from the given tensor_dict and populate a new instance of @@ -240,7 +240,7 @@ def from_broadcasted_tensor_dict( ) def as_broadcastable_tensor_dict( - self) -> Dict[str, Union[int, torch.Tensor]]: + self) -> dict[str, Union[int, torch.Tensor]]: """ Extract broadcastable fields. """ @@ -282,7 +282,7 @@ def do_metadata_broadcast(self) -> bool: @property @abstractmethod - def kv_cache(self) -> Optional[List[List[torch.Tensor]]]: + def kv_cache(self) -> Optional[list[list[torch.Tensor]]]: """ Gets the list of kv caches to pass to the worker's model runner. Each element in the list is a kv cache corresponding to a particular virtual @@ -311,7 +311,7 @@ def execute_worker(self, worker_input: WorkerInput) -> None: def _get_worker_input_from_broadcast( self - ) -> Optional[Tuple[BroadcastableModelInput, WorkerInput, Dict[ + ) -> Optional[tuple[BroadcastableModelInput, WorkerInput, dict[ str, torch.Tensor]]]: """ Get the worker input from the broadcasted tensor dict. """ assert self.do_metadata_broadcast @@ -331,7 +331,7 @@ def _get_worker_input_from_broadcast( def _get_driver_input_and_broadcast( self, execute_model_req: ExecuteModelRequest - ) -> Tuple[BroadcastableModelInput, WorkerInput, Dict[str, torch.Tensor]]: + ) -> tuple[BroadcastableModelInput, WorkerInput, dict[str, torch.Tensor]]: """ Get the driver input and broadcast it to other workers. """ assert self.is_driver_worker @@ -361,7 +361,7 @@ def _get_driver_input_and_broadcast( def prepare_input( self, execute_model_req: Optional[ExecuteModelRequest] = None - ) -> Optional[Tuple[BroadcastableModelInput, WorkerInput, Dict[ + ) -> Optional[tuple[BroadcastableModelInput, WorkerInput, dict[ str, torch.Tensor]]]: """ Prepare the inputs to ModelRunner and workers. @@ -386,7 +386,7 @@ def get_model(self) -> nn.Module: def execute_model( self, execute_model_req: Optional[ExecuteModelRequest] = None, - ) -> Optional[List[SamplerOutput]]: + ) -> Optional[list[SamplerOutput]]: """Executes at least one model step on the given sequences, unless no sequences are provided.""" start_time = time.perf_counter() @@ -444,14 +444,14 @@ def execute_model( o.model_execute_time = (orig_model_execute_time + model_execute_time) - # output is List[SamplerOutput] + # output is list[SamplerOutput] return output def _execute_model_spmd( self, execute_model_req: ExecuteModelRequest, intermediate_tensors: Optional[IntermediateTensors] = None - ) -> Optional[List[SamplerOutput]]: + ) -> Optional[list[SamplerOutput]]: """ Execute model in Single Program Multiple Data (SPMD) fashion. All workers take the same request, prepare the input and @@ -521,7 +521,7 @@ def __init__( from vllm.utils import init_cached_hf_modules init_cached_hf_modules() - def adjust_rank(self, rank_mapping: Dict[int, int]) -> None: + def adjust_rank(self, rank_mapping: dict[int, int]) -> None: """ Adjust the rpc_rank based on the given mapping. It is only used during the initialization of the executor, @@ -530,7 +530,7 @@ def adjust_rank(self, rank_mapping: Dict[int, int]) -> None: if self.rpc_rank in rank_mapping: self.rpc_rank = rank_mapping[self.rpc_rank] - def update_environment_variables(self, envs_list: List[Dict[str, + def update_environment_variables(self, envs_list: list[dict[str, str]]) -> None: envs = envs_list[self.rpc_rank] key = 'CUDA_VISIBLE_DEVICES' @@ -540,7 +540,7 @@ def update_environment_variables(self, envs_list: List[Dict[str, del os.environ[key] update_environment_variables(envs) - def init_worker(self, all_kwargs: List[Dict[str, Any]]) -> None: + def init_worker(self, all_kwargs: list[dict[str, Any]]) -> None: """ Here we inject some common logic before initializing the worker. Arguments are passed to the worker class constructor. @@ -567,7 +567,7 @@ def init_worker(self, all_kwargs: List[Dict[str, Any]]) -> None: self.worker = worker_class(**kwargs) assert self.worker is not None - def initialize_from_config(self, kv_cache_configs: List[Any]) -> None: + def initialize_from_config(self, kv_cache_configs: list[Any]) -> None: kv_cache_config = kv_cache_configs[self.rpc_rank] self.worker.initialize_from_config(kv_cache_config) # type: ignore @@ -598,8 +598,8 @@ def __getattr__(self, attr): def extract_previous_hidden_states( - data: Union[ExecuteModelRequest, Dict[str, torch.Tensor]]) -> \ - Dict[str, torch.Tensor]: + data: Union[ExecuteModelRequest, dict[str, torch.Tensor]]) -> \ + dict[str, torch.Tensor]: """If data contains previous_hidden_states, extract it. This returns a dict which can be used directly as additional kwargs in any following execute_model calls. This is used in draft models like EAGLE.""" diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py index 39957e661c474..3fc0c9c10eebd 100644 --- a/vllm/worker/xpu_model_runner.py +++ b/vllm/worker/xpu_model_runner.py @@ -5,8 +5,7 @@ import weakref from collections import defaultdict from dataclasses import dataclass -from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, - Type, TypeVar) +from typing import TYPE_CHECKING, Any, Callable, Optional, TypeVar import torch import torch.nn as nn @@ -54,11 +53,11 @@ class ModelInputForXPU(ModelRunnerInputBase): attn_metadata: Optional["AttentionMetadata"] = None multi_modal_kwargs: Optional[BatchedTensorInputs] = None virtual_engine: Optional[int] = None - seq_lens: Optional[List[int]] = None - query_lens: Optional[List[int]] = None + seq_lens: Optional[list[int]] = None + query_lens: Optional[list[int]] = None async_callback: Optional[Callable] = None - def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: + def as_broadcastable_tensor_dict(self) -> dict[str, Any]: tensor_dict = { "input_tokens": self.input_tokens, "input_positions": self.input_positions, @@ -69,8 +68,8 @@ def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: @classmethod def from_broadcasted_tensor_dict( - cls: Type[TModelInputForXPU], - tensor_dict: Dict[str, Any], + cls: type[TModelInputForXPU], + tensor_dict: dict[str, Any], attn_backend: Optional["AttentionBackend"] = None, ) -> TModelInputForXPU: if attn_backend is not None: @@ -86,7 +85,7 @@ class ModelInputForXPUWithSamplingMetadata(ModelInputForXPU): """ sampling_metadata: Optional["SamplingMetadata"] = None - def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: + def as_broadcastable_tensor_dict(self) -> dict[str, Any]: tensor_dict = { "input_tokens": self.input_tokens, "input_positions": self.input_positions, @@ -99,7 +98,7 @@ def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: @classmethod def from_broadcasted_tensor_dict( cls, - tensor_dict: Dict[str, Any], + tensor_dict: dict[str, Any], attn_backend: Optional["AttentionBackend"] = None, ) -> "ModelInputForXPUWithSamplingMetadata": tensor_dict = _init_sampling_metadata_from_tensor_dict(tensor_dict) @@ -113,7 +112,7 @@ class ModelInputForXPUBuilder(ModelRunnerInputBuilderBase[ModelInputForXPU]): def __init__(self, runner: "XPUModelRunner", - finished_requests_ids: Optional[List[str]] = None) -> None: + finished_requests_ids: Optional[list[str]] = None) -> None: super().__init__() self.runner = runner self.model_input_cls = self.runner._model_input_cls @@ -123,8 +122,8 @@ def __init__(self, self.device = self.runner.device def prepare(self, - finished_requests_ids: Optional[List[str]] = None) -> None: - self.seq_group_metadata_list: List[SequenceGroupMetadata] = [] + finished_requests_ids: Optional[list[str]] = None) -> None: + self.seq_group_metadata_list: list[SequenceGroupMetadata] = [] def add_seq_group(self, seq_group_metadata: SequenceGroupMetadata): self.seq_group_metadata_list.append(seq_group_metadata) @@ -154,16 +153,16 @@ def build(self) -> ModelInputForXPU: def _prepare_prompt( self, - seq_group_metadata_list: List[SequenceGroupMetadata], - ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, List[int], + seq_group_metadata_list: list[SequenceGroupMetadata], + ) -> tuple[torch.Tensor, torch.Tensor, AttentionMetadata, list[int], BatchedTensorInputs]: assert len(seq_group_metadata_list) > 0 - input_tokens: List[int] = [] - input_positions: List[int] = [] - slot_mapping: List[int] = [] - seq_lens: List[int] = [] - multi_modal_kwargs_list: List[MultiModalKwargs] = [] - multi_modal_placeholder_maps: Dict[ + input_tokens: list[int] = [] + input_positions: list[int] = [] + slot_mapping: list[int] = [] + seq_lens: list[int] = [] + multi_modal_kwargs_list: list[MultiModalKwargs] = [] + multi_modal_placeholder_maps: dict[ str, MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap) @@ -282,14 +281,14 @@ def _prepare_prompt( def _prepare_decode( self, - seq_group_metadata_list: List[SequenceGroupMetadata], - ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata]: + seq_group_metadata_list: list[SequenceGroupMetadata], + ) -> tuple[torch.Tensor, torch.Tensor, AttentionMetadata]: assert len(seq_group_metadata_list) > 0 - input_tokens: List[int] = [] - input_positions: List[int] = [] - slot_mapping: List[int] = [] - seq_lens: List[int] = [] - block_tables: List[List[int]] = [] + input_tokens: list[int] = [] + input_positions: list[int] = [] + slot_mapping: list[int] = [] + seq_lens: list[int] = [] + block_tables: list[list[int]] = [] for seq_group_metadata in seq_group_metadata_list: assert not seq_group_metadata.is_prompt @@ -367,9 +366,9 @@ def _prepare_decode( class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]): - _model_input_cls: Type[ModelInputForXPUWithSamplingMetadata] = ( + _model_input_cls: type[ModelInputForXPUWithSamplingMetadata] = ( ModelInputForXPUWithSamplingMetadata) - _builder_cls: Type[ModelInputForXPUBuilder] = ModelInputForXPUBuilder + _builder_cls: type[ModelInputForXPUBuilder] = ModelInputForXPUBuilder def __init__( self, @@ -441,7 +440,7 @@ def profile_run(self) -> None: # Profile memory usage with max_num_sequences sequences and the total # number of tokens equal to max_num_batched_tokens. - seqs: List[SequenceGroupMetadata] = [] + seqs: list[SequenceGroupMetadata] = [] # Additional GPU memory may be needed for multi-modal encoding, which # needs to be accounted for when calculating the GPU blocks for # vLLM blocker manager. @@ -499,7 +498,7 @@ def profile_run(self) -> None: def make_model_input_from_broadcasted_tensor_dict( self, - tensor_dict: Dict[str, + tensor_dict: dict[str, Any]) -> ModelInputForXPUWithSamplingMetadata: return ( ModelInputForXPUWithSamplingMetadata.from_broadcasted_tensor_dict( @@ -509,8 +508,8 @@ def make_model_input_from_broadcasted_tensor_dict( def _prepare_model_input_tensors( self, - seq_group_metadata_list: List[SequenceGroupMetadata], - finished_requests_ids: Optional[List[str]] = None + seq_group_metadata_list: list[SequenceGroupMetadata], + finished_requests_ids: Optional[list[str]] = None ) -> ModelInputForXPUWithSamplingMetadata: """Helper method to prepare the model input based on a given sequence group. Prepares metadata needed for the base model forward pass but not @@ -526,9 +525,9 @@ def _prepare_model_input_tensors( def prepare_model_input( self, - seq_group_metadata_list: List[SequenceGroupMetadata], + seq_group_metadata_list: list[SequenceGroupMetadata], virtual_engine: int = 0, - finished_requests_ids: Optional[List[str]] = None + finished_requests_ids: Optional[list[str]] = None ) -> ModelInputForXPUWithSamplingMetadata: """Prepare the model input based on a given sequence group, including metadata for the sampling step. @@ -555,10 +554,10 @@ def prepare_model_input( def execute_model( self, model_input: ModelInputForXPUWithSamplingMetadata, - kv_caches: List[torch.Tensor], + kv_caches: list[torch.Tensor], intermediate_tensors: Optional[IntermediateTensors] = None, num_steps: int = 1, - ) -> Optional[List[SamplerOutput]]: + ) -> Optional[list[SamplerOutput]]: if num_steps > 1: raise ValueError( "XPUModelRunner does not support multi-step execution.") diff --git a/vllm/worker/xpu_worker.py b/vllm/worker/xpu_worker.py index 3aea0d7419d02..af76419a7e3b2 100644 --- a/vllm/worker/xpu_worker.py +++ b/vllm/worker/xpu_worker.py @@ -2,7 +2,7 @@ """A XPU worker class.""" import gc import os -from typing import List, Optional, Tuple +from typing import Optional import intel_extension_for_pytorch # noqa: F401 import oneccl_bindings_for_pytorch # noqa: F401 @@ -64,8 +64,8 @@ def __init__( ) # Uninitialized cache engine. Will be initialized by # initialize_cache. - self.cache_engine: List[CacheEngine] - self.gpu_cache: Optional[List[List[torch.Tensor]]] + self.cache_engine: list[CacheEngine] + self.gpu_cache: Optional[list[list[torch.Tensor]]] def init_device(self) -> None: if self.device_config.device.type == "xpu" and current_platform.is_xpu( @@ -85,7 +85,7 @@ def init_device(self) -> None: # keep this method for `empty_cache` and `synchronize` api @torch.inference_mode() - def determine_num_available_blocks(self) -> Tuple[int, int]: + def determine_num_available_blocks(self) -> tuple[int, int]: """Profiles the peak memory usage of the model to determine how many KV blocks may be allocated without OOMs. From 836352cfea60bd53a7997f4face64c97b00ff755 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 27 Feb 2025 16:59:55 +0100 Subject: [PATCH 2/5] Make mypy happy Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/core/block/block_table.py | 4 ++-- vllm/core/block/common.py | 4 +++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py index 4e7f6338d3a42..ea5e1393c23a1 100644 --- a/vllm/core/block/block_table.py +++ b/vllm/core/block/block_table.py @@ -257,7 +257,7 @@ def physical_block_ids(self) -> list[int]: list[int]: A list of physical block indices for the blocks in the BlockTable. """ - return self._blocks.ids() + return self._blocks.ids def get_unseen_token_ids(self, sequence_token_ids: list[int]) -> list[int]: """Get the number of "unseen" tokens in the sequence. @@ -340,7 +340,7 @@ def _is_allocated(self) -> bool: @property def blocks(self) -> list[Block]: - return self._blocks.list() + return self._blocks.blocks @property def _num_empty_slots(self) -> int: diff --git a/vllm/core/block/common.py b/vllm/core/block/common.py index 9e444ac1f7dd1..a08e6da736c7d 100644 --- a/vllm/core/block/common.py +++ b/vllm/core/block/common.py @@ -287,9 +287,11 @@ def reset(self): self._blocks = [] self._block_ids = [] - def list(self) -> list[Block]: + @property + def blocks(self) -> list[Block]: return self._blocks + @property def ids(self) -> list[int]: return self._block_ids From 9c2bbcac1d47c41bb39d6c7f94ef26c1028767a6 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 27 Feb 2025 18:40:01 +0100 Subject: [PATCH 3/5] Revert "Make mypy happy" This reverts commit 836352cfea60bd53a7997f4face64c97b00ff755. Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/core/block/block_table.py | 4 ++-- vllm/core/block/common.py | 4 +--- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py index ea5e1393c23a1..4e7f6338d3a42 100644 --- a/vllm/core/block/block_table.py +++ b/vllm/core/block/block_table.py @@ -257,7 +257,7 @@ def physical_block_ids(self) -> list[int]: list[int]: A list of physical block indices for the blocks in the BlockTable. """ - return self._blocks.ids + return self._blocks.ids() def get_unseen_token_ids(self, sequence_token_ids: list[int]) -> list[int]: """Get the number of "unseen" tokens in the sequence. @@ -340,7 +340,7 @@ def _is_allocated(self) -> bool: @property def blocks(self) -> list[Block]: - return self._blocks.blocks + return self._blocks.list() @property def _num_empty_slots(self) -> int: diff --git a/vllm/core/block/common.py b/vllm/core/block/common.py index a08e6da736c7d..9e444ac1f7dd1 100644 --- a/vllm/core/block/common.py +++ b/vllm/core/block/common.py @@ -287,11 +287,9 @@ def reset(self): self._blocks = [] self._block_ids = [] - @property - def blocks(self) -> list[Block]: + def list(self) -> list[Block]: return self._blocks - @property def ids(self) -> list[int]: return self._block_ids From 36577af1fcd79721c07e3e8f7fa051121bd8397b Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 27 Feb 2025 18:43:36 +0100 Subject: [PATCH 4/5] Revert changes in `vllm/vllm` that aren't `v1` or `entrypoints` Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- pyproject.toml | 26 +++ vllm/adapter_commons/layers.py | 5 +- vllm/adapter_commons/models.py | 10 +- vllm/adapter_commons/utils.py | 18 +- vllm/adapter_commons/worker_manager.py | 6 +- vllm/assets/video.py | 6 +- vllm/attention/backends/abstract.py | 31 ++-- vllm/attention/backends/blocksparse_attn.py | 26 +-- vllm/attention/backends/flash_attn.py | 40 ++--- vllm/attention/backends/flashinfer.py | 52 +++--- vllm/attention/backends/flashmla.py | 18 +- vllm/attention/backends/hpu_attn.py | 20 +-- vllm/attention/backends/ipex_attn.py | 30 ++-- vllm/attention/backends/mla/common.py | 45 ++--- vllm/attention/backends/openvino.py | 14 +- vllm/attention/backends/pallas.py | 20 +-- vllm/attention/backends/placeholder_attn.py | 26 +-- vllm/attention/backends/rocm_flash_attn.py | 30 ++-- vllm/attention/backends/torch_sdpa.py | 46 ++--- vllm/attention/backends/triton_mla.py | 8 +- vllm/attention/backends/utils.py | 42 ++--- vllm/attention/backends/xformers.py | 38 ++-- vllm/attention/layer.py | 6 +- vllm/attention/ops/flashmla.py | 12 +- vllm/attention/ops/hpu_paged_attn.py | 14 +- vllm/attention/ops/ipex_attn.py | 14 +- vllm/attention/ops/paged_attn.py | 10 +- vllm/attention/selector.py | 7 +- vllm/compilation/backends.py | 33 ++-- vllm/compilation/compiler_interface.py | 28 +-- vllm/compilation/decorators.py | 8 +- vllm/compilation/fix_functionalization.py | 15 +- vllm/compilation/fusion.py | 10 +- vllm/compilation/fx_utils.py | 3 +- vllm/compilation/multi_output_match.py | 8 +- vllm/compilation/pass_manager.py | 6 +- vllm/compilation/wrapper.py | 4 +- vllm/core/block/block_table.py | 48 +++--- vllm/core/block/common.py | 37 ++-- vllm/core/block/cpu_gpu_block_allocator.py | 60 +++---- vllm/core/block/interfaces.py | 64 +++---- vllm/core/block/naive_block.py | 57 +++--- vllm/core/block/prefix_caching_block.py | 78 ++++----- vllm/core/block_manager.py | 27 +-- vllm/core/evictor.py | 9 +- vllm/core/interfaces.py | 14 +- vllm/core/placeholder_block_space_manager.py | 12 +- vllm/core/scheduler.py | 154 ++++++++--------- vllm/device_allocator/cumem.py | 10 +- vllm/distributed/communication_op.py | 4 +- .../device_communicators/cuda_wrapper.py | 8 +- .../device_communicators/custom_all_reduce.py | 8 +- .../custom_all_reduce_utils.py | 9 +- .../device_communicators/pynccl_wrapper.py | 10 +- .../device_communicators/shm_broadcast.py | 8 +- .../kv_transfer/kv_connector/base.py | 14 +- .../kv_transfer/kv_connector/factory.py | 6 +- .../kv_connector/lmcache_connector.py | 8 +- .../kv_connector/simple_connector.py | 10 +- .../kv_transfer/kv_lookup_buffer/base.py | 6 +- .../kv_lookup_buffer/simple_buffer.py | 14 +- .../kv_transfer/kv_pipe/pynccl_pipe.py | 6 +- .../kv_transfer/kv_transfer_agent.py | 8 +- vllm/distributed/parallel_state.py | 43 ++--- vllm/distributed/utils.py | 13 +- vllm/engine/arg_utils.py | 30 ++-- vllm/engine/async_llm_engine.py | 28 +-- vllm/engine/async_timeout.py | 8 +- vllm/engine/llm_engine.py | 122 ++++++------- vllm/engine/metrics.py | 49 +++--- vllm/engine/metrics_types.py | 42 ++--- vllm/engine/multiprocessing/__init__.py | 5 +- vllm/engine/multiprocessing/client.py | 10 +- vllm/engine/multiprocessing/engine.py | 5 +- vllm/engine/output_processor/interfaces.py | 8 +- vllm/engine/output_processor/multi_step.py | 12 +- vllm/engine/output_processor/single_step.py | 8 +- vllm/engine/output_processor/stop_checker.py | 6 +- vllm/engine/output_processor/util.py | 9 +- vllm/engine/protocol.py | 5 +- vllm/executor/executor_base.py | 32 ++-- vllm/executor/mp_distributed_executor.py | 16 +- vllm/executor/msgspec_utils.py | 4 +- vllm/executor/multiproc_worker_utils.py | 7 +- vllm/executor/ray_distributed_executor.py | 30 ++-- vllm/executor/ray_utils.py | 18 +- vllm/executor/uniproc_executor.py | 8 +- vllm/inputs/data.py | 54 +++--- vllm/inputs/parse.py | 17 +- vllm/inputs/preprocess.py | 23 ++- vllm/inputs/registry.py | 5 +- vllm/lora/fully_sharded_layers.py | 22 +-- vllm/lora/layers.py | 66 +++---- vllm/lora/lora.py | 14 +- vllm/lora/models.py | 56 +++--- vllm/lora/ops/triton_ops/sgmv_expand.py | 8 +- vllm/lora/ops/triton_ops/sgmv_shrink.py | 8 +- vllm/lora/ops/triton_ops/utils.py | 11 +- vllm/lora/peft_helper.py | 4 +- vllm/lora/punica_wrapper/punica_base.py | 82 ++++----- vllm/lora/punica_wrapper/punica_cpu.py | 46 ++--- vllm/lora/punica_wrapper/punica_gpu.py | 48 +++--- vllm/lora/punica_wrapper/punica_hpu.py | 30 ++-- vllm/lora/punica_wrapper/utils.py | 20 +-- vllm/lora/utils.py | 18 +- vllm/lora/worker_manager.py | 22 +-- vllm/model_executor/custom_op.py | 4 +- .../guided_decoding/guided_fields.py | 10 +- .../guided_decoding/outlines_decoding.py | 4 +- .../outlines_logits_processors.py | 16 +- .../guided_decoding/xgrammar_decoding.py | 4 +- .../layers/fused_moe/__init__.py | 6 +- .../layers/fused_moe/fused_moe.py | 38 ++-- vllm/model_executor/layers/fused_moe/layer.py | 4 +- vllm/model_executor/layers/layernorm.py | 16 +- .../layers/mamba/mamba_mixer2.py | 6 +- vllm/model_executor/layers/pooler.py | 8 +- .../layers/quantization/__init__.py | 8 +- .../layers/quantization/aqlm.py | 14 +- .../model_executor/layers/quantization/awq.py | 14 +- .../layers/quantization/awq_marlin.py | 16 +- .../layers/quantization/base_config.py | 16 +- .../layers/quantization/bitsandbytes.py | 14 +- .../compressed_tensors/compressed_tensors.py | 38 ++-- .../compressed_tensors_moe.py | 6 +- .../schemes/compressed_tensors_24.py | 12 +- .../schemes/compressed_tensors_w4a16_24.py | 4 +- .../schemes/compressed_tensors_w8a16_fp8.py | 4 +- .../schemes/compressed_tensors_w8a8_fp8.py | 4 +- .../schemes/compressed_tensors_w8a8_int8.py | 6 +- .../schemes/compressed_tensors_wNa16.py | 6 +- .../compressed_tensors/triton_scaled_mm.py | 4 +- .../quantization/compressed_tensors/utils.py | 11 +- .../layers/quantization/deepspeedfp.py | 10 +- .../layers/quantization/experts_int8.py | 8 +- .../layers/quantization/fbgemm_fp8.py | 12 +- .../model_executor/layers/quantization/fp8.py | 14 +- .../layers/quantization/gguf.py | 12 +- .../layers/quantization/gptq.py | 14 +- .../layers/quantization/gptq_marlin.py | 20 +-- .../layers/quantization/gptq_marlin_24.py | 10 +- .../layers/quantization/hqq_marlin.py | 12 +- .../layers/quantization/ipex_quant.py | 10 +- .../kernels/mixed_precision/MPLinearKernel.py | 10 +- .../kernels/mixed_precision/__init__.py | 8 +- .../kernels/mixed_precision/exllama.py | 4 +- .../kernels/mixed_precision/machete.py | 4 +- .../kernels/mixed_precision/marlin.py | 4 +- .../kernels/scaled_mm/ScaledMMLinearKernel.py | 6 +- .../kernels/scaled_mm/__init__.py | 8 +- .../quantization/kernels/scaled_mm/cutlass.py | 4 +- .../quantization/kernels/scaled_mm/triton.py | 4 +- .../quantization/kernels/scaled_mm/xla.py | 4 +- .../layers/quantization/marlin.py | 10 +- .../layers/quantization/modelopt.py | 10 +- .../layers/quantization/moe_wna16.py | 16 +- .../layers/quantization/neuron_quant.py | 8 +- .../layers/quantization/ptpc_fp8.py | 6 +- .../model_executor/layers/quantization/qqq.py | 10 +- .../layers/quantization/quark/quark.py | 48 +++--- .../layers/quantization/quark/quark_moe.py | 4 +- .../quark/schemes/quark_w8a8_fp8.py | 4 +- .../quark/schemes/quark_w8a8_int8.py | 6 +- .../layers/quantization/quark/utils.py | 5 +- .../layers/quantization/schema.py | 4 +- .../layers/quantization/tpu_int8.py | 12 +- .../layers/quantization/utils/fp8_utils.py | 22 +-- .../layers/quantization/utils/gptq_utils.py | 4 +- .../quantization/utils/machete_utils.py | 8 +- .../layers/quantization/utils/marlin_utils.py | 12 +- .../quantization/utils/marlin_utils_test.py | 6 +- .../utils/marlin_utils_test_24.py | 9 +- .../utils/marlin_utils_test_qqq.py | 10 +- .../layers/quantization/utils/quant_utils.py | 17 +- .../layers/quantization/utils/w8a8_utils.py | 8 +- .../layers/rejection_sampler.py | 18 +- vllm/model_executor/layers/resampler.py | 10 +- .../model_executor/layers/rotary_embedding.py | 68 ++++---- vllm/model_executor/layers/sampler.py | 65 ++++--- .../layers/spec_decode_base_sampler.py | 4 +- vllm/model_executor/layers/utils.py | 3 +- .../layers/vocab_parallel_embedding.py | 15 +- vllm/model_executor/model_loader/loader.py | 54 +++--- vllm/model_executor/model_loader/neuron.py | 6 +- vllm/model_executor/model_loader/openvino.py | 4 +- .../model_executor/model_loader/tensorizer.py | 7 +- vllm/model_executor/model_loader/utils.py | 12 +- .../model_loader/weight_utils.py | 41 +++-- vllm/model_executor/models/arctic.py | 13 +- vllm/model_executor/models/aria.py | 22 +-- vllm/model_executor/models/baichuan.py | 11 +- vllm/model_executor/models/bamba.py | 13 +- vllm/model_executor/models/bart.py | 7 +- vllm/model_executor/models/bert.py | 13 +- vllm/model_executor/models/blip.py | 9 +- vllm/model_executor/models/blip2.py | 10 +- vllm/model_executor/models/bloom.py | 9 +- vllm/model_executor/models/chameleon.py | 22 +-- vllm/model_executor/models/chatglm.py | 11 +- vllm/model_executor/models/clip.py | 9 +- vllm/model_executor/models/commandr.py | 11 +- vllm/model_executor/models/dbrx.py | 9 +- vllm/model_executor/models/decilm.py | 8 +- vllm/model_executor/models/deepseek.py | 11 +- vllm/model_executor/models/deepseek_mtp.py | 9 +- vllm/model_executor/models/deepseek_v2.py | 13 +- vllm/model_executor/models/deepseek_vl2.py | 20 +-- vllm/model_executor/models/eagle.py | 5 +- vllm/model_executor/models/exaone.py | 15 +- vllm/model_executor/models/fairseq2_llama.py | 8 +- vllm/model_executor/models/falcon.py | 9 +- vllm/model_executor/models/florence2.py | 27 ++- vllm/model_executor/models/fuyu.py | 10 +- vllm/model_executor/models/gemma.py | 11 +- vllm/model_executor/models/gemma2.py | 15 +- vllm/model_executor/models/glm4v.py | 3 +- vllm/model_executor/models/gpt2.py | 9 +- vllm/model_executor/models/gpt_bigcode.py | 9 +- vllm/model_executor/models/gpt_j.py | 9 +- vllm/model_executor/models/gpt_neox.py | 9 +- vllm/model_executor/models/granite.py | 13 +- vllm/model_executor/models/granitemoe.py | 7 +- vllm/model_executor/models/grok1.py | 15 +- vllm/model_executor/models/h2ovl.py | 3 +- .../models/idefics2_vision_model.py | 9 +- vllm/model_executor/models/idefics3.py | 14 +- vllm/model_executor/models/interfaces.py | 76 ++++---- vllm/model_executor/models/interfaces_base.py | 26 +-- vllm/model_executor/models/intern_vit.py | 9 +- vllm/model_executor/models/internlm2.py | 19 +- vllm/model_executor/models/internlm2_ve.py | 4 +- vllm/model_executor/models/internvl.py | 10 +- vllm/model_executor/models/jais.py | 9 +- vllm/model_executor/models/jamba.py | 15 +- vllm/model_executor/models/llama.py | 21 ++- vllm/model_executor/models/llava.py | 11 +- vllm/model_executor/models/llava_next.py | 17 +- .../model_executor/models/llava_next_video.py | 16 +- vllm/model_executor/models/llava_onevision.py | 26 +-- vllm/model_executor/models/mamba.py | 13 +- vllm/model_executor/models/mamba2.py | 13 +- vllm/model_executor/models/mamba_cache.py | 13 +- vllm/model_executor/models/medusa.py | 25 ++- vllm/model_executor/models/minicpm.py | 17 +- vllm/model_executor/models/minicpm3.py | 4 +- vllm/model_executor/models/minicpmo.py | 24 +-- vllm/model_executor/models/minicpmv.py | 54 +++--- vllm/model_executor/models/mixtral.py | 9 +- vllm/model_executor/models/mixtral_quant.py | 9 +- vllm/model_executor/models/mllama.py | 72 ++++---- vllm/model_executor/models/mlp_speculator.py | 10 +- vllm/model_executor/models/module_mapping.py | 18 +- vllm/model_executor/models/molmo.py | 46 ++--- vllm/model_executor/models/mpt.py | 9 +- vllm/model_executor/models/nemotron.py | 15 +- vllm/model_executor/models/nvlm_d.py | 3 +- vllm/model_executor/models/olmo.py | 11 +- vllm/model_executor/models/olmo2.py | 7 +- vllm/model_executor/models/olmoe.py | 11 +- vllm/model_executor/models/opt.py | 9 +- vllm/model_executor/models/orion.py | 13 +- vllm/model_executor/models/paligemma.py | 8 +- vllm/model_executor/models/persimmon.py | 9 +- vllm/model_executor/models/phi.py | 9 +- vllm/model_executor/models/phi3_small.py | 13 +- vllm/model_executor/models/phi3v.py | 14 +- vllm/model_executor/models/phimoe.py | 9 +- vllm/model_executor/models/pixtral.py | 31 ++-- .../models/prithvi_geospatial_mae.py | 9 +- vllm/model_executor/models/qwen.py | 13 +- vllm/model_executor/models/qwen2.py | 19 +- vllm/model_executor/models/qwen2_5_vl.py | 20 +-- vllm/model_executor/models/qwen2_audio.py | 8 +- vllm/model_executor/models/qwen2_moe.py | 11 +- vllm/model_executor/models/qwen2_rm.py | 7 +- vllm/model_executor/models/qwen2_vl.py | 22 +-- vllm/model_executor/models/qwen_vl.py | 8 +- vllm/model_executor/models/registry.py | 58 +++---- vllm/model_executor/models/roberta.py | 11 +- vllm/model_executor/models/siglip.py | 11 +- vllm/model_executor/models/solar.py | 13 +- vllm/model_executor/models/stablelm.py | 11 +- vllm/model_executor/models/starcoder2.py | 9 +- vllm/model_executor/models/telechat2.py | 12 +- vllm/model_executor/models/transformers.py | 3 +- vllm/model_executor/models/ultravox.py | 8 +- vllm/model_executor/models/utils.py | 48 +++--- vllm/model_executor/models/whisper.py | 24 +-- vllm/model_executor/pooling_metadata.py | 8 +- vllm/model_executor/sampling_metadata.py | 88 +++++----- vllm/model_executor/utils.py | 4 +- vllm/multimodal/base.py | 7 +- vllm/multimodal/hasher.py | 3 +- vllm/multimodal/image.py | 4 +- vllm/multimodal/registry.py | 14 +- vllm/multimodal/video.py | 4 +- vllm/platforms/cuda.py | 11 +- vllm/platforms/interface.py | 4 +- vllm/platforms/rocm.py | 6 +- vllm/plugins/__init__.py | 4 +- vllm/profiler/layerwise_profile.py | 38 ++-- vllm/profiler/utils.py | 8 +- vllm/prompt_adapter/models.py | 16 +- vllm/prompt_adapter/worker_manager.py | 16 +- vllm/spec_decode/batch_expansion.py | 43 +++-- vllm/spec_decode/draft_model_runner.py | 8 +- vllm/spec_decode/interfaces.py | 6 +- vllm/spec_decode/medusa_worker.py | 16 +- vllm/spec_decode/mlp_speculator_worker.py | 16 +- vllm/spec_decode/multi_step_worker.py | 39 +++-- vllm/spec_decode/ngram_worker.py | 14 +- vllm/spec_decode/proposer_worker_base.py | 10 +- .../spec_decode/smaller_tp_proposer_worker.py | 16 +- vllm/spec_decode/spec_decode_worker.py | 62 +++---- vllm/spec_decode/target_model_runner.py | 6 +- vllm/spec_decode/top1_proposer.py | 28 +-- vllm/spec_decode/util.py | 45 +++-- vllm/transformers_utils/config.py | 10 +- vllm/transformers_utils/configs/arctic.py | 6 +- vllm/transformers_utils/configs/cohere2.py | 6 +- .../configs/deepseek_vl2.py | 5 +- vllm/transformers_utils/configs/exaone.py | 4 +- vllm/transformers_utils/configs/jais.py | 4 +- .../configs/mlp_speculator.py | 6 +- vllm/transformers_utils/configs/mpt.py | 18 +- vllm/transformers_utils/configs/olmo2.py | 2 +- vllm/transformers_utils/configs/solar.py | 2 +- vllm/transformers_utils/configs/ultravox.py | 10 +- vllm/transformers_utils/detokenizer.py | 6 +- vllm/transformers_utils/detokenizer_utils.py | 24 +-- .../processors/deepseek_vl2.py | 29 ++-- vllm/transformers_utils/tokenizer_base.py | 34 ++-- .../tokenizer_group/__init__.py | 4 +- .../tokenizer_group/base_tokenizer_group.py | 6 +- .../tokenizer_group/ray_tokenizer_group.py | 6 +- .../tokenizer_group/tokenizer_group.py | 8 +- vllm/transformers_utils/tokenizers/mistral.py | 58 +++---- vllm/transformers_utils/utils.py | 4 +- vllm/usage/usage_lib.py | 14 +- vllm/worker/cache_engine.py | 5 +- vllm/worker/cpu_enc_dec_model_runner.py | 30 ++-- vllm/worker/cpu_model_runner.py | 77 +++++---- vllm/worker/cpu_pooling_model_runner.py | 24 +-- vllm/worker/cpu_worker.py | 26 +-- vllm/worker/enc_dec_model_runner.py | 32 ++-- vllm/worker/hpu_model_runner.py | 162 +++++++++--------- vllm/worker/hpu_worker.py | 22 +-- vllm/worker/model_runner.py | 125 +++++++------- vllm/worker/model_runner_base.py | 39 +++-- vllm/worker/multi_step_model_runner.py | 37 ++-- vllm/worker/multi_step_tpu_worker.py | 6 +- vllm/worker/multi_step_worker.py | 8 +- vllm/worker/neuron_model_runner.py | 44 ++--- vllm/worker/neuron_worker.py | 8 +- vllm/worker/openvino_model_runner.py | 38 ++-- vllm/worker/openvino_worker.py | 36 ++-- vllm/worker/pooling_model_runner.py | 24 +-- vllm/worker/tpu_model_runner.py | 65 +++---- vllm/worker/tpu_worker.py | 14 +- vllm/worker/worker.py | 26 +-- vllm/worker/worker_base.py | 50 +++--- vllm/worker/xpu_model_runner.py | 73 ++++---- vllm/worker/xpu_worker.py | 8 +- 363 files changed, 3395 insertions(+), 3419 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 2f9c3a0f12e2c..04e0c9e67eb2d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -65,6 +65,32 @@ exclude = [ [tool.ruff.lint.per-file-ignores] "vllm/version.py" = ["F401"] "vllm/_version.py" = ["ALL"] +# Python 3.8 typing. TODO: Remove these excludes after v1.0.0 +"vllm/adapter_commons/**/*.py" = ["UP006", "UP035"] +"vllm/attention/**/*.py" = ["UP006", "UP035"] +"vllm/compilation/**/*.py" = ["UP006", "UP035"] +"vllm/core/**/*.py" = ["UP006", "UP035"] +"vllm/device_allocator/**/*.py" = ["UP006", "UP035"] +"vllm/distributed/**/*.py" = ["UP006", "UP035"] +"vllm/engine/**/*.py" = ["UP006", "UP035"] +"vllm/executor/**/*.py" = ["UP006", "UP035"] +"vllm/inputs/**/*.py" = ["UP006", "UP035"] +"vllm/logging_utils/**/*.py" = ["UP006", "UP035"] +"vllm/lora/**/*.py" = ["UP006", "UP035"] +"vllm/model_executor/**/*.py" = ["UP006", "UP035"] +"vllm/multimodal/**/*.py" = ["UP006", "UP035"] +"vllm/platforms/**/*.py" = ["UP006", "UP035"] +"vllm/plugins/**/*.py" = ["UP006", "UP035"] +"vllm/profiler/**/*.py" = ["UP006", "UP035"] +"vllm/prompt_adapter/**/*.py" = ["UP006", "UP035"] +"vllm/spec_decode/**/*.py" = ["UP006", "UP035"] +"vllm/third_party/**/*.py" = ["UP006", "UP035"] +"vllm/transformers_utils/**/*.py" = ["UP006", "UP035"] +"vllm/triton_utils/**/*.py" = ["UP006", "UP035"] +"vllm/usage/**/*.py" = ["UP006", "UP035"] +"vllm/vllm_flash_attn/**/*.py" = ["UP006", "UP035"] +"vllm/assets/**/*.py" = ["UP006", "UP035"] +"vllm/worker/**/*.py" = ["UP006", "UP035"] [tool.ruff.lint] select = [ diff --git a/vllm/adapter_commons/layers.py b/vllm/adapter_commons/layers.py index 9cc2b181fc7cc..18e0c5227d45c 100644 --- a/vllm/adapter_commons/layers.py +++ b/vllm/adapter_commons/layers.py @@ -1,14 +1,15 @@ # SPDX-License-Identifier: Apache-2.0 from dataclasses import dataclass +from typing import Tuple @dataclass class AdapterMapping: # Per every token in input_ids: - index_mapping: tuple[int, ...] + index_mapping: Tuple[int, ...] # Per sampled token: - prompt_mapping: tuple[int, ...] + prompt_mapping: Tuple[int, ...] def __post_init__(self): self.index_mapping = tuple(self.index_mapping) diff --git a/vllm/adapter_commons/models.py b/vllm/adapter_commons/models.py index 5d2663d56d0a1..f9a5d2fffad5e 100644 --- a/vllm/adapter_commons/models.py +++ b/vllm/adapter_commons/models.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 from abc import ABC, abstractmethod -from typing import Any, Callable, Optional, TypeVar +from typing import Any, Callable, Dict, Optional, TypeVar from torch import nn @@ -49,9 +49,9 @@ def __init__( model: the model to be adapted. """ self.model: nn.Module = model - self._registered_adapters: dict[int, Any] = {} - # dict instead of a Set for compatibility with LRUCache. - self._active_adapters: dict[int, None] = {} + self._registered_adapters: Dict[int, Any] = {} + # Dict instead of a Set for compatibility with LRUCache. + self._active_adapters: Dict[int, None] = {} self.adapter_type = 'Adapter' self._last_mapping = None @@ -97,7 +97,7 @@ def get_adapter(self, adapter_id: int) -> Optional[Any]: raise NotImplementedError @abstractmethod - def list_adapters(self) -> dict[int, Any]: + def list_adapters(self) -> Dict[int, Any]: raise NotImplementedError @abstractmethod diff --git a/vllm/adapter_commons/utils.py b/vllm/adapter_commons/utils.py index 46e9629e1f55f..c2dc5433cc656 100644 --- a/vllm/adapter_commons/utils.py +++ b/vllm/adapter_commons/utils.py @@ -1,10 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Any, Callable, Optional +from typing import Any, Callable, Dict, Optional, Set ## model functions -def deactivate_adapter(adapter_id: int, active_adapters: dict[int, None], +def deactivate_adapter(adapter_id: int, active_adapters: Dict[int, None], deactivate_func: Callable) -> bool: if adapter_id in active_adapters: deactivate_func(adapter_id) @@ -13,7 +13,7 @@ def deactivate_adapter(adapter_id: int, active_adapters: dict[int, None], return False -def add_adapter(adapter: Any, registered_adapters: dict[int, Any], +def add_adapter(adapter: Any, registered_adapters: Dict[int, Any], capacity: int, add_func: Callable) -> bool: if adapter.id not in registered_adapters: if len(registered_adapters) >= capacity: @@ -32,23 +32,23 @@ def set_adapter_mapping(mapping: Any, last_mapping: Any, return last_mapping -def remove_adapter(adapter_id: int, registered_adapters: dict[int, Any], +def remove_adapter(adapter_id: int, registered_adapters: Dict[int, Any], deactivate_func: Callable) -> bool: deactivate_func(adapter_id) return bool(registered_adapters.pop(adapter_id, None)) -def list_adapters(registered_adapters: dict[int, Any]) -> dict[int, Any]: +def list_adapters(registered_adapters: Dict[int, Any]) -> Dict[int, Any]: return dict(registered_adapters) def get_adapter(adapter_id: int, - registered_adapters: dict[int, Any]) -> Optional[Any]: + registered_adapters: Dict[int, Any]) -> Optional[Any]: return registered_adapters.get(adapter_id) ## worker functions -def set_active_adapters_worker(requests: set[Any], mapping: Optional[Any], +def set_active_adapters_worker(requests: Set[Any], mapping: Optional[Any], apply_adapters_func, set_adapter_mapping_func) -> None: apply_adapters_func(requests) @@ -66,7 +66,7 @@ def add_adapter_worker(adapter_request: Any, list_adapters_func, return loaded -def apply_adapters_worker(adapter_requests: set[Any], list_adapters_func, +def apply_adapters_worker(adapter_requests: Set[Any], list_adapters_func, adapter_slots: int, remove_adapter_func, add_adapter_func) -> None: models_that_exist = list_adapters_func() @@ -88,5 +88,5 @@ def apply_adapters_worker(adapter_requests: set[Any], list_adapters_func, add_adapter_func(models_map[adapter_id]) -def list_adapters_worker(adapter_manager_list_adapters_func) -> set[int]: +def list_adapters_worker(adapter_manager_list_adapters_func) -> Set[int]: return set(adapter_manager_list_adapters_func()) diff --git a/vllm/adapter_commons/worker_manager.py b/vllm/adapter_commons/worker_manager.py index 3c1d26404c990..ce24e08a5b56e 100644 --- a/vllm/adapter_commons/worker_manager.py +++ b/vllm/adapter_commons/worker_manager.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 from abc import ABC, abstractmethod -from typing import Any, Optional +from typing import Any, Optional, Set import torch @@ -17,7 +17,7 @@ def is_enabled(self) -> bool: raise NotImplementedError @abstractmethod - def set_active_adapters(self, requests: set[Any], + def set_active_adapters(self, requests: Set[Any], mapping: Optional[Any]) -> None: raise NotImplementedError @@ -34,5 +34,5 @@ def remove_all_adapters(self) -> None: raise NotImplementedError @abstractmethod - def list_adapters(self) -> set[int]: + def list_adapters(self) -> Set[int]: raise NotImplementedError diff --git a/vllm/assets/video.py b/vllm/assets/video.py index e45e1a65f8905..494cfc38381cf 100644 --- a/vllm/assets/video.py +++ b/vllm/assets/video.py @@ -2,7 +2,7 @@ from dataclasses import dataclass from functools import lru_cache -from typing import Literal +from typing import List, Literal import cv2 import numpy as np @@ -58,7 +58,7 @@ def video_to_ndarrays(path: str, num_frames: int = -1) -> npt.NDArray: def video_to_pil_images_list(path: str, - num_frames: int = -1) -> list[Image.Image]: + num_frames: int = -1) -> List[Image.Image]: frames = video_to_ndarrays(path, num_frames) return [ Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) @@ -72,7 +72,7 @@ class VideoAsset: num_frames: int = -1 @property - def pil_images(self) -> list[Image.Image]: + def pil_images(self) -> List[Image.Image]: video_path = download_video_asset(self.name) ret = video_to_pil_images_list(video_path, self.num_frames) return ret diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py index d610dde0a8e61..5f0a540135402 100644 --- a/vllm/attention/backends/abstract.py +++ b/vllm/attention/backends/abstract.py @@ -3,7 +3,8 @@ from abc import ABC, abstractmethod from contextlib import contextmanager from dataclasses import dataclass, fields -from typing import TYPE_CHECKING, Any, Generic, Optional, Protocol, TypeVar +from typing import (TYPE_CHECKING, Any, Dict, Generic, List, Optional, + Protocol, Set, Tuple, Type, TypeVar) import torch @@ -44,17 +45,17 @@ def get_name() -> str: @staticmethod @abstractmethod - def get_impl_cls() -> type["AttentionImpl"]: + def get_impl_cls() -> Type["AttentionImpl"]: raise NotImplementedError @staticmethod @abstractmethod - def get_metadata_cls() -> type["AttentionMetadata"]: + def get_metadata_cls() -> Type["AttentionMetadata"]: raise NotImplementedError @staticmethod @abstractmethod - def get_state_cls() -> type["AttentionState"]: + def get_state_cls() -> Type["AttentionState"]: raise NotImplementedError @classmethod @@ -63,7 +64,7 @@ def make_metadata(cls, *args, **kwargs) -> "AttentionMetadata": @staticmethod @abstractmethod - def get_builder_cls() -> type["AttentionMetadataBuilder"]: + def get_builder_cls() -> Type["AttentionMetadataBuilder"]: raise NotImplementedError @staticmethod @@ -73,7 +74,7 @@ def get_kv_cache_shape( block_size: int, num_kv_heads: int, head_size: int, - ) -> tuple[int, ...]: + ) -> Tuple[int, ...]: raise NotImplementedError @staticmethod @@ -88,7 +89,7 @@ def swap_blocks( @staticmethod @abstractmethod def copy_blocks( - kv_caches: list[torch.Tensor], + kv_caches: List[torch.Tensor], src_to_dists: torch.Tensor, ) -> None: raise NotImplementedError @@ -121,7 +122,7 @@ class AttentionMetadata: # N.B. These aren't really related to attention and don't belong on this # type -- this is just a temporary solution to make them available to # `model_executable`. - multi_modal_placeholder_index_maps: Optional[dict[ + multi_modal_placeholder_index_maps: Optional[Dict[ str, MultiModalPlaceholderMap.IndexMap]] # Enable/disable KV scales calculation. This is so that we can disable the @@ -143,8 +144,8 @@ def decode_metadata(self) -> Optional["AttentionMetadata"]: pass def asdict_zerocopy(self, - skip_fields: Optional[set[str]] = None - ) -> dict[str, Any]: + skip_fields: Optional[Set[str]] = None + ) -> Dict[str, Any]: """Similar to dataclasses.asdict, but avoids deepcopying.""" if skip_fields is None: skip_fields = set() @@ -190,14 +191,14 @@ def graph_capture_get_metadata_for_batch( def get_graph_input_buffers( self, attn_metadata: T, - is_encoder_decoder_model: bool = False) -> dict[str, Any]: + is_encoder_decoder_model: bool = False) -> Dict[str, Any]: """Get attention-specific input buffers for CUDA graph capture.""" ... @abstractmethod def prepare_graph_input_buffers( self, - input_buffers: dict[str, Any], + input_buffers: Dict[str, Any], attn_metadata: T, is_encoder_decoder_model: bool = False) -> None: """In-place modify input buffers dict for CUDA graph replay.""" @@ -223,7 +224,7 @@ def prepare(self) -> None: raise NotImplementedError @abstractmethod - def build(self, seq_lens: list[int], query_lens: list[int], + def build(self, seq_lens: List[int], query_lens: List[int], cuda_graph_pad_size: int, batch_size: int) -> T: """Build attention metadata with on-device tensors.""" raise NotImplementedError @@ -256,10 +257,10 @@ def __init__( head_size: int, scale: float, num_kv_heads: Optional[int] = None, - alibi_slopes: Optional[list[float]] = None, + alibi_slopes: Optional[List[float]] = None, sliding_window: Optional[int] = None, kv_cache_dtype: str = "auto", - blocksparse_params: Optional[dict[str, Any]] = None, + blocksparse_params: Optional[Dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, attn_type: str = AttentionType.DECODER, ) -> None: diff --git a/vllm/attention/backends/blocksparse_attn.py b/vllm/attention/backends/blocksparse_attn.py index 5300d158ce1c3..9765e7881ad9d 100644 --- a/vllm/attention/backends/blocksparse_attn.py +++ b/vllm/attention/backends/blocksparse_attn.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 from dataclasses import dataclass, field -from typing import Any, Optional +from typing import Any, Dict, List, Optional, Tuple, Type import torch @@ -58,7 +58,7 @@ class BlocksparseParams: head_sliding_step: int = field(init=False) # range of q heads to for a TP rank - active_head_range: tuple = field(init=False) + active_head_range: Tuple = field(init=False) def __post_init__(self): assert self.block_size > 0 @@ -95,19 +95,19 @@ def get_name() -> str: return "BLOCK_SPARSE_FLASH_ATTN" @staticmethod - def get_impl_cls() -> type["BlocksparseFlashAttentionImpl"]: + def get_impl_cls() -> Type["BlocksparseFlashAttentionImpl"]: return BlocksparseFlashAttentionImpl @staticmethod - def get_metadata_cls() -> type["AttentionMetadata"]: + def get_metadata_cls() -> Type["AttentionMetadata"]: return BlocksparseFlashAttentionMetadata @staticmethod - def get_builder_cls() -> type["BlocksparseFlashAttentionMetadataBuilder"]: + def get_builder_cls() -> Type["BlocksparseFlashAttentionMetadataBuilder"]: return BlocksparseFlashAttentionMetadataBuilder @staticmethod - def get_state_cls() -> type["CommonAttentionState"]: + def get_state_cls() -> Type["CommonAttentionState"]: return CommonAttentionState @staticmethod @@ -116,7 +116,7 @@ def get_kv_cache_shape( block_size: int, num_kv_heads: int, head_size: int, - ) -> tuple[int, ...]: + ) -> Tuple[int, ...]: return PagedAttention.get_kv_cache_shape(num_blocks, block_size, num_kv_heads, head_size) @@ -124,14 +124,14 @@ def get_kv_cache_shape( def swap_blocks( src_kv_cache: torch.Tensor, dst_kv_cache: torch.Tensor, - src_to_dst: dict[int, int], + src_to_dst: Dict[int, int], ) -> None: PagedAttention.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst) @staticmethod def copy_blocks( - kv_caches: list[torch.Tensor], - src_to_dists: dict[int, list[int]], + kv_caches: List[torch.Tensor], + src_to_dists: Dict[int, List[int]], ) -> None: PagedAttention.copy_blocks(kv_caches, src_to_dists) @@ -148,7 +148,7 @@ class BlocksparseFlashAttentionMetadata(AttentionMetadata): """ # (batch_size,). The sequence length per sequence. Sequence length means # the computed tokens + new tokens None if it is a decoding. - seq_lens: Optional[list[int]] + seq_lens: Optional[List[int]] # seq_lens stored as a tensor. seq_lens_tensor: Optional[torch.Tensor] @@ -299,10 +299,10 @@ def __init__( head_size: int, scale: float, num_kv_heads: int, - alibi_slopes: Optional[list[float]], + alibi_slopes: Optional[List[float]], sliding_window: Optional[int], kv_cache_dtype: str, - blocksparse_params: Optional[dict[str, Any]] = None, + blocksparse_params: Optional[Dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, attn_type: str = AttentionType.DECODER, ) -> None: diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py index a18c1d190902e..5aca10079f9be 100755 --- a/vllm/attention/backends/flash_attn.py +++ b/vllm/attention/backends/flash_attn.py @@ -3,7 +3,7 @@ from collections import defaultdict from dataclasses import dataclass from itertools import accumulate -from typing import TYPE_CHECKING, Any, Optional +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type import torch @@ -37,7 +37,7 @@ class FlashAttentionBackend(AttentionBackend): accept_output_buffer: bool = True @staticmethod - def get_supported_head_sizes() -> list[int]: + def get_supported_head_sizes() -> List[int]: return [32, 64, 96, 128, 160, 192, 224, 256] @staticmethod @@ -45,19 +45,19 @@ def get_name() -> str: return "FLASH_ATTN" @staticmethod - def get_impl_cls() -> type["FlashAttentionImpl"]: + def get_impl_cls() -> Type["FlashAttentionImpl"]: return FlashAttentionImpl @staticmethod - def get_metadata_cls() -> type["AttentionMetadata"]: + def get_metadata_cls() -> Type["AttentionMetadata"]: return FlashAttentionMetadata @staticmethod - def get_builder_cls() -> type["FlashAttentionMetadataBuilder"]: + def get_builder_cls() -> Type["FlashAttentionMetadataBuilder"]: return FlashAttentionMetadataBuilder @staticmethod - def get_state_cls() -> type["CommonAttentionState"]: + def get_state_cls() -> Type["CommonAttentionState"]: return CommonAttentionState @staticmethod @@ -66,7 +66,7 @@ def get_kv_cache_shape( block_size: int, num_kv_heads: int, head_size: int, - ) -> tuple[int, ...]: + ) -> Tuple[int, ...]: if block_size % 16 != 0: raise ValueError("Block size must be a multiple of 16.") return (2, num_blocks, block_size, num_kv_heads, head_size) @@ -86,7 +86,7 @@ def swap_blocks( @staticmethod def copy_blocks( - kv_caches: list[torch.Tensor], + kv_caches: List[torch.Tensor], src_to_dists: torch.Tensor, ) -> None: key_caches = [kv_cache[0] for kv_cache in kv_caches] @@ -106,7 +106,7 @@ class FlashAttentionMetadata(AttentionMetadata): """ # (batch_size,). The sequence length per sequence. Sequence length means # the computed tokens + new tokens None if it is a decoding. - seq_lens: Optional[list[int]] + seq_lens: Optional[List[int]] # seq_lens stored as a tensor. seq_lens_tensor: Optional[torch.Tensor] @@ -163,7 +163,7 @@ class FlashAttentionMetadata(AttentionMetadata): # Begin encoder attn & enc/dec cross-attn fields... # Encoder sequence lengths representation - encoder_seq_lens: Optional[list[int]] = None + encoder_seq_lens: Optional[List[int]] = None encoder_seq_lens_tensor: Optional[torch.Tensor] = None # (batch_size + 1,). The cumulative sequence lengths of the sequences in # the batch, used to index into sequence. E.g., if the sequence length is @@ -387,12 +387,12 @@ def __init__(self, input_builder: "ModelInputForGPUBuilder"): self.block_size = input_builder.block_size def prepare(self): - self.slot_mapping: list[int] = [] - self.prefill_seq_lens: list[int] = [] - self.context_lens: list[int] = [] - self.block_tables: list[list[int]] = [] - self.curr_seq_lens: list[int] = [] - self.multimodal_placeholder_maps: dict[ + self.slot_mapping: List[int] = [] + self.prefill_seq_lens: List[int] = [] + self.context_lens: List[int] = [] + self.block_tables: List[List[int]] = [] + self.curr_seq_lens: List[int] = [] + self.multimodal_placeholder_maps: Dict[ str, MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap) self.num_prefills = 0 @@ -462,7 +462,7 @@ def _add_seq_group( def _get_graph_runner_block_tables( self, num_seqs: int, - block_tables: list[list[int]]) -> torch.Tensor: + block_tables: List[List[int]]) -> torch.Tensor: # The shape of graph_block_tables is # [max batch size, max context len // block size]. max_batch_size, max_blocks = self.runner.graph_block_tables.shape @@ -484,7 +484,7 @@ def _get_graph_runner_block_tables( return torch.from_numpy(graph_block_tables).to( device=self.runner.device, non_blocking=True) - def build(self, seq_lens: list[int], query_lens: list[int], + def build(self, seq_lens: List[int], query_lens: List[int], cuda_graph_pad_size: int, batch_size: int): """Build attention metadata with on-device tensors. @@ -606,10 +606,10 @@ def __init__( head_size: int, scale: float, num_kv_heads: int, - alibi_slopes: Optional[list[float]], + alibi_slopes: Optional[List[float]], sliding_window: Optional[int], kv_cache_dtype: str, - blocksparse_params: Optional[dict[str, Any]] = None, + blocksparse_params: Optional[Dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, attn_type: str = AttentionType.DECODER, ) -> None: diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py index 19fe810427a11..0556c191ddea6 100644 --- a/vllm/attention/backends/flashinfer.py +++ b/vllm/attention/backends/flashinfer.py @@ -4,7 +4,7 @@ from collections import defaultdict from contextlib import contextmanager from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, Optional +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple, Type from vllm.multimodal import MultiModalPlaceholderMap @@ -53,19 +53,19 @@ def get_name() -> str: return "FLASHINFER" @staticmethod - def get_impl_cls() -> type["FlashInferImpl"]: + def get_impl_cls() -> Type["FlashInferImpl"]: return FlashInferImpl @staticmethod - def get_metadata_cls() -> type["AttentionMetadata"]: + def get_metadata_cls() -> Type["AttentionMetadata"]: return FlashInferMetadata @staticmethod - def get_builder_cls() -> type["FlashInferMetadataBuilder"]: + def get_builder_cls() -> Type["FlashInferMetadataBuilder"]: return FlashInferMetadataBuilder @staticmethod - def get_state_cls() -> type["FlashInferState"]: + def get_state_cls() -> Type["FlashInferState"]: return FlashInferState @staticmethod @@ -74,7 +74,7 @@ def get_kv_cache_shape( block_size: int, num_kv_heads: int, head_size: int, - ) -> tuple[int, ...]: + ) -> Tuple[int, ...]: return (num_blocks, 2, block_size, num_kv_heads, head_size) @staticmethod @@ -87,13 +87,13 @@ def swap_blocks( @staticmethod def copy_blocks( - kv_caches: list[torch.Tensor], + kv_caches: List[torch.Tensor], src_to_dists: torch.Tensor, ) -> None: PagedAttention.copy_blocks(kv_caches, src_to_dists) @staticmethod - def get_supported_head_sizes() -> list[int]: + def get_supported_head_sizes() -> List[int]: return [64, 128, 256] @staticmethod @@ -119,14 +119,14 @@ class PerLayerParameters: def get_per_layer_parameters( - vllm_config: VllmConfig) -> dict[str, PerLayerParameters]: + vllm_config: VllmConfig) -> Dict[str, PerLayerParameters]: """ Scan all attention layers and determine some hyperparameters to use during `plan`. """ layers = vllm_config.compilation_config.static_forward_context - per_layer_params: dict[str, PerLayerParameters] = {} + per_layer_params: Dict[str, PerLayerParameters] = {} for key, layer in layers.items(): assert isinstance(layer, Attention) @@ -147,7 +147,7 @@ def get_per_layer_parameters( def infer_global_hyperparameters( - per_layer_params: dict[str, PerLayerParameters]) -> PerLayerParameters: + per_layer_params: Dict[str, PerLayerParameters]) -> PerLayerParameters: """ Currently, FlashInfer backend only support models in which all layers share the same values for the following hyperparameters: @@ -514,8 +514,8 @@ def begin_forward(self): q_data_type=self.q_data_type) def asdict_zerocopy(self, - skip_fields: Optional[set[str]] = None - ) -> dict[str, Any]: + skip_fields: Optional[Set[str]] = None + ) -> Dict[str, Any]: if skip_fields is None: skip_fields = set() # We need to skip the prefill/decode_wrapper field since it cannot be @@ -613,12 +613,12 @@ def __init__(self, input_builder: "ModelInputForGPUBuilder"): self.vllm_config = get_current_vllm_config() def prepare(self): - self.slot_mapping: list[int] = [] - self.prefill_seq_lens: list[int] = [] - self.context_lens: list[int] = [] - self.block_tables: list[list[int]] = [] - self.curr_seq_lens: list[int] = [] - self.multimodal_placeholder_maps: dict[ + self.slot_mapping: List[int] = [] + self.prefill_seq_lens: List[int] = [] + self.context_lens: List[int] = [] + self.block_tables: List[List[int]] = [] + self.curr_seq_lens: List[int] = [] + self.multimodal_placeholder_maps: Dict[ str, MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap) self.num_prefills = 0 @@ -635,12 +635,12 @@ def prepare(self): # [0, 5, 8, 1, 6, 7, 3, 4] # paged_kv_indptr is used to index into paged_kv_indices: # [0, 3, 6, 8] - self.paged_kv_indices: list[int] = [] + self.paged_kv_indices: List[int] = [] # 0 at the beginning of paged_kv_indptr indicates the start of the # first request’s page indices in the paged_kv_indices list. - self.paged_kv_indptr: list[int] = [0] + self.paged_kv_indptr: List[int] = [0] # paged_kv_last_page_len is the length of the last page of each request - self.paged_kv_last_page_len: list[int] = [] + self.paged_kv_last_page_len: List[int] = [] self.total_blocks = 0 self.is_profile_run: bool = False @@ -725,7 +725,7 @@ def _add_seq_group( block_table = block_tables[seq_id] self._update_paged_kv_tensors(block_table, seq_len) - def _update_paged_kv_tensors(self, block_table: list[int], seq_len: int): + def _update_paged_kv_tensors(self, block_table: List[int], seq_len: int): # Get the number of valid blocks based on sequence length. # If seq_len = 16, block_size = 16, # block_table_bound is 1 with 1 valid block. @@ -744,7 +744,7 @@ def _update_paged_kv_tensors(self, block_table: list[int], seq_len: int): last_page_len = self.block_size self.paged_kv_last_page_len.append(last_page_len) - def build(self, seq_lens: list[int], query_lens: list[int], + def build(self, seq_lens: List[int], query_lens: List[int], cuda_graph_pad_size: int, batch_size: int): """Build attention metadata with on-device tensors. @@ -901,10 +901,10 @@ def __init__( head_size: int, scale: float, num_kv_heads: int, - alibi_slopes: Optional[list[float]], + alibi_slopes: Optional[List[float]], sliding_window: Optional[int], kv_cache_dtype: str, - blocksparse_params: Optional[dict[str, Any]] = None, + blocksparse_params: Optional[Dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, attn_type: str = AttentionType.DECODER, ) -> None: diff --git a/vllm/attention/backends/flashmla.py b/vllm/attention/backends/flashmla.py index d119c7993fcdb..273c69b63ec63 100644 --- a/vllm/attention/backends/flashmla.py +++ b/vllm/attention/backends/flashmla.py @@ -2,7 +2,7 @@ from contextlib import contextmanager from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, Optional +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type import torch @@ -27,25 +27,25 @@ def get_name() -> str: return "FLASHMLA" @staticmethod - def get_impl_cls() -> type["FlashMLAImpl"]: + def get_impl_cls() -> Type["FlashMLAImpl"]: return FlashMLAImpl @staticmethod - def get_metadata_cls() -> type["FlashMLAMetadata"]: + def get_metadata_cls() -> Type["FlashMLAMetadata"]: return FlashMLAMetadata @staticmethod - def get_builder_cls() -> type["FlashMLAMetadataBuilder"]: + def get_builder_cls() -> Type["FlashMLAMetadataBuilder"]: return FlashMLAMetadataBuilder @staticmethod - def get_state_cls() -> type["FlashMLAState"]: + def get_state_cls() -> Type["FlashMLAState"]: return FlashMLAState @dataclass class FlashMLAMetadata(MLACommonMetadata): - decode_tile_scheduler_metadata: Optional[tuple[torch.Tensor, + decode_tile_scheduler_metadata: Optional[Tuple[torch.Tensor, torch.Tensor]] = None decode_num_splits: Optional[torch.Tensor] = None @@ -79,7 +79,7 @@ def __init__(self, *args, **kwargs): self.num_q_heads = self.runner.model_config.get_num_attention_heads( self.runner.parallel_config) - def build(self, seq_lens: list[int], query_lens: list[int], + def build(self, seq_lens: List[int], query_lens: List[int], cuda_graph_pad_size: int, batch_size: int): m = super().build(seq_lens, query_lens, cuda_graph_pad_size, batch_size) @@ -176,10 +176,10 @@ def __init__( head_size: int, scale: float, num_kv_heads: int, - alibi_slopes: Optional[list[float]], + alibi_slopes: Optional[List[float]], sliding_window: Optional[int], kv_cache_dtype: str, - blocksparse_params: Optional[dict[str, Any]], + blocksparse_params: Optional[Dict[str, Any]], logits_soft_cap: Optional[float], attn_type: str, # MLA Specific Arguments diff --git a/vllm/attention/backends/hpu_attn.py b/vllm/attention/backends/hpu_attn.py index e5048fbef6198..9eb533685dbd2 100644 --- a/vllm/attention/backends/hpu_attn.py +++ b/vllm/attention/backends/hpu_attn.py @@ -6,7 +6,7 @@ import os from dataclasses import dataclass -from typing import Any, Optional +from typing import Any, Dict, List, Optional, Tuple, Type import torch import vllm_hpu_extension.ops as ops @@ -31,15 +31,15 @@ def get_name() -> str: return "HPU_ATTN" @staticmethod - def get_impl_cls() -> type["HPUAttentionImpl"]: + def get_impl_cls() -> Type["HPUAttentionImpl"]: return HPUAttentionImpl @staticmethod - def get_metadata_cls() -> type["AttentionMetadata"]: + def get_metadata_cls() -> Type["AttentionMetadata"]: return HPUAttentionMetadata @staticmethod - def get_state_cls() -> type["CommonAttentionState"]: + def get_state_cls() -> Type["CommonAttentionState"]: return CommonAttentionState @staticmethod @@ -48,7 +48,7 @@ def get_kv_cache_shape( block_size: int, num_kv_heads: int, head_size: int, - ) -> tuple[int, ...]: + ) -> Tuple[int, ...]: return HPUPagedAttention.get_kv_cache_shape(num_blocks, block_size, num_kv_heads, head_size) @@ -56,14 +56,14 @@ def get_kv_cache_shape( def swap_blocks( src_kv_cache: torch.Tensor, dst_kv_cache: torch.Tensor, - src_to_dst: dict[int, int], + src_to_dst: Dict[int, int], ) -> None: HPUPagedAttention.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst) @staticmethod def copy_blocks( - kv_caches: list[torch.Tensor], - src_to_dists: dict[int, list[int]], + kv_caches: List[torch.Tensor], + src_to_dists: Dict[int, List[int]], ) -> None: HPUPagedAttention.copy_blocks(kv_caches, src_to_dists) @@ -101,10 +101,10 @@ def __init__( head_size: int, scale: float, num_kv_heads: int, - alibi_slopes: Optional[list[float]], + alibi_slopes: Optional[List[float]], sliding_window: Optional[int], kv_cache_dtype: str, - blocksparse_params: Optional[dict[str, Any]] = None, + blocksparse_params: Optional[Dict[str, Any]] = None, max_seq_len: int = 4096, attn_type: str = AttentionType.DECODER, ) -> None: diff --git a/vllm/attention/backends/ipex_attn.py b/vllm/attention/backends/ipex_attn.py index b772089ad25d7..b4879af4cf20e 100644 --- a/vllm/attention/backends/ipex_attn.py +++ b/vllm/attention/backends/ipex_attn.py @@ -2,7 +2,7 @@ """ Attention layer with torch scaled_dot_product_attention and PagedAttention.""" from dataclasses import dataclass -from typing import Any, Optional +from typing import Any, Dict, List, Optional, Tuple, Type import torch @@ -24,15 +24,15 @@ def get_name() -> str: return "IPEX" @staticmethod - def get_impl_cls() -> type["IpexAttnBackendImpl"]: + def get_impl_cls() -> Type["IpexAttnBackendImpl"]: return IpexAttnBackendImpl @staticmethod - def get_metadata_cls() -> type["IpexAttnMetadata"]: + def get_metadata_cls() -> Type["IpexAttnMetadata"]: return IpexAttnMetadata @staticmethod - def get_state_cls() -> type["CommonAttentionState"]: + def get_state_cls() -> Type["CommonAttentionState"]: return CommonAttentionState @staticmethod @@ -41,7 +41,7 @@ def get_kv_cache_shape( block_size: int, num_kv_heads: int, head_size: int, - ) -> tuple[int, ...]: + ) -> Tuple[int, ...]: return PagedAttention.get_kv_cache_shape(num_blocks, block_size, num_kv_heads, head_size) @@ -56,7 +56,7 @@ def swap_blocks( @staticmethod def copy_blocks( - kv_caches: list[torch.Tensor], + kv_caches: List[torch.Tensor], src_to_dists: torch.Tensor, ) -> None: from vllm._ipex_ops import ipex_ops as ops @@ -73,7 +73,7 @@ class IpexAttnMetadata(AttentionMetadata, PagedAttentionMetadata): # or all decoding. True if all sequences are prompts. is_prompt: bool slot_mapping: torch.Tensor - seq_lens: Optional[list[int]] + seq_lens: Optional[List[int]] seqlen_q: Optional[torch.Tensor] max_seqlen: Optional[int] @@ -83,7 +83,7 @@ def __post_init__(self): # when alibi slopes is used. It is because of the limitation # from xformer API. # will not appear in the __repr__ and __init__ - self.attn_bias: Optional[list[torch.Tensor]] = None + self.attn_bias: Optional[List[torch.Tensor]] = None @property def prefill_metadata(self) -> Optional["IpexAttnMetadata"]: @@ -112,10 +112,10 @@ def __init__( head_size: int, scale: float, num_kv_heads: int, - alibi_slopes: Optional[list[float]], + alibi_slopes: Optional[List[float]], sliding_window: Optional[int], kv_cache_dtype: str, - blocksparse_params: Optional[dict[str, Any]] = None, + blocksparse_params: Optional[Dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, attn_type: str = AttentionType.DECODER, ) -> None: @@ -160,7 +160,7 @@ def split_kv_cache( kv_cache: torch.Tensor, num_kv_heads: int, head_size: int, - ) -> tuple[torch.Tensor, torch.Tensor]: + ) -> Tuple[torch.Tensor, torch.Tensor]: x = 1 num_blocks = kv_cache.shape[1] @@ -341,8 +341,8 @@ def forward( def _make_alibi_bias( alibi_slopes: torch.Tensor, dtype: torch.dtype, - seq_lens: list[int], -) -> list[torch.Tensor]: + seq_lens: List[int], +) -> List[torch.Tensor]: attn_biases = [] for seq_len in seq_lens: bias = torch.arange(seq_len, dtype=dtype, device=alibi_slopes.device) @@ -366,10 +366,10 @@ def _make_alibi_bias( def _make_sliding_window_bias( - seq_lens: list[int], + seq_lens: List[int], window_size: Optional[int], dtype: torch.dtype, -) -> list[torch.Tensor]: +) -> List[torch.Tensor]: attn_biases = [] for seq_len in seq_lens: tensor = torch.full( diff --git a/vllm/attention/backends/mla/common.py b/vllm/attention/backends/mla/common.py index 2eae3aa865998..1befcb6b45dfa 100644 --- a/vllm/attention/backends/mla/common.py +++ b/vllm/attention/backends/mla/common.py @@ -198,7 +198,8 @@ from contextlib import contextmanager from dataclasses import dataclass from itertools import accumulate -from typing import TYPE_CHECKING, Any, Generic, Optional, TypeVar +from typing import (TYPE_CHECKING, Any, Dict, Generic, List, Optional, Tuple, + Type, TypeVar) import torch from compressed_tensors.quantization import QuantizationStrategy @@ -252,15 +253,15 @@ def get_name() -> str: return "TRITON_MLA" @staticmethod - def get_metadata_cls() -> type["AttentionMetadata"]: + def get_metadata_cls() -> Type["AttentionMetadata"]: return MLACommonMetadata @staticmethod - def get_builder_cls() -> type["MLACommonMetadataBuilder"]: + def get_builder_cls() -> Type["MLACommonMetadataBuilder"]: return MLACommonMetadataBuilder @staticmethod - def get_state_cls() -> type["MLACommonState"]: + def get_state_cls() -> Type["MLACommonState"]: return MLACommonState @staticmethod @@ -269,7 +270,7 @@ def get_kv_cache_shape( block_size: int, num_kv_heads: int, # assumed to be 1 for MLA head_size: int, - ) -> tuple[int, ...]: + ) -> Tuple[int, ...]: return (num_blocks, block_size, head_size) @staticmethod @@ -282,13 +283,13 @@ def swap_blocks( @staticmethod def copy_blocks( - kv_caches: list[torch.Tensor], + kv_caches: List[torch.Tensor], src_to_dists: torch.Tensor, ) -> None: ops.copy_blocks_mla(kv_caches, src_to_dists) @staticmethod - def get_supported_head_sizes() -> list[int]: + def get_supported_head_sizes() -> List[int]: return [576] @@ -474,7 +475,7 @@ class MLACommonMetadata(AttentionMetadata): # (batch_size,). The sequence length per sequence. Sequence length means # the computed tokens + new tokens None if it is a decoding. - seq_lens: Optional[list[int]] + seq_lens: Optional[List[int]] # seq_lens stored as a tensor. seq_lens_tensor: Optional[torch.Tensor] @@ -527,8 +528,8 @@ class MLACommonMetadata(AttentionMetadata): # For chunked prefill context_chunk_cu_seq_lens: Optional[torch.Tensor] = None context_chunk_starts: Optional[torch.Tensor] = None - context_chunk_seq_tot: Optional[list[int]] = None - context_chunk_max_seq_lens: Optional[list[int]] = None + context_chunk_seq_tot: Optional[List[int]] = None + context_chunk_max_seq_lens: Optional[List[int]] = None # Set by MLAAttentionState in `begin_forward` so it doesn't get broadcasted chunked_prefill_workspace: Optional[torch.Tensor] = None @@ -748,13 +749,13 @@ def __init__(self, input_builder: "ModelInputForGPUBuilder"): self.page_size = self.runner.block_size def prepare(self): - self.slot_mapping: list[int] = [] - self.prefill_seq_lens: list[int] = [] - self.context_lens: list[int] = [] - self.block_tables: list[list[int]] = [] - self.curr_seq_lens: list[int] = [] - self.input_positions: list[int] = [] - self.multimodal_placeholder_maps: dict[ + self.slot_mapping: List[int] = [] + self.prefill_seq_lens: List[int] = [] + self.context_lens: List[int] = [] + self.block_tables: List[List[int]] = [] + self.curr_seq_lens: List[int] = [] + self.input_positions: List[int] = [] + self.multimodal_placeholder_maps: Dict[ str, MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap) self.num_prefills = 0 @@ -819,7 +820,7 @@ def _add_seq_group( def _get_graph_runner_block_tables( self, num_seqs: int, - block_tables: list[list[int]]) -> torch.Tensor: + block_tables: List[List[int]]) -> torch.Tensor: # The shape of graph_block_tables is # [max batch size, max context len // block size]. max_batch_size, max_blocks = self.runner.graph_block_tables.shape @@ -841,7 +842,7 @@ def _get_graph_runner_block_tables( return torch.from_numpy(graph_block_tables).to( device=self.runner.device, non_blocking=True) - def build(self, seq_lens: list[int], query_lens: list[int], + def build(self, seq_lens: List[int], query_lens: List[int], cuda_graph_pad_size: int, batch_size: int): """Build attention metadata with on-device tensors. @@ -1005,10 +1006,10 @@ def __init__( head_size: int, scale: float, num_kv_heads: int, - alibi_slopes: Optional[list[float]], + alibi_slopes: Optional[List[float]], sliding_window: Optional[int], kv_cache_dtype: str, - blocksparse_params: Optional[dict[str, Any]], + blocksparse_params: Optional[Dict[str, Any]], logits_soft_cap: Optional[float], attn_type: str, # MLA Specific Arguments @@ -1101,7 +1102,7 @@ def process_weights_after_loading(self, act_dtype: torch.dtype): # # returns input_group_shape, weight_group_shape def get_scale_group_shapes_for_fp8(layer: LinearBase) -> \ - tuple[tuple[int, int], tuple[int, int]]: + Tuple[Tuple[int, int], Tuple[int, int]]: if isinstance(layer.quant_method, Fp8LinearMethod): if layer.quant_method.block_quant: weight_block_size = \ diff --git a/vllm/attention/backends/openvino.py b/vllm/attention/backends/openvino.py index 61fcb16b7c944..9908620a32a23 100644 --- a/vllm/attention/backends/openvino.py +++ b/vllm/attention/backends/openvino.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 from dataclasses import dataclass -from typing import Optional +from typing import Dict, List, Optional, Tuple, Type import openvino as ov import torch @@ -54,7 +54,7 @@ def make_metadata(*args, **kwargs) -> "AttentionMetadata": raise NotImplementedError @staticmethod - def get_state_cls() -> type["CommonAttentionState"]: + def get_state_cls() -> Type["CommonAttentionState"]: return CommonAttentionState @staticmethod @@ -67,22 +67,22 @@ def get_kv_cache_shape( block_size: int, num_kv_heads: int, head_size: int, - ) -> tuple[int, ...]: + ) -> Tuple[int, ...]: return (2, num_blocks, num_kv_heads, block_size, head_size) @staticmethod def swap_blocks( src_tensor: ov.Tensor, dst_tensor: ov.Tensor, - src_to_dists: list[tuple[int, int]], + src_to_dists: List[Tuple[int, int]], ) -> None: for src, dst in src_to_dists: copy_cache_block(src_tensor, dst_tensor, src, dst) @staticmethod def copy_blocks( - kv_caches: list[tuple[ov.Tensor, ov.Tensor]], - src_to_dists: list[tuple[int, int]], + kv_caches: List[Tuple[ov.Tensor, ov.Tensor]], + src_to_dists: List[Tuple[int, int]], ) -> None: for src, dst in src_to_dists: for key_cache, value_cache in kv_caches: @@ -138,7 +138,7 @@ class OpenVINOAttentionMetadata: # N.B. These aren't really related to attention and don't belong on this # type -- this is just a temporary solution to make them available to # `model_executable`. - multi_modal_placeholder_index_maps: Optional[dict[ + multi_modal_placeholder_index_maps: Optional[Dict[ str, MultiModalPlaceholderMap.IndexMap]] # Enable/disable KV scales calculation. This is so that we can disable the diff --git a/vllm/attention/backends/pallas.py b/vllm/attention/backends/pallas.py index 66260fc92a9b1..b61dfe63ddcaa 100644 --- a/vllm/attention/backends/pallas.py +++ b/vllm/attention/backends/pallas.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 from dataclasses import dataclass -from typing import Any, Optional +from typing import Any, Dict, List, Optional, Tuple, Type import torch import torch_xla.experimental.custom_kernel # Required to register custom ops. @@ -19,15 +19,15 @@ def get_name() -> str: return "PALLAS" @staticmethod - def get_impl_cls() -> type["PallasAttentionBackendImpl"]: + def get_impl_cls() -> Type["PallasAttentionBackendImpl"]: return PallasAttentionBackendImpl @staticmethod - def get_metadata_cls() -> type["PallasMetadata"]: + def get_metadata_cls() -> Type["PallasMetadata"]: return PallasMetadata @staticmethod - def get_state_cls() -> type["CommonAttentionState"]: + def get_state_cls() -> Type["CommonAttentionState"]: return CommonAttentionState @staticmethod @@ -36,7 +36,7 @@ def get_kv_cache_shape( block_size: int, num_kv_heads: int, head_size: int, - ) -> tuple[int, ...]: + ) -> Tuple[int, ...]: return (num_kv_heads, num_blocks, block_size, head_size) @staticmethod @@ -50,8 +50,8 @@ def swap_blocks( @torch.compile(backend="openxla") @staticmethod def copy_blocks( - kv_caches: list[tuple[torch.Tensor, torch.Tensor]], - src_to_dists: tuple[torch.Tensor, torch.Tensor], + kv_caches: List[Tuple[torch.Tensor, torch.Tensor]], + src_to_dists: Tuple[torch.Tensor, torch.Tensor], ) -> None: src_indices, dst_indices = src_to_dists for k_cache, v_cache in kv_caches: @@ -98,10 +98,10 @@ def __init__( head_size: int, scale: float, num_kv_heads: int, - alibi_slopes: Optional[list[float]], + alibi_slopes: Optional[List[float]], sliding_window: Optional[int], kv_cache_dtype: str, - blocksparse_params: Optional[dict[str, Any]] = None, + blocksparse_params: Optional[Dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, attn_type: str = AttentionType.DECODER, ) -> None: @@ -155,7 +155,7 @@ def forward( query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, - kv_cache: tuple[torch.Tensor, torch.Tensor], + kv_cache: Tuple[torch.Tensor, torch.Tensor], attn_metadata: PallasMetadata, output: Optional[torch.Tensor] = None, ) -> torch.Tensor: diff --git a/vllm/attention/backends/placeholder_attn.py b/vllm/attention/backends/placeholder_attn.py index 84b725473af98..f1def25c89cff 100644 --- a/vllm/attention/backends/placeholder_attn.py +++ b/vllm/attention/backends/placeholder_attn.py @@ -3,7 +3,7 @@ from collections import defaultdict from dataclasses import dataclass from itertools import accumulate -from typing import TYPE_CHECKING, Optional +from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Type import torch @@ -30,19 +30,19 @@ def get_name() -> str: return "NO_ATTENTION" @staticmethod - def get_impl_cls() -> type["PlaceholderAttentionImpl"]: + def get_impl_cls() -> Type["PlaceholderAttentionImpl"]: return PlaceholderAttentionImpl @staticmethod - def get_builder_cls() -> type["PlaceholderAttentionMetadataBuilder"]: + def get_builder_cls() -> Type["PlaceholderAttentionMetadataBuilder"]: return PlaceholderAttentionMetadataBuilder @staticmethod - def get_metadata_cls() -> type["PlaceholderAttentionMetadata"]: + def get_metadata_cls() -> Type["PlaceholderAttentionMetadata"]: return PlaceholderAttentionMetadata @staticmethod - def get_state_cls() -> type["CommonAttentionState"]: + def get_state_cls() -> Type["CommonAttentionState"]: return CommonAttentionState @staticmethod @@ -51,7 +51,7 @@ def get_kv_cache_shape( block_size: int, num_kv_heads: int, head_size: int, - ) -> tuple[int, ...]: + ) -> Tuple[int, ...]: return (1, 1, 1, 1, 1) @staticmethod @@ -64,7 +64,7 @@ def swap_blocks( @staticmethod def copy_blocks( - kv_caches: list[torch.Tensor], + kv_caches: List[torch.Tensor], src_to_dists: torch.Tensor, ) -> None: return @@ -75,7 +75,7 @@ class PlaceholderAttentionMetadata(AttentionMetadata): """Attention metadata for prefill and decode batched together.""" # (batch_size,). The sequence length per sequence. Sequence length means # the computed tokens + new tokens None if it is a decoding. - seq_lens: Optional[list[int]] + seq_lens: Optional[List[int]] # seq_lens stored as a tensor. seq_lens_tensor: Optional[torch.Tensor] @@ -269,10 +269,10 @@ def __init__(self, input_builder: "ModelInputForGPUBuilder"): self.runner = input_builder.runner def prepare(self): - self.prefill_seq_lens: list[int] = [] - self.context_lens: list[int] = [] - self.curr_seq_lens: list[int] = [] - self.multimodal_placeholder_maps: dict[ + self.prefill_seq_lens: List[int] = [] + self.context_lens: List[int] = [] + self.curr_seq_lens: List[int] = [] + self.multimodal_placeholder_maps: Dict[ str, MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap) self.num_prefills = 0 @@ -309,7 +309,7 @@ def _add_seq_group( self.num_decode_tokens += query_len self.curr_seq_lens.append(curr_seq_len) - def build(self, seq_lens: list[int], query_lens: list[int], + def build(self, seq_lens: List[int], query_lens: List[int], cuda_graph_pad_size: int, batch_size: int): """Build attention metadata with on-device tensors. diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py index 2c908451d151d..3f40686ee2fda 100644 --- a/vllm/attention/backends/rocm_flash_attn.py +++ b/vllm/attention/backends/rocm_flash_attn.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 """Attention layer ROCm GPUs.""" from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, Optional +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type import torch @@ -35,19 +35,19 @@ def get_name() -> str: return "ROCM_FLASH" @staticmethod - def get_impl_cls() -> type["ROCmFlashAttentionImpl"]: + def get_impl_cls() -> Type["ROCmFlashAttentionImpl"]: return ROCmFlashAttentionImpl @staticmethod - def get_metadata_cls() -> type["AttentionMetadata"]: + def get_metadata_cls() -> Type["AttentionMetadata"]: return ROCmFlashAttentionMetadata @staticmethod - def get_builder_cls() -> type["ROCmFlashAttentionMetadataBuilder"]: + def get_builder_cls() -> Type["ROCmFlashAttentionMetadataBuilder"]: return ROCmFlashAttentionMetadataBuilder @staticmethod - def get_state_cls() -> type["CommonAttentionState"]: + def get_state_cls() -> Type["CommonAttentionState"]: return CommonAttentionState @staticmethod @@ -56,7 +56,7 @@ def get_kv_cache_shape( block_size: int, num_kv_heads: int, head_size: int, - ) -> tuple[int, ...]: + ) -> Tuple[int, ...]: return PagedAttention.get_kv_cache_shape(num_blocks, block_size, num_kv_heads, head_size) @@ -70,7 +70,7 @@ def swap_blocks( @staticmethod def copy_blocks( - kv_caches: list[torch.Tensor], + kv_caches: List[torch.Tensor], src_to_dists: torch.Tensor, ) -> None: PagedAttention.copy_blocks(kv_caches, src_to_dists) @@ -87,7 +87,7 @@ class ROCmFlashAttentionMetadata(AttentionMetadata, PagedAttentionMetadata): """ # (batch_size,). The sequence length per sequence. Sequence length means # the computed tokens + new tokens None if it is a decoding. - seq_lens: Optional[list[int]] + seq_lens: Optional[List[int]] # seq_lens stored as a tensor. seq_lens_tensor: Optional[torch.Tensor] # Maximum sequence length among prefill batch. 0 if there are decoding @@ -133,7 +133,7 @@ class ROCmFlashAttentionMetadata(AttentionMetadata, PagedAttentionMetadata): # Begin encoder attn & enc/dec cross-attn fields... # Encoder sequence lengths representation - encoder_seq_lens: Optional[list[int]] = None + encoder_seq_lens: Optional[List[int]] = None encoder_seq_lens_tensor: Optional[torch.Tensor] = None # Maximum sequence length among encoder sequences @@ -301,8 +301,8 @@ class ROCmFlashAttentionMetadataBuilder( def _make_alibi_bias(alibi_slopes: torch.Tensor, dtype: torch.dtype, - seq_lens: Optional[list[int]], - make_attn_mask: bool = True) -> list[torch.Tensor]: + seq_lens: Optional[List[int]], + make_attn_mask: bool = True) -> List[torch.Tensor]: attn_biases = [] if seq_lens: for seq_len in seq_lens: @@ -453,10 +453,10 @@ def __init__( head_size: int, scale: float, num_kv_heads: int, - alibi_slopes: Optional[list[float]], + alibi_slopes: Optional[List[float]], sliding_window: Optional[int], kv_cache_dtype: str, - blocksparse_params: Optional[dict[str, Any]] = None, + blocksparse_params: Optional[Dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, attn_type: str = AttentionType.DECODER, ) -> None: @@ -846,12 +846,12 @@ def _sdpa_attention( query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, - seq_lens: list[int], + seq_lens: List[int], num_tokens: int, num_heads: int, head_size: int, scale: float, - attn_masks: Optional[list[torch.Tensor]] = None, + attn_masks: Optional[List[torch.Tensor]] = None, ) -> torch.Tensor: start = 0 output = torch.empty((num_tokens, num_heads, head_size), diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py index 8e249abfa13da..25fe6ed95c5df 100644 --- a/vllm/attention/backends/torch_sdpa.py +++ b/vllm/attention/backends/torch_sdpa.py @@ -2,7 +2,7 @@ """ Attention layer with torch scaled_dot_product_attention and PagedAttention.""" from dataclasses import dataclass -from typing import Any, Optional +from typing import Any, Dict, List, Optional, Tuple, Type import torch from torch.nn.functional import scaled_dot_product_attention @@ -29,19 +29,19 @@ def get_name() -> str: return "TORCH_SDPA" @staticmethod - def get_impl_cls() -> type["TorchSDPABackendImpl"]: + def get_impl_cls() -> Type["TorchSDPABackendImpl"]: return TorchSDPABackendImpl @staticmethod - def get_metadata_cls() -> type["AttentionMetadata"]: + def get_metadata_cls() -> Type["AttentionMetadata"]: return TorchSDPAMetadata @staticmethod - def get_state_cls() -> type["CommonAttentionState"]: + def get_state_cls() -> Type["CommonAttentionState"]: return CommonAttentionState @staticmethod - def get_builder_cls() -> type["TorchSDPAMetadataBuilder"]: + def get_builder_cls() -> Type["TorchSDPAMetadataBuilder"]: return TorchSDPAMetadataBuilder @staticmethod @@ -50,7 +50,7 @@ def get_kv_cache_shape( block_size: int, num_kv_heads: int, head_size: int, - ) -> tuple[int, ...]: + ) -> Tuple[int, ...]: return PagedAttention.get_kv_cache_shape(num_blocks, block_size, num_kv_heads, head_size) @@ -64,7 +64,7 @@ def swap_blocks( @staticmethod def copy_blocks( - kv_caches: list[torch.Tensor], + kv_caches: List[torch.Tensor], src_to_dists: torch.Tensor, ) -> None: PagedAttention.copy_blocks(kv_caches, src_to_dists) @@ -77,7 +77,7 @@ class TorchSDPAMetadata(AttentionMetadata, PagedAttentionMetadata): # Currently, input sequences can only contain all prompts # or all decoding. True if all sequences are prompts. chunked_prefill: bool - seq_lens: Optional[list[int]] = None # For non-chunked prefill + seq_lens: Optional[List[int]] = None # For non-chunked prefill # For chunked prefill only max_query_len: Optional[int] = None @@ -88,7 +88,7 @@ class TorchSDPAMetadata(AttentionMetadata, PagedAttentionMetadata): # Begin encoder attn & enc/dec cross-attn fields... # Encoder sequence lengths representation - encoder_seq_lens: Optional[list[int]] = None + encoder_seq_lens: Optional[List[int]] = None encoder_seq_lens_tensor: Optional[torch.Tensor] = None # Maximum sequence length among encoder sequences @@ -108,9 +108,9 @@ def __post_init__(self): # when alibi slopes is used. It is because of the limitation # from xformer API. # will not appear in the __repr__ and __init__ - self.attn_bias: Optional[list[torch.Tensor]] = None - self.encoder_attn_bias: Optional[list[torch.Tensor]] = None - self.cross_attn_bias: Optional[list[torch.Tensor]] = None + self.attn_bias: Optional[List[torch.Tensor]] = None + self.encoder_attn_bias: Optional[List[torch.Tensor]] = None + self.cross_attn_bias: Optional[List[torch.Tensor]] = None @property def is_all_encoder_attn_metadata_set(self): @@ -180,7 +180,7 @@ def get_seq_lens( def get_attn_bias( self, attn_type: str, - ) -> Optional[list[torch.Tensor]]: + ) -> Optional[List[torch.Tensor]]: ''' Extract appropriate attention bias from attention metadata according to attention type. @@ -207,7 +207,7 @@ def get_attn_bias( def set_attn_bias( self, - attn_bias: list[torch.Tensor], + attn_bias: List[torch.Tensor], attn_type: str, ) -> None: ''' @@ -288,7 +288,7 @@ def __init__(self, input_builder: ModelInputForCPUBuilder) -> None: def prepare(self): self.input_data = self.input_builder.input_data - def build(self, seq_lens: list[int], query_lens: list[int], + def build(self, seq_lens: List[int], query_lens: List[int], cuda_graph_pad_size: int, batch_size: int) -> TorchSDPAMetadata: input_data = self.input_data prefill_seq_lens = seq_lens[0:input_data.num_prefills] @@ -394,10 +394,10 @@ def __init__( head_size: int, scale: float, num_kv_heads: int, - alibi_slopes: Optional[list[float]], + alibi_slopes: Optional[List[float]], sliding_window: Optional[int], kv_cache_dtype: str, - blocksparse_params: Optional[dict[str, Any]] = None, + blocksparse_params: Optional[Dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, attn_type: str = AttentionType.DECODER, ) -> None: @@ -636,9 +636,9 @@ def _run_sdpa_forward( def _make_alibi_bias( alibi_slopes: torch.Tensor, dtype: torch.dtype, - seq_lens: list[int], -) -> list[torch.Tensor]: - attn_biases: list[torch.Tensor] = [] + seq_lens: List[int], +) -> List[torch.Tensor]: + attn_biases: List[torch.Tensor] = [] for seq_len in seq_lens: bias = torch.arange(seq_len, dtype=dtype) # NOTE(zhuohan): HF uses @@ -660,11 +660,11 @@ def _make_alibi_bias( def _make_sliding_window_bias( - seq_lens: list[int], + seq_lens: List[int], window_size: Optional[int], dtype: torch.dtype, -) -> list[torch.Tensor]: - attn_biases: list[torch.Tensor] = [] +) -> List[torch.Tensor]: + attn_biases: List[torch.Tensor] = [] for seq_len in seq_lens: tensor = torch.full( (1, seq_len, seq_len), diff --git a/vllm/attention/backends/triton_mla.py b/vllm/attention/backends/triton_mla.py index 048ecbeee5c0d..08e8226ab04c0 100644 --- a/vllm/attention/backends/triton_mla.py +++ b/vllm/attention/backends/triton_mla.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Any, Optional +from typing import Any, Dict, List, Optional, Type import torch @@ -18,7 +18,7 @@ def get_name() -> str: return "TRITON_MLA" @staticmethod - def get_impl_cls() -> type["TritonMLAImpl"]: + def get_impl_cls() -> Type["TritonMLAImpl"]: return TritonMLAImpl @@ -30,10 +30,10 @@ def __init__( head_size: int, scale: float, num_kv_heads: int, - alibi_slopes: Optional[list[float]], + alibi_slopes: Optional[List[float]], sliding_window: Optional[int], kv_cache_dtype: str, - blocksparse_params: Optional[dict[str, Any]], + blocksparse_params: Optional[Dict[str, Any]], logits_soft_cap: Optional[float], attn_type: str, # MLA Specific Arguments diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py index c967f67873f4b..baf01c9263d4f 100644 --- a/vllm/attention/backends/utils.py +++ b/vllm/attention/backends/utils.py @@ -3,7 +3,7 @@ from collections import defaultdict from contextlib import contextmanager from itertools import accumulate -from typing import TYPE_CHECKING, Any, TypeVar, Union +from typing import TYPE_CHECKING, Any, Dict, List, Tuple, Type, TypeVar, Union import numpy as np import torch @@ -37,7 +37,7 @@ from vllm.worker.model_runner import ModelInputForGPUBuilder -def is_block_tables_empty(block_tables: Union[None, dict]): +def is_block_tables_empty(block_tables: Union[None, Dict]): """ Check if block_tables is None or a dictionary with all None values. """ @@ -58,8 +58,8 @@ def compute_slot_mapping_start_idx(is_prompt: bool, query_len: int, return start_idx -def _compute_slot_mapping_python(slot_mapping: list[int], - block_table: list[int], range_start: int, +def _compute_slot_mapping_python(slot_mapping: List[int], + block_table: List[int], range_start: int, range_end: int, block_size: int): for i in range(range_start, range_end): block_number = block_table[i // block_size] @@ -68,8 +68,8 @@ def _compute_slot_mapping_python(slot_mapping: list[int], slot_mapping.append(slot) -def _compute_slot_mapping_numpy(slot_mapping: list[int], - block_table: list[int], range_start: int, +def _compute_slot_mapping_numpy(slot_mapping: List[int], + block_table: List[int], range_start: int, range_end: int, block_size: int): block_table_array = np.array(block_table) idx = np.arange(range_start, range_end) @@ -81,10 +81,10 @@ def _compute_slot_mapping_numpy(slot_mapping: list[int], slot_mapping.extend(seq_slot_mapping_array) -def compute_slot_mapping(is_profile_run: bool, slot_mapping: list[int], +def compute_slot_mapping(is_profile_run: bool, slot_mapping: List[int], seq_id: int, seq_len: int, context_len: int, start_idx: int, block_size: int, - block_tables: dict[int, list[int]]): + block_tables: Dict[int, List[int]]): """ Compute slot mapping. """ @@ -125,7 +125,7 @@ def compute_slot_mapping(is_profile_run: bool, slot_mapping: list[int], class CommonMetadataBuilder(AttentionMetadataBuilder[TAttentionMetadata]): - _metadata_cls: type[TAttentionMetadata] + _metadata_cls: Type[TAttentionMetadata] def __init__(self, input_builder: "ModelInputForGPUBuilder"): self.input_builder = input_builder @@ -135,12 +135,12 @@ def __init__(self, input_builder: "ModelInputForGPUBuilder"): self.block_size = input_builder.block_size def prepare(self): - self.slot_mapping: list[int] = [] - self.prefill_seq_lens: list[int] = [] - self.context_lens: list[int] = [] - self.block_tables: list[list[int]] = [] - self.curr_seq_lens: list[int] = [] - self.multimodal_placeholder_maps: dict[ + self.slot_mapping: List[int] = [] + self.prefill_seq_lens: List[int] = [] + self.context_lens: List[int] = [] + self.block_tables: List[List[int]] = [] + self.curr_seq_lens: List[int] = [] + self.multimodal_placeholder_maps: Dict[ str, MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap) self.num_prefills = 0 @@ -202,7 +202,7 @@ def _add_seq_group( seq_len, context_len, start_idx, self.block_size, inter_data.block_tables) - def build(self, seq_lens: list[int], query_lens: list[int], + def build(self, seq_lens: List[int], query_lens: List[int], cuda_graph_pad_size: int, batch_size: int): """Build attention metadata with on-device tensors. @@ -357,7 +357,7 @@ def graph_capture_get_metadata_for_batch( def get_graph_input_buffers( self, attn_metadata, - is_encoder_decoder_model: bool = False) -> dict[str, Any]: + is_encoder_decoder_model: bool = False) -> Dict[str, Any]: input_buffers = { "slot_mapping": attn_metadata.slot_mapping, "seq_lens_tensor": attn_metadata.decode_metadata.seq_lens_tensor, @@ -426,7 +426,7 @@ def _update_captured_metadata_for_enc_dec_model(self, batch_size: int, attn_metadata.num_encoder_tokens = 0 def _add_additonal_input_buffers_for_enc_dec_model( - self, attn_metadata, input_buffers: dict[str, Any]): + self, attn_metadata, input_buffers: Dict[str, Any]): """ Saves additional input buffers specific to the encoder-decoder model from the attention metadata. @@ -445,7 +445,7 @@ def _add_additonal_input_buffers_for_enc_dec_model( attn_metadata.decode_metadata.cross_block_tables) def _prepare_input_buffers_for_enc_dec_model(self, attn_metadata, - input_buffers: dict[str, + input_buffers: Dict[str, Any]): """ Populates input buffers with data from the encoder-decoder model's @@ -543,7 +543,7 @@ def get_seq_len_block_table_args( def get_num_prefill_decode_query_kv_tokens( attn_metadata, attn_type: str, -) -> tuple[int, int, int]: +) -> Tuple[int, int, int]: """ Calculate the number of prefill and decode tokens for query, key/value based on the attention metadata and the specified attention type. @@ -552,7 +552,7 @@ def get_num_prefill_decode_query_kv_tokens( attn_metadata (FlashAttentionMetadata): Attention Metadata object. attn_type (AttentionType): The type of attention being used. Returns: - tuple[int, int, int]: A tuple containing three integers: + Tuple[int, int, int]: A tuple containing three integers: - The number of prefill query tokens. - The number of prefill key/value tokens. - The number of decode query tokens. diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py index d60b4a1de5af8..9fa76634e1fc9 100644 --- a/vllm/attention/backends/xformers.py +++ b/vllm/attention/backends/xformers.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 """Attention layer with xFormers and PagedAttention.""" from dataclasses import dataclass -from typing import Any, Optional +from typing import Any, Dict, List, Optional, Tuple, Type import torch from xformers import ops as xops @@ -31,19 +31,19 @@ def get_name() -> str: return "XFORMERS" @staticmethod - def get_impl_cls() -> type["XFormersImpl"]: + def get_impl_cls() -> Type["XFormersImpl"]: return XFormersImpl @staticmethod - def get_metadata_cls() -> type["AttentionMetadata"]: + def get_metadata_cls() -> Type["AttentionMetadata"]: return XFormersMetadata @staticmethod - def get_builder_cls() -> type["XFormersMetadataBuilder"]: + def get_builder_cls() -> Type["XFormersMetadataBuilder"]: return XFormersMetadataBuilder @staticmethod - def get_state_cls() -> type["CommonAttentionState"]: + def get_state_cls() -> Type["CommonAttentionState"]: return CommonAttentionState @staticmethod @@ -52,7 +52,7 @@ def get_kv_cache_shape( block_size: int, num_kv_heads: int, head_size: int, - ) -> tuple[int, ...]: + ) -> Tuple[int, ...]: return PagedAttention.get_kv_cache_shape(num_blocks, block_size, num_kv_heads, head_size) @@ -60,13 +60,13 @@ def get_kv_cache_shape( def swap_blocks( src_kv_cache: torch.Tensor, dst_kv_cache: torch.Tensor, - src_to_dst: dict[int, int], + src_to_dst: Dict[int, int], ) -> None: PagedAttention.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst) @staticmethod def copy_blocks( - kv_caches: list[torch.Tensor], + kv_caches: List[torch.Tensor], src_to_dists: torch.Tensor, ) -> None: PagedAttention.copy_blocks(kv_caches, src_to_dists) @@ -107,7 +107,7 @@ class XFormersMetadata(AttentionMetadata, PagedAttentionMetadata): # (batch_size,). The sequence length per sequence. Sequence length means # the computed tokens + new tokens None if it is a decoding. - seq_lens: Optional[list[int]] = None + seq_lens: Optional[List[int]] = None # FIXME: It is for flash attn. # (batch_size + 1,). The cumulative sequence lengths of the sequences in @@ -137,7 +137,7 @@ class XFormersMetadata(AttentionMetadata, PagedAttentionMetadata): # Begin encoder attn & enc/dec cross-attn fields... # Encoder sequence lengths representation - encoder_seq_lens: Optional[list[int]] = None + encoder_seq_lens: Optional[List[int]] = None encoder_seq_lens_tensor: Optional[torch.Tensor] = None # FIXME: It is for flash attn. # (batch_size + 1,). The cumulative sequence lengths of the sequences in @@ -162,9 +162,9 @@ def __post_init__(self): # when alibi slopes is used. It is because of the limitation # from xformer API. # will not appear in the __repr__ and __init__ - self.attn_bias: Optional[list[AttentionBias]] = None - self.encoder_attn_bias: Optional[list[AttentionBias]] = None - self.cross_attn_bias: Optional[list[AttentionBias]] = None + self.attn_bias: Optional[List[AttentionBias]] = None + self.encoder_attn_bias: Optional[List[AttentionBias]] = None + self.cross_attn_bias: Optional[List[AttentionBias]] = None @property def is_all_encoder_attn_metadata_set(self): @@ -320,7 +320,7 @@ def _get_attn_bias( def _set_attn_bias( attn_metadata: XFormersMetadata, - attn_bias: list[Optional[AttentionBias]], + attn_bias: List[Optional[AttentionBias]], attn_type: str, ) -> None: ''' @@ -383,10 +383,10 @@ def __init__( head_size: int, scale: float, num_kv_heads: int, - alibi_slopes: Optional[list[float]], + alibi_slopes: Optional[List[float]], sliding_window: Optional[int], kv_cache_dtype: str, - blocksparse_params: Optional[dict[str, Any]] = None, + blocksparse_params: Optional[Dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, attn_type: str = AttentionType.DECODER, ) -> None: @@ -763,9 +763,9 @@ def _make_alibi_bias( alibi_slopes: torch.Tensor, num_kv_heads: int, dtype: torch.dtype, - seq_lens: list[int], -) -> list[AttentionBias]: - attn_biases: list[AttentionBias] = [] + seq_lens: List[int], +) -> List[AttentionBias]: + attn_biases: List[AttentionBias] = [] for seq_len in seq_lens: bias = torch.arange(seq_len, dtype=dtype) # NOTE(zhuohan): HF uses diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index 04923eb6b4d0b..c45c83a0707fd 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 """Attention layer.""" -from typing import Any, Optional +from typing import Any, Dict, List, Optional import torch import torch.nn as nn @@ -36,10 +36,10 @@ def __init__( head_size: int, scale: float, num_kv_heads: Optional[int] = None, - alibi_slopes: Optional[list[float]] = None, + alibi_slopes: Optional[List[float]] = None, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, - blocksparse_params: Optional[dict[str, Any]] = None, + blocksparse_params: Optional[Dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, per_layer_sliding_window: Optional[int] = None, use_mla: bool = False, diff --git a/vllm/attention/ops/flashmla.py b/vllm/attention/ops/flashmla.py index 1e7bb07c7838b..18b69a6b3ddf8 100644 --- a/vllm/attention/ops/flashmla.py +++ b/vllm/attention/ops/flashmla.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # adapted from: https://github.com/deepseek-ai/FlashMLA/blob/main/flash_mla/flash_mla_interface.py -from typing import Optional +from typing import Optional, Tuple import torch @@ -19,7 +19,7 @@ _flashmla_C_AVAILABLE = False -def is_flashmla_supported() -> tuple[bool, Optional[str]]: +def is_flashmla_supported() -> Tuple[bool, Optional[str]]: """ Return: is_supported_flag, unsupported_reason (optional). """ @@ -39,7 +39,7 @@ def get_mla_metadata( cache_seqlens: torch.Tensor, num_heads_per_head_k: int, num_heads_k: int, -) -> tuple[torch.Tensor, torch.Tensor]: +) -> Tuple[torch.Tensor, torch.Tensor]: """ Arguments: cache_seqlens: (batch_size), dtype torch.int32. @@ -66,7 +66,7 @@ def flash_mla_with_kvcache( num_splits: torch.Tensor, softmax_scale: Optional[float] = None, causal: bool = False, -) -> tuple[torch.Tensor, torch.Tensor]: +) -> Tuple[torch.Tensor, torch.Tensor]: """ Arguments: q: (batch_size, seq_len_q, num_heads_q, head_dim). @@ -106,10 +106,10 @@ def flash_mla_with_kvcache( # TODO: Add fake functions # # @register_fake("_flashmla_C::get_mla_metadata") -# def _get_mla_metadata_fake(....) -> tuple[torch.Tensor, torch.Tensor]: +# def _get_mla_metadata_fake(....) -> Tuple[torch.Tensor, torch.Tensor]: # return .... # # @register_fake("_flashmla_C::fwd_kvcache_mla") -# def _fwd_kvcache_mla_fake(....) -> tuple[torch.Tensor, torch.Tensor]: +# def _fwd_kvcache_mla_fake(....) -> Tuple[torch.Tensor, torch.Tensor]: # return .... # diff --git a/vllm/attention/ops/hpu_paged_attn.py b/vllm/attention/ops/hpu_paged_attn.py index 994a4556c115c..49ea420d092cc 100644 --- a/vllm/attention/ops/hpu_paged_attn.py +++ b/vllm/attention/ops/hpu_paged_attn.py @@ -5,7 +5,7 @@ ############################################################################### from dataclasses import dataclass -from typing import Optional +from typing import Dict, List, Optional, Tuple import torch from vllm_hpu_extension import cache_ops, ops @@ -29,7 +29,7 @@ class HPUPagedAttentionMetadata: class HPUPagedAttention: @staticmethod - def get_supported_head_sizes() -> list[int]: + def get_supported_head_sizes() -> List[int]: return [64, 80, 96, 112, 128, 256] @staticmethod @@ -38,7 +38,7 @@ def get_kv_cache_shape( block_size: int, num_kv_heads: int, head_size: int, - ) -> tuple[int, ...]: + ) -> Tuple[int, ...]: return (num_blocks, block_size, num_kv_heads, head_size) @staticmethod @@ -46,7 +46,7 @@ def split_kv_cache( kv_cache: torch.Tensor, num_kv_heads: int, head_size: int, - ) -> tuple[torch.Tensor, torch.Tensor]: + ) -> Tuple[torch.Tensor, torch.Tensor]: key_cache = kv_cache[0] value_cache = kv_cache[1] return key_cache, value_cache @@ -86,7 +86,7 @@ def forward_prefix( def swap_blocks( src_kv_cache: torch.Tensor, dst_kv_cache: torch.Tensor, - src_to_dst: dict[int, int], + src_to_dst: Dict[int, int], ) -> None: src_key_cache = src_kv_cache[0] dst_key_cache = dst_kv_cache[0] @@ -98,8 +98,8 @@ def swap_blocks( @staticmethod def copy_blocks( - kv_caches: list[torch.Tensor], - src_to_dists: dict[int, list[int]], + kv_caches: List[torch.Tensor], + src_to_dists: Dict[int, List[int]], ) -> None: key_caches = [kv_cache[0] for kv_cache in kv_caches] value_caches = [kv_cache[1] for kv_cache in kv_caches] diff --git a/vllm/attention/ops/ipex_attn.py b/vllm/attention/ops/ipex_attn.py index 9e1274b4f1d50..598ceea130d97 100644 --- a/vllm/attention/ops/ipex_attn.py +++ b/vllm/attention/ops/ipex_attn.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Optional +from typing import Dict, List, Optional, Tuple try: import intel_extension_for_pytorch.llm.modules as ipex_modules @@ -16,7 +16,7 @@ class _PagedAttention: @staticmethod - def get_supported_head_sizes() -> list[int]: + def get_supported_head_sizes() -> List[int]: return [32, 64, 80, 96, 112, 128, 256] @staticmethod @@ -26,7 +26,7 @@ def get_kv_cache_shape( num_kv_heads: int, head_size: int, *args, - ) -> tuple[int, ...]: + ) -> Tuple[int, ...]: return (2, num_blocks, block_size * num_kv_heads * head_size) @staticmethod @@ -35,7 +35,7 @@ def split_kv_cache( num_kv_heads: int, head_size: int, *args, - ) -> tuple[torch.Tensor, torch.Tensor]: + ) -> Tuple[torch.Tensor, torch.Tensor]: x = 16 // kv_cache.element_size() num_blocks = kv_cache.shape[1] @@ -117,8 +117,8 @@ def forward_decode( @staticmethod def copy_blocks( - kv_caches: list[torch.Tensor], - src_to_dists: dict[int, list[int]], + kv_caches: List[torch.Tensor], + src_to_dists: Dict[int, List[int]], *args, ) -> None: key_caches = [kv_cache[0] for kv_cache in kv_caches] @@ -134,7 +134,7 @@ def split_kv_cache( num_kv_heads: int, head_size: int, *args, - ) -> tuple[torch.Tensor, torch.Tensor]: + ) -> Tuple[torch.Tensor, torch.Tensor]: num_blocks = kv_cache.shape[1] key_cache = kv_cache[0] diff --git a/vllm/attention/ops/paged_attn.py b/vllm/attention/ops/paged_attn.py index 2ec534d5b154f..fd703413db908 100644 --- a/vllm/attention/ops/paged_attn.py +++ b/vllm/attention/ops/paged_attn.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 from dataclasses import dataclass -from typing import Optional +from typing import List, Optional, Tuple import torch @@ -35,7 +35,7 @@ class PagedAttentionMetadata: class PagedAttention: @staticmethod - def get_supported_head_sizes() -> list[int]: + def get_supported_head_sizes() -> List[int]: return [32, 64, 80, 96, 112, 120, 128, 192, 256] @staticmethod @@ -44,7 +44,7 @@ def get_kv_cache_shape( block_size: int, num_kv_heads: int, head_size: int, - ) -> tuple[int, ...]: + ) -> Tuple[int, ...]: return (2, num_blocks, block_size * num_kv_heads * head_size) @staticmethod @@ -52,7 +52,7 @@ def split_kv_cache( kv_cache: torch.Tensor, num_kv_heads: int, head_size: int, - ) -> tuple[torch.Tensor, torch.Tensor]: + ) -> Tuple[torch.Tensor, torch.Tensor]: x = 16 // kv_cache.element_size() num_blocks = kv_cache.shape[1] @@ -245,7 +245,7 @@ def swap_blocks( @staticmethod def copy_blocks( - kv_caches: list[torch.Tensor], + kv_caches: List[torch.Tensor], src_to_dists: torch.Tensor, ) -> None: key_caches = [kv_cache[0] for kv_cache in kv_caches] diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py index 419d3d1327598..26c6ac812a125 100644 --- a/vllm/attention/selector.py +++ b/vllm/attention/selector.py @@ -1,10 +1,9 @@ # SPDX-License-Identifier: Apache-2.0 import os -from collections.abc import Generator from contextlib import contextmanager from functools import cache -from typing import Optional +from typing import Generator, Optional, Type import torch @@ -87,7 +86,7 @@ def get_attn_backend( is_attention_free: bool, is_blocksparse: bool = False, use_mla: bool = False, -) -> type[AttentionBackend]: +) -> Type[AttentionBackend]: """Selects which attention backend to use and lazily imports it.""" # Accessing envs.* behind an @lru_cache decorator can cause the wrong # value to be returned from the cache if the value changes between calls. @@ -115,7 +114,7 @@ def _cached_get_attn_backend( is_blocksparse: bool = False, use_v1: bool = False, use_mla: bool = False, -) -> type[AttentionBackend]: +) -> Type[AttentionBackend]: if is_blocksparse: logger.info("Using BlocksparseFlashAttention backend.") from vllm.attention.backends.blocksparse_attn import ( diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index edc0ffb31a233..b972f03c9685b 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -5,9 +5,8 @@ import os import pprint import time -from collections.abc import Sequence from contextlib import ExitStack -from typing import Any, Callable, Optional +from typing import Any, Callable, Dict, List, Optional, Sequence, Set, Tuple from unittest.mock import patch import torch @@ -43,7 +42,7 @@ class CompilerManager: """ def __init__(self, use_inductor: bool): - self.cache: dict[tuple[Optional[int], int, str], Any] = dict() + self.cache: Dict[Tuple[Optional[int], int, str], Any] = dict() cls = InductorAdaptor if use_inductor else EagerAdaptor self.compiler = cls() @@ -76,7 +75,7 @@ def save_to_file(self): def load(self, graph: fx.GraphModule, - example_inputs: list[Any], + example_inputs: List[Any], graph_index: int, runtime_shape: Optional[int] = None) -> Optional[Callable]: if (runtime_shape, graph_index, self.compiler.name) not in self.cache: @@ -160,7 +159,7 @@ class SplitItem: def split_graph(graph: fx.GraphModule, - ops: list[str]) -> tuple[fx.GraphModule, list[SplitItem]]: + ops: List[str]) -> Tuple[fx.GraphModule, List[SplitItem]]: # split graph by ops subgraph_id = 0 node_to_subgraph_id = {} @@ -226,7 +225,7 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter): """ def __init__(self, module: torch.fx.GraphModule, - compile_submod_names: list[str], vllm_config: VllmConfig, + compile_submod_names: List[str], vllm_config: VllmConfig, graph_pool, vllm_backend: "VllmBackend"): super().__init__(module) from torch._guards import detect_fake_mode @@ -246,8 +245,8 @@ def run(self, *args): return super().run(*fake_args) def call_module(self, target: torch.fx.node.Target, - args: tuple[torch.fx.node.Argument, - ...], kwargs: dict[str, Any]) -> Any: + args: Tuple[torch.fx.node.Argument, + ...], kwargs: Dict[str, Any]) -> Any: assert isinstance(target, str) output = super().call_module(target, args, kwargs) @@ -298,12 +297,12 @@ class VllmBackend: graph: fx.GraphModule # the stiching graph module for all the piecewise graphs split_gm: fx.GraphModule - piecewise_graphs: list[SplitItem] + piecewise_graphs: List[SplitItem] returned_callable: Callable # Inductor passes to run on the graph pre-defunctionalization post_grad_passes: Sequence[Callable] - sym_tensor_indices: list[int] - input_buffers: list[torch.Tensor] + sym_tensor_indices: List[int] + input_buffers: List[torch.Tensor] compiler_manager: CompilerManager def __init__( @@ -524,14 +523,14 @@ class ConcreteSizeEntry: # for cudagraph debugging, track the input addresses # during capture, and check if they are the same during replay - input_addresses: Optional[list[int]] = None + input_addresses: Optional[List[int]] = None class PiecewiseBackend: def __init__(self, graph: fx.GraphModule, vllm_config: VllmConfig, graph_pool: Any, piecewise_compile_index: int, - total_piecewise_compiles: int, sym_shape_indices: list[int], + total_piecewise_compiles: int, sym_shape_indices: List[int], compiled_graph_for_general_shape: Callable, vllm_backend: VllmBackend): """ @@ -559,9 +558,9 @@ def __init__(self, graph: fx.GraphModule, vllm_config: VllmConfig, self.is_last_graph = ( piecewise_compile_index == total_piecewise_compiles - 1) - self.compile_sizes: set[int] = set( + self.compile_sizes: Set[int] = set( self.compilation_config.compile_sizes) - self.cudagraph_capture_sizes: set[int] = set( + self.cudagraph_capture_sizes: Set[int] = set( self.compilation_config.cudagraph_capture_sizes ) if self.compilation_config.use_cudagraph else set() @@ -575,11 +574,11 @@ def __init__(self, graph: fx.GraphModule, vllm_config: VllmConfig, # the entries for different shapes that we need to either # compile or capture cudagraph - self.concrete_size_entries: dict[int, ConcreteSizeEntry] = {} + self.concrete_size_entries: Dict[int, ConcreteSizeEntry] = {} # to_be_compiled_sizes tracks the remaining sizes to compile, # and updates during the compilation process, so we need to copy it - self.to_be_compiled_sizes: set[int] = self.compile_sizes.copy() + self.to_be_compiled_sizes: Set[int] = self.compile_sizes.copy() for shape in self.compile_sizes.union(self.cudagraph_capture_sizes): self.concrete_size_entries[shape] = ConcreteSizeEntry( runtime_shape=shape, diff --git a/vllm/compilation/compiler_interface.py b/vllm/compilation/compiler_interface.py index d9bdac365cae5..ac0544ad64037 100644 --- a/vllm/compilation/compiler_interface.py +++ b/vllm/compilation/compiler_interface.py @@ -3,7 +3,7 @@ import hashlib import os from contextlib import ExitStack -from typing import Any, Callable, Optional +from typing import Any, Callable, Dict, List, Optional, Tuple from unittest.mock import patch import torch @@ -43,10 +43,10 @@ def compute_hash(self, vllm_config: VllmConfig) -> str: def compile( self, graph: fx.GraphModule, - example_inputs: list[Any], - compiler_config: dict[str, Any], + example_inputs: List[Any], + compiler_config: Dict[str, Any], runtime_shape: Optional[int] = None - ) -> tuple[Optional[Callable], Optional[Any]]: + ) -> Tuple[Optional[Callable], Optional[Any]]: """ Compile the graph with the given example inputs and compiler config, with a runtime shape. If the `runtime_shape` is None, it means @@ -72,7 +72,7 @@ def compile( def load(self, handle: Any, graph: fx.GraphModule, - example_inputs: list[Any], + example_inputs: List[Any], graph_index: int, runtime_shape: Optional[int] = None) -> Callable: """ @@ -110,7 +110,7 @@ class AlwaysHitShapeEnv: """ def __init__(self) -> None: - self.guards: list[Any] = [] + self.guards: List[Any] = [] def evaluate_guards_expression(self, *args, **kwargs): return True @@ -129,7 +129,7 @@ class InductorAdaptor(CompilerInterface): name = "inductor" def compute_hash(self, vllm_config: VllmConfig) -> str: - factors: list[Any] = [] + factors: List[Any] = [] # summarize system state from torch._inductor.codecache import CacheBase system_factors = CacheBase.get_system() @@ -159,10 +159,10 @@ def initialize_cache(self, cache_dir: str, disable_cache: bool = False): def compile( self, graph: fx.GraphModule, - example_inputs: list[Any], - compiler_config: dict[str, Any], + example_inputs: List[Any], + compiler_config: Dict[str, Any], runtime_shape: Optional[int] = None - ) -> tuple[Optional[Callable], Optional[Any]]: + ) -> Tuple[Optional[Callable], Optional[Any]]: from torch._inductor import config current_config = config.get_config_copy() from torch._inductor.compile_fx import compile_fx @@ -273,7 +273,7 @@ def _get_shape_env() -> AlwaysHitShapeEnv: def load(self, handle: Any, graph: fx.GraphModule, - example_inputs: list[Any], + example_inputs: List[Any], graph_index: int, runtime_shape: Optional[int] = None) -> Callable: assert isinstance(handle, tuple) @@ -331,10 +331,10 @@ class EagerAdaptor(CompilerInterface): def compile( self, graph: fx.GraphModule, - example_inputs: list[Any], - compiler_config: dict[str, Any], + example_inputs: List[Any], + compiler_config: Dict[str, Any], runtime_shape: Optional[int] = None - ) -> tuple[Optional[Callable], Optional[Any]]: + ) -> Tuple[Optional[Callable], Optional[Any]]: # we don't need to compile the graph, just return the graph itself. # It does not support caching, return None for the handle. return graph, None diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py index f02994c55527d..20afe6967df39 100644 --- a/vllm/compilation/decorators.py +++ b/vllm/compilation/decorators.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import inspect -from typing import Callable, Optional, TypeVar, Union, overload +from typing import Callable, Dict, List, Optional, TypeVar, Union, overload from unittest.mock import patch import torch @@ -25,7 +25,7 @@ @overload def support_torch_compile( *, - dynamic_arg_dims: Optional[dict[str, Union[int, list[int]]]], + dynamic_arg_dims: Optional[Dict[str, Union[int, List[int]]]], ) -> Callable[[_T], _T]: ... @@ -38,7 +38,7 @@ def support_torch_compile(cls: _T) -> _T: def support_torch_compile( cls: Optional[_T] = None, *, - dynamic_arg_dims: Optional[dict[str, Union[int, list[int]]]] = None, + dynamic_arg_dims: Optional[Dict[str, Union[int, List[int]]]] = None, ) -> Union[Callable[[_T], _T], _T]: """ A decorator to add support for compiling the forward method of a class. @@ -131,7 +131,7 @@ def cls_decorator_helper(cls: _T) -> _T: def _support_torch_compile( cls: _T, - dynamic_arg_dims: dict[str, Union[int, list[int]]], + dynamic_arg_dims: Dict[str, Union[int, List[int]]], ) -> _T: """ A decorator to add support for compiling the forward method of a class. diff --git a/vllm/compilation/fix_functionalization.py b/vllm/compilation/fix_functionalization.py index b7443cf11a331..9b0e9c5d04081 100644 --- a/vllm/compilation/fix_functionalization.py +++ b/vllm/compilation/fix_functionalization.py @@ -1,8 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import operator -from collections.abc import Iterable -from typing import Optional, Union +from typing import Dict, Iterable, List, Optional, Tuple, Union import torch from torch._higher_order_ops.auto_functionalize import auto_functionalized @@ -28,7 +27,7 @@ def __call__(self, graph: torch.fx.Graph): self.begin() self.dump_graph(graph, "before_fix_functionalization") - self.nodes_to_remove: list[torch.fx.Node] = [] + self.nodes_to_remove: List[torch.fx.Node] = [] count = 0 for node in graph.nodes: if not is_func(node, auto_functionalized): @@ -111,8 +110,8 @@ def _remove(self, node_or_nodes: Union[torch.fx.Node, def defunctionalize(self, graph: torch.fx.Graph, node: torch.fx.Node, - mutated_args: dict[int, Union[torch.fx.Node, str]], - args: Optional[tuple[Union[torch.fx.Node, str], + mutated_args: Dict[int, Union[torch.fx.Node, str]], + args: Optional[Tuple[Union[torch.fx.Node, str], ...]] = None): """ De-functionalize a node by replacing it with a call to the original. @@ -124,7 +123,7 @@ def defunctionalize(self, self._remove(node) def replace_users_with_mutated_args(self, node: torch.fx.Node, - mutated_args: dict[int, + mutated_args: Dict[int, Union[torch.fx.Node, str]]): """ @@ -140,7 +139,7 @@ def replace_users_with_mutated_args(self, node: torch.fx.Node, user.replace_all_uses_with(arg) self._remove(user) - def getitem_users(self, node: torch.fx.Node) -> dict[int, torch.fx.Node]: + def getitem_users(self, node: torch.fx.Node) -> Dict[int, torch.fx.Node]: """ Returns the operator.getitem users of the auto-functionalized node, indexed by the index they are getting. @@ -155,7 +154,7 @@ def getitem_users(self, node: torch.fx.Node) -> dict[int, torch.fx.Node]: def insert_defunctionalized(self, graph: torch.fx.Graph, node: torch.fx.Node, - args: Optional[tuple[Union[torch.fx.Node, str], + args: Optional[Tuple[Union[torch.fx.Node, str], ...]] = None): """ Insert a new defunctionalized node into the graph before node. diff --git a/vllm/compilation/fusion.py b/vllm/compilation/fusion.py index 3f77fb61dfe83..0c3d8697b2375 100644 --- a/vllm/compilation/fusion.py +++ b/vllm/compilation/fusion.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Callable, NamedTuple, Optional +from typing import Callable, Dict, List, NamedTuple, Optional, Tuple import torch import torch._inductor.pattern_matcher as pm @@ -57,7 +57,7 @@ def __str__(self): kFp8DynamicTensorSym = QuantKey(FP8_DTYPE, False, True, True) kFp8DynamicTokenSym = QuantKey(FP8_DTYPE, False, False, True) -QUANT_OPS: dict[QuantKey, OpOverload] = { +QUANT_OPS: Dict[QuantKey, OpOverload] = { kFp8StaticTensorSym: torch.ops._C.static_scaled_fp8_quant.default, # noqa kFp8DynamicTensorSym: torch.ops._C.dynamic_scaled_fp8_quant.default, # noqa @@ -80,7 +80,7 @@ def __str__(self): f"{'' if self.fused_add else 'out'} residual)") -FUSED_OPS: dict[FusedRMSQuantKey, OpOverload] = { +FUSED_OPS: Dict[FusedRMSQuantKey, OpOverload] = { FusedRMSQuantKey(kFp8StaticTensorSym, False): torch.ops._C.rms_norm_static_fp8_quant.default, # noqa FusedRMSQuantKey(kFp8StaticTensorSym, True): @@ -101,7 +101,7 @@ def __init__(self, match: pm.Match, quant_op, fused_op): self.QUANT_OP = quant_op # in-place quant op self.FUSED_OP = fused_op # in-place fused quant op - def insert_fused_node(self, fused_return_mapping: dict[int, tuple[fx.Node, + def insert_fused_node(self, fused_return_mapping: Dict[int, Tuple[fx.Node, int]], **kwargs): """ @@ -548,7 +548,7 @@ def __init__(self, config: CompilationConfig.PassConfig): "FusionPass singleton instance already exists" super().__init__(config) - self.matches: list[MultiOutputMatch] = [] + self.matches: List[MultiOutputMatch] = [] self.patterns: PatternMatcherPass = PatternMatcherPass( pass_name="fusion_pass") diff --git a/vllm/compilation/fx_utils.py b/vllm/compilation/fx_utils.py index e3ca7f24fed55..b9a8d3112e775 100644 --- a/vllm/compilation/fx_utils.py +++ b/vllm/compilation/fx_utils.py @@ -1,8 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import operator -from collections.abc import Iterable -from typing import Optional +from typing import Iterable, Optional from torch import fx from torch._higher_order_ops.auto_functionalize import auto_functionalized diff --git a/vllm/compilation/multi_output_match.py b/vllm/compilation/multi_output_match.py index c26f148252863..e6f6a60b25950 100644 --- a/vllm/compilation/multi_output_match.py +++ b/vllm/compilation/multi_output_match.py @@ -3,7 +3,7 @@ import abc import operator from abc import abstractmethod -from collections.abc import Iterable +from typing import Iterable, List, Tuple from torch import fx from torch._higher_order_ops.auto_functionalize import auto_functionalized @@ -56,7 +56,7 @@ def process(self): raise NotImplementedError @property - def nodes(self) -> list[fx.Node]: + def nodes(self) -> List[fx.Node]: return self.match.nodes @property @@ -87,13 +87,13 @@ def inserting_after_match(self): return self.graph.inserting_after(last_node_in_match) def insert_getitems(self, tuple_node: fx.Node, - indices: Iterable[int]) -> tuple[fx.Node, ...]: + indices: Iterable[int]) -> Tuple[fx.Node, ...]: """ Insert operator.getitem nodes to extract elements from a tuple node. :param tuple_node: The tuple node to extract elements from. :param indices: The indices of the elements to extract. - :return: tuple of the new getitem nodes, corresponding to the indices. + :return: Tuple of the new getitem nodes, corresponding to the indices. """ with self.graph.inserting_after(tuple_node): return tuple( diff --git a/vllm/compilation/pass_manager.py b/vllm/compilation/pass_manager.py index 7c967b59035d3..52f8c3b1ec15a 100644 --- a/vllm/compilation/pass_manager.py +++ b/vllm/compilation/pass_manager.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Any +from typing import Any, Dict, List import torch from torch import fx as fx @@ -43,7 +43,7 @@ class PostGradPassManager(Parent): """ def __init__(self): - self.passes: list[InductorPass] = [] + self.passes: List[InductorPass] = [] def __call__(self, graph: fx.Graph): for pass_ in self.passes: @@ -69,7 +69,7 @@ def add(self, pass_: InductorPass): def uuid(self): return self.__getstate__() - def __getstate__(self) -> dict[str, list[Any]]: + def __getstate__(self) -> Dict[str, List[Any]]: """ Custom pickling for the pass manager, as some passes cannot be pickled. Pickling occurs because the pass manager is set as the value of diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py index 1a8211f0ab7c6..a8a283ddd8c0c 100644 --- a/vllm/compilation/wrapper.py +++ b/vllm/compilation/wrapper.py @@ -5,7 +5,7 @@ from abc import abstractmethod from contextlib import contextmanager from types import CodeType -from typing import Callable, Optional +from typing import Callable, List, Optional import torch @@ -48,7 +48,7 @@ def __init__(self, self.compiled_callable = compiled_callable self.original_code_object = self.__class__.forward.__code__ - self.compiled_codes: list[CodeType] = [] + self.compiled_codes: List[CodeType] = [] torch._dynamo.convert_frame.register_bytecode_hook(self.bytecode_hook) # read the env var to determine whether to use the custom dispatcher diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py index 4e7f6338d3a42..d4d31c58dc8d4 100644 --- a/vllm/core/block/block_table.py +++ b/vllm/core/block/block_table.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import math -from typing import Optional +from typing import List, Optional from vllm.core.block.common import BlockList from vllm.core.block.interfaces import Block, DeviceAwareBlockAllocator @@ -21,7 +21,7 @@ class BlockTable: single block. block_allocator (DeviceAwareBlockAllocator): The block allocator used to manage memory for the blocks. - _blocks (Optional[list[Block]], optional): An optional list of existing + _blocks (Optional[List[Block]], optional): An optional list of existing blocks to initialize the BlockTable with. If not provided, an empty BlockTable is created. max_block_sliding_window (Optional[int], optional): The number of @@ -34,7 +34,7 @@ class BlockTable: single block. _allocator (DeviceAwareBlockAllocator): The block allocator used to manage memory for the blocks. - _blocks (Optional[list[Block]]): The list of blocks managed by this + _blocks (Optional[List[Block]]): The list of blocks managed by this BlockTable. _num_full_slots (int): The number of tokens currently stored in the blocks. @@ -44,7 +44,7 @@ def __init__( self, block_size: int, block_allocator: DeviceAwareBlockAllocator, - _blocks: Optional[list[Block]] = None, + _blocks: Optional[List[Block]] = None, max_block_sliding_window: Optional[int] = None, ): self._block_size = block_size @@ -57,7 +57,7 @@ def __init__( self._num_full_slots = self._get_num_token_ids() @staticmethod - def get_num_required_blocks(token_ids: list[int], + def get_num_required_blocks(token_ids: List[int], block_size: int, num_lookahead_slots: int = 0) -> int: """Calculates the minimum number of blocks required to store a given @@ -68,7 +68,7 @@ def get_num_required_blocks(token_ids: list[int], allocation (e.g. ignoring prefix caching). Args: - token_ids (list[int]): The sequence of token IDs to be stored. + token_ids (List[int]): The sequence of token IDs to be stored. block_size (int): The maximum number of tokens that can be stored in a single block. num_lookahead_slots (int): look-ahead slots that the sequence may @@ -81,7 +81,7 @@ def get_num_required_blocks(token_ids: list[int], return cdiv(len(token_ids) + num_lookahead_slots, block_size) def allocate(self, - token_ids: list[int], + token_ids: List[int], device: Device = Device.GPU, extra_hash: Optional[int] = None) -> None: """Allocates memory blocks for storing the given sequence of token IDs. @@ -90,7 +90,7 @@ def allocate(self, sequence of token IDs. Args: - token_ids (list[int]): The sequence of token IDs to be stored. + token_ids (List[int]): The sequence of token IDs to be stored. device (Device, optional): The device on which the blocks should be allocated. Defaults to Device.GPU. extra_hash (Optional[int]): The hash value of additional @@ -106,14 +106,14 @@ def allocate(self, self.update(blocks) self._num_full_slots = len(token_ids) - def update(self, blocks: list[Block]) -> None: + def update(self, blocks: List[Block]) -> None: """Resets the table to the newly provided blocks (with their corresponding block ids) """ self._blocks.update(blocks) def append_token_ids(self, - token_ids: list[int], + token_ids: List[int], num_lookahead_slots: int = 0, num_computed_slots: Optional[int] = None, extra_hash: Optional[int] = None) -> None: @@ -130,7 +130,7 @@ def append_token_ids(self, separate block. Args: - token_ids (list[int]): The sequence of token IDs to be appended. + token_ids (List[int]): The sequence of token IDs to be appended. num_computed_slots (Optional[int]): The number of KV cache slots that are already filled (computed). When sliding window is enabled, this is used to compute how many @@ -244,7 +244,7 @@ def free(self) -> None: self._blocks.reset() @property - def physical_block_ids(self) -> list[int]: + def physical_block_ids(self) -> List[int]: """Returns a list of physical block indices for the blocks in the BlockTable. @@ -254,23 +254,23 @@ def physical_block_ids(self) -> list[int]: occupied by the block. Returns: - list[int]: A list of physical block indices for the blocks in the + List[int]: A list of physical block indices for the blocks in the BlockTable. """ return self._blocks.ids() - def get_unseen_token_ids(self, sequence_token_ids: list[int]) -> list[int]: + def get_unseen_token_ids(self, sequence_token_ids: List[int]) -> List[int]: """Get the number of "unseen" tokens in the sequence. Unseen tokens are tokens in the sequence corresponding to this block table, but are not yet appended to this block table. Args: - sequence_token_ids (list[int]): The list of token ids in the + sequence_token_ids (List[int]): The list of token ids in the sequence. Returns: - list[int]: The postfix of sequence_token_ids that has not yet been + List[int]: The postfix of sequence_token_ids that has not yet been appended to the block table. """ @@ -281,10 +281,10 @@ def get_unseen_token_ids(self, sequence_token_ids: list[int]) -> list[int]: def _allocate_blocks_for_token_ids( self, prev_block: Optional[Block], - token_ids: list[int], + token_ids: List[int], device: Device, - extra_hash: Optional[int] = None) -> list[Block]: - blocks: list[Block] = [] + extra_hash: Optional[int] = None) -> List[Block]: + blocks: List[Block] = [] block_token_ids = [] tail_token_ids = [] @@ -315,9 +315,9 @@ def _allocate_blocks_for_token_ids( return blocks - def _get_all_token_ids(self) -> list[int]: + def _get_all_token_ids(self) -> List[int]: # NOTE: This function is O(seq_len); use sparingly. - token_ids: list[int] = [] + token_ids: List[int] = [] if not self._is_allocated: return token_ids @@ -339,7 +339,7 @@ def _is_allocated(self) -> bool: return len(self._blocks) > 0 @property - def blocks(self) -> list[Block]: + def blocks(self) -> List[Block]: return self._blocks.list() @property @@ -358,7 +358,7 @@ def num_full_slots(self) -> int: return self._num_full_slots def get_num_blocks_touched_by_append_slots( - self, token_ids: list[int], num_lookahead_slots: int) -> int: + self, token_ids: List[int], num_lookahead_slots: int) -> int: """Determine how many blocks will be "touched" by appending the token ids. @@ -378,7 +378,7 @@ def get_num_blocks_touched_by_append_slots( return num_token_blocks def _chunk_token_blocks_for_append( - self, token_ids: list[int]) -> list[list[int]]: + self, token_ids: List[int]) -> List[List[int]]: """Split the token ids into block-sized chunks so they can be easily appended to blocks. The first such "token block" may have less token ids than the block size, since the last allocated block may be partially diff --git a/vllm/core/block/common.py b/vllm/core/block/common.py index 9e444ac1f7dd1..1966eac1cf9e0 100644 --- a/vllm/core/block/common.py +++ b/vllm/core/block/common.py @@ -1,9 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 from collections import deque -from collections.abc import Iterable from dataclasses import dataclass -from typing import Optional, Protocol +from typing import Deque, Dict, Iterable, List, Optional, Protocol, Tuple from vllm.core.block.interfaces import Block, BlockAllocator @@ -37,7 +36,7 @@ class RefCounter(RefCounterProtocol): def __init__(self, all_block_indices: Iterable[BlockId]): deduped = set(all_block_indices) - self._refcounts: dict[BlockId, RefCount] = { + self._refcounts: Dict[BlockId, RefCount] = { index: 0 for index in deduped } @@ -109,7 +108,7 @@ class CopyOnWriteTracker: """ def __init__(self, refcounter: RefCounterProtocol): - self._copy_on_writes: list[tuple[BlockId, BlockId]] = [] + self._copy_on_writes: List[Tuple[BlockId, BlockId]] = [] self._refcounter = refcounter def is_appendable(self, block: Block) -> bool: @@ -136,7 +135,7 @@ def record_cow(self, src_block_id: Optional[BlockId], assert trg_block_id is not None self._copy_on_writes.append((src_block_id, trg_block_id)) - def clear_cows(self) -> list[tuple[BlockId, BlockId]]: + def clear_cows(self) -> List[Tuple[BlockId, BlockId]]: """Clears the copy-on-write tracking information and returns the current state. @@ -145,7 +144,7 @@ def clear_cows(self) -> list[tuple[BlockId, BlockId]]: It then clears the internal tracking information. Returns: - list[tuple[BlockId, BlockId]]: A list mapping source + List[Tuple[BlockId, BlockId]]: A list mapping source block indices to destination block indices for the current copy-on-write operations. """ @@ -173,7 +172,7 @@ def __init__(self, block_size: int, create_block: Block.Factory, self._pool_size = pool_size assert self._pool_size >= 0 - self._free_ids: deque[int] = deque(range(self._pool_size)) + self._free_ids: Deque[int] = deque(range(self._pool_size)) self._pool = [] for i in range(self._pool_size): self._pool.append( @@ -204,7 +203,7 @@ def increase_pool(self): def init_block(self, prev_block: Optional[Block], - token_ids: list[int], + token_ids: List[int], block_size: int, physical_block_id: Optional[int], extra_hash: Optional[int] = None) -> Block: @@ -236,9 +235,9 @@ class BlockList: list on every iteration of the block manager """ - def __init__(self, blocks: list[Block]): - self._blocks: list[Block] = [] - self._block_ids: list[int] = [] + def __init__(self, blocks: List[Block]): + self._blocks: List[Block] = [] + self._block_ids: List[int] = [] self.update(blocks) @@ -251,7 +250,7 @@ def _update_block_id(self, block_index: int, assert new_block_id is not None self._block_ids[block_index] = new_block_id - def update(self, blocks: list[Block]): + def update(self, blocks: List[Block]): self._blocks = blocks # Cache block ids for fast query @@ -259,7 +258,7 @@ def update(self, blocks: list[Block]): for block in self._blocks: self._add_block_id(block.block_id) - def append_token_ids(self, block_index: int, token_ids: list[int]) -> None: + def append_token_ids(self, block_index: int, token_ids: List[int]) -> None: block = self._blocks[block_index] prev_block_id = block.block_id @@ -287,10 +286,10 @@ def reset(self): self._blocks = [] self._block_ids = [] - def list(self) -> list[Block]: + def list(self) -> List[Block]: return self._blocks - def ids(self) -> list[int]: + def ids(self) -> List[int]: return self._block_ids @@ -346,7 +345,7 @@ def get_hit_rate(self): return (completed_block_hit + incompleted_block_hit) / total_blocks -def get_all_blocks_recursively(last_block: Block) -> list[Block]: +def get_all_blocks_recursively(last_block: Block) -> List[Block]: """Retrieves all the blocks in a sequence starting from the last block. This function recursively traverses the sequence of blocks in reverse order, @@ -357,15 +356,15 @@ def get_all_blocks_recursively(last_block: Block) -> list[Block]: last_block (Block): The last block in the sequence. Returns: - list[Block]: A list of all the blocks in the sequence, in the order they + List[Block]: A list of all the blocks in the sequence, in the order they appear. """ - def recurse(block: Block, lst: list[Block]) -> None: + def recurse(block: Block, lst: List[Block]) -> None: if block.prev_block is not None: recurse(block.prev_block, lst) lst.append(block) - all_blocks: list[Block] = [] + all_blocks: List[Block] = [] recurse(last_block, all_blocks) return all_blocks diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py index d777b6ab1d160..359b5b263f689 100644 --- a/vllm/core/block/cpu_gpu_block_allocator.py +++ b/vllm/core/block/cpu_gpu_block_allocator.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Optional +from typing import Dict, FrozenSet, List, Optional, Tuple from vllm.core.block.interfaces import (Block, BlockAllocator, BlockId, DeviceAwareBlockAllocator) @@ -109,10 +109,10 @@ def __init__(self, cpu_block_allocator: BlockAllocator, Device.GPU: gpu_block_allocator, } - self._swap_mapping: dict[int, int] = {} + self._swap_mapping: Dict[int, int] = {} self._null_block: Optional[Block] = None - self._block_ids_to_allocator: dict[int, BlockAllocator] = {} + self._block_ids_to_allocator: Dict[int, BlockAllocator] = {} for _, allocator in self._allocators.items(): for block_id in allocator.all_block_ids: self._block_ids_to_allocator[block_id] = allocator @@ -146,16 +146,16 @@ def allocate_mutable_block(self, def allocate_immutable_blocks( self, prev_block: Optional[Block], - block_token_ids: list[list[int]], + block_token_ids: List[List[int]], device: Device, - extra_hash: Optional[int] = None) -> list[Block]: + extra_hash: Optional[int] = None) -> List[Block]: """Allocates a new group of immutable blocks with the provided block token IDs on the specified device. Args: prev_block (Optional[Block]): The previous block in the sequence. Used for prefix hashing. - block_token_ids (list[int]): The list of block token IDs to be + block_token_ids (List[int]): The list of block token IDs to be stored in the new blocks. device (Device): The device on which to allocate the new block. extra_hash (Optional[int]): The hash value of additional @@ -163,7 +163,7 @@ def allocate_immutable_blocks( in the prefix caching block. Returns: - list[Block]: The newly allocated list of immutable blocks + List[Block]: The newly allocated list of immutable blocks containing the provided block token IDs. """ return self._allocators[device].allocate_immutable_blocks( @@ -171,7 +171,7 @@ def allocate_immutable_blocks( def allocate_immutable_block(self, prev_block: Optional[Block], - token_ids: list[int], + token_ids: List[int], device: Device, extra_hash: Optional[int] = None) -> Block: """Allocates a new immutable block with the provided token IDs on the @@ -180,7 +180,7 @@ def allocate_immutable_block(self, Args: prev_block (Optional[Block]): The previous block in the sequence. Used for prefix hashing. - token_ids (list[int]): The list of token IDs to be stored in the new + token_ids (List[int]): The list of token IDs to be stored in the new block. device (Device): The device on which to allocate the new block. extra_hash (Optional[int]): The hash value of additional @@ -208,7 +208,7 @@ def free(self, block: Block) -> None: allocator = self._block_ids_to_allocator[block_id] allocator.free(block) - def fork(self, last_block: Block) -> list[Block]: + def fork(self, last_block: Block) -> List[Block]: """Creates a new sequence of blocks that shares the same underlying memory as the original sequence. @@ -216,7 +216,7 @@ def fork(self, last_block: Block) -> list[Block]: last_block (Block): The last block in the original sequence. Returns: - list[Block]: A new list of blocks that shares the same memory as the + List[Block]: A new list of blocks that shares the same memory as the original sequence. """ # do not attempt to fork the null block @@ -255,20 +255,20 @@ def get_physical_block_id(self, device: Device, absolute_id: int) -> int: """ return self._allocators[device].get_physical_block_id(absolute_id) - def swap(self, blocks: list[Block], src_device: Device, - dst_device: Device) -> dict[int, int]: + def swap(self, blocks: List[Block], src_device: Device, + dst_device: Device) -> Dict[int, int]: """Execute the swap for the given blocks from source_device on to dest_device, save the current swap mapping and append them to the accumulated `self._swap_mapping` for each scheduling move. Args: - blocks: list of blocks to be swapped. + blocks: List of blocks to be swapped. src_device (Device): Device to swap the 'blocks' from. dst_device (Device): Device to swap the 'blocks' to. Returns: - dict[int, int]: Swap mapping from source_device + Dict[int, int]: Swap mapping from source_device on to dest_device. """ src_block_ids = [block.block_id for block in blocks] @@ -276,20 +276,20 @@ def swap(self, blocks: list[Block], src_device: Device, self._allocators[dst_device].swap_in(blocks) dst_block_ids = [block.block_id for block in blocks] - current_swap_mapping: dict[int, int] = {} + current_swap_mapping: Dict[int, int] = {} for src_block_id, dst_block_id in zip(src_block_ids, dst_block_ids): if src_block_id is not None and dst_block_id is not None: self._swap_mapping[src_block_id] = dst_block_id current_swap_mapping[src_block_id] = dst_block_id return current_swap_mapping - def get_num_full_blocks_touched(self, blocks: list[Block], + def get_num_full_blocks_touched(self, blocks: List[Block], device: Device) -> int: """Returns the number of full blocks that will be touched by swapping in/out the given blocks on to the 'device'. Args: - blocks: list of blocks to be swapped. + blocks: List of blocks to be swapped. device (Device): Device to swap the 'blocks' on. Returns: @@ -300,40 +300,40 @@ def get_num_full_blocks_touched(self, blocks: list[Block], """ return self._allocators[device].get_num_full_blocks_touched(blocks) - def clear_copy_on_writes(self) -> list[tuple[int, int]]: + def clear_copy_on_writes(self) -> List[Tuple[int, int]]: """Clears the copy-on-write (CoW) state and returns the mapping of source to destination block IDs. Returns: - list[tuple[int, int]]: A list mapping source block IDs to + List[Tuple[int, int]]: A list mapping source block IDs to destination block IDs. """ # CoW only supported on GPU device = Device.GPU return self._allocators[device].clear_copy_on_writes() - def mark_blocks_as_accessed(self, block_ids: list[int], + def mark_blocks_as_accessed(self, block_ids: List[int], now: float) -> None: """Mark blocks as accessed, only use for prefix caching.""" # Prefix caching only supported on GPU. device = Device.GPU return self._allocators[device].mark_blocks_as_accessed(block_ids, now) - def mark_blocks_as_computed(self, block_ids: list[int]) -> None: + def mark_blocks_as_computed(self, block_ids: List[int]) -> None: """Mark blocks as accessed, only use for prefix caching.""" # Prefix caching only supported on GPU. device = Device.GPU return self._allocators[device].mark_blocks_as_computed(block_ids) def get_common_computed_block_ids( - self, computed_seq_block_ids: list[list[int]]) -> list[int]: + self, computed_seq_block_ids: List[List[int]]) -> List[int]: # Prefix caching only supported on GPU. device = Device.GPU return self._allocators[device].get_common_computed_block_ids( computed_seq_block_ids) @property - def all_block_ids(self) -> frozenset[int]: + def all_block_ids(self) -> FrozenSet[int]: return frozenset(self._block_ids_to_allocator.keys()) def get_prefix_cache_hit_rate(self, device: Device) -> float: @@ -348,13 +348,13 @@ def reset_prefix_cache(self) -> bool: success = success and allocator.reset_prefix_cache() return success - def get_and_reset_swaps(self) -> list[tuple[int, int]]: + def get_and_reset_swaps(self) -> List[Tuple[int, int]]: """Returns and clears the mapping of source to destination block IDs. Will be called after every swapping operations for now, and after every schedule when BlockManagerV2 become default. Currently not useful. Returns: - list[tuple[int, int]]: A mapping of source to destination block IDs. + List[Tuple[int, int]]: A mapping of source to destination block IDs. """ mapping = self._swap_mapping.copy() self._swap_mapping.clear() @@ -362,9 +362,9 @@ def get_and_reset_swaps(self) -> list[tuple[int, int]]: def find_cached_blocks_prefix( self, - block_hashes: list[int], + block_hashes: List[int], device: Device = Device.GPU, - ) -> list[int]: + ) -> List[int]: return self._allocators[device].find_cached_blocks_prefix(block_hashes) @@ -381,7 +381,7 @@ def __init__(self, proxy: Block): super().__init__() self._proxy = proxy - def append_token_ids(self, token_ids: list[BlockId]): + def append_token_ids(self, token_ids: List[BlockId]): raise ValueError("null block should not be modified") @property @@ -393,7 +393,7 @@ def block_id(self, value: Optional[BlockId]): raise ValueError("null block should not be modified") @property - def token_ids(self) -> list[BlockId]: + def token_ids(self) -> List[BlockId]: return self._proxy.token_ids @property diff --git a/vllm/core/block/interfaces.py b/vllm/core/block/interfaces.py index 1c47fded26afd..0b0197deb8d47 100644 --- a/vllm/core/block/interfaces.py +++ b/vllm/core/block/interfaces.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 from abc import ABC, abstractmethod -from typing import Optional, Protocol +from typing import Dict, FrozenSet, List, Optional, Protocol, Tuple from vllm.utils import Device @@ -11,7 +11,7 @@ class Block(ABC): @abstractmethod - def append_token_ids(self, token_ids: list[int]) -> None: + def append_token_ids(self, token_ids: List[int]) -> None: pass @property @@ -27,7 +27,7 @@ def block_id(self, value: Optional[int]) -> None: @property @abstractmethod - def token_ids(self) -> list[int]: + def token_ids(self) -> List[int]: pass @property @@ -84,7 +84,7 @@ class Factory(Protocol): def __call__( self, prev_block: Optional["Block"], - token_ids: list[int], + token_ids: List[int], block_size: int, allocator: "BlockAllocator", block_id: Optional[int] = None, @@ -114,14 +114,14 @@ def allocate_mutable_block(self, prev_block: Optional[Block], @abstractmethod def allocate_immutable_block(self, prev_block: Optional[Block], - token_ids: list[int], + token_ids: List[int], extra_hash: Optional[int]) -> Block: pass @abstractmethod def allocate_immutable_blocks(self, prev_block: Optional[Block], - block_token_ids: list[list[int]], - extra_hash: Optional[int]) -> list[Block]: + block_token_ids: List[List[int]], + extra_hash: Optional[int]) -> List[Block]: pass @abstractmethod @@ -129,7 +129,7 @@ def free(self, block: Block) -> None: pass @abstractmethod - def fork(self, last_block: Block) -> list[Block]: + def fork(self, last_block: Block) -> List[Block]: pass @abstractmethod @@ -145,34 +145,34 @@ def get_physical_block_id(self, absolute_id: int) -> int: pass @abstractmethod - def swap_out(self, blocks: list[Block]) -> None: + def swap_out(self, blocks: List[Block]) -> None: pass @abstractmethod - def swap_in(self, blocks: list[Block]) -> None: + def swap_in(self, blocks: List[Block]) -> None: pass @property @abstractmethod - def all_block_ids(self) -> frozenset[int]: + def all_block_ids(self) -> FrozenSet[int]: pass @abstractmethod - def clear_copy_on_writes(self) -> list[tuple[int, int]]: + def clear_copy_on_writes(self) -> List[Tuple[int, int]]: pass @abstractmethod - def mark_blocks_as_accessed(self, block_ids: list[int], + def mark_blocks_as_accessed(self, block_ids: List[int], now: float) -> None: pass @abstractmethod - def mark_blocks_as_computed(self, block_ids: list[int]) -> None: + def mark_blocks_as_computed(self, block_ids: List[int]) -> None: pass @abstractmethod def get_common_computed_block_ids( - self, computed_seq_block_ids: list[list[int]]) -> list[int]: + self, computed_seq_block_ids: List[List[int]]) -> List[int]: pass @abstractmethod @@ -186,7 +186,7 @@ def promote_to_immutable_block(self, block: Block) -> BlockId: pass @abstractmethod - def get_num_full_blocks_touched(self, blocks: list[Block]) -> int: + def get_num_full_blocks_touched(self, blocks: List[Block]) -> int: pass @abstractmethod @@ -205,8 +205,8 @@ class NoFreeBlocksError(ValueError): @abstractmethod def find_cached_blocks_prefix( self, - block_hashes: list[int], - ) -> list[int]: + block_hashes: List[int], + ) -> List[int]: pass @@ -222,7 +222,7 @@ def allocate_mutable_block(self, @abstractmethod def allocate_immutable_block(self, prev_block: Optional[Block], - token_ids: list[int], + token_ids: List[int], device: Device, extra_hash: Optional[int] = None) -> Block: pass @@ -231,10 +231,10 @@ def allocate_immutable_block(self, def allocate_immutable_blocks( self, prev_block: Optional[Block], - block_token_ids: list[list[int]], + block_token_ids: List[List[int]], device: Device, extra_hash: Optional[int] = None, - ) -> list[Block]: + ) -> List[Block]: pass @abstractmethod @@ -250,40 +250,40 @@ def free(self, block: Block) -> None: pass @abstractmethod - def fork(self, last_block: Block) -> list[Block]: + def fork(self, last_block: Block) -> List[Block]: pass @property @abstractmethod - def all_block_ids(self) -> frozenset[int]: + def all_block_ids(self) -> FrozenSet[int]: pass @abstractmethod - def clear_copy_on_writes(self) -> list[tuple[int, int]]: + def clear_copy_on_writes(self) -> List[Tuple[int, int]]: pass @abstractmethod - def mark_blocks_as_accessed(self, block_ids: list[int], + def mark_blocks_as_accessed(self, block_ids: List[int], now: float) -> None: pass @abstractmethod - def mark_blocks_as_computed(self, block_ids: list[int]) -> None: + def mark_blocks_as_computed(self, block_ids: List[int]) -> None: pass @abstractmethod def get_common_computed_block_ids( - self, computed_seq_block_ids: list[list[int]]) -> list[int]: + self, computed_seq_block_ids: List[List[int]]) -> List[int]: pass @abstractmethod - def get_num_full_blocks_touched(self, blocks: list[Block], + def get_num_full_blocks_touched(self, blocks: List[Block], device: Device) -> int: pass @abstractmethod - def swap(self, blocks: list[Block], src_device: Device, - dst_device: Device) -> dict[int, int]: + def swap(self, blocks: List[Block], src_device: Device, + dst_device: Device) -> Dict[int, int]: pass @abstractmethod @@ -312,7 +312,7 @@ def reset_prefix_cache(self) -> bool: @abstractmethod def find_cached_blocks_prefix( self, - block_hashes: list[int], + block_hashes: List[int], device: Device = Device.GPU, - ) -> list[int]: + ) -> List[int]: pass diff --git a/vllm/core/block/naive_block.py b/vllm/core/block/naive_block.py index b7b645187a7bc..c388366b825f2 100644 --- a/vllm/core/block/naive_block.py +++ b/vllm/core/block/naive_block.py @@ -1,8 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 from collections import deque -from collections.abc import Iterable -from typing import Optional, Union +from typing import Deque, FrozenSet, Iterable, List, Optional, Tuple, Union from vllm.core.block.common import (BlockPool, CopyOnWriteTracker, RefCounter, get_all_blocks_recursively) @@ -39,7 +38,7 @@ def __init__( if block_ids is None: block_ids = range(num_blocks) - self._free_block_indices: deque[BlockId] = deque(block_ids) + self._free_block_indices: Deque[BlockId] = deque(block_ids) self._all_block_indices = frozenset(block_ids) assert len(self._all_block_indices) == num_blocks @@ -65,7 +64,7 @@ def __init__( def allocate_immutable_block(self, prev_block: Optional[Block], - token_ids: list[int], + token_ids: List[int], extra_hash: Optional[int] = None, device: Optional[Device] = None) -> Block: """Allocates a new immutable block with the given token IDs, linked to @@ -75,7 +74,7 @@ def allocate_immutable_block(self, prev_block (Optional[Block]): The previous block in the sequence. If None, then the block to be allocated is the first block in the sequence. - token_ids (list[int]): The token IDs to be stored in the new block. + token_ids (List[int]): The token IDs to be stored in the new block. Returns: Block: The newly allocated immutable block. @@ -88,9 +87,9 @@ def allocate_immutable_block(self, def allocate_immutable_blocks( self, prev_block: Optional[Block], - block_token_ids: list[list[int]], + block_token_ids: List[List[int]], extra_hash: Optional[int] = None, - device: Optional[Device] = None) -> list[Block]: + device: Optional[Device] = None) -> List[Block]: assert device is None num_blocks = len(block_token_ids) @@ -162,7 +161,7 @@ def free(self, block: Block, keep_block_object: bool = False) -> None: def free_block_id(self, block_id: BlockId) -> None: self._free_block_id(block_id) - def fork(self, last_block: Block) -> list[Block]: + def fork(self, last_block: Block) -> List[Block]: """Creates a new sequence of blocks that shares the same underlying memory as the original sequence. @@ -170,12 +169,12 @@ def fork(self, last_block: Block) -> list[Block]: last_block (Block): The last block in the original sequence. Returns: - list[Block]: The new sequence of blocks that shares the same memory + List[Block]: The new sequence of blocks that shares the same memory as the original sequence. """ source_blocks = get_all_blocks_recursively(last_block) - forked_blocks: list[Block] = [] + forked_blocks: List[Block] = [] prev_block = None for block in source_blocks: @@ -219,7 +218,7 @@ def refcounter(self): return self._refcounter @property - def all_block_ids(self) -> frozenset[int]: + def all_block_ids(self) -> FrozenSet[int]: return self._all_block_indices def cow_block_if_not_appendable(self, block: Block) -> BlockId: @@ -247,16 +246,16 @@ def cow_block_if_not_appendable(self, block: Block) -> BlockId: return trg_block_id - def clear_copy_on_writes(self) -> list[tuple[BlockId, BlockId]]: + def clear_copy_on_writes(self) -> List[Tuple[BlockId, BlockId]]: """Returns the copy-on-write source->destination mapping and clears it. Returns: - list[tuple[BlockId, BlockId]]: A list mapping source + List[Tuple[BlockId, BlockId]]: A list mapping source block indices to destination block indices. """ return self._cow_tracker.clear_cows() - def mark_blocks_as_accessed(self, block_ids: list[int], + def mark_blocks_as_accessed(self, block_ids: List[int], now: float) -> None: """Mark blocks as accessed, used in prefix caching. @@ -265,7 +264,7 @@ def mark_blocks_as_accessed(self, block_ids: list[int], """ pass - def mark_blocks_as_computed(self, block_ids: list[int]) -> None: + def mark_blocks_as_computed(self, block_ids: List[int]) -> None: """Mark blocks as computed, used in prefix caching. Since the naive allocator does not implement prefix caching, we do @@ -274,7 +273,7 @@ def mark_blocks_as_computed(self, block_ids: list[int]) -> None: pass def get_common_computed_block_ids( - self, computed_seq_block_ids: list[list[int]]) -> list[int]: + self, computed_seq_block_ids: List[List[int]]) -> List[int]: """Determine blocks that can be skipped in prefill. Since the naive allocator does not support prefix caching, always return @@ -285,12 +284,12 @@ def get_common_computed_block_ids( def promote_to_immutable_block(self, block: Block) -> BlockId: raise NotImplementedError("There is no promotion for naive blocks") - def get_num_full_blocks_touched(self, blocks: list[Block]) -> int: + def get_num_full_blocks_touched(self, blocks: List[Block]) -> int: """Returns the number of full blocks that will be touched by swapping in/out. Args: - blocks: list of blocks to be swapped. + blocks: List of blocks to be swapped. Returns: int: the number of full blocks that will be touched by swapping in/out the given blocks. Non full blocks are ignored @@ -306,11 +305,11 @@ def get_num_full_blocks_touched(self, blocks: list[Block]) -> int: old_block_set.add(block) return len(old_block_set) - def swap_out(self, blocks: list[Block]) -> None: + def swap_out(self, blocks: List[Block]) -> None: for block in blocks: self._free_block_id(block) - def swap_in(self, blocks: list[Block]) -> None: + def swap_in(self, blocks: List[Block]) -> None: for block in blocks: # Here we allocate either immutable or mutable block and then # extract its block_id. Note that the block object is released @@ -337,7 +336,7 @@ def reset_prefix_cache(self) -> bool: """No prefix cache for naive block allocator.""" return True - def find_cached_blocks_prefix(self, block_hashes: list[int]) -> list[int]: + def find_cached_blocks_prefix(self, block_hashes: List[int]) -> List[int]: # Not applicable for naive block allocator. return [] @@ -352,7 +351,7 @@ class NaiveBlock(Block): Args: prev_block (Block): The previous block in the sequence. - token_ids (list[int]): The initial token IDs to be stored in the block. + token_ids (List[int]): The initial token IDs to be stored in the block. block_size (int): The maximum number of token IDs that can be stored in the block. allocator (BlockAllocator): The block allocator associated with this @@ -366,13 +365,13 @@ class NaiveBlock(Block): def __init__(self, prev_block: Optional[Block], - token_ids: list[int], + token_ids: List[int], block_size: int, allocator: BlockAllocator, block_id: Optional[int] = None, _cow_target: Optional[Block] = None, extra_hash: Optional[int] = None): - self._token_ids: list[int] = [] + self._token_ids: List[int] = [] self._block_size = block_size self._prev_block = prev_block self._block_id = block_id @@ -381,12 +380,12 @@ def __init__(self, self._append_token_ids_no_cow(token_ids) - def append_token_ids(self, token_ids: list[int]) -> None: + def append_token_ids(self, token_ids: List[int]) -> None: """Appends the given token IDs to the block and performs a copy-on-write if necessary. Args: - token_ids (Optional[list[int]]): The token IDs to be appended + token_ids (Optional[List[int]]): The token IDs to be appended to the block. """ self._append_token_ids_no_cow(token_ids) @@ -395,11 +394,11 @@ def append_token_ids(self, token_ids: list[int]) -> None: self._block_id = (self._allocator.cow_block_if_not_appendable( self._cow_target)) - def _append_token_ids_no_cow(self, token_ids: list[int]) -> None: + def _append_token_ids_no_cow(self, token_ids: List[int]) -> None: """Appends the given token IDs to the block Args: - token_ids (list[int]): The token IDs to be appended to the block. + token_ids (List[int]): The token IDs to be appended to the block. """ if len(token_ids) == 0: return @@ -441,7 +440,7 @@ def num_empty_slots(self) -> int: return self._block_size - len(self.token_ids) @property - def token_ids(self) -> list[int]: + def token_ids(self) -> List[int]: return self._token_ids @property diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py index f5907a69bf452..1ca9e49dac371 100644 --- a/vllm/core/block/prefix_caching_block.py +++ b/vllm/core/block/prefix_caching_block.py @@ -2,9 +2,9 @@ """Token blocks.""" import sys from bisect import bisect_left -from collections.abc import Iterable from os.path import commonprefix -from typing import Callable, Optional +from typing import (Callable, Dict, FrozenSet, Iterable, List, Optional, Set, + Tuple) from vllm.core.block.common import (CacheMetricData, CopyOnWriteTracker, get_all_blocks_recursively) @@ -88,15 +88,15 @@ def __init__( # A mapping of prefix hash to block index. All blocks which have a # prefix hash will be in this dict, even if they have refcount 0. - self._cached_blocks: dict[PrefixHash, BlockId] = {} + self._cached_blocks: Dict[PrefixHash, BlockId] = {} # A list of immutable block IDs that have been touched by scheduler # and should be marked as computed after an entire batch of sequences # are scheduled. - self._touched_blocks: set[BlockId] = set() + self._touched_blocks: Set[BlockId] = set() # Used to track status of each physical block id - self._block_tracker: dict[BlockId, BlockTracker] = {} + self._block_tracker: Dict[BlockId, BlockTracker] = {} for block_id in block_ids: self._block_tracker[block_id] = BlockTracker() @@ -134,7 +134,7 @@ def __init__( def _create_block( self, prev_block: Optional[Block], - token_ids: list[int], + token_ids: List[int], block_size: int, allocator: BlockAllocator, block_id: Optional[int] = None, @@ -156,7 +156,7 @@ def _create_block( def allocate_immutable_block(self, prev_block: Optional[Block], - token_ids: list[int], + token_ids: List[int], extra_hash: Optional[int] = None, device: Optional[Device] = None) -> Block: """Allocates an immutable block with the given token IDs, reusing cached @@ -164,7 +164,7 @@ def allocate_immutable_block(self, Args: prev_block (Optional[Block]): The previous block in the sequence. - token_ids (list[int]): The token IDs to be stored in the block. + token_ids (List[int]): The token IDs to be stored in the block. Returns: Block: The allocated immutable block. @@ -197,9 +197,9 @@ def allocate_immutable_block(self, def allocate_immutable_blocks( self, prev_block: Optional[Block], - block_token_ids: list[list[int]], + block_token_ids: List[List[int]], extra_hash: Optional[int] = None, - device: Optional[Device] = None) -> list[Block]: + device: Optional[Device] = None) -> List[Block]: blocks = [] for token_ids in block_token_ids: prev_block = self.allocate_immutable_block(prev_block=prev_block, @@ -376,7 +376,7 @@ def free(self, block: Block, keep_block_object: bool = False) -> None: if not keep_block_object: self._block_pool.free_block(block) - def fork(self, last_block: Block) -> list[Block]: + def fork(self, last_block: Block) -> List[Block]: """Creates a new sequence of blocks that shares the same underlying memory as the original sequence. @@ -384,12 +384,12 @@ def fork(self, last_block: Block) -> list[Block]: last_block (Block): The last block in the original sequence. Returns: - list[Block]: The new sequence of blocks that shares the same memory + List[Block]: The new sequence of blocks that shares the same memory as the original sequence. """ source_blocks = get_all_blocks_recursively(last_block) - forked_blocks: list[Block] = [] + forked_blocks: List[Block] = [] prev_block = None for block in source_blocks: block_id = block.block_id @@ -435,7 +435,7 @@ def get_physical_block_id(self, absolute_id: int) -> int: return sorted(self.all_block_ids).index(absolute_id) @property - def all_block_ids(self) -> frozenset[int]: + def all_block_ids(self) -> FrozenSet[int]: return self._hashless_allocator.all_block_ids def get_prefix_cache_hit_rate(self) -> float: @@ -551,16 +551,16 @@ def cow_block_if_not_appendable(self, block: Block) -> BlockId: return trg_block_id - def clear_copy_on_writes(self) -> list[tuple[BlockId, BlockId]]: + def clear_copy_on_writes(self) -> List[Tuple[BlockId, BlockId]]: """Returns the copy-on-write source->destination mapping and clears it. Returns: - list[tuple[BlockId, BlockId]]: A list mapping source + List[Tuple[BlockId, BlockId]]: A list mapping source block indices to destination block indices. """ return self._cow_tracker.clear_cows() - def mark_blocks_as_accessed(self, block_ids: list[int], + def mark_blocks_as_accessed(self, block_ids: List[int], now: float) -> None: """Mark blocks as accessed, used in prefix caching. @@ -577,7 +577,7 @@ def mark_blocks_as_accessed(self, block_ids: list[int], raise ValueError( "Mark block as accessed which is not belonged to GPU") - def mark_blocks_as_computed(self, block_ids: list[int]) -> None: + def mark_blocks_as_computed(self, block_ids: List[int]) -> None: # Mark all touched blocks as computed. for block_id in self._touched_blocks: self._block_tracker[block_id].computed = True @@ -600,7 +600,7 @@ def block_is_computed(self, block_id: int) -> bool: return block_id in self.evictor def get_common_computed_block_ids( - self, computed_seq_block_ids: list[list[int]]) -> list[int]: + self, computed_seq_block_ids: List[List[int]]) -> List[int]: """Return the block ids that are common for a given sequence group. Only those blocks that are immutable and already be marked @@ -620,12 +620,12 @@ def get_common_computed_block_ids( if ids ]) - def get_num_full_blocks_touched(self, blocks: list[Block]) -> int: + def get_num_full_blocks_touched(self, blocks: List[Block]) -> int: """Returns the number of full blocks that will be touched by swapping in/out. Args: - blocks: list of blocks to be swapped. + blocks: List of blocks to be swapped. Returns: int: the number of full blocks that will be touched by swapping in/out the given blocks. Non full blocks are ignored @@ -643,23 +643,23 @@ def get_num_full_blocks_touched(self, blocks: list[Block]) -> int: num_touched_blocks += 1 return num_touched_blocks - def swap_out(self, blocks: list[Block]) -> None: + def swap_out(self, blocks: List[Block]) -> None: """Execute the swap out actions. Basically just free the given blocks. Args: - blocks: list of blocks to be swapped out. + blocks: List of blocks to be swapped out. """ for block in blocks: self._free_block_id(block) - def swap_in(self, blocks: list[Block]) -> None: + def swap_in(self, blocks: List[Block]) -> None: """Execute the swap in actions. Change the block id from old allocator to current allocator for each block to finish the block table update. Args: - blocks: list of blocks to be swapped in. + blocks: List of blocks to be swapped in. """ for block in blocks: # Here we allocate either immutable or mutable block and then @@ -681,7 +681,7 @@ def swap_in(self, blocks: list[Block]) -> None: block.block_id = block_id # Assign block_id - def find_cached_blocks_prefix(self, block_hashes: list[int]) -> list[int]: + def find_cached_blocks_prefix(self, block_hashes: List[int]) -> List[int]: """ Given a list of block hashes, return the prefix of the block hashes that are all cached. @@ -692,10 +692,10 @@ def find_cached_blocks_prefix(self, block_hashes: list[int]) -> list[int]: property, we can use binary search to find the prefix of cached blocks. Args: - block_hashes (list[int]): The list of block hashes. + block_hashes (List[int]): The list of block hashes. Returns: - list[int]: The prefix of the `block_hashes` that are cached. + List[int]: The prefix of the `block_hashes` that are cached. """ def _block_is_cached(block_hash: PrefixHash) -> bool: @@ -734,7 +734,7 @@ class PrefixCachingBlock(Block): Args: prev_block (Optional[PrefixCachingBlock]): The previous block in the sequence. - token_ids (list[int]): The initial token IDs to be stored in the block. + token_ids (List[int]): The initial token IDs to be stored in the block. block_size (int): The maximum number of token IDs that can be stored in the block. allocator (BlockAllocator): The prefix @@ -756,7 +756,7 @@ class PrefixCachingBlock(Block): def __init__( self, prev_block: Optional[Block], - token_ids: list[int], + token_ids: List[int], block_size: int, allocator: BlockAllocator, block_id: Optional[int] = None, @@ -826,12 +826,12 @@ def last_accessed(self) -> float: def last_accessed(self, last_accessed_ts: float): self._last_accessed = last_accessed_ts - def append_token_ids(self, token_ids: list[int]) -> None: + def append_token_ids(self, token_ids: List[int]) -> None: """Appends the given token IDs to the block and registers the block as immutable if the block becomes full. Args: - token_ids (list[int]): The token IDs to be appended to the block. + token_ids (List[int]): The token IDs to be appended to the block. """ # Ensure this is mutable block (not promoted) assert self.content_hash is None @@ -878,7 +878,7 @@ def block_size(self) -> int: return self._block.block_size @property - def token_ids(self) -> list[int]: + def token_ids(self) -> List[int]: return self._block.token_ids @property @@ -927,7 +927,7 @@ def content_hash(self) -> Optional[int]: def hash_block_tokens(cls, is_first_block: bool, prev_block_hash: Optional[int], - cur_block_token_ids: list[int], + cur_block_token_ids: List[int], extra_hash: Optional[int] = None) -> int: """Computes a hash value corresponding to the contents of a block and the contents of the preceding block(s). The hash value is used for @@ -938,7 +938,7 @@ def hash_block_tokens(cls, the sequence. - prev_block_hash (Optional[int]): The hash of the previous block. None if this is the first block. - - cur_block_token_ids (list[int]): A list of token ids in the current + - cur_block_token_ids (List[int]): A list of token ids in the current block. The current block is assumed to be full. - extra_hash (Optional[int]): The hash value of additional factors such as adapters that influence the block, apart from the token_ids. @@ -990,14 +990,14 @@ def __init__( # for the sequence when we need to check if the sequence is cached. # Note a block that's not full will not have its hash calculated and # recorded. - self._seq_id_to_blocks_hashes: dict[int, list[int]] = {} + self._seq_id_to_blocks_hashes: Dict[int, List[int]] = {} # A map from seq_id to the number of tokens that are cached for the # sequence. # We need this so that a sequence in continuous prefill doesn't # accidentally see its cached token count change. See comments in # `get_num_cached_tokens` for more details. - self._seq_id_to_num_tokens_computed: dict[int, int] = {} + self._seq_id_to_num_tokens_computed: Dict[int, int] = {} def _update_seq_hashes(self, seq: Sequence) -> None: """Incrementally update the sequence's block hashes and record them.""" @@ -1096,7 +1096,7 @@ class LastAccessBlocksTracker: def __init__(self, allocator): self._allocator = allocator - self._seq_last_access: dict[int, Optional[float]] = {} + self._seq_last_access: Dict[int, Optional[float]] = {} def add_seq(self, seq_id: int) -> None: """Start tracking seq_id @@ -1115,7 +1115,7 @@ def update_last_access(self, seq_id: int, time: float) -> None: self._seq_last_access[seq_id] = time def update_seq_blocks_last_access(self, seq_id: int, - block_ids: list[int]) -> None: + block_ids: List[int]) -> None: assert seq_id in self._seq_last_access ts = self._seq_last_access[seq_id] diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py index b229bbb6d4391..c5b3b04f37ca3 100644 --- a/vllm/core/block_manager.py +++ b/vllm/core/block_manager.py @@ -1,7 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 """A block manager that manages token blocks.""" -from collections.abc import Sequence as GenericSequence -from typing import Optional +from typing import Dict, List, Optional +from typing import Sequence as GenericSequence +from typing import Tuple from vllm.core.block.block_table import BlockTable from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator @@ -97,8 +98,8 @@ def __init__( block_size=block_size, ) - self.block_tables: dict[SeqId, BlockTable] = {} - self.cross_block_tables: dict[EncoderSeqId, BlockTable] = {} + self.block_tables: Dict[SeqId, BlockTable] = {} + self.cross_block_tables: Dict[EncoderSeqId, BlockTable] = {} self._computed_blocks_tracker = ComputedBlocksTracker( self.block_allocator, self.block_size, self.enable_caching) @@ -235,7 +236,7 @@ def append_slots( self, seq: Sequence, num_lookahead_slots: int, - ) -> list[tuple[int, int]]: + ) -> List[Tuple[int, int]]: block_table = self.block_tables[seq.seq_id] @@ -276,11 +277,11 @@ def free_cross(self, seq_group: SequenceGroup) -> None: self.cross_block_tables[request_id].free() del self.cross_block_tables[request_id] - def get_block_table(self, seq: Sequence) -> list[int]: + def get_block_table(self, seq: Sequence) -> List[int]: block_ids = self.block_tables[seq.seq_id].physical_block_ids return block_ids # type: ignore - def get_cross_block_table(self, seq_group: SequenceGroup) -> list[int]: + def get_cross_block_table(self, seq_group: SequenceGroup) -> List[int]: request_id = seq_group.request_id assert request_id in self.cross_block_tables block_ids = self.cross_block_tables[request_id].physical_block_ids @@ -306,7 +307,7 @@ def mark_blocks_as_computed(self, seq_group: SequenceGroup, self.block_allocator.mark_blocks_as_computed([]) def get_common_computed_block_ids( - self, seqs: list[Sequence]) -> GenericSequence[int]: + self, seqs: List[Sequence]) -> GenericSequence[int]: """Determine which blocks for which we skip prefill. With prefix caching we can skip prefill for previously-generated blocks. @@ -356,7 +357,7 @@ def can_swap_in(self, seq_group: SequenceGroup, return self._can_swap(seq_group, Device.GPU, SequenceStatus.SWAPPED, num_lookahead_slots) - def swap_in(self, seq_group: SequenceGroup) -> list[tuple[int, int]]: + def swap_in(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]: """Returns the block id mapping (from CPU to GPU) generated by swapping in the given seq_group with num_lookahead_slots. @@ -364,7 +365,7 @@ def swap_in(self, seq_group: SequenceGroup) -> list[tuple[int, int]]: seq_group (SequenceGroup): The sequence group to swap in. Returns: - list[tuple[int, int]]: The mapping of swapping block from CPU + List[Tuple[int, int]]: The mapping of swapping block from CPU to GPU. """ physical_block_id_mapping = [] @@ -409,7 +410,7 @@ def can_swap_out(self, seq_group: SequenceGroup) -> bool: SequenceStatus.RUNNING) return alloc_status == AllocStatus.OK - def swap_out(self, seq_group: SequenceGroup) -> list[tuple[int, int]]: + def swap_out(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]: """Returns the block id mapping (from GPU to CPU) generated by swapping out the given sequence_group with num_lookahead_slots. @@ -417,7 +418,7 @@ def swap_out(self, seq_group: SequenceGroup) -> list[tuple[int, int]]: sequence_group (SequenceGroup): The sequence group to swap out. Returns: - list[tuple[int, int]]: The mapping of swapping block from + List[Tuple[int, int]]: The mapping of swapping block from GPU to CPU. """ physical_block_id_mapping = [] @@ -482,7 +483,7 @@ def _can_swap(self, # swap. Then verify if there are available blocks in the device # to perform the swap. num_blocks_touched = 0 - blocks: list[Block] = [] + blocks: List[Block] = [] for seq in seq_group.get_seqs(status=status): block_table = self.block_tables[seq.seq_id] if block_table.blocks is not None: diff --git a/vllm/core/evictor.py b/vllm/core/evictor.py index 68a2f704def8a..0e363eddc8a5e 100644 --- a/vllm/core/evictor.py +++ b/vllm/core/evictor.py @@ -3,6 +3,7 @@ import enum import heapq from abc import ABC, abstractmethod +from typing import Dict, List, Tuple class EvictionPolicy(enum.Enum): @@ -26,7 +27,7 @@ def __contains__(self, block_id: int) -> bool: pass @abstractmethod - def evict(self) -> tuple[int, int]: + def evict(self) -> Tuple[int, int]: """Runs the eviction algorithm and returns the evicted block's content hash along with physical block id along with physical block id """ @@ -83,13 +84,13 @@ class LRUEvictor(Evictor): CLEANUP_THRESHOLD = 50 def __init__(self): - self.free_table: dict[int, BlockMetaData] = {} + self.free_table: Dict[int, BlockMetaData] = {} self.priority_queue = [] def __contains__(self, block_id: int) -> bool: return block_id in self.free_table - def evict(self) -> tuple[int, int]: + def evict(self) -> Tuple[int, int]: if len(self.free_table) == 0: raise ValueError("No usable cache memory left") @@ -127,7 +128,7 @@ def _cleanup_if_necessary(self): self._cleanup() def _cleanup(self): - new_priority_queue: list[tuple[float, int, int, int]] = [] + new_priority_queue: List[Tuple[float, int, int, int]] = [] for block_id, block in self.free_table.items(): new_priority_queue.append( diff --git a/vllm/core/interfaces.py b/vllm/core/interfaces.py index 819f372490ec9..b48ba87e95a0b 100644 --- a/vllm/core/interfaces.py +++ b/vllm/core/interfaces.py @@ -2,7 +2,9 @@ import enum from abc import ABC, abstractmethod -from collections.abc import Sequence as GenericSequence +from typing import List +from typing import Sequence as GenericSequence +from typing import Tuple from vllm.sequence import Sequence, SequenceGroup from vllm.utils import Device @@ -59,7 +61,7 @@ def append_slots( self, seq: Sequence, num_lookahead_slots: int, - ) -> list[tuple[int, int]]: + ) -> List[Tuple[int, int]]: pass @abstractmethod @@ -72,7 +74,7 @@ def can_swap_in(self, seq_group: SequenceGroup, pass @abstractmethod - def swap_in(self, seq_group: SequenceGroup) -> list[tuple[int, int]]: + def swap_in(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]: pass @abstractmethod @@ -80,7 +82,7 @@ def can_swap_out(self, seq_group: SequenceGroup) -> bool: pass @abstractmethod - def swap_out(self, seq_group: SequenceGroup) -> list[tuple[int, int]]: + def swap_out(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]: pass @abstractmethod @@ -88,7 +90,7 @@ def free(self, seq: Sequence) -> None: pass @abstractmethod - def get_block_table(self, seq: Sequence) -> list[int]: + def get_block_table(self, seq: Sequence) -> List[int]: pass @abstractmethod @@ -109,7 +111,7 @@ def access_all_blocks_in_seq( @abstractmethod def get_common_computed_block_ids( - self, seqs: list[Sequence]) -> GenericSequence[int]: + self, seqs: List[Sequence]) -> GenericSequence[int]: pass @abstractmethod diff --git a/vllm/core/placeholder_block_space_manager.py b/vllm/core/placeholder_block_space_manager.py index a4f721544606a..70c22afa8e158 100644 --- a/vllm/core/placeholder_block_space_manager.py +++ b/vllm/core/placeholder_block_space_manager.py @@ -1,5 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 +from typing import List, Tuple + from vllm.core.interfaces import AllocStatus, BlockSpaceManager from vllm.sequence import Sequence, SequenceGroup from vllm.utils import Device @@ -40,7 +42,7 @@ def append_slots( self, seq: Sequence, num_lookahead_slots: int, - ) -> list[tuple[int, int]]: + ) -> List[Tuple[int, int]]: return [] def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None: @@ -50,20 +52,20 @@ def can_swap_in(self, seq_group: SequenceGroup, num_lookahead_slots: int) -> AllocStatus: return AllocStatus.OK - def swap_in(self, seq_group: SequenceGroup) -> list[tuple[int, int]]: + def swap_in(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]: return None # type: ignore def can_swap_out(self, seq_group: SequenceGroup) -> bool: return True - def swap_out(self, seq_group: SequenceGroup) -> list[tuple[int, int]]: + def swap_out(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]: return None # type: ignore def free(self, seq: Sequence) -> None: # No operation on free return - def get_block_table(self, seq: Sequence) -> list[int]: + def get_block_table(self, seq: Sequence) -> List[int]: return None # type: ignore def get_num_free_gpu_blocks(self) -> int: @@ -80,7 +82,7 @@ def access_all_blocks_in_seq( pass def get_common_computed_block_ids(self, - seq_group: list[Sequence]) -> list[int]: + seq_group: List[Sequence]) -> List[int]: return [] def mark_blocks_as_computed(self, seq_group: SequenceGroup, diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index 2a43878ea395b..3cdad496e8435 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -5,10 +5,10 @@ import random import time from collections import deque -from collections.abc import Iterable -from collections.abc import Sequence as GenericSequence from dataclasses import dataclass, field -from typing import Callable, Optional, Union +from typing import Callable, Deque, Dict, Iterable, List, Optional +from typing import Sequence as GenericSequence +from typing import Set, Tuple, Union from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig from vllm.core.interfaces import AllocStatus, BlockSpaceManager @@ -58,8 +58,8 @@ class SchedulingBudget: token_budget: int max_num_seqs: int - _request_ids_num_batched_tokens: set[str] = field(default_factory=set) - _request_ids_num_curr_seqs: set[str] = field(default_factory=set) + _request_ids_num_batched_tokens: Set[str] = field(default_factory=set) + _request_ids_num_curr_seqs: Set[str] = field(default_factory=set) # Number of cached tokens in the batch. _num_cached_tokens: int = 0 # Number of actual non-cached tokens in the batch. @@ -141,14 +141,14 @@ class SchedulerOutputs: num_prefill_groups: int # Total number of batched tokens. num_batched_tokens: int - # Blocks to swap in. list of CPU -> GPU block number. - blocks_to_swap_in: list[tuple[int, int]] - # Blocks to swap out. list of GPU -> CPU block number. - blocks_to_swap_out: list[tuple[int, int]] + # Blocks to swap in. List of CPU -> GPU block number. + blocks_to_swap_in: List[Tuple[int, int]] + # Blocks to swap out. List of GPU -> CPU block number. + blocks_to_swap_out: List[Tuple[int, int]] # Blocks to copy. Source to dest block. - blocks_to_copy: list[tuple[int, int]] + blocks_to_copy: List[Tuple[int, int]] # Sequence groups that are going to be ignored. - ignored_seq_groups: list[SequenceGroup] + ignored_seq_groups: List[SequenceGroup] # The number of slots for lookahead decoding. num_lookahead_slots: int # The number of requests in the running queue @@ -185,7 +185,7 @@ def key_fn(group: ScheduledSequenceGroup): key=key_fn) @property - def lora_requests(self) -> set[LoRARequest]: + def lora_requests(self) -> Set[LoRARequest]: return { g.seq_group.lora_request for g in self.scheduled_seq_groups @@ -193,7 +193,7 @@ def lora_requests(self) -> set[LoRARequest]: } @property - def prompt_adapter_requests(self) -> set[PromptAdapterRequest]: + def prompt_adapter_requests(self) -> Set[PromptAdapterRequest]: return { g.seq_group.prompt_adapter_request for g in self.scheduled_seq_groups @@ -210,24 +210,24 @@ class SchedulerRunningOutputs: """ # Selected sequences that are running and in a decoding phase. - decode_seq_groups: list[ScheduledSequenceGroup] + decode_seq_groups: List[ScheduledSequenceGroup] # Selected sequences that are running and in a prefill phase. # I.e., it means the prefill has been chunked. - prefill_seq_groups: list[ScheduledSequenceGroup] + prefill_seq_groups: List[ScheduledSequenceGroup] # The preempted sequences. - preempted: list[SequenceGroup] + preempted: List[SequenceGroup] # Sequences that are swapped out. - swapped_out: list[SequenceGroup] + swapped_out: List[SequenceGroup] # The blocks to swap out. - blocks_to_swap_out: list[tuple[int, int]] + blocks_to_swap_out: List[Tuple[int, int]] # The blocks to copy. - blocks_to_copy: list[tuple[int, int]] + blocks_to_copy: List[Tuple[int, int]] # The number of slots for lookahead decoding. num_lookahead_slots: int # Optimization for fast-access to seq_group lists - decode_seq_groups_list: list[SequenceGroup] - prefill_seq_groups_list: list[SequenceGroup] + decode_seq_groups_list: List[SequenceGroup] + prefill_seq_groups_list: List[SequenceGroup] @classmethod def create_empty(cls) -> "SchedulerRunningOutputs": @@ -253,18 +253,18 @@ class SchedulerSwappedInOutputs: # Selected sequences that are going to be swapped in and is in a # decoding phase. - decode_seq_groups: list[ScheduledSequenceGroup] + decode_seq_groups: List[ScheduledSequenceGroup] # Selected sequences that are going to be swapped in and in a prefill # phase. I.e., it means the prefill has been chunked. - prefill_seq_groups: list[ScheduledSequenceGroup] + prefill_seq_groups: List[ScheduledSequenceGroup] # The blocks to swap in. - blocks_to_swap_in: list[tuple[int, int]] + blocks_to_swap_in: List[Tuple[int, int]] # The blocks to copy. - blocks_to_copy: list[tuple[int, int]] + blocks_to_copy: List[Tuple[int, int]] # The number of slots for lookahead decoding. num_lookahead_slots: int # Infeasible sequence groups. - infeasible_seq_groups: list[SequenceGroup] + infeasible_seq_groups: List[SequenceGroup] @classmethod def create_empty(cls) -> "SchedulerSwappedInOutputs": @@ -287,9 +287,9 @@ class SchedulerPrefillOutputs: """ # Selected sequences for prefill. - seq_groups: list[ScheduledSequenceGroup] + seq_groups: List[ScheduledSequenceGroup] # Ignored sequence groups. - ignored_seq_groups: list[SequenceGroup] + ignored_seq_groups: List[SequenceGroup] num_lookahead_slots: int @classmethod @@ -372,8 +372,8 @@ def maybe_increment_partial_prefills(self, @classmethod def from_queues( cls, - running: deque[SequenceGroup], - waiting: deque[SequenceGroup], + running: Deque[SequenceGroup], + waiting: Deque[SequenceGroup], scheduler_config: SchedulerConfig, ) -> "PartialPrefillMetadata": """Create a PartialPrefillMetadata object from the current state of @@ -465,18 +465,18 @@ def __init__( # Sequence groups in the WAITING state. # Contain new prefill or preempted requests. - self.waiting: deque[SequenceGroup] = deque() + self.waiting: Deque[SequenceGroup] = deque() # Sequence groups in the RUNNING state. # Contain decode requests. - self.running: deque[SequenceGroup] = deque() + self.running: Deque[SequenceGroup] = deque() # Sequence groups in the SWAPPED state. # Contain decode requests that are swapped out. - self.swapped: deque[SequenceGroup] = deque() + self.swapped: Deque[SequenceGroup] = deque() # Sequence groups finished requests ids since last step iteration. # It lets the model know that any state associated with these requests # can and must be released after the current step. # This is used to evict the finished requests from the Mamba cache. - self._finished_requests_ids: list[str] = list() + self._finished_requests_ids: List[str] = list() # Time at previous scheduling step self.prev_time = 0.0 # Did we schedule a prompt at previous step? @@ -495,9 +495,9 @@ def __init__( self.num_cumulative_preemption: int = 0 # Used to cache python objects - self._seq_group_metadata_cache: list[PyObjectCache] = [] - self._scheduler_running_outputs_cache: list[PyObjectCache] = [] - self._scheduled_seq_group_cache: list[PyObjectCache] = [] + self._seq_group_metadata_cache: List[PyObjectCache] = [] + self._scheduler_running_outputs_cache: List[PyObjectCache] = [] + self._scheduled_seq_group_cache: List[PyObjectCache] = [] # For async output processing, we need to swap cache buffers between # iterations. I.e. since the output processing is lagged one step, @@ -520,7 +520,7 @@ def __init__( # when the request reaches max_model_len. In this case, the request # will be stopped during schedule() call and added to this stop list # for processing and deallocation by the free_finished_seq_groups() - self._async_stopped: list[SequenceGroup] = [] + self._async_stopped: List[SequenceGroup] = [] # List with the chunk sizes to hand out to each sequence depending # on how many partial prefills are running. This is slightly faster than @@ -578,7 +578,7 @@ def abort_seq_group(self, request_id: Union[str, Iterable[str]]) -> None: request_id = (request_id, ) request_ids = set(request_id) for state_queue in [self.waiting, self.running, self.swapped]: - aborted_groups: list[SequenceGroup] = [] + aborted_groups: List[SequenceGroup] = [] for seq_group in state_queue: if not request_ids: # Using 'break' here may add two extra iterations, @@ -625,7 +625,7 @@ def reset_prefix_cache(self) -> bool: def get_num_unfinished_seq_groups(self) -> int: return len(self.waiting) + len(self.running) + len(self.swapped) - def get_and_reset_finished_requests_ids(self) -> list[str]: + def get_and_reset_finished_requests_ids(self) -> List[str]: """Flushes the list of request ids of previously finished seq_groups.""" finished_requests_ids = self._finished_requests_ids self._finished_requests_ids = list() @@ -634,7 +634,7 @@ def get_and_reset_finished_requests_ids(self) -> list[str]: def _schedule_running( self, budget: SchedulingBudget, - curr_loras: Optional[set[int]], + curr_loras: Optional[Set[int]], enable_chunking: bool = False, partial_prefill_metadata: Optional[PartialPrefillMetadata] = None, ) -> SchedulerRunningOutputs: @@ -673,14 +673,14 @@ def _schedule_running( ret.prefill_seq_groups_list.clear() # Blocks that need to be swapped or copied before model execution. - blocks_to_swap_out: list[tuple[int, int]] = ret.blocks_to_swap_out - blocks_to_copy: list[tuple[int, int]] = ret.blocks_to_copy + blocks_to_swap_out: List[Tuple[int, int]] = ret.blocks_to_swap_out + blocks_to_copy: List[Tuple[int, int]] = ret.blocks_to_copy - decode_seq_groups: list[ScheduledSequenceGroup] = ret.decode_seq_groups - prefill_seq_groups: list[ + decode_seq_groups: List[ScheduledSequenceGroup] = ret.decode_seq_groups + prefill_seq_groups: List[ ScheduledSequenceGroup] = ret.prefill_seq_groups - preempted: list[SequenceGroup] = ret.preempted - swapped_out: list[SequenceGroup] = ret.swapped_out + preempted: List[SequenceGroup] = ret.preempted + swapped_out: List[SequenceGroup] = ret.swapped_out running_queue = self.running assert len(self._async_stopped) == 0 @@ -806,7 +806,7 @@ def _schedule_running( def _schedule_swapped( self, budget: SchedulingBudget, - curr_loras: Optional[set[int]], + curr_loras: Optional[Set[int]], enable_chunking: bool = False, ) -> SchedulerSwappedInOutputs: """Schedule sequence groups that are swapped out. @@ -829,15 +829,15 @@ def _schedule_swapped( SchedulerSwappedInOutputs. """ # Blocks that need to be swapped or copied before model execution. - blocks_to_swap_in: list[tuple[int, int]] = [] - blocks_to_copy: list[tuple[int, int]] = [] - decode_seq_groups: list[ScheduledSequenceGroup] = [] - prefill_seq_groups: list[ScheduledSequenceGroup] = [] - infeasible_seq_groups: list[SequenceGroup] = [] + blocks_to_swap_in: List[Tuple[int, int]] = [] + blocks_to_copy: List[Tuple[int, int]] = [] + decode_seq_groups: List[ScheduledSequenceGroup] = [] + prefill_seq_groups: List[ScheduledSequenceGroup] = [] + infeasible_seq_groups: List[SequenceGroup] = [] swapped_queue = self.swapped - leftover_swapped: deque[SequenceGroup] = deque() + leftover_swapped: Deque[SequenceGroup] = deque() while swapped_queue: seq_group = swapped_queue[0] @@ -939,7 +939,7 @@ def _get_prompt_limit(self, seq_group: SequenceGroup) -> int: return prompt_limit def _get_priority(self, - seq_group: SequenceGroup) -> tuple[Optional[int], float]: + seq_group: SequenceGroup) -> Tuple[Optional[int], float]: """Get the priority of the sequence group. Highest preference to user-defined priority, followed by arrival time. Args: @@ -967,7 +967,7 @@ def _schedule_priority_preemption( running_queue = deque(sorted(self.running, key=self._get_priority)) - blocks_to_swap_out: list[tuple[int, int]] = [] + blocks_to_swap_out: List[Tuple[int, int]] = [] force_preemption_count = 0 if waiting_queue: @@ -1017,7 +1017,7 @@ def _schedule_priority_preemption( def _schedule_prefills( self, budget: SchedulingBudget, - curr_loras: Optional[set[int]], + curr_loras: Optional[Set[int]], enable_chunking: bool = False, partial_prefill_metadata: Optional[PartialPrefillMetadata] = None, ) -> SchedulerPrefillOutputs: @@ -1054,12 +1054,12 @@ def _schedule_prefills( num_lookahead_slots=self._get_num_lookahead_slots( is_prefill=True, enable_chunking=enable_chunking), ) - ignored_seq_groups: list[SequenceGroup] = [] - seq_groups: list[ScheduledSequenceGroup] = [] + ignored_seq_groups: List[SequenceGroup] = [] + seq_groups: List[ScheduledSequenceGroup] = [] waiting_queue = self.waiting - leftover_waiting_sequences: deque[SequenceGroup] = deque() + leftover_waiting_sequences: Deque[SequenceGroup] = deque() while self._passed_delay(time.time()) and waiting_queue: seq_group = waiting_queue[0] @@ -1162,7 +1162,7 @@ def _schedule_prefills( seq_group) if enable_chunking and self.scheduler_config.is_multi_step: - blocks_to_copy: list[tuple[int, int]] = [] + blocks_to_copy: List[Tuple[int, int]] = [] # init_multi_step_from_lookahead_slots happens in append_slots self._append_slots(seq_group, blocks_to_copy, enable_chunking) # This assert will trip when a copy-on-write happens. This is @@ -1325,7 +1325,7 @@ def _schedule_chunked_prefill(self) -> SchedulerOutputs: token_budget=self.scheduler_config.max_num_batched_tokens, max_num_seqs=self.scheduler_config.max_num_seqs, ) - curr_loras: set[int] = set() + curr_loras: Set[int] = set() prefills = SchedulerPrefillOutputs.create_empty() swapped_in = SchedulerSwappedInOutputs.create_empty() @@ -1423,8 +1423,8 @@ def _schedule_chunked_prefill(self) -> SchedulerOutputs: ) def _order_finishing_prefills_first( - self, scheduled_prefill_seqs: list[ScheduledSequenceGroup] - ) -> list[SequenceGroup]: + self, scheduled_prefill_seqs: List[ScheduledSequenceGroup] + ) -> List[SequenceGroup]: """Returns a list of prefilling SequenceGroups where sequences that are scheduled to finish prefilling are listed first""" finishing = [ @@ -1477,7 +1477,7 @@ def _allow_async_output_proc(self, seq_group: SequenceGroup) -> bool: def schedule( self - ) -> tuple[list[SequenceGroupMetadata], SchedulerOutputs, bool]: + ) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs, bool]: # Schedule sequence groups. # This function call changes the internal states of the scheduler # such as self.running, self.swapped, and self.waiting. @@ -1492,7 +1492,7 @@ def schedule( allow_async_output_proc: bool = self.use_async_output_proc # Create input data structures. - seq_group_metadata_list: list[SequenceGroupMetadata] = [] + seq_group_metadata_list: List[SequenceGroupMetadata] = [] for i, scheduled_seq_group in enumerate( scheduler_outputs.scheduled_seq_groups): seq_group = scheduled_seq_group.seq_group @@ -1505,9 +1505,9 @@ def schedule( seq_group_metadata.block_tables.clear() # seq_id -> SequenceData - seq_data: dict[int, SequenceData] = {} + seq_data: Dict[int, SequenceData] = {} # seq_id -> physical block numbers - block_tables: dict[int, list[int]] = {} + block_tables: Dict[int, List[int]] = {} if seq_group.is_encoder_decoder(): # Encoder associated with SequenceGroup @@ -1661,7 +1661,7 @@ def _free_finished_seq_group(self, seq_group: SequenceGroup) -> None: self._free_finished_seqs(seq_group) def free_finished_seq_groups(self) -> None: - remaining: deque[SequenceGroup] = deque() + remaining: Deque[SequenceGroup] = deque() for seq_group in self.running: self._free_finished_seq_group(seq_group) if not seq_group.is_finished(): @@ -1689,7 +1689,7 @@ def _allocate_and_set_running(self, seq_group: SequenceGroup) -> None: def _append_slots( self, seq_group: SequenceGroup, - blocks_to_copy: list[tuple[int, int]], + blocks_to_copy: List[Tuple[int, int]], enable_chunking: bool = False, ) -> None: """Appends new slots to the sequences in the given sequence group. @@ -1697,7 +1697,7 @@ def _append_slots( Args: seq_group (SequenceGroup): The sequence group containing the sequences to append slots to. - blocks_to_copy (list[tuple[int, int]]): A list of tuple of two + blocks_to_copy (List[Tuple[int, int]]): A list of tuple of two ints, the first int is the source block index, and the second int is the destination block index. This list is updated with the new source and destination block indices for the appended @@ -1727,7 +1727,7 @@ def _append_slots( blocks_to_copy.extend(cows) def _preempt(self, seq_group: SequenceGroup, - blocks_to_swap_out: list[tuple[int, int]]) -> PreemptionMode: + blocks_to_swap_out: List[Tuple[int, int]]) -> PreemptionMode: # If preemption mode is not specified, we determine the mode as follows: # We use recomputation by default since it incurs lower overhead than # swapping. However, when the sequence group has multiple sequences @@ -1786,14 +1786,14 @@ def _preempt_by_recompute( def _preempt_by_swap( self, seq_group: SequenceGroup, - blocks_to_swap_out: list[tuple[int, int]], + blocks_to_swap_out: List[Tuple[int, int]], ) -> None: self._swap_out(seq_group, blocks_to_swap_out) def _swap_in( self, seq_group: SequenceGroup, - blocks_to_swap_in: list[tuple[int, int]], + blocks_to_swap_in: List[Tuple[int, int]], ) -> None: mapping = self.block_manager.swap_in(seq_group) blocks_to_swap_in.extend(mapping) @@ -1803,7 +1803,7 @@ def _swap_in( def _swap_out( self, seq_group: SequenceGroup, - blocks_to_swap_out: list[tuple[int, int]], + blocks_to_swap_out: List[Tuple[int, int]], ) -> None: if not self.block_manager.can_swap_out(seq_group): # FIXME(woosuk): Abort the sequence group instead of aborting the @@ -1867,7 +1867,7 @@ def _get_num_new_uncached_and_cached_tokens( enable_chunking: bool, budget: SchedulingBudget, partial_prefill_metadata: Optional[PartialPrefillMetadata] = None, - ) -> tuple[int, int]: + ) -> Tuple[int, int]: """ Returns the number of new uncached and cached tokens to schedule for a given sequence group that's in a given `status`. @@ -1982,7 +1982,7 @@ def _chunk_new_tokens_to_schedule( budget: SchedulingBudget, prompt_limit: int, num_new_tokens: int, - partial_prefill_budget_lookup_list: list[int], + partial_prefill_budget_lookup_list: List[int], partial_prefill_metadata: Optional[PartialPrefillMetadata] = None, ) -> int: """ diff --git a/vllm/device_allocator/cumem.py b/vllm/device_allocator/cumem.py index 5aa6e114e3594..7f63fc1437872 100644 --- a/vllm/device_allocator/cumem.py +++ b/vllm/device_allocator/cumem.py @@ -9,7 +9,7 @@ # the only successful approach is to call cuda driver API in C. import dataclasses from contextlib import contextmanager -from typing import Any, Callable, Optional, Union +from typing import Any, Callable, Dict, Optional, Tuple, Union import torch @@ -61,7 +61,7 @@ def find_loaded_library(lib_name) -> Optional[str]: libcudart = None # py_device, py_alignedSize, py_d_mem, py_p_memHandle -HandleType = tuple[int, int, int, int] +HandleType = Tuple[int, int, int, int] @dataclasses.dataclass @@ -140,9 +140,9 @@ def get_instance() -> "CuMemAllocator": return CuMemAllocator.instance def __init__(self): - self.pointer_to_data: dict[int, AllocationData] = {} + self.pointer_to_data: Dict[int, AllocationData] = {} self.current_tag: str = CuMemAllocator.default_tag - self.allocator_and_pools: dict[str, Any] = {} + self.allocator_and_pools: Dict[str, Any] = {} def python_malloc_callback(self, allocation_handle: HandleType) -> None: """ @@ -164,7 +164,7 @@ def python_free_callback(self, ptr: int) -> HandleType: def sleep( self, - offload_tags: Optional[Union[tuple[str, ...], + offload_tags: Optional[Union[Tuple[str, ...], str]] = None) -> None: """ Put the allocator in sleep mode. diff --git a/vllm/distributed/communication_op.py b/vllm/distributed/communication_op.py index 96af7a64c2fca..0228264f91f9a 100644 --- a/vllm/distributed/communication_op.py +++ b/vllm/distributed/communication_op.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Any, Optional, Union +from typing import Any, Dict, Optional, Union import torch import torch.distributed @@ -26,7 +26,7 @@ def tensor_model_parallel_gather(input_: torch.Tensor, return get_tp_group().gather(input_, dst, dim) -def broadcast_tensor_dict(tensor_dict: Optional[dict[Any, Union[torch.Tensor, +def broadcast_tensor_dict(tensor_dict: Optional[Dict[Any, Union[torch.Tensor, Any]]] = None, src: int = 0): if not torch.distributed.is_initialized(): diff --git a/vllm/distributed/device_communicators/cuda_wrapper.py b/vllm/distributed/device_communicators/cuda_wrapper.py index 6c15ef644b8c2..1d53b1c5b8099 100644 --- a/vllm/distributed/device_communicators/cuda_wrapper.py +++ b/vllm/distributed/device_communicators/cuda_wrapper.py @@ -6,7 +6,7 @@ import ctypes from dataclasses import dataclass -from typing import Any, Optional +from typing import Any, Dict, List, Optional # this line makes it possible to directly load `libcudart.so` using `ctypes` import torch # noqa @@ -32,7 +32,7 @@ class cudaIpcMemHandle_t(ctypes.Structure): class Function: name: str restype: Any - argtypes: list[Any] + argtypes: List[Any] def find_loaded_library(lib_name) -> Optional[str]: @@ -97,11 +97,11 @@ class CudaRTLibrary: # class attribute to store the mapping from the path to the library # to avoid loading the same library multiple times - path_to_library_cache: dict[str, Any] = {} + path_to_library_cache: Dict[str, Any] = {} # class attribute to store the mapping from library path # to the corresponding dictionary - path_to_dict_mapping: dict[str, dict[str, Any]] = {} + path_to_dict_mapping: Dict[str, Dict[str, Any]] = {} def __init__(self, so_file: Optional[str] = None): if so_file is None: diff --git a/vllm/distributed/device_communicators/custom_all_reduce.py b/vllm/distributed/device_communicators/custom_all_reduce.py index 46efa72ed101c..90f7f2d0f9823 100644 --- a/vllm/distributed/device_communicators/custom_all_reduce.py +++ b/vllm/distributed/device_communicators/custom_all_reduce.py @@ -2,7 +2,7 @@ import ctypes from contextlib import contextmanager -from typing import Optional, Union +from typing import List, Optional, Union import torch import torch.distributed as dist @@ -177,7 +177,7 @@ def __init__(self, @staticmethod def create_shared_buffer( size_in_bytes: int, - group: Optional[ProcessGroup] = None) -> list[int]: + group: Optional[ProcessGroup] = None) -> List[int]: """ Creates a shared buffer and returns a list of pointers representing the buffer on all processes in the group. @@ -190,7 +190,7 @@ def create_shared_buffer( handles = [None] * world_size dist.all_gather_object(handles, handle, group=group) - pointers: list[int] = [] + pointers: List[int] = [] for i, h in enumerate(handles): if i == rank: pointers.append(pointer.value) # type: ignore @@ -201,7 +201,7 @@ def create_shared_buffer( return pointers @staticmethod - def free_shared_buffer(pointers: list[int], + def free_shared_buffer(pointers: List[int], group: Optional[ProcessGroup] = None, rank: Optional[int] = None) -> None: if rank is None: diff --git a/vllm/distributed/device_communicators/custom_all_reduce_utils.py b/vllm/distributed/device_communicators/custom_all_reduce_utils.py index 11b8b57fe2aed..d8d6eed2dd7ec 100644 --- a/vllm/distributed/device_communicators/custom_all_reduce_utils.py +++ b/vllm/distributed/device_communicators/custom_all_reduce_utils.py @@ -7,9 +7,8 @@ import subprocess import sys import tempfile -from collections.abc import Sequence from itertools import product -from typing import Optional +from typing import Dict, List, Optional, Sequence import torch.distributed as dist import torch.multiprocessing as mp @@ -150,7 +149,7 @@ def can_actually_p2p( p_src.join() p_tgt.join() assert p_src.exitcode == 0 and p_tgt.exitcode == 0 - result: list[bool] = [] + result: List[bool] = [] for src, tgt in zip(batch_src, batch_tgt): a = result_queue.get() b = result_queue.get() @@ -176,7 +175,7 @@ def can_actually_p2p( # e.g. used by different vllm engines. The device id in the cache file is a # **local** device id, i.e. from 0 to num_dev-1, where num_dev is the number # of visible devices in the vllm engine. -_gpu_p2p_access_cache: Optional[dict[str, bool]] = None +_gpu_p2p_access_cache: Optional[Dict[str, bool]] = None def gpu_p2p_access_check(src: int, tgt: int) -> bool: @@ -205,7 +204,7 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool: # only the local master process (with local_rank == 0) can # enter this block to calculate the cache logger.info("generating GPU P2P access cache in %s", path) - cache: dict[str, bool] = {} + cache: Dict[str, bool] = {} ids = list(range(num_dev)) # batch of all pairs of GPUs batch_src, batch_tgt = zip(*list(product(ids, ids))) diff --git a/vllm/distributed/device_communicators/pynccl_wrapper.py b/vllm/distributed/device_communicators/pynccl_wrapper.py index 6f69089b61968..4f04899e92e6d 100644 --- a/vllm/distributed/device_communicators/pynccl_wrapper.py +++ b/vllm/distributed/device_communicators/pynccl_wrapper.py @@ -24,7 +24,7 @@ import ctypes import platform from dataclasses import dataclass -from typing import Any, Optional +from typing import Any, Dict, List, Optional import torch from torch.distributed import ReduceOp @@ -121,7 +121,7 @@ def from_torch(cls, op: ReduceOp) -> int: class Function: name: str restype: Any - argtypes: list[Any] + argtypes: List[Any] class NCCLLibrary: @@ -210,11 +210,11 @@ class NCCLLibrary: # class attribute to store the mapping from the path to the library # to avoid loading the same library multiple times - path_to_library_cache: dict[str, Any] = {} + path_to_library_cache: Dict[str, Any] = {} # class attribute to store the mapping from library path # to the corresponding dictionary - path_to_dict_mapping: dict[str, dict[str, Any]] = {} + path_to_dict_mapping: Dict[str, Dict[str, Any]] = {} def __init__(self, so_file: Optional[str] = None): @@ -238,7 +238,7 @@ def __init__(self, so_file: Optional[str] = None): raise e if so_file not in NCCLLibrary.path_to_dict_mapping: - _funcs: dict[str, Any] = {} + _funcs: Dict[str, Any] = {} for func in NCCLLibrary.exported_functions: f = getattr(self.lib, func.name) f.restype = func.restype diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py index 1361207e04763..12a720d47fbba 100644 --- a/vllm/distributed/device_communicators/shm_broadcast.py +++ b/vllm/distributed/device_communicators/shm_broadcast.py @@ -7,7 +7,7 @@ from contextlib import contextmanager from dataclasses import dataclass, field from multiprocessing import shared_memory -from typing import Optional, Union +from typing import List, Optional, Tuple, Union from unittest.mock import patch import torch @@ -166,9 +166,9 @@ def get_metadata(self, current_idx: int): @dataclass class Handle: - local_reader_ranks: list[int] = field(default_factory=list) + local_reader_ranks: List[int] = field(default_factory=list) - buffer_handle: Optional[tuple[int, int, int, str]] = None + buffer_handle: Optional[Tuple[int, int, int, str]] = None local_subscribe_addr: Optional[str] = None remote_subscribe_addr: Optional[str] = None remote_addr_ipv6: bool = False @@ -180,7 +180,7 @@ def __init__( self, n_reader, # number of all readers n_local_reader, # number of local readers through shared memory - local_reader_ranks: Optional[list[int]] = None, + local_reader_ranks: Optional[List[int]] = None, max_chunk_bytes: int = 1024 * 1024 * 10, max_chunks: int = 10, connect_ip: Optional[str] = None, diff --git a/vllm/distributed/kv_transfer/kv_connector/base.py b/vllm/distributed/kv_transfer/kv_connector/base.py index 92a472bbe600a..57c764b481c29 100644 --- a/vllm/distributed/kv_transfer/kv_connector/base.py +++ b/vllm/distributed/kv_transfer/kv_connector/base.py @@ -8,7 +8,7 @@ """ from abc import ABC, abstractmethod -from typing import TYPE_CHECKING, Union +from typing import TYPE_CHECKING, List, Tuple, Union import torch @@ -54,7 +54,7 @@ def send_kv_caches_and_hidden_states( self, model_executable: torch.nn.Module, model_input: "ModelInputForGPUWithSamplingMetadata", - kv_caches: list[torch.Tensor], + kv_caches: List[torch.Tensor], hidden_or_intermediate_states: Union[torch.Tensor, IntermediateTensors], ) -> None: @@ -70,7 +70,7 @@ def send_kv_caches_and_hidden_states( start and end layer information. model_input (ModelInputForGPUWithSamplingMetadata): The input metadata from vLLM. - kv_caches (list[torch.Tensor]): list of KV caches (keys and values) + kv_caches (List[torch.Tensor]): List of KV caches (keys and values) for each layer. hidden_or_intermediate_states (Union[torch.Tensor, IntermediateTensors]): @@ -87,8 +87,8 @@ def send_kv_caches_and_hidden_states( def recv_kv_caches_and_hidden_states( self, model_executable: torch.nn.Module, model_input: "ModelInputForGPUWithSamplingMetadata", - kv_caches: list[torch.Tensor] - ) -> tuple[Union[torch.Tensor, IntermediateTensors], bool, + kv_caches: List[torch.Tensor] + ) -> Tuple[Union[torch.Tensor, IntermediateTensors], bool, "ModelInputForGPUWithSamplingMetadata"]: """ Receive KV caches and hidden states from the connector. @@ -103,8 +103,8 @@ def recv_kv_caches_and_hidden_states( The model executable from vLLM modelrunner. model_input (ModelInputForGPUWithSamplingMetadata): The model input from vLLM modelrunner. - kv_caches (list[torch.Tensor]): - list of KV caches for each layer. + kv_caches (List[torch.Tensor]): + List of KV caches for each layer. Returns: - hidden_or_intermediate_states (torch.Tensor or diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py index 4f45f29f36a07..7336c54ec8a30 100644 --- a/vllm/distributed/kv_transfer/kv_connector/factory.py +++ b/vllm/distributed/kv_transfer/kv_connector/factory.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import importlib -from typing import TYPE_CHECKING, Callable +from typing import TYPE_CHECKING, Callable, Dict, Type from .base import KVConnectorBase @@ -10,7 +10,7 @@ class KVConnectorFactory: - _registry: dict[str, Callable[[], type[KVConnectorBase]]] = {} + _registry: Dict[str, Callable[[], Type[KVConnectorBase]]] = {} @classmethod def register_connector(cls, name: str, module_path: str, @@ -19,7 +19,7 @@ def register_connector(cls, name: str, module_path: str, if name in cls._registry: raise ValueError(f"Connector '{name}' is already registered.") - def loader() -> type[KVConnectorBase]: + def loader() -> Type[KVConnectorBase]: module = importlib.import_module(module_path) return getattr(module, class_name) diff --git a/vllm/distributed/kv_transfer/kv_connector/lmcache_connector.py b/vllm/distributed/kv_transfer/kv_connector/lmcache_connector.py index 4f003eefa4aa8..bf9117133af56 100644 --- a/vllm/distributed/kv_transfer/kv_connector/lmcache_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/lmcache_connector.py @@ -7,7 +7,7 @@ (2) offload and share KV caches. """ -from typing import TYPE_CHECKING, Union +from typing import TYPE_CHECKING, List, Tuple, Union import torch @@ -61,8 +61,8 @@ def __init__( def recv_kv_caches_and_hidden_states( self, model_executable: torch.nn.Module, model_input: "ModelInputForGPUWithSamplingMetadata", - kv_caches: list[torch.Tensor] - ) -> tuple[Union[torch.Tensor, IntermediateTensors], bool, + kv_caches: List[torch.Tensor] + ) -> Tuple[Union[torch.Tensor, IntermediateTensors], bool, "ModelInputForGPUWithSamplingMetadata"]: hidden_or_intermediate_states = None @@ -80,7 +80,7 @@ def send_kv_caches_and_hidden_states( self, model_executable: torch.nn.Module, model_input: "ModelInputForGPUWithSamplingMetadata", - kv_caches: list[torch.Tensor], + kv_caches: List[torch.Tensor], hidden_or_intermediate_states: Union[torch.Tensor, IntermediateTensors], ) -> None: diff --git a/vllm/distributed/kv_transfer/kv_connector/simple_connector.py b/vllm/distributed/kv_transfer/kv_connector/simple_connector.py index 0f092c890ecee..2033e9762ac0b 100644 --- a/vllm/distributed/kv_transfer/kv_connector/simple_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/simple_connector.py @@ -8,7 +8,7 @@ But the logic can be extended to support other pipe and lookup buffer. """ -from typing import TYPE_CHECKING, Optional, Union +from typing import TYPE_CHECKING, List, Optional, Tuple, Union import torch @@ -132,7 +132,7 @@ def __init__( ) def select(self, input_tokens: Optional[torch.Tensor], - roi: Optional[torch.Tensor]) -> list[Optional[torch.Tensor]]: + roi: Optional[torch.Tensor]) -> List[Optional[torch.Tensor]]: assert self.consumer_buffer is not None, "Please initialize the "\ "consumer buffer before calling select." @@ -151,7 +151,7 @@ def send_kv_caches_and_hidden_states( self, model_executable: torch.nn.Module, model_input: "ModelInputForGPUWithSamplingMetadata", - kv_caches: list[torch.Tensor], + kv_caches: List[torch.Tensor], hidden_or_intermediate_states: Union[torch.Tensor, IntermediateTensors], ) -> None: @@ -202,8 +202,8 @@ def send_kv_caches_and_hidden_states( def recv_kv_caches_and_hidden_states( self, model_executable: torch.nn.Module, model_input: "ModelInputForGPUWithSamplingMetadata", - kv_caches: list[torch.Tensor] - ) -> tuple[Union[torch.Tensor, IntermediateTensors], bool, + kv_caches: List[torch.Tensor] + ) -> Tuple[Union[torch.Tensor, IntermediateTensors], bool, "ModelInputForGPUWithSamplingMetadata"]: # When bypass_model_exec is set to False, it means that at least for one diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py index f9a0e22f918fa..845da7c501e88 100644 --- a/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py +++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py @@ -9,7 +9,7 @@ """ from abc import ABC, abstractmethod -from typing import Optional +from typing import List, Optional import torch @@ -71,7 +71,7 @@ def insert(self, input_tokens: torch.Tensor, roi: torch.Tensor, @abstractmethod def drop_select( self, input_tokens: Optional[torch.Tensor], - roi: Optional[torch.Tensor]) -> list[Optional[torch.Tensor]]: + roi: Optional[torch.Tensor]) -> List[Optional[torch.Tensor]]: """Select and *drop* KV cache entries from the lookup buffer. The functionality is similar to the following python statements @@ -89,7 +89,7 @@ def drop_select( roi (torch.Tensor): A binary mask on top of the input tokens Returns: - list[Optional[torch.Tensor]]: A list of tensors. Can be None. + List[Optional[torch.Tensor]]: A list of tensors. Can be None. Raises: NotImplementedError: This method must be implemented in subclasses. diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py index 641762c199c46..3462f7de020ef 100644 --- a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py +++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py @@ -11,7 +11,7 @@ """ import threading from collections import deque -from typing import Optional, Union +from typing import Deque, List, Optional, Union import torch @@ -38,7 +38,7 @@ def __init__(self, signal_pipe: KVPipeBase, data_pipe: KVPipeBase, data_pipe: on device (e.g. GPU) """ - self.buffer: deque[list[torch.Tensor]] = deque() + self.buffer: Deque[List[torch.Tensor]] = deque() self.buffer_size = 0 self.buffer_size_threshold = buffer_size_thresh @@ -50,8 +50,8 @@ def __init__(self, signal_pipe: KVPipeBase, data_pipe: KVPipeBase, self.normal_signal = torch.tensor([0], device="cpu") self.end_signal = None - def _matches(self, tokens_roi_sender: list[torch.Tensor], - tokens_roi_recver: list[torch.Tensor]): + def _matches(self, tokens_roi_sender: List[torch.Tensor], + tokens_roi_recver: List[torch.Tensor]): # tokens_roi_sender: tokens and roi of the producer (in the buffer) # tokens_roi_recver: tokens and roi of the consumer (query) @@ -88,7 +88,7 @@ def _send_tensor_and_dec_size(self, tensor = tensor.float() self.data_pipe.send_tensor(tensor) - def _get_element_size(self, data: Optional[Union[list, torch.Tensor]]): + def _get_element_size(self, data: Optional[Union[List, torch.Tensor]]): if isinstance(data, torch.Tensor): return data.element_size() * data.numel() @@ -151,7 +151,7 @@ def drop_select_handler(self): tokens_roi_recver = [input_tokens, roi] def is_buffer_available( - tokens_roi_recver: list[torch.Tensor], ) -> bool: + tokens_roi_recver: List[torch.Tensor], ) -> bool: # perform input tokens and roi matching # FIXME: this matching is O(n), ideally it should be O(1) # but this buffer size won't (and shouldn't) be too large so @@ -184,7 +184,7 @@ def is_buffer_available( def drop_select( self, input_tokens: Optional[torch.Tensor], - roi: Optional[torch.Tensor]) -> list[Optional[torch.Tensor]]: + roi: Optional[torch.Tensor]) -> List[Optional[torch.Tensor]]: assert self.request_handling_thread is None, \ "drop_select should be called by the KV cache consumer "\ diff --git a/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py index 37e1e4d8269f8..7aa53d07a9ef2 100644 --- a/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py +++ b/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py @@ -15,7 +15,7 @@ import threading import time from concurrent.futures import ThreadPoolExecutor -from typing import Callable, Optional +from typing import Callable, Dict, Optional, Tuple import torch @@ -35,7 +35,7 @@ def __init__(self, message): super().__init__(self.message) -Metadata = dict[str, Optional[torch.Tensor]] +Metadata = Dict[str, Optional[torch.Tensor]] class PyNcclPipe(KVPipeBase): @@ -81,7 +81,7 @@ def __init__(self, def _get_device_send_recv_impl( self, group: StatelessProcessGroup - ) -> tuple[Callable[[torch.Tensor, int], None], Callable[ + ) -> Tuple[Callable[[torch.Tensor, int], None], Callable[ [torch.Tensor, int], None]]: send: Callable[[torch.Tensor, int], None] diff --git a/vllm/distributed/kv_transfer/kv_transfer_agent.py b/vllm/distributed/kv_transfer/kv_transfer_agent.py index 2873ae9a86b97..1e80e0bd7de86 100644 --- a/vllm/distributed/kv_transfer/kv_transfer_agent.py +++ b/vllm/distributed/kv_transfer/kv_transfer_agent.py @@ -5,7 +5,7 @@ 1. `send_kv_caches_and_hidden_states` 2. `recv_kv_caches_and_hidden_states """ -from typing import TYPE_CHECKING, Union +from typing import TYPE_CHECKING, List, Tuple, Union if TYPE_CHECKING: from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata @@ -53,7 +53,7 @@ def send_kv_caches_and_hidden_states( self, model_executable: torch.nn.Module, model_input: "ModelInputForGPUWithSamplingMetadata", - kv_caches: list[torch.Tensor], + kv_caches: List[torch.Tensor], hidden_or_intermediate_states: Union[torch.Tensor, IntermediateTensors], ) -> None: @@ -68,8 +68,8 @@ def close(self) -> None: def recv_kv_caches_and_hidden_states( self, model_executable: torch.nn.Module, model_input: "ModelInputForGPUWithSamplingMetadata", - kv_caches: list[torch.Tensor] - ) -> tuple[Union[torch.Tensor, IntermediateTensors], bool, + kv_caches: List[torch.Tensor] + ) -> Tuple[Union[torch.Tensor, IntermediateTensors], bool, "ModelInputForGPUWithSamplingMetadata"]: return self.connector.recv_kv_caches_and_hidden_states( diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index dd38b699b39dd..86166dd5bb831 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -29,7 +29,8 @@ from contextlib import contextmanager, nullcontext from dataclasses import dataclass from multiprocessing import shared_memory -from typing import TYPE_CHECKING, Any, Callable, Optional, Union +from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, + Union) from unittest.mock import patch import torch @@ -58,15 +59,15 @@ class GraphCaptureContext: def _split_tensor_dict( - tensor_dict: dict[str, Union[torch.Tensor, Any]] -) -> tuple[list[tuple[str, Any]], list[torch.Tensor]]: + tensor_dict: Dict[str, Union[torch.Tensor, Any]] +) -> Tuple[List[Tuple[str, Any]], List[torch.Tensor]]: """Split the tensor dictionary into two parts: 1. A list of (key, value) pairs. If the value is a tensor, it is replaced by its metadata. 2. A list of tensors. """ - metadata_list: list[tuple[str, Any]] = [] - tensor_list: list[torch.Tensor] = [] + metadata_list: List[Tuple[str, Any]] = [] + tensor_list: List[torch.Tensor] = [] for key, value in tensor_dict.items(): if isinstance(value, torch.Tensor): # Note: we cannot use `value.device` here, @@ -82,7 +83,7 @@ def _split_tensor_dict( return metadata_list, tensor_list -_group_name_counter: dict[str, int] = {} +_group_name_counter: Dict[str, int] = {} def _get_unique_name(name: str) -> str: @@ -98,7 +99,7 @@ def _get_unique_name(name: str) -> str: return newname -_groups: dict[str, Callable[[], Optional["GroupCoordinator"]]] = {} +_groups: Dict[str, Callable[[], Optional["GroupCoordinator"]]] = {} def _register_group(group: "GroupCoordinator") -> None: @@ -138,7 +139,7 @@ class GroupCoordinator: # available attributes: rank: int # global rank - ranks: list[int] # global ranks in the group + ranks: List[int] # global ranks in the group world_size: int # size of the group # difference between `local_rank` and `rank_in_group`: # if we have a group of size 4 across two nodes: @@ -157,7 +158,7 @@ class GroupCoordinator: def __init__( self, - group_ranks: list[list[int]], + group_ranks: List[List[int]], local_rank: int, torch_distributed_backend: Union[str, Backend], use_device_communicator: bool, @@ -376,7 +377,7 @@ def broadcast_object(self, obj: Optional[Any] = None, src: int = 0): return recv[0] def broadcast_object_list(self, - obj_list: list[Any], + obj_list: List[Any], src: int = 0, group: Optional[ProcessGroup] = None): """Broadcast the input object list. @@ -459,11 +460,11 @@ def recv_object(self, src: int) -> Any: def broadcast_tensor_dict( self, - tensor_dict: Optional[dict[str, Union[torch.Tensor, Any]]] = None, + tensor_dict: Optional[Dict[str, Union[torch.Tensor, Any]]] = None, src: int = 0, group: Optional[ProcessGroup] = None, metadata_group: Optional[ProcessGroup] = None - ) -> Optional[dict[str, Union[torch.Tensor, Any]]]: + ) -> Optional[Dict[str, Union[torch.Tensor, Any]]]: """Broadcast the input tensor dictionary. NOTE: `src` is the local rank of the source rank. """ @@ -477,7 +478,7 @@ def broadcast_tensor_dict( rank_in_group = self.rank_in_group if rank_in_group == src: - metadata_list: list[tuple[Any, Any]] = [] + metadata_list: List[Tuple[Any, Any]] = [] assert isinstance( tensor_dict, dict), (f"Expecting a dictionary, got {type(tensor_dict)}") @@ -544,10 +545,10 @@ def broadcast_tensor_dict( def send_tensor_dict( self, - tensor_dict: dict[str, Union[torch.Tensor, Any]], + tensor_dict: Dict[str, Union[torch.Tensor, Any]], dst: Optional[int] = None, all_gather_group: Optional["GroupCoordinator"] = None, - ) -> Optional[dict[str, Union[torch.Tensor, Any]]]: + ) -> Optional[Dict[str, Union[torch.Tensor, Any]]]: """Send the input tensor dictionary. NOTE: `dst` is the local rank of the source rank. """ @@ -567,7 +568,7 @@ def send_tensor_dict( dst = (self.rank_in_group + 1) % self.world_size assert dst < self.world_size, f"Invalid dst rank ({dst})" - metadata_list: list[tuple[Any, Any]] = [] + metadata_list: List[Tuple[Any, Any]] = [] assert isinstance( tensor_dict, dict), f"Expecting a dictionary, got {type(tensor_dict)}" @@ -602,7 +603,7 @@ def recv_tensor_dict( self, src: Optional[int] = None, all_gather_group: Optional["GroupCoordinator"] = None, - ) -> Optional[dict[str, Union[torch.Tensor, Any]]]: + ) -> Optional[Dict[str, Union[torch.Tensor, Any]]]: """Recv the input tensor dictionary. NOTE: `src` is the local rank of the source rank. """ @@ -623,7 +624,7 @@ def recv_tensor_dict( assert src < self.world_size, f"Invalid src rank ({src})" recv_metadata_list = self.recv_object(src=src) - tensor_dict: dict[str, Any] = {} + tensor_dict: Dict[str, Any] = {} for key, value in recv_metadata_list: if isinstance(value, TensorMetadata): tensor = torch.empty(value.size, @@ -707,7 +708,7 @@ def get_world_group() -> GroupCoordinator: return _WORLD -def init_world_group(ranks: list[int], local_rank: int, +def init_world_group(ranks: List[int], local_rank: int, backend: str) -> GroupCoordinator: return GroupCoordinator( group_ranks=[ranks], @@ -719,7 +720,7 @@ def init_world_group(ranks: list[int], local_rank: int, def init_model_parallel_group( - group_ranks: list[list[int]], + group_ranks: List[List[int]], local_rank: int, backend: str, use_message_queue_broadcaster: bool = False, @@ -1088,7 +1089,7 @@ def cleanup_dist_env_and_memory(shutdown_ray: bool = False): def in_the_same_node_as(pg: Union[ProcessGroup, StatelessProcessGroup], - source_rank: int = 0) -> list[bool]: + source_rank: int = 0) -> List[bool]: """ This is a collective operation that returns if each rank is in the same node as the source rank. It tests if processes are attached to the same diff --git a/vllm/distributed/utils.py b/vllm/distributed/utils.py index b518e6272aedb..d6fca4f0221b8 100644 --- a/vllm/distributed/utils.py +++ b/vllm/distributed/utils.py @@ -8,8 +8,7 @@ import pickle import time from collections import deque -from collections.abc import Sequence -from typing import Any, Optional +from typing import Any, Deque, Dict, Optional, Sequence, Tuple import torch from torch.distributed import ProcessGroup, TCPStore @@ -66,7 +65,7 @@ def split_tensor_along_last_dim( def get_pp_indices(num_hidden_layers: int, pp_rank: int, - pp_size: int) -> tuple[int, int]: + pp_size: int) -> Tuple[int, int]: """Try to evenly distribute layers across partitions. If the number of layers is not divisible by the number of partitions, @@ -124,15 +123,15 @@ class StatelessProcessGroup: data_expiration_seconds: int = 3600 # 1 hour # dst rank -> counter - send_dst_counter: dict[int, int] = dataclasses.field(default_factory=dict) + send_dst_counter: Dict[int, int] = dataclasses.field(default_factory=dict) # src rank -> counter - recv_src_counter: dict[int, int] = dataclasses.field(default_factory=dict) + recv_src_counter: Dict[int, int] = dataclasses.field(default_factory=dict) broadcast_send_counter: int = 0 - broadcast_recv_src_counter: dict[int, int] = dataclasses.field( + broadcast_recv_src_counter: Dict[int, int] = dataclasses.field( default_factory=dict) # A deque to store the data entries, with key and timestamp. - entries: deque[tuple[str, + entries: Deque[Tuple[str, float]] = dataclasses.field(default_factory=deque) def __post_init__(self): diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 187d8ebee9d55..1a2f794c9151d 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -3,9 +3,9 @@ import argparse import dataclasses import json -from collections.abc import Mapping from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, Literal, Optional, Union, cast, get_args +from typing import (TYPE_CHECKING, Any, Dict, List, Literal, Mapping, Optional, + Tuple, Type, Union, cast, get_args) import torch @@ -65,7 +65,7 @@ def nullable_kvs(val: str) -> Optional[Mapping[str, int]]: if len(val) == 0: return None - out_dict: dict[str, int] = {} + out_dict: Dict[str, int] = {} for item in val.split(","): kv_parts = [part.lower().strip() for part in item.split("=")] if len(kv_parts) != 2: @@ -91,7 +91,7 @@ def nullable_kvs(val: str) -> Optional[Mapping[str, int]]: class EngineArgs: """Arguments for vLLM engine.""" model: str = 'facebook/opt-125m' - served_model_name: Optional[Union[str, list[str]]] = None + served_model_name: Optional[Union[str, List[str]]] = None tokenizer: Optional[str] = None hf_config_path: Optional[str] = None task: TaskOption = "auto" @@ -110,7 +110,7 @@ class EngineArgs: # is intended for expert use only. The API may change without # notice. distributed_executor_backend: Optional[Union[str, - type[ExecutorBase]]] = None + Type[ExecutorBase]]] = None # number of P/D disaggregation (or other disaggregation) workers pipeline_parallel_size: int = 1 tensor_parallel_size: int = 1 @@ -131,7 +131,7 @@ class EngineArgs: disable_log_stats: bool = False revision: Optional[str] = None code_revision: Optional[str] = None - rope_scaling: Optional[dict[str, Any]] = None + rope_scaling: Optional[Dict[str, Any]] = None rope_theta: Optional[float] = None hf_overrides: Optional[HfOverrides] = None tokenizer_revision: Optional[str] = None @@ -143,10 +143,10 @@ class EngineArgs: # Note: Specifying a tokenizer pool by passing a class # is intended for expert use only. The API may change without # notice. - tokenizer_pool_type: Union[str, type["BaseTokenizerGroup"]] = "ray" - tokenizer_pool_extra_config: Optional[dict[str, Any]] = None + tokenizer_pool_type: Union[str, Type["BaseTokenizerGroup"]] = "ray" + tokenizer_pool_extra_config: Optional[Dict[str, Any]] = None limit_mm_per_prompt: Optional[Mapping[str, int]] = None - mm_processor_kwargs: Optional[dict[str, Any]] = None + mm_processor_kwargs: Optional[Dict[str, Any]] = None disable_mm_preprocessor_cache: bool = False enable_lora: bool = False enable_lora_bias: bool = False @@ -157,7 +157,7 @@ class EngineArgs: max_prompt_adapter_token: int = 0 fully_sharded_loras: bool = False lora_extra_vocab_size: int = 256 - long_lora_scaling_factors: Optional[tuple[float]] = None + long_lora_scaling_factors: Optional[Tuple[float]] = None lora_dtype: Optional[Union[str, torch.dtype]] = 'auto' max_cpu_loras: Optional[int] = None device: str = 'auto' @@ -167,7 +167,7 @@ class EngineArgs: num_gpu_blocks_override: Optional[int] = None num_lookahead_slots: int = 0 model_loader_extra_config: Optional[dict] = None - ignore_patterns: Optional[Union[str, list[str]]] = None + ignore_patterns: Optional[Union[str, List[str]]] = None preemption_mode: Optional[str] = None scheduler_delay_factor: float = 0.0 @@ -196,9 +196,9 @@ class EngineArgs: collect_detailed_traces: Optional[str] = None disable_async_output_proc: bool = False scheduling_policy: Literal["fcfs", "priority"] = "fcfs" - scheduler_cls: Union[str, type[object]] = "vllm.core.scheduler.Scheduler" + scheduler_cls: Union[str, Type[object]] = "vllm.core.scheduler.Scheduler" - override_neuron_config: Optional[dict[str, Any]] = None + override_neuron_config: Optional[Dict[str, Any]] = None override_pooler_config: Optional[PoolerConfig] = None compilation_config: Optional[CompilationConfig] = None worker_cls: str = "auto" @@ -206,13 +206,13 @@ class EngineArgs: kv_transfer_config: Optional[KVTransferConfig] = None generation_config: Optional[str] = None - override_generation_config: Optional[dict[str, Any]] = None + override_generation_config: Optional[Dict[str, Any]] = None enable_sleep_mode: bool = False model_impl: str = "auto" calculate_kv_scales: Optional[bool] = None - additional_config: Optional[dict[str, Any]] = None + additional_config: Optional[Dict[str, Any]] = None def __post_init__(self): if not self.tokenizer: diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 1f7041eff149e..93d9b74d8e1e8 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -4,9 +4,9 @@ import copy import time import weakref -from collections.abc import AsyncGenerator, Coroutine, Iterable, Mapping from functools import partial -from typing import Any, Callable, Optional, Union, overload +from typing import (Any, AsyncGenerator, Callable, Coroutine, Dict, Iterable, + List, Mapping, Optional, Set, Tuple, Type, Union, overload) from weakref import ReferenceType from typing_extensions import deprecated @@ -93,7 +93,7 @@ def put(self, item: Union[RequestOutput, PoolingRequestOutput, def finish( self, - exception: Optional[Union[BaseException, type[BaseException]]] = None, + exception: Optional[Union[BaseException, Type[BaseException]]] = None, ) -> None: if not self._finished: self._finished = True @@ -130,9 +130,9 @@ class RequestTracker: """Synchronous abstraction for tracking requests.""" def __init__(self) -> None: - self._request_streams: dict[str, AsyncStream] = {} + self._request_streams: Dict[str, AsyncStream] = {} self._aborted_requests: asyncio.Queue[str] = asyncio.Queue() - self._new_requests: asyncio.Queue[tuple[AsyncStream, + self._new_requests: asyncio.Queue[Tuple[AsyncStream, dict]] = asyncio.Queue() self.new_requests_event = asyncio.Event() @@ -216,7 +216,7 @@ def abort_request(self, request_id: str, *, exception: Optional[Union[BaseException, - type[BaseException]]] = None, + Type[BaseException]]] = None, verbose: bool = False) -> None: """Abort a request during next background loop iteration.""" if verbose: @@ -228,11 +228,11 @@ def abort_request(self, if stream is not None: stream.finish(exception=exception) - def get_new_and_aborted_requests(self) -> tuple[list[dict], set[str]]: + def get_new_and_aborted_requests(self) -> Tuple[List[Dict], Set[str]]: """Get the new requests and finished requests to be sent to the engine.""" - new_requests: list[dict] = [] - finished_requests: set[str] = set() + new_requests: List[Dict] = [] + finished_requests: Set[str] = set() while not self._aborted_requests.empty(): request_id = self._aborted_requests.get_nowait() @@ -268,7 +268,7 @@ def __init__(self, *args, **kwargs): async def step_async( self, virtual_engine: int - ) -> list[Union[RequestOutput, PoolingRequestOutput]]: + ) -> List[Union[RequestOutput, PoolingRequestOutput]]: """Performs one decoding iteration and returns newly generated results. The workers are ran asynchronously if possible. @@ -583,7 +583,7 @@ class AsyncLLMEngine(EngineClient): **kwargs: Arguments for :class:`LLMEngine`. """ - _engine_class: type[_AsyncLLMEngine] = _AsyncLLMEngine + _engine_class: Type[_AsyncLLMEngine] = _AsyncLLMEngine def __init__(self, *args, @@ -621,7 +621,7 @@ def __del__(self): @classmethod def _get_executor_cls(cls, - engine_config: VllmConfig) -> type[ExecutorBase]: + engine_config: VllmConfig) -> Type[ExecutorBase]: return LLMEngine._get_executor_cls(engine_config) @classmethod @@ -631,7 +631,7 @@ def from_engine_args( engine_config: Optional[VllmConfig] = None, start_engine_loop: bool = True, usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, - stat_loggers: Optional[dict[str, StatLoggerBase]] = None, + stat_loggers: Optional[Dict[str, StatLoggerBase]] = None, ) -> "AsyncLLMEngine": """Creates an async LLM engine from the engine arguments.""" # Create the engine configs. @@ -1156,7 +1156,7 @@ async def get_lora_config(self) -> LoRAConfig: async def do_log_stats( self, scheduler_outputs: Optional[SchedulerOutputs] = None, - model_output: Optional[list[SamplerOutput]] = None) -> None: + model_output: Optional[List[SamplerOutput]] = None) -> None: self.engine.do_log_stats() async def check_health(self) -> None: diff --git a/vllm/engine/async_timeout.py b/vllm/engine/async_timeout.py index 88b4a3e9e7e81..aa54c0693941f 100644 --- a/vllm/engine/async_timeout.py +++ b/vllm/engine/async_timeout.py @@ -10,7 +10,7 @@ import sys import warnings from types import TracebackType -from typing import Any, Optional +from typing import Any, Optional, Type if sys.version_info[:2] >= (3, 11): from asyncio import timeout as asyncio_timeout @@ -77,7 +77,7 @@ def __enter__(self) -> "Timeout": def __exit__( self, - exc_type: Optional[type[BaseException]], + exc_type: Optional[Type[BaseException]], exc_val: Optional[BaseException], exc_tb: Optional[TracebackType], ) -> Optional[bool]: @@ -90,7 +90,7 @@ async def __aenter__(self) -> "Timeout": async def __aexit__( self, - exc_type: Optional[type[BaseException]], + exc_type: Optional[Type[BaseException]], exc_val: Optional[BaseException], exc_tb: Optional[TracebackType], ) -> Optional[bool]: @@ -173,7 +173,7 @@ def _do_enter(self) -> None: self._state = _State.ENTER self._reschedule() - def _do_exit(self, exc_type: Optional[type[BaseException]]) -> None: + def _do_exit(self, exc_type: Optional[Type[BaseException]]) -> None: if exc_type is asyncio.CancelledError and \ self._state == _State.TIMEOUT: self._timeout_handler = None diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 9e4f6e7535063..3dee4dab4c47e 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -4,13 +4,13 @@ import time from collections import Counter as collectionsCounter from collections import deque -from collections.abc import Iterable, Mapping -from collections.abc import Sequence as GenericSequence from contextlib import contextmanager from dataclasses import dataclass from functools import partial -from typing import (TYPE_CHECKING, Callable, ClassVar, NamedTuple, Optional, - Union, cast, overload) +from typing import (TYPE_CHECKING, Callable, ClassVar, Deque, Dict, Iterable, + List, Mapping, NamedTuple, Optional) +from typing import Sequence as GenericSequence +from typing import Set, Type, Union, cast, overload import torch from typing_extensions import TypeVar, deprecated @@ -72,15 +72,15 @@ @dataclass class SchedulerOutputState: """Caches the scheduler outputs for a virtual engine. Used for Multi-Step""" - seq_group_metadata_list: Optional[list[SequenceGroupMetadata]] = None + seq_group_metadata_list: Optional[List[SequenceGroupMetadata]] = None scheduler_outputs: Optional[SchedulerOutputs] = None allow_async_output_proc: bool = False last_output: Optional[SamplerOutput] = None class OutputData(NamedTuple): - outputs: list[SamplerOutput] - seq_group_metadata_list: list[SequenceGroupMetadata] + outputs: List[SamplerOutput] + seq_group_metadata_list: List[SequenceGroupMetadata] scheduler_outputs: SchedulerOutputs is_async: bool is_last_step: bool @@ -90,23 +90,23 @@ class OutputData(NamedTuple): # is_first_step_output is invalid when `outputs` has # outputs from multiple steps. is_first_step_output: Optional[bool] - skip: list[int] + skip: List[int] class SchedulerContext: def __init__(self, multi_step_stream_outputs: bool = False): - self.output_queue: deque[OutputData] = deque() - self.request_outputs: list[Union[RequestOutput, + self.output_queue: Deque[OutputData] = deque() + self.request_outputs: List[Union[RequestOutput, PoolingRequestOutput]] = [] self.seq_group_metadata_list: Optional[ - list[SequenceGroupMetadata]] = None + List[SequenceGroupMetadata]] = None self.scheduler_outputs: Optional[SchedulerOutputs] = None self.multi_step_stream_outputs: bool = multi_step_stream_outputs - def append_output(self, outputs: list[SamplerOutput], - seq_group_metadata_list: list[SequenceGroupMetadata], + def append_output(self, outputs: List[SamplerOutput], + seq_group_metadata_list: List[SequenceGroupMetadata], scheduler_outputs: SchedulerOutputs, is_async: bool, is_last_step: bool, is_first_step_output: Optional[bool]): @@ -170,7 +170,7 @@ def enable_output_validation(cls): def validate_output( cls, output: object, - output_type: type[_O], + output_type: Type[_O], ) -> _O: do_validate = cls.DO_VALIDATE_OUTPUT @@ -185,11 +185,11 @@ def validate_output( def validate_outputs( cls, outputs: GenericSequence[object], - output_type: type[_O], - ) -> list[_O]: + output_type: Type[_O], + ) -> List[_O]: do_validate = cls.DO_VALIDATE_OUTPUT - outputs_: list[_O] + outputs_: List[_O] if TYPE_CHECKING or do_validate: outputs_ = [] for output in outputs: @@ -208,10 +208,10 @@ def validate_outputs( def __init__( self, vllm_config: VllmConfig, - executor_class: type[ExecutorBase], + executor_class: Type[ExecutorBase], log_stats: bool, usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, - stat_loggers: Optional[dict[str, StatLoggerBase]] = None, + stat_loggers: Optional[Dict[str, StatLoggerBase]] = None, input_registry: InputRegistry = INPUT_REGISTRY, mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY, use_cached_outputs: bool = False, @@ -409,7 +409,7 @@ def get_tokenizer_for_seq(sequence: Sequence) -> AnyTokenizer: ), )) - self.seq_id_to_seq_group: dict[str, SequenceGroupBase] = {} + self.seq_id_to_seq_group: Dict[str, SequenceGroupBase] = {} # Flag to set when an input fails to process and the engine should run # the next step without re-scheduling. @@ -443,7 +443,7 @@ def _initialize_kv_caches(self) -> None: @classmethod def _get_executor_cls(cls, - engine_config: VllmConfig) -> type[ExecutorBase]: + engine_config: VllmConfig) -> Type[ExecutorBase]: # distributed_executor_backend must be set in VllmConfig.__post_init__ distributed_executor_backend = ( engine_config.parallel_config.distributed_executor_backend) @@ -484,7 +484,7 @@ def from_engine_args( cls, engine_args: EngineArgs, usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, - stat_loggers: Optional[dict[str, StatLoggerBase]] = None, + stat_loggers: Optional[Dict[str, StatLoggerBase]] = None, ) -> "LLMEngine": """Creates an LLM engine from the engine arguments.""" # Create the engine configs. @@ -514,7 +514,7 @@ def __del__(self): def get_tokenizer_group( self, - group_type: type[_G] = BaseTokenizerGroup, + group_type: Type[_G] = BaseTokenizerGroup, ) -> _G: tokenizer_group = self.tokenizer @@ -937,7 +937,7 @@ def reset_prefix_cache(self) -> bool: @staticmethod def _process_sequence_group_outputs( seq_group: SequenceGroup, - outputs: list[PoolingSequenceGroupOutput], + outputs: List[PoolingSequenceGroupOutput], ) -> None: seq_group.pooled_data = outputs[0].data @@ -1016,7 +1016,7 @@ def _process_model_outputs(self, scheduler_outputs.scheduled_seq_groups) has_multiple_outputs: bool = len(outputs) > 1 - outputs_by_sequence_group: list[list[SequenceGroupOutput]] + outputs_by_sequence_group: List[List[SequenceGroupOutput]] if has_multiple_outputs: assert self.scheduler_config.is_multi_step or \ self.speculative_config @@ -1062,8 +1062,8 @@ def _process_model_outputs(self, else: indices = range(len(seq_group_metadata_list)) # type: ignore - finished_before: list[int] = [] - finished_now: list[int] = [] + finished_before: List[int] = [] + finished_now: List[int] = [] for i in indices: if i in skip: continue @@ -1077,7 +1077,7 @@ def _process_model_outputs(self, finished_before.append(i) continue - output: list[SequenceGroupOutput] + output: List[SequenceGroupOutput] if has_multiple_outputs: output = outputs_by_sequence_group[i] else: @@ -1221,9 +1221,9 @@ def _process_model_outputs(self, return None def _advance_to_next_step( - self, output: list[SamplerOutput], - seq_group_metadata_list: list[SequenceGroupMetadata], - scheduled_seq_groups: list[ScheduledSequenceGroup]) -> None: + self, output: List[SamplerOutput], + seq_group_metadata_list: List[SequenceGroupMetadata], + scheduled_seq_groups: List[ScheduledSequenceGroup]) -> None: """Given model output from a single run, append the tokens to the sequences. This is normally done inside output processor, but it is required if the worker is to perform async forward pass to next step. @@ -1264,7 +1264,7 @@ def _advance_to_next_step( else: seq.append_token_id(sample.output_token, sample.logprobs) - def step(self) -> list[Union[RequestOutput, PoolingRequestOutput]]: + def step(self) -> List[Union[RequestOutput, PoolingRequestOutput]]: """Performs one decoding iteration and returns newly generated results. .. figure:: https://i.imgur.com/sv2HssD.png @@ -1490,7 +1490,7 @@ def step(self) -> list[Union[RequestOutput, PoolingRequestOutput]]: def _abort_and_cache_schedule( self, request_id: str, virtual_engine: int, - seq_group_metadata_list: list[SequenceGroupMetadata], + seq_group_metadata_list: List[SequenceGroupMetadata], scheduler_outputs: SchedulerOutputs, allow_async_output_proc: bool) -> None: """Aborts a single request, and caches the scheduler outputs minus that @@ -1521,7 +1521,7 @@ def _abort_and_cache_schedule( allow_async_output_proc=allow_async_output_proc) def _has_remaining_steps( - self, seq_group_metadata_list: Optional[list[SequenceGroupMetadata]] + self, seq_group_metadata_list: Optional[List[SequenceGroupMetadata]] ) -> bool: if (not self.scheduler_config.is_multi_step or not seq_group_metadata_list): @@ -1542,7 +1542,7 @@ def _has_remaining_steps( def _cache_scheduler_outputs_for_multi_step( self, virtual_engine: int, - seq_group_metadata_list: Optional[list[SequenceGroupMetadata]], + seq_group_metadata_list: Optional[List[SequenceGroupMetadata]], scheduler_outputs: SchedulerOutputs, allow_async_output_proc: bool) -> None: co = self.cached_scheduler_outputs[virtual_engine] @@ -1554,7 +1554,7 @@ def _cache_scheduler_outputs_for_multi_step( def _update_cached_scheduler_output( self, virtual_engine: int, - output: list[Optional[SamplerOutput]]) -> None: + output: List[Optional[SamplerOutput]]) -> None: if (self.parallel_config.pipeline_parallel_size > 1 and len(output) > 0 and output[0] is not None): last_output = output[-1] @@ -1596,9 +1596,9 @@ def remove_logger(self, logger_name: str) -> None: def do_log_stats(self, scheduler_outputs: Optional[SchedulerOutputs] = None, - model_output: Optional[list[SamplerOutput]] = None, - finished_before: Optional[list[int]] = None, - skip: Optional[list[int]] = None) -> None: + model_output: Optional[List[SamplerOutput]] = None, + finished_before: Optional[List[int]] = None, + skip: Optional[List[int]] = None) -> None: """Forced log when no requests active.""" if self.log_stats: stats = self._get_stats(scheduler_outputs, model_output, @@ -1608,9 +1608,9 @@ def do_log_stats(self, def _get_stats(self, scheduler_outputs: Optional[SchedulerOutputs], - model_output: Optional[list[SamplerOutput]] = None, - finished_before: Optional[list[int]] = None, - skip: Optional[list[int]] = None) -> Stats: + model_output: Optional[List[SamplerOutput]] = None, + finished_before: Optional[List[int]] = None, + skip: Optional[List[int]] = None) -> Stats: """Get Stats to be Logged to Prometheus. Args: @@ -1662,28 +1662,28 @@ def _get_stats(self, num_prompt_tokens_iter = 0 num_generation_tokens_iter = 0 num_tokens_iter = 0 - time_to_first_tokens_iter: list[float] = [] - time_per_output_tokens_iter: list[float] = [] + time_to_first_tokens_iter: List[float] = [] + time_per_output_tokens_iter: List[float] = [] num_preemption_iter = (0 if scheduler_outputs is None else scheduler_outputs.preempted) # Request stats # Latency - time_e2e_requests: list[float] = [] - time_queue_requests: list[float] = [] - time_inference_requests: list[float] = [] - time_prefill_requests: list[float] = [] - time_decode_requests: list[float] = [] - time_in_queue_requests: list[float] = [] - model_forward_time_requests: list[float] = [] - model_execute_time_requests: list[float] = [] + time_e2e_requests: List[float] = [] + time_queue_requests: List[float] = [] + time_inference_requests: List[float] = [] + time_prefill_requests: List[float] = [] + time_decode_requests: List[float] = [] + time_in_queue_requests: List[float] = [] + model_forward_time_requests: List[float] = [] + model_execute_time_requests: List[float] = [] # Metadata - num_prompt_tokens_requests: list[int] = [] - num_generation_tokens_requests: list[int] = [] - n_requests: list[int] = [] - max_num_generation_tokens_requests: list[int] = [] - max_tokens_requests: list[int] = [] - finished_reason_requests: list[str] = [] + num_prompt_tokens_requests: List[int] = [] + num_generation_tokens_requests: List[int] = [] + n_requests: List[int] = [] + max_num_generation_tokens_requests: List[int] = [] + max_tokens_requests: List[int] = [] + finished_reason_requests: List[str] = [] # LoRA requests running_lora_adapters = dict( @@ -1882,7 +1882,7 @@ def add_lora(self, lora_request: LoRARequest) -> bool: def remove_lora(self, lora_id: int) -> bool: return self.model_executor.remove_lora(lora_id) - def list_loras(self) -> set[int]: + def list_loras(self) -> Set[int]: return self.model_executor.list_loras() def pin_lora(self, lora_id: int) -> bool: @@ -1895,7 +1895,7 @@ def add_prompt_adapter( def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool: return self.model_executor.remove_prompt_adapter(prompt_adapter_id) - def list_prompt_adapters(self) -> list[int]: + def list_prompt_adapters(self) -> List[int]: return self.model_executor.list_prompt_adapters() def start_profile(self) -> None: @@ -1924,7 +1924,7 @@ def is_tracing_enabled(self) -> bool: def do_tracing(self, scheduler_outputs: SchedulerOutputs, - finished_before: Optional[list[int]] = None) -> None: + finished_before: Optional[List[int]] = None) -> None: if self.tracer is None: return diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index efa4a805a81c7..cb3ca7a118819 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -1,8 +1,9 @@ # SPDX-License-Identifier: Apache-2.0 import time -from collections import Counter as CollectionsCounter -from typing import TYPE_CHECKING, Optional, Union, cast +from typing import TYPE_CHECKING +from typing import Counter as CollectionsCounter +from typing import Dict, List, Optional, Type, Union, cast import numpy as np import prometheus_client @@ -45,7 +46,7 @@ class Metrics: _counter_cls = prometheus_client.Counter _histogram_cls = prometheus_client.Histogram - def __init__(self, labelnames: list[str], vllm_config: VllmConfig): + def __init__(self, labelnames: List[str], vllm_config: VllmConfig): # Unregister any existing vLLM collectors (for CI/CD) self._unregister_vllm_metrics() @@ -275,7 +276,7 @@ class _RayGaugeWrapper: def __init__(self, name: str, documentation: str = "", - labelnames: Optional[list[str]] = None, + labelnames: Optional[List[str]] = None, multiprocess_mode: str = ""): del multiprocess_mode labelnames_tuple = tuple(labelnames) if labelnames else None @@ -302,7 +303,7 @@ class _RayCounterWrapper: def __init__(self, name: str, documentation: str = "", - labelnames: Optional[list[str]] = None): + labelnames: Optional[List[str]] = None): labelnames_tuple = tuple(labelnames) if labelnames else None self._counter = ray_metrics.Counter(name=name, description=documentation, @@ -325,8 +326,8 @@ class _RayHistogramWrapper: def __init__(self, name: str, documentation: str = "", - labelnames: Optional[list[str]] = None, - buckets: Optional[list[float]] = None): + labelnames: Optional[List[str]] = None, + buckets: Optional[List[float]] = None): labelnames_tuple = tuple(labelnames) if labelnames else None boundaries = buckets if buckets else [] self._histogram = ray_metrics.Histogram(name=name, @@ -347,14 +348,14 @@ class RayMetrics(Metrics): RayMetrics is used by RayPrometheusStatLogger to log to Ray metrics. Provides the same metrics as Metrics but uses Ray's util.metrics library. """ - _gauge_cls: type[prometheus_client.Gauge] = cast( - type[prometheus_client.Gauge], _RayGaugeWrapper) - _counter_cls: type[prometheus_client.Counter] = cast( - type[prometheus_client.Counter], _RayCounterWrapper) - _histogram_cls: type[prometheus_client.Histogram] = cast( - type[prometheus_client.Histogram], _RayHistogramWrapper) - - def __init__(self, labelnames: list[str], vllm_config: VllmConfig): + _gauge_cls: Type[prometheus_client.Gauge] = cast( + Type[prometheus_client.Gauge], _RayGaugeWrapper) + _counter_cls: Type[prometheus_client.Counter] = cast( + Type[prometheus_client.Counter], _RayCounterWrapper) + _histogram_cls: Type[prometheus_client.Histogram] = cast( + Type[prometheus_client.Histogram], _RayHistogramWrapper) + + def __init__(self, labelnames: List[str], vllm_config: VllmConfig): if ray_metrics is None: raise ImportError("RayMetrics requires Ray to be installed.") super().__init__(labelnames, vllm_config) @@ -364,14 +365,14 @@ def _unregister_vllm_metrics(self) -> None: pass -def build_buckets(mantissa_lst: list[int], max_value: int) -> list[int]: +def build_buckets(mantissa_lst: List[int], max_value: int) -> List[int]: """ Builds a list of buckets with increasing powers of 10 multiplied by mantissa values until the value exceeds the specified maximum. """ exponent = 0 - buckets: list[int] = [] + buckets: List[int] = [] while True: for m in mantissa_lst: value = m * 10**exponent @@ -382,7 +383,7 @@ def build_buckets(mantissa_lst: list[int], max_value: int) -> list[int]: exponent += 1 -def build_1_2_5_buckets(max_value: int) -> list[int]: +def build_1_2_5_buckets(max_value: int) -> List[int]: """ Example: >>> build_1_2_5_buckets(100) @@ -391,7 +392,7 @@ def build_1_2_5_buckets(max_value: int) -> list[int]: return build_buckets([1, 2, 5], max_value) -def build_1_2_3_5_8_buckets(max_value: int) -> list[int]: +def build_1_2_3_5_8_buckets(max_value: int) -> List[int]: """ Example: >>> build_1_2_3_5_8_buckets(100) @@ -406,7 +407,7 @@ def local_interval_elapsed(now: float, last_log: float, return elapsed_time > local_interval -def get_throughput(tracked_stats: list[int], now: float, +def get_throughput(tracked_stats: List[int], now: float, last_log: float) -> float: return float(np.sum(tracked_stats) / (now - last_log)) @@ -507,7 +508,7 @@ class PrometheusStatLogger(StatLoggerBase): _metrics_cls = Metrics _gauge_cls = prometheus_client.Gauge - def __init__(self, local_interval: float, labels: dict[str, str], + def __init__(self, local_interval: float, labels: Dict[str, str], vllm_config: VllmConfig) -> None: super().__init__(local_interval, vllm_config) # Prometheus metrics @@ -539,13 +540,13 @@ def _log_counter_labels(self, counter, data: CollectionsCounter, for label, count in data.items(): counter.labels(**{**self.labels, label_key: label}).inc(count) - def _log_histogram(self, histogram, data: Union[list[int], - list[float]]) -> None: + def _log_histogram(self, histogram, data: Union[List[int], + List[float]]) -> None: # Convenience function for logging list to histogram. for datum in data: histogram.labels(**self.labels).observe(datum) - def _log_gauge_string(self, gauge, data: dict[str, str]) -> None: + def _log_gauge_string(self, gauge, data: Dict[str, str]) -> None: gauge.labels(**data).set_to_current_time() def _log_prometheus(self, stats: Stats) -> None: diff --git a/vllm/engine/metrics_types.py b/vllm/engine/metrics_types.py index 541f9fb9ff793..9e6d5ef29bedb 100644 --- a/vllm/engine/metrics_types.py +++ b/vllm/engine/metrics_types.py @@ -15,7 +15,7 @@ import time from abc import ABC, abstractmethod from dataclasses import dataclass -from typing import Optional +from typing import List, Optional from vllm.config import SupportsMetricsInfo, VllmConfig from vllm.spec_decode.metrics import SpecDecodeWorkerMetrics @@ -42,29 +42,29 @@ class Stats: num_prompt_tokens_iter: int num_generation_tokens_iter: int num_tokens_iter: int - time_to_first_tokens_iter: list[float] - time_per_output_tokens_iter: list[float] + time_to_first_tokens_iter: List[float] + time_per_output_tokens_iter: List[float] num_preemption_iter: int # Request stats (should have _requests suffix) # Latency - time_e2e_requests: list[float] - time_queue_requests: list[float] - time_inference_requests: list[float] - time_prefill_requests: list[float] - time_decode_requests: list[float] - time_in_queue_requests: list[float] - model_forward_time_requests: list[float] - model_execute_time_requests: list[float] + time_e2e_requests: List[float] + time_queue_requests: List[float] + time_inference_requests: List[float] + time_prefill_requests: List[float] + time_decode_requests: List[float] + time_in_queue_requests: List[float] + model_forward_time_requests: List[float] + model_execute_time_requests: List[float] # Metadata - num_prompt_tokens_requests: list[int] - num_generation_tokens_requests: list[int] - n_requests: list[int] - max_num_generation_tokens_requests: list[int] - max_tokens_requests: list[int] - finished_reason_requests: list[str] - waiting_lora_adapters: list[str] - running_lora_adapters: list[str] + num_prompt_tokens_requests: List[int] + num_generation_tokens_requests: List[int] + n_requests: List[int] + max_num_generation_tokens_requests: List[int] + max_tokens_requests: List[int] + finished_reason_requests: List[str] + waiting_lora_adapters: List[str] + running_lora_adapters: List[str] max_lora: str spec_decode_metrics: Optional["SpecDecodeWorkerMetrics"] = None @@ -75,8 +75,8 @@ class StatLoggerBase(ABC): def __init__(self, local_interval: float, vllm_config: VllmConfig) -> None: # Tracked stats over current local logging interval. - self.num_prompt_tokens: list[int] = [] - self.num_generation_tokens: list[int] = [] + self.num_prompt_tokens: List[int] = [] + self.num_generation_tokens: List[int] = [] self.last_local_log = time.time() self.local_interval = local_interval self.spec_decode_metrics: Optional[SpecDecodeWorkerMetrics] = None diff --git a/vllm/engine/multiprocessing/__init__.py b/vllm/engine/multiprocessing/__init__.py index ea9742a6d38bc..26dfb63c3dbf3 100644 --- a/vllm/engine/multiprocessing/__init__.py +++ b/vllm/engine/multiprocessing/__init__.py @@ -1,10 +1,9 @@ # SPDX-License-Identifier: Apache-2.0 import uuid -from collections.abc import Mapping from dataclasses import dataclass, field from enum import Enum -from typing import Optional, Union, overload +from typing import List, Mapping, Optional, Union, overload from typing_extensions import deprecated @@ -154,7 +153,7 @@ class RPCAdapterLoadedResponse: RPCResetPrefixCacheRequest, RPCSleepRequest, RPCWakeUpRequest] -REQUEST_OUTPUTS_T = Union[list[RequestOutput], RPCAdapterLoadedResponse, +REQUEST_OUTPUTS_T = Union[List[RequestOutput], RPCAdapterLoadedResponse, RPCError] diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py index e8b830c2a38be..c12fe242082bf 100644 --- a/vllm/engine/multiprocessing/client.py +++ b/vllm/engine/multiprocessing/client.py @@ -3,9 +3,9 @@ import asyncio import copy import pickle -from collections.abc import AsyncGenerator, Iterator, Mapping from contextlib import contextmanager, suppress -from typing import Any, Optional, Union, cast, overload +from typing import (Any, AsyncGenerator, Dict, Iterator, List, Mapping, + Optional, Union, cast, overload) import cloudpickle import psutil @@ -76,7 +76,7 @@ class MQLLMEngineClient(EngineClient): - Pulls RequestOutputs from its queue and yields them MQLLMEngine runs two background loops: - - output_loop: the output loop pulls list[RequestOutput] + - output_loop: the output loop pulls List[RequestOutput] from the MQLLMEngine via zmq (each list is the output of one engine_step in the LLMEngine). It then parses the list and pushes individual request_outputs into @@ -120,7 +120,7 @@ def __init__(self, ipc_path: str, engine_config: VllmConfig, self.data_ipc_path = f"{ipc_path}{IPC_DATA_EXT}" # Stream for each individual request. - self.output_queues: dict[str, asyncio.Queue] = {} + self.output_queues: Dict[str, asyncio.Queue] = {} # Loop to handle output of the LLMEngine periodically. # Started after the MQLLMEngine is ready so that we can @@ -401,7 +401,7 @@ async def abort(self, request_id: str): async def do_log_stats( self, scheduler_outputs: Optional[SchedulerOutputs] = None, - model_output: Optional[list[SamplerOutput]] = None, + model_output: Optional[List[SamplerOutput]] = None, ) -> None: """ Ignore do_log_stats (handled on MQLLMEngine polling) diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py index 897ccdf127fab..efea6ee2c69aa 100644 --- a/vllm/engine/multiprocessing/engine.py +++ b/vllm/engine/multiprocessing/engine.py @@ -2,9 +2,8 @@ import pickle import signal -from collections.abc import Iterator from contextlib import contextmanager -from typing import Optional, Union +from typing import Iterator, List, Optional, Union import cloudpickle import zmq @@ -206,7 +205,7 @@ def run_engine_loop(self): if not self.use_async_sockets: self._send_outputs(request_outputs) - def engine_step(self) -> list[RequestOutput]: + def engine_step(self) -> List[RequestOutput]: """Engine step wrapper with error handling.""" try: return self.engine.step() diff --git a/vllm/engine/output_processor/interfaces.py b/vllm/engine/output_processor/interfaces.py index db32a145c8626..4c8e295c13815 100644 --- a/vllm/engine/output_processor/interfaces.py +++ b/vllm/engine/output_processor/interfaces.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 from abc import ABC, abstractmethod -from typing import Callable +from typing import Callable, List from vllm.config import SchedulerConfig from vllm.core.scheduler import Scheduler @@ -28,7 +28,7 @@ class SequenceGroupOutputProcessor(ABC): def create_output_processor( scheduler_config: SchedulerConfig, detokenizer: Detokenizer, - scheduler: list[Scheduler], + scheduler: List[Scheduler], seq_counter: Counter, get_tokenizer_for_seq: Callable[[Sequence], AnyTokenizer], stop_checker: "StopChecker", @@ -59,7 +59,7 @@ def create_output_processor( @abstractmethod def process_outputs(self, sequence_group: SequenceGroup, - outputs: list[SequenceGroupOutput], + outputs: List[SequenceGroupOutput], is_async: bool) -> None: """Process new token ids for the sequence group. Handles logic such as detokenization, stop checking, and freeing/forking sequences in the @@ -69,6 +69,6 @@ def process_outputs(self, sequence_group: SequenceGroup, @abstractmethod def process_prompt_logprob(self, seq_group: SequenceGroup, - outputs: list[SequenceGroupOutput]) -> None: + outputs: List[SequenceGroupOutput]) -> None: """Update prompt logprobs received from outputs to seq_group.""" pass diff --git a/vllm/engine/output_processor/multi_step.py b/vllm/engine/output_processor/multi_step.py index 4c7d29c4a77e4..8ceef855e020f 100644 --- a/vllm/engine/output_processor/multi_step.py +++ b/vllm/engine/output_processor/multi_step.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import functools -from typing import Callable, cast +from typing import Callable, List, cast from vllm.core.scheduler import Scheduler from vllm.engine.output_processor.interfaces import ( @@ -39,7 +39,7 @@ class MultiStepOutputProcessor(SequenceGroupOutputProcessor): def __init__( self, detokenizer: Detokenizer, - scheduler: list[Scheduler], + scheduler: List[Scheduler], seq_counter: Counter, get_tokenizer_for_seq: Callable[[Sequence], AnyTokenizer], stop_checker: StopChecker, @@ -51,7 +51,7 @@ def __init__( self.stop_checker = stop_checker def process_prompt_logprob(self, seq_group: SequenceGroup, - outputs: list[SequenceGroupOutput]) -> None: + outputs: List[SequenceGroupOutput]) -> None: """Process prompt logprobs associated with each step of a multi-step- scheduled computation. @@ -75,7 +75,7 @@ def _log_prompt_logprob_unsupported_warning_once(): def process_outputs(self, sequence_group: SequenceGroup, - outputs: list[SequenceGroupOutput], + outputs: List[SequenceGroupOutput], is_async: bool = False) -> None: """Append new tokens in the outputs to sequences in the sequence group. @@ -112,7 +112,7 @@ def process_outputs(self, isinstance(output, CompletionSequenceGroupOutput) for output in outputs ]) - compl_outputs = cast(list[CompletionSequenceGroupOutput], outputs) + compl_outputs = cast(List[CompletionSequenceGroupOutput], outputs) assert all([ seq_id == output.samples[0].parent_seq_id for output in compl_outputs @@ -158,7 +158,7 @@ def _process_decode_and_stop(self, seq: Sequence, ) def _process_seq_outputs(self, seq: Sequence, - valid_samples: list[SequenceOutput], + valid_samples: List[SequenceOutput], sampling_params: SamplingParams) -> None: output_token_ids = [sample.output_token for sample in valid_samples] output_logprobs = [sample.logprobs for sample in valid_samples] diff --git a/vllm/engine/output_processor/single_step.py b/vllm/engine/output_processor/single_step.py index 270f062cce13e..4d96791a1f8a3 100644 --- a/vllm/engine/output_processor/single_step.py +++ b/vllm/engine/output_processor/single_step.py @@ -1,5 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 +from typing import List + from vllm.config import SchedulerConfig from vllm.core.scheduler import Scheduler from vllm.engine.output_processor.interfaces import ( @@ -67,7 +69,7 @@ class SingleStepOutputProcessor(SequenceGroupOutputProcessor): """ def __init__(self, scheduler_config: SchedulerConfig, - detokenizer: Detokenizer, scheduler: list[Scheduler], + detokenizer: Detokenizer, scheduler: List[Scheduler], seq_counter: Counter, stop_checker: StopChecker): self.scheduler_config = scheduler_config self.detokenizer = detokenizer @@ -76,7 +78,7 @@ def __init__(self, scheduler_config: SchedulerConfig, self.stop_checker = stop_checker def process_outputs(self, sequence_group: SequenceGroup, - outputs: list[SequenceGroupOutput], + outputs: List[SequenceGroupOutput], is_async: bool) -> None: """Append all new tokens to sequences in the sequence group. Fork any surviving beam candidates; free any unsurviving ones. @@ -96,7 +98,7 @@ def process_outputs(self, sequence_group: SequenceGroup, is_async) def process_prompt_logprob(self, seq_group: SequenceGroup, - outputs: list[SequenceGroupOutput]) -> None: + outputs: List[SequenceGroupOutput]) -> None: """Process prompt logprobs associated with one step of a single-step- scheduled computation. diff --git a/vllm/engine/output_processor/stop_checker.py b/vllm/engine/output_processor/stop_checker.py index c757d8eded9f5..6cad9ec8f327f 100644 --- a/vllm/engine/output_processor/stop_checker.py +++ b/vllm/engine/output_processor/stop_checker.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Callable, Optional +from typing import Callable, List, Optional, Tuple from vllm.lora.request import LoRARequest from vllm.sampling_params import SamplingParams @@ -94,9 +94,9 @@ def maybe_stop_sequence( def check_stop_strings( output_text: str, new_char_count: int, - stop: list[str], + stop: List[str], include_in_output: bool, - ) -> Optional[tuple[str, int]]: + ) -> Optional[Tuple[str, int]]: """Check if any stop strings are matched and truncate sequence output text accordingly. diff --git a/vllm/engine/output_processor/util.py b/vllm/engine/output_processor/util.py index 09ed76b2a8023..0d2b58c109e32 100644 --- a/vllm/engine/output_processor/util.py +++ b/vllm/engine/output_processor/util.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 -from collections.abc import Sequence as GenericSequence +from typing import List +from typing import Sequence as GenericSequence from typing import cast from vllm.model_executor.layers.sampler import SamplerOutput @@ -9,11 +10,11 @@ def create_output_by_sequence_group( outputs: GenericSequence[SamplerOutput], - num_seq_groups: int) -> list[list[SequenceGroupOutput]]: + num_seq_groups: int) -> List[List[SequenceGroupOutput]]: """Helper method which transforms a 2d list organized by [step][sequence group] into [sequence group][step]. """ - output_by_sequence_group: list[list[CompletionSequenceGroupOutput]] = [ + output_by_sequence_group: List[List[CompletionSequenceGroupOutput]] = [ [] for _ in range(num_seq_groups) ] for step in outputs: @@ -23,4 +24,4 @@ def create_output_by_sequence_group( # Cast to the more generic type that CompletionSequenceGroupOutput # inherits from. - return cast(list[list[SequenceGroupOutput]], output_by_sequence_group) + return cast(List[List[SequenceGroupOutput]], output_by_sequence_group) diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py index e45b3facfc980..ee9accd32f218 100644 --- a/vllm/engine/protocol.py +++ b/vllm/engine/protocol.py @@ -2,8 +2,7 @@ import asyncio from abc import ABC, abstractmethod -from collections.abc import AsyncGenerator, Mapping -from typing import Optional +from typing import AsyncGenerator, List, Mapping, Optional from vllm.beam_search import BeamSearchSequence, create_sort_beams_key_function from vllm.config import DecodingConfig, ModelConfig @@ -255,7 +254,7 @@ async def is_tracing_enabled(self) -> bool: async def do_log_stats( self, scheduler_outputs: Optional[SchedulerOutputs] = None, - model_output: Optional[list[SamplerOutput]] = None, + model_output: Optional[List[SamplerOutput]] = None, ) -> None: ... diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py index 138696370839e..6f5adb4f64728 100644 --- a/vllm/executor/executor_base.py +++ b/vllm/executor/executor_base.py @@ -3,8 +3,8 @@ import asyncio import time from abc import ABC, abstractmethod -from collections.abc import Awaitable -from typing import Any, Callable, Optional, Union +from typing import (Any, Awaitable, Callable, Dict, List, Optional, Set, Tuple, + Union) import torch.nn as nn from typing_extensions import TypeVar @@ -60,8 +60,8 @@ def _init_executor(self) -> None: def collective_rpc(self, method: Union[str, Callable[..., _R]], timeout: Optional[float] = None, - args: tuple = (), - kwargs: Optional[dict[str, Any]] = None) -> list[_R]: + args: Tuple = (), + kwargs: Optional[Dict[str, Any]] = None) -> List[_R]: """ Execute an RPC call on all workers. @@ -86,7 +86,7 @@ def collective_rpc(self, """ raise NotImplementedError - def determine_num_available_blocks(self) -> tuple[int, int]: + def determine_num_available_blocks(self) -> Tuple[int, int]: """Determine the number of available blocks for the GPU KV cache and swappable CPU KV cache. @@ -94,7 +94,7 @@ def determine_num_available_blocks(self) -> tuple[int, int]: ExecutorBase may require modification of the result, e.g. to ensure the selected cache sizes are compatible with all workers. - Returns a tuple[num_gpu_blocks, num_cpu_blocks], where num_gpu_blocks + Returns a Tuple[num_gpu_blocks, num_cpu_blocks], where num_gpu_blocks are blocks that are "active" on the device and can be appended to. num_cpu_blocks refers to "swapped" blocks in CPU memory and cannot be appended to. @@ -135,7 +135,7 @@ def rpc_func(worker: WorkerBase) -> _R: def execute_model( self, execute_model_req: ExecuteModelRequest - ) -> Optional[list[Union[SamplerOutput, PoolerOutput]]]: + ) -> Optional[List[Union[SamplerOutput, PoolerOutput]]]: output = self.collective_rpc("execute_model", args=(execute_model_req, )) return output[0] @@ -156,7 +156,7 @@ def pin_lora(self, lora_id: int) -> bool: assert lora_id > 0, "lora_id must be greater than 0." return all(self.collective_rpc("pin_lora", args=(lora_id, ))) - def list_loras(self) -> set[int]: + def list_loras(self) -> Set[int]: sets = self.collective_rpc("list_loras") for s in sets: assert s == sets[0], "All workers should have the same LORAs." @@ -184,7 +184,7 @@ def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool: self.collective_rpc("pin_prompt_adapter", args=(prompt_adapter_id, ))) - def list_prompt_adapters(self) -> set[int]: + def list_prompt_adapters(self) -> Set[int]: sets = self.collective_rpc("list_prompt_adapters") for s in sets: assert (s == sets[0] @@ -245,7 +245,7 @@ def __del__(self): async def execute_model_async( self, - execute_model_req: ExecuteModelRequest) -> list[SamplerOutput]: + execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]: """Executes one model step on the given sequences.""" output = await make_async(self.execute_model)(execute_model_req) return output @@ -273,7 +273,7 @@ def __init__(self, *args, **kwargs): def execute_model( self, execute_model_req: ExecuteModelRequest, - ) -> list[SamplerOutput]: + ) -> List[SamplerOutput]: # TODO: unify into collective_rpc if self.parallel_worker_tasks is None: self.parallel_worker_tasks = self._run_workers( @@ -299,7 +299,7 @@ def stop_remote_worker_execution_loop(self) -> None: @abstractmethod def _driver_execute_model( self, execute_model_req: Optional[ExecuteModelRequest] - ) -> Optional[list[SamplerOutput]]: + ) -> Optional[List[SamplerOutput]]: """Run execute_model in the driver worker. Passing None will cause the driver to stop the model execution loop @@ -311,8 +311,8 @@ def _driver_execute_model( def collective_rpc(self, method: Union[str, Callable], timeout: Optional[float] = None, - args: tuple = (), - kwargs: Optional[dict] = None) -> list[Any]: + args: Tuple = (), + kwargs: Optional[Dict] = None) -> List[Any]: return self._run_workers(method, *args, **(kwargs or {})) @abstractmethod @@ -344,7 +344,7 @@ def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None: async def execute_model_async( self, - execute_model_req: ExecuteModelRequest) -> list[SamplerOutput]: + execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]: if self.parallel_worker_tasks is None: # Start model execution loop running in the parallel workers self.parallel_worker_tasks = asyncio.create_task( @@ -368,7 +368,7 @@ async def stop_remote_worker_execution_loop_async(self) -> None: async def _driver_execute_model_async( self, execute_model_req: Optional[ExecuteModelRequest] = None, - ) -> list[SamplerOutput]: + ) -> List[SamplerOutput]: """Execute the model asynchronously in the driver worker. Passing None will cause the driver to stop the model execution diff --git a/vllm/executor/mp_distributed_executor.py b/vllm/executor/mp_distributed_executor.py index e175568923c08..d1f8c36fbbec7 100644 --- a/vllm/executor/mp_distributed_executor.py +++ b/vllm/executor/mp_distributed_executor.py @@ -2,7 +2,7 @@ import asyncio import os -from typing import Any, Callable, Optional, Union +from typing import Any, Callable, List, Optional, Union import cloudpickle @@ -72,15 +72,15 @@ def _init_executor(self) -> None: distributed_init_method = get_distributed_init_method( "127.0.0.1", get_open_port()) - self.workers: list[ProcessWorkerWrapper] = [] + self.workers: List[ProcessWorkerWrapper] = [] # This is the list of workers that are rank 0 of each TP group EXCEPT # global rank 0. These are the workers that will broadcast to the # rest of the workers. - self.tp_driver_workers: list[ProcessWorkerWrapper] = [] + self.tp_driver_workers: List[ProcessWorkerWrapper] = [] # This is the list of workers that are not drivers and not the first # worker in a TP group. These are the workers that will be # broadcasted to. - self.non_driver_workers: list[ProcessWorkerWrapper] = [] + self.non_driver_workers: List[ProcessWorkerWrapper] = [] if world_size == 1: self.worker_monitor = None @@ -126,7 +126,7 @@ def _init_executor(self) -> None: max_concurrent_workers=self.parallel_config. max_parallel_loading_workers) self.driver_exec_model = make_async(self.driver_worker.execute_model) - self.pp_locks: Optional[list[asyncio.Lock]] = None + self.pp_locks: Optional[List[asyncio.Lock]] = None def shutdown(self): if (worker_monitor := getattr(self, "worker_monitor", @@ -135,7 +135,7 @@ def shutdown(self): def _driver_execute_model( self, execute_model_req: Optional[ExecuteModelRequest] - ) -> Optional[list[SamplerOutput]]: + ) -> Optional[List[SamplerOutput]]: """Run execute_model in the driver worker. Passing None will cause the driver to stop the model execution @@ -150,7 +150,7 @@ def _run_workers( async_run_tensor_parallel_workers_only: bool = False, max_concurrent_workers: Optional[int] = None, **kwargs, - ) -> list[Any]: + ) -> List[Any]: """Runs the given method on all workers. Args: @@ -204,7 +204,7 @@ def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None: async def _driver_execute_model_async( self, execute_model_req: Optional[ExecuteModelRequest] = None - ) -> list[SamplerOutput]: + ) -> List[SamplerOutput]: if not self.tp_driver_workers: return await self.driver_exec_model(execute_model_req) diff --git a/vllm/executor/msgspec_utils.py b/vllm/executor/msgspec_utils.py index 0a28952a9cdc1..e680d53cbd10e 100644 --- a/vllm/executor/msgspec_utils.py +++ b/vllm/executor/msgspec_utils.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 from array import array -from typing import Any +from typing import Any, Type from vllm.sequence import VLLM_TOKEN_ID_ARRAY_TYPE @@ -18,7 +18,7 @@ def encode_hook(obj: Any) -> Any: return obj.tobytes() -def decode_hook(type: type, obj: Any) -> Any: +def decode_hook(type: Type, obj: Any) -> Any: """Custom msgspec dec hook that supports array types. See https://jcristharif.com/msgspec/api.html#msgspec.msgpack.Encoder diff --git a/vllm/executor/multiproc_worker_utils.py b/vllm/executor/multiproc_worker_utils.py index 25e4eb8296525..68a83bb610a49 100644 --- a/vllm/executor/multiproc_worker_utils.py +++ b/vllm/executor/multiproc_worker_utils.py @@ -9,7 +9,8 @@ from multiprocessing import Queue from multiprocessing.connection import wait from multiprocessing.process import BaseProcess -from typing import Any, Callable, Generic, Optional, TextIO, TypeVar, Union +from typing import (Any, Callable, Dict, Generic, List, Optional, TextIO, + TypeVar, Union) import torch @@ -81,7 +82,7 @@ class ResultHandler(threading.Thread): def __init__(self) -> None: super().__init__(daemon=True) self.result_queue = get_mp_context().Queue() - self.tasks: dict[uuid.UUID, Union[ResultFuture, asyncio.Future]] = {} + self.tasks: Dict[uuid.UUID, Union[ResultFuture, asyncio.Future]] = {} def run(self): for result in iter(self.result_queue.get, _TERMINATE): @@ -101,7 +102,7 @@ def close(self): class WorkerMonitor(threading.Thread): """Monitor worker status (in background thread)""" - def __init__(self, workers: list['ProcessWorkerWrapper'], + def __init__(self, workers: List['ProcessWorkerWrapper'], result_handler: ResultHandler): super().__init__(daemon=True) self.workers = workers diff --git a/vllm/executor/ray_distributed_executor.py b/vllm/executor/ray_distributed_executor.py index d4c95840665f3..c3b41d1c11340 100644 --- a/vllm/executor/ray_distributed_executor.py +++ b/vllm/executor/ray_distributed_executor.py @@ -4,7 +4,7 @@ import os from collections import defaultdict from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, Callable, Optional, Union +from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union import cloudpickle import msgspec @@ -91,10 +91,10 @@ def _init_executor(self) -> None: self.input_encoder = msgspec.msgpack.Encoder(enc_hook=encode_hook) self.output_decoder = msgspec.msgpack.Decoder( - Optional[list[SamplerOutput]]) + Optional[List[SamplerOutput]]) self.use_v1 = envs.VLLM_USE_V1 - self.pp_locks: Optional[list[asyncio.Lock]] = None + self.pp_locks: Optional[List[asyncio.Lock]] = None if not self.use_ray_compiled_dag: self.driver_exec_method = make_async( self.driver_worker.execute_method) @@ -112,7 +112,7 @@ def shutdown(self) -> None: self.forward_dag = None def _configure_ray_workers_use_nsight(self, - ray_remote_kwargs) -> dict[str, Any]: + ray_remote_kwargs) -> Dict[str, Any]: # If nsight profiling is enabled, we need to set the profiling # configuration for the ray workers as runtime env. runtime_env = ray_remote_kwargs.setdefault("runtime_env", {}) @@ -138,12 +138,12 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", # It holds the resource for the driver worker. self.driver_dummy_worker: Optional[RayWorkerWrapper] = None # The remaining workers are the actual ray actors. - self.workers: list[RayWorkerWrapper] = [] + self.workers: List[RayWorkerWrapper] = [] # Used in ray compiled DAG: indexed first by PP rank, # and then TP rank. In other words, the inner list is # the TP group of workers for a PP rank. - self.pp_tp_workers: list[list[RayWorkerWrapper]] = [] + self.pp_tp_workers: List[List[RayWorkerWrapper]] = [] if self.parallel_config.ray_workers_use_nsight: ray_remote_kwargs = self._configure_ray_workers_use_nsight( @@ -152,7 +152,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", logger.info("use_ray_spmd_worker: %s", self.use_ray_spmd_worker) # Create the workers. - bundle_indices: list[int] + bundle_indices: List[int] if envs.VLLM_RAY_BUNDLE_INDICES: # Use the bundle indices specified by the user. bundle_indices = list( @@ -172,7 +172,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", bundle_indices.append(bundle_id) bundle_indices = bundle_indices[:self.parallel_config.world_size] - worker_metadata: list[RayWorkerMetaData] = [] + worker_metadata: List[RayWorkerMetaData] = [] driver_ip = get_ip() for rank, bundle_id in enumerate(bundle_indices): scheduling_strategy = PlacementGroupSchedulingStrategy( @@ -233,7 +233,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", "Consider adjusting the Ray placement group or running " "the driver on a GPU node.") - ip_counts: dict[str, int] = {} + ip_counts: Dict[str, int] = {} for ip in worker_ips: ip_counts[ip] = ip_counts.get(ip, 0) + 1 @@ -377,11 +377,11 @@ def sort_by_driver_then_worker_ip(item: RayWorkerMetaData): # This is the list of workers that are rank 0 of each TP group EXCEPT # global rank 0. These are the workers that will broadcast to the # rest of the workers. - self.tp_driver_workers: list[RayWorkerWrapper] = [] + self.tp_driver_workers: List[RayWorkerWrapper] = [] # This is the list of workers that are not drivers and not the first # worker in a TP group. These are the workers that will be # broadcasted to. - self.non_driver_workers: list[RayWorkerWrapper] = [] + self.non_driver_workers: List[RayWorkerWrapper] = [] # Enforce rank order for correct rank to return final output. for index, worker in enumerate(self.workers): @@ -394,7 +394,7 @@ def sort_by_driver_then_worker_ip(item: RayWorkerMetaData): def _driver_execute_model( self, execute_model_req: Optional[ExecuteModelRequest] - ) -> Optional[list[SamplerOutput]]: + ) -> Optional[List[SamplerOutput]]: """Run execute_model in the driver worker. Passing None will cause the driver to stop the model execution @@ -407,7 +407,7 @@ def _driver_execute_model( def execute_model( self, - execute_model_req: ExecuteModelRequest) -> list[SamplerOutput]: + execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]: if not self.use_ray_spmd_worker: return super().execute_model(execute_model_req) @@ -586,7 +586,7 @@ def __del__(self): async def execute_model_async( self, - execute_model_req: ExecuteModelRequest) -> list[SamplerOutput]: + execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]: if not self.use_ray_spmd_worker: return await super().execute_model_async(execute_model_req) @@ -601,7 +601,7 @@ async def execute_model_async( async def _driver_execute_model_async( self, execute_model_req: Optional[ExecuteModelRequest] = None - ) -> list[SamplerOutput]: + ) -> List[SamplerOutput]: assert not self.use_ray_spmd_worker, ( "driver_worker does not exist for VLLM_USE_RAY_SPMD_WORKER=1") if not self.tp_driver_workers: diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py index 0fd2efdceb142..6067f9a3c13b8 100644 --- a/vllm/executor/ray_utils.py +++ b/vllm/executor/ray_utils.py @@ -3,7 +3,7 @@ import os import time from collections import defaultdict -from typing import TYPE_CHECKING, Optional, Union +from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union import msgspec @@ -52,7 +52,7 @@ def __init__(self, *args, **kwargs) -> None: def get_node_ip(self) -> str: return get_ip() - def get_node_and_gpu_ids(self) -> tuple[str, list[int]]: + def get_node_and_gpu_ids(self) -> Tuple[str, List[int]]: node_id = ray.get_runtime_context().get_node_id() device_key = vllm.platforms.current_platform.ray_device_key if not device_key: @@ -64,7 +64,7 @@ def get_node_and_gpu_ids(self) -> tuple[str, list[int]]: def execute_model_spmd( self, req_or_tuple: Union[bytes, - tuple[bytes, + Tuple[bytes, Optional[IntermediateTensors]]] ) -> bytes: """Execute model in SPMD fashion: used only when SPMD worker and @@ -115,9 +115,9 @@ def setup_device_if_necessary(self): def execute_model_ray( self, scheduler_output: Union["SchedulerOutput", - tuple["SchedulerOutput", + Tuple["SchedulerOutput", "IntermediateTensors"]], - ) -> Union["ModelRunnerOutput", tuple["SchedulerOutput", + ) -> Union["ModelRunnerOutput", Tuple["SchedulerOutput", "IntermediateTensors"]]: # This method is used by Ray Compiled Graph to execute the model, # and it needs a special logic of self.setup_device_if_necessary() @@ -133,7 +133,7 @@ def execute_model_ray( output = scheduler_output, output return output - def override_env_vars(self, vars: dict[str, str]): + def override_env_vars(self, vars: Dict[str, str]): os.environ.update(vars) ray_import_err = None @@ -171,8 +171,8 @@ def _verify_bundles(placement_group: "PlacementGroup", bundle_to_node_ids = pg_data["bundles_to_node_id"] # bundle_idx -> bundle (e.g., {"GPU": 1}) bundles = pg_data["bundles"] - # node_id -> list of bundle (e.g., {"GPU": 1}) - node_id_to_bundle: dict[str, list[dict[str, float]]] = defaultdict(list) + # node_id -> List of bundle (e.g., {"GPU": 1}) + node_id_to_bundle: Dict[str, List[Dict[str, float]]] = defaultdict(list) for bundle_idx, node_id in bundle_to_node_ids.items(): node_id_to_bundle[node_id].append(bundles[bundle_idx]) @@ -334,7 +334,7 @@ def initialize_ray_cluster( "number of available %ss in the placement group.", device_str, device_str) # Create a new placement group - placement_group_specs: list[dict[str, float]] = ([{ + placement_group_specs: List[Dict[str, float]] = ([{ device_str: 1.0 } for _ in range(parallel_config.world_size)]) diff --git a/vllm/executor/uniproc_executor.py b/vllm/executor/uniproc_executor.py index 87dc3eed2a0e8..e041215de6602 100644 --- a/vllm/executor/uniproc_executor.py +++ b/vllm/executor/uniproc_executor.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import os -from typing import Any, Callable, Optional, Union +from typing import Any, Callable, Dict, List, Optional, Tuple, Union import torch import torch.distributed as dist @@ -49,8 +49,8 @@ def _init_executor(self) -> None: def collective_rpc(self, method: Union[str, Callable], timeout: Optional[float] = None, - args: tuple = (), - kwargs: Optional[dict] = None) -> list[Any]: + args: Tuple = (), + kwargs: Optional[Dict] = None) -> List[Any]: if kwargs is None: kwargs = {} answer = run_method(self.driver_worker, method, args, kwargs) @@ -120,7 +120,7 @@ def _init_executor(self) -> None: self.collective_rpc("init_device") self.collective_rpc("load_model") - def determine_num_available_blocks(self) -> tuple[int, int]: + def determine_num_available_blocks(self) -> Tuple[int, int]: """ Determine the number of available KV blocks. Add an additional all_reduce to get the min across all ranks. diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py index 138a8f61107be..2ffebeee392a3 100644 --- a/vllm/inputs/data.py +++ b/vllm/inputs/data.py @@ -1,9 +1,9 @@ # SPDX-License-Identifier: Apache-2.0 -from collections.abc import Iterable from dataclasses import dataclass from functools import cached_property -from typing import TYPE_CHECKING, Any, Generic, Literal, Optional, Union, cast +from typing import (TYPE_CHECKING, Any, Dict, Generic, Iterable, List, Literal, + Optional, Tuple, Union, cast) import torch from typing_extensions import NotRequired, TypedDict, TypeVar, assert_never @@ -26,7 +26,7 @@ class TextPrompt(TypedDict): if the model supports it. """ - mm_processor_kwargs: NotRequired[dict[str, Any]] + mm_processor_kwargs: NotRequired[Dict[str, Any]] """ Optional multi-modal processor kwargs to be forwarded to the multimodal input mapper & processor. Note that if multiple modalities @@ -38,10 +38,10 @@ class TextPrompt(TypedDict): class TokensPrompt(TypedDict): """Schema for a tokenized prompt.""" - prompt_token_ids: list[int] + prompt_token_ids: List[int] """A list of token IDs to pass to the model.""" - token_type_ids: NotRequired[list[int]] + token_type_ids: NotRequired[List[int]] """A list of token type IDs to pass to the cross encoder model.""" multi_modal_data: NotRequired["MultiModalDataDict"] @@ -50,7 +50,7 @@ class TokensPrompt(TypedDict): if the model supports it. """ - mm_processor_kwargs: NotRequired[dict[str, Any]] + mm_processor_kwargs: NotRequired[Dict[str, Any]] """ Optional multi-modal processor kwargs to be forwarded to the multimodal input mapper & processor. Note that if multiple modalities @@ -115,7 +115,7 @@ class ExplicitEncoderDecoderPrompt(TypedDict, Generic[_T1_co, _T2_co]): decoder_prompt: Optional[_T2_co] - mm_processor_kwargs: NotRequired[dict[str, Any]] + mm_processor_kwargs: NotRequired[Dict[str, Any]] PromptType = Union[SingletonPrompt, ExplicitEncoderDecoderPrompt] @@ -136,10 +136,10 @@ class TokenInputs(TypedDict): type: Literal["token"] """The type of inputs.""" - prompt_token_ids: list[int] + prompt_token_ids: List[int] """The token IDs of the prompt.""" - token_type_ids: NotRequired[list[int]] + token_type_ids: NotRequired[List[int]] """The token type IDs of the prompt.""" prompt: NotRequired[str] @@ -164,12 +164,12 @@ class TokenInputs(TypedDict): Placeholder ranges for the multi-modal data. """ - multi_modal_hashes: NotRequired[list[str]] + multi_modal_hashes: NotRequired[List[str]] """ The hashes of the multi-modal data. """ - mm_processor_kwargs: NotRequired[dict[str, Any]] + mm_processor_kwargs: NotRequired[Dict[str, Any]] """ Optional multi-modal processor kwargs to be forwarded to the multimodal input mapper & processor. Note that if multiple modalities @@ -179,14 +179,14 @@ class TokenInputs(TypedDict): def token_inputs( - prompt_token_ids: list[int], - token_type_ids: Optional[list[int]] = None, + prompt_token_ids: List[int], + token_type_ids: Optional[List[int]] = None, prompt: Optional[str] = None, multi_modal_data: Optional["MultiModalDataDict"] = None, multi_modal_inputs: Optional["MultiModalKwargs"] = None, - multi_modal_hashes: Optional[list[str]] = None, + multi_modal_hashes: Optional[List[str]] = None, multi_modal_placeholders: Optional["MultiModalPlaceholderDict"] = None, - mm_processor_kwargs: Optional[dict[str, Any]] = None, + mm_processor_kwargs: Optional[Dict[str, Any]] = None, ) -> TokenInputs: """Construct :class:`TokenInputs` from optional values.""" inputs = TokenInputs(type="token", prompt_token_ids=prompt_token_ids) @@ -255,7 +255,7 @@ def prompt(self) -> Optional[str]: assert_never(inputs) # type: ignore[arg-type] @cached_property - def prompt_token_ids(self) -> list[int]: + def prompt_token_ids(self) -> List[int]: inputs = self.inputs if inputs["type"] == "token" or inputs["type"] == "multimodal": @@ -264,7 +264,7 @@ def prompt_token_ids(self) -> list[int]: assert_never(inputs) # type: ignore[arg-type] @cached_property - def token_type_ids(self) -> list[int]: + def token_type_ids(self) -> List[int]: inputs = self.inputs if inputs["type"] == "token" or inputs["type"] == "multimodal": @@ -294,7 +294,7 @@ def multi_modal_data(self) -> "MultiModalDataDict": assert_never(inputs) # type: ignore[arg-type] @cached_property - def multi_modal_inputs(self) -> Union[dict, "MultiModalKwargs"]: + def multi_modal_inputs(self) -> Union[Dict, "MultiModalKwargs"]: inputs = self.inputs if inputs["type"] == "token": @@ -306,7 +306,7 @@ def multi_modal_inputs(self) -> Union[dict, "MultiModalKwargs"]: assert_never(inputs) # type: ignore[arg-type] @cached_property - def multi_modal_hashes(self) -> list[str]: + def multi_modal_hashes(self) -> List[str]: inputs = self.inputs if inputs["type"] == "token": @@ -331,7 +331,7 @@ def multi_modal_placeholders(self) -> "MultiModalPlaceholderDict": assert_never(inputs) # type: ignore[arg-type] @cached_property - def mm_processor_kwargs(self) -> dict[str, Any]: + def mm_processor_kwargs(self) -> Dict[str, Any]: inputs = self.inputs if inputs["type"] == "token": @@ -355,7 +355,7 @@ def mm_processor_kwargs(self) -> dict[str, Any]: def build_explicit_enc_dec_prompt( encoder_prompt: _T1, decoder_prompt: Optional[_T2], - mm_processor_kwargs: Optional[dict[str, Any]] = None, + mm_processor_kwargs: Optional[Dict[str, Any]] = None, ) -> ExplicitEncoderDecoderPrompt[_T1, _T2]: if mm_processor_kwargs is None: mm_processor_kwargs = {} @@ -368,9 +368,9 @@ def build_explicit_enc_dec_prompt( def zip_enc_dec_prompts( enc_prompts: Iterable[_T1], dec_prompts: Iterable[Optional[_T2]], - mm_processor_kwargs: Optional[Union[Iterable[dict[str, Any]], - dict[str, Any]]] = None, -) -> list[ExplicitEncoderDecoderPrompt[_T1, _T2]]: + mm_processor_kwargs: Optional[Union[Iterable[Dict[str, Any]], + Dict[str, Any]]] = None, +) -> List[ExplicitEncoderDecoderPrompt[_T1, _T2]]: """ Zip encoder and decoder prompts together into a list of :class:`ExplicitEncoderDecoderPrompt` instances. @@ -380,12 +380,12 @@ def zip_enc_dec_prompts( provided, it will be zipped with the encoder/decoder prompts. """ if mm_processor_kwargs is None: - mm_processor_kwargs = cast(dict[str, Any], {}) + mm_processor_kwargs = cast(Dict[str, Any], {}) if isinstance(mm_processor_kwargs, dict): return [ build_explicit_enc_dec_prompt( encoder_prompt, decoder_prompt, - cast(dict[str, Any], mm_processor_kwargs)) + cast(Dict[str, Any], mm_processor_kwargs)) for (encoder_prompt, decoder_prompt) in zip(enc_prompts, dec_prompts) ] @@ -399,7 +399,7 @@ def zip_enc_dec_prompts( def to_enc_dec_tuple_list( enc_dec_prompts: Iterable[ExplicitEncoderDecoderPrompt[_T1, _T2]], -) -> list[tuple[_T1, Optional[_T2]]]: +) -> List[Tuple[_T1, Optional[_T2]]]: return [(enc_dec_prompt["encoder_prompt"], enc_dec_prompt["decoder_prompt"]) for enc_dec_prompt in enc_dec_prompts] diff --git a/vllm/inputs/parse.py b/vllm/inputs/parse.py index ed1056948d807..454d9d8303b77 100644 --- a/vllm/inputs/parse.py +++ b/vllm/inputs/parse.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from collections.abc import Sequence -from typing import Literal, TypedDict, Union, cast, overload +from typing import List, Literal, Sequence, TypedDict, Union, cast, overload from typing_extensions import TypeIs @@ -18,24 +17,24 @@ class ParsedText(TypedDict): class ParsedTokens(TypedDict): - content: list[int] + content: List[int] is_tokens: Literal[True] @overload def parse_and_batch_prompt( - prompt: Union[str, list[str]]) -> Sequence[ParsedText]: + prompt: Union[str, List[str]]) -> Sequence[ParsedText]: ... @overload def parse_and_batch_prompt( - prompt: Union[list[int], list[list[int]]]) -> Sequence[ParsedTokens]: + prompt: Union[List[int], List[List[int]]]) -> Sequence[ParsedTokens]: ... def parse_and_batch_prompt( - prompt: Union[str, list[str], list[int], list[list[int]]], + prompt: Union[str, List[str], List[int], List[List[int]]], ) -> Union[Sequence[ParsedText], Sequence[ParsedTokens]]: if isinstance(prompt, str): # case 1: a string @@ -47,16 +46,16 @@ def parse_and_batch_prompt( if is_list_of(prompt, str): # case 2: array of strings - prompt = cast(list[str], prompt) + prompt = cast(List[str], prompt) return [ ParsedText(content=elem, is_tokens=False) for elem in prompt ] if is_list_of(prompt, int): # case 3: array of tokens - prompt = cast(list[int], prompt) + prompt = cast(List[int], prompt) return [ParsedTokens(content=prompt, is_tokens=True)] if is_list_of(prompt, list): - prompt = cast(list[list[int]], prompt) + prompt = cast(List[List[int]], prompt) if len(prompt[0]) == 0: raise ValueError("please provide at least one prompt") diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index 742733d3644a3..bc5856990da6f 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -1,8 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import asyncio -from collections.abc import Mapping -from typing import Optional, Union, cast +from typing import List, Mapping, Optional, Tuple, Union, cast from typing_extensions import assert_never @@ -93,7 +92,7 @@ def get_decoder_start_token_id(self) -> Optional[int]: return dec_start_token_id - def _get_default_enc_dec_decoder_prompt(self) -> list[int]: + def _get_default_enc_dec_decoder_prompt(self) -> List[int]: ''' Specifically for encoder/decoder models: generate a default decoder prompt for when @@ -131,8 +130,8 @@ def _get_default_enc_dec_decoder_prompt(self) -> list[int]: def _prepare_decoder_input_ids_for_generation( self, - decoder_input_ids: Optional[list[int]], - ) -> list[int]: + decoder_input_ids: Optional[List[int]], + ) -> List[int]: """ Prepares `decoder_input_ids` for generation with encoder-decoder models. @@ -169,9 +168,9 @@ def _prepare_decoder_input_ids_for_generation( def _apply_prompt_adapter( self, - prompt_token_ids: list[int], + prompt_token_ids: List[int], prompt_adapter_request: Optional[PromptAdapterRequest], - ) -> list[int]: + ) -> List[int]: if prompt_adapter_request: prompt_token_ids = ( [0] * prompt_adapter_request.prompt_adapter_num_virtual_tokens @@ -184,7 +183,7 @@ def _tokenize_prompt( prompt: str, request_id: str, lora_request: Optional[LoRARequest], - ) -> list[int]: + ) -> List[int]: """ Apply the model's tokenizer to a text prompt, returning the corresponding token IDs. @@ -212,7 +211,7 @@ async def _tokenize_prompt_async( prompt: str, request_id: str, lora_request: Optional[LoRARequest], - ) -> list[int]: + ) -> List[int]: """Async version of :meth:`_tokenize_prompt`.""" tokenizer = self.get_tokenizer_group() add_special_tokens = None @@ -247,7 +246,7 @@ def _can_process_multimodal(self) -> bool: def _process_multimodal( self, - prompt: Union[str, list[int]], + prompt: Union[str, List[int]], mm_data: MultiModalDataDict, mm_processor_kwargs: Optional[Mapping[str, object]], lora_request: Optional[LoRARequest], @@ -275,7 +274,7 @@ def _process_multimodal( async def _process_multimodal_async( self, - prompt: Union[str, list[int]], + prompt: Union[str, List[int]], mm_data: MultiModalDataDict, mm_processor_kwargs: Optional[Mapping[str, object]], lora_request: Optional[LoRARequest], @@ -501,7 +500,7 @@ def _separate_enc_dec_inputs_from_mm_processor_outputs( self, inputs: SingletonInputs, decoder_inputs_to_override: Optional[SingletonInputs] = None, - ) -> tuple[SingletonInputs, SingletonInputs]: + ) -> Tuple[SingletonInputs, SingletonInputs]: """ For encoder/decoder models only: Separate Encoder/Decoder inputs from a MultiModalEncDecInputs diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py index 4ec9304345399..691fcd7dc53f2 100644 --- a/vllm/inputs/registry.py +++ b/vllm/inputs/registry.py @@ -2,10 +2,9 @@ import functools from collections import UserDict -from collections.abc import Mapping from dataclasses import dataclass -from typing import (TYPE_CHECKING, Any, Callable, NamedTuple, Optional, - Protocol, Union) +from typing import (TYPE_CHECKING, Any, Callable, Mapping, NamedTuple, + Optional, Protocol, Union) from torch import nn from transformers import BatchFeature, PretrainedConfig, ProcessorMixin diff --git a/vllm/lora/fully_sharded_layers.py b/vllm/lora/fully_sharded_layers.py index 40a366a876d7e..41e1ec94145db 100644 --- a/vllm/lora/fully_sharded_layers.py +++ b/vllm/lora/fully_sharded_layers.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # pylint: disable=unused-argument -from typing import TYPE_CHECKING, Optional, Union, cast +from typing import TYPE_CHECKING, List, Optional, Tuple, Union, cast import torch import torch.nn as nn @@ -107,7 +107,7 @@ def can_replace_layer( cls, source_layer: nn.Module, lora_config: LoRAConfig, - packed_modules_list: list, + packed_modules_list: List, model_config: Optional[PretrainedConfig], ) -> bool: # specifying kwargs so they can be easily accessed in decorator @@ -130,8 +130,8 @@ class MergedColumnParallelLinearWithShardedLoRA( """ def slice_lora_a( - self, lora_a: list[Union[torch.Tensor, None]] - ) -> list[Union[torch.Tensor, None]]: + self, lora_a: List[Union[torch.Tensor, None]] + ) -> List[Union[torch.Tensor, None]]: #NOTE: lora_a contains 2 subloras, and each sublora could be None. output_shard_size = self.lora_a_stacked[0].shape[2] output_start_idx = self.tp_rank * output_shard_size @@ -154,7 +154,7 @@ def can_replace_layer( cls, source_layer: nn.Module, lora_config: LoRAConfig, - packed_modules_list: list, + packed_modules_list: List, model_config: Optional[PretrainedConfig], ) -> bool: # specifying kwargs so they can be easily accessed in decorator @@ -190,7 +190,7 @@ def apply(self, @classmethod @_fully_sharded_can_replace def can_replace_layer(cls, source_layer: nn.Module, - lora_config: LoRAConfig, packed_modules_list: list, + lora_config: LoRAConfig, packed_modules_list: List, model_config: Optional[PretrainedConfig]) -> bool: # specifying kwargs so they can be easily accessed in decorator return super().can_replace_layer( @@ -211,8 +211,8 @@ class MergedQKVParallelLinearWithShardedLoRA(MergedQKVParallelLinearWithLoRA): """ def slice_lora_a( - self, lora_a: list[Union[torch.Tensor, None]] - ) -> list[Union[torch.Tensor, None]]: + self, lora_a: List[Union[torch.Tensor, None]] + ) -> List[Union[torch.Tensor, None]]: # NOTE: lora_a contains 3 subloras, and each sublora could be None. shard_size = [self.lora_a_stacked[i].shape[2] for i in range(3)] start_idx = [self.tp_rank * shard_size[i] for i in range(3)] @@ -237,7 +237,7 @@ def can_replace_layer( cls, source_layer: nn.Module, lora_config: LoRAConfig, - packed_modules_list: list, + packed_modules_list: List, model_config: Optional[PretrainedConfig], ) -> bool: # specifying kwargs so they can be easily accessed in decorator @@ -270,7 +270,7 @@ def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor: def slice_bias(self, bias: torch.Tensor) -> torch.Tensor: if bias is None: return bias - self.lora_bias_stacked = cast(tuple[torch.Tensor, ...], + self.lora_bias_stacked = cast(Tuple[torch.Tensor, ...], self.lora_bias_stacked) shard_size = self.lora_bias_stacked[0].shape[2] start_idx = self.tp_rank * shard_size @@ -322,7 +322,7 @@ def can_replace_layer( cls, source_layer: nn.Module, lora_config: LoRAConfig, - packed_modules_list: list, + packed_modules_list: List, model_config: Optional[PretrainedConfig], ) -> bool: # specifying kwargs so they can be easily accessed in decorator diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index d4cbb3e207e2b..6c48173c201b3 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -3,7 +3,7 @@ # pylint: disable=unused-argument import math from dataclasses import dataclass -from typing import TYPE_CHECKING, Optional, Union, cast +from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union, cast import torch import torch.nn as nn @@ -82,14 +82,14 @@ class LoRAMapping(AdapterMapping): class BaseLayerWithLoRA(nn.Module): def slice_lora_a( - self, lora_a: Union[torch.Tensor, list[Union[torch.Tensor, None]]] - ) -> Union[torch.Tensor, list[Union[torch.Tensor, None]]]: + self, lora_a: Union[torch.Tensor, List[Union[torch.Tensor, None]]] + ) -> Union[torch.Tensor, List[Union[torch.Tensor, None]]]: """Slice lora a if splitting for tensor parallelism.""" ... def slice_lora_b( - self, lora_b: Union[torch.Tensor, list[Union[torch.Tensor, None]]] - ) -> Union[torch.Tensor, list[Union[torch.Tensor, None]]]: + self, lora_b: Union[torch.Tensor, List[Union[torch.Tensor, None]]] + ) -> Union[torch.Tensor, List[Union[torch.Tensor, None]]]: """Slice lora b if splitting with tensor parallelism.""" ... @@ -128,7 +128,7 @@ def can_replace_layer( cls, source_layer: nn.Module, lora_config: LoRAConfig, - packed_modules_list: list, + packed_modules_list: List, model_config: Optional[PretrainedConfig], ) -> bool: """Returns True if the layer can be replaced by this LoRA layer.""" @@ -140,7 +140,7 @@ class VocabParallelEmbeddingWithLoRA(BaseLayerWithLoRA): def __init__(self, base_layer: VocabParallelEmbedding) -> None: super().__init__() self.base_layer = base_layer - self.embeddings_slice: Optional[tuple[int, int]] + self.embeddings_slice: Optional[Tuple[int, int]] self.embeddings_weights: Optional[torch.Tensor] def create_lora_weights( @@ -269,7 +269,7 @@ def can_replace_layer( cls, source_layer: nn.Module, lora_config: LoRAConfig, - packed_modules_list: list, + packed_modules_list: List, model_config: Optional[PretrainedConfig], ) -> bool: return type(source_layer) is VocabParallelEmbedding @@ -282,9 +282,9 @@ def __init__(self, base_layer: LinearBase): self.base_layer = base_layer self.input_size = self.base_layer.input_size self.device = _get_lora_device(self.base_layer) - self.lora_bias_stacked: Optional[tuple[torch.Tensor, ...]] = None + self.lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]] = None - self.output_slices: tuple[int, ...] + self.output_slices: Tuple[int, ...] self.tp_size: int self.output_size: int self.n_slices: int @@ -351,7 +351,7 @@ def reset_lora(self, index: int): self.lora_b_stacked[s_index][index] = 0 if self.lora_config.bias_enabled: # Make mypy happy - self.lora_bias_stacked = cast(tuple[torch.Tensor, ...], + self.lora_bias_stacked = cast(Tuple[torch.Tensor, ...], self.lora_bias_stacked) self.lora_bias_stacked[s_index][index] = 0 @@ -385,7 +385,7 @@ def set_lora( lora_b.T, non_blocking=True) if lora_bias is not None: - self.lora_bias_stacked = cast(tuple[torch.Tensor, ...], + self.lora_bias_stacked = cast(Tuple[torch.Tensor, ...], self.lora_bias_stacked) assert len(self.lora_bias_stacked) self.lora_bias_stacked[0][index, 0, :lora_bias.shape[0]].copy_( @@ -413,7 +413,7 @@ def __init__(self, base_layer: ReplicatedLinear) -> None: def forward( self, input_: torch.Tensor - ) -> tuple[Optional[torch.Tensor], Optional[torch.Tensor]]: + ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]: """Forward of ReplicatedLinearWithLoRA Args: @@ -440,7 +440,7 @@ def can_replace_layer( cls, source_layer: nn.Module, lora_config: LoRAConfig, - packed_modules_list: list, + packed_modules_list: List, model_config: Optional[PretrainedConfig], ) -> bool: return type(source_layer) is ReplicatedLinear @@ -506,7 +506,7 @@ def slice_bias(self, bias: torch.Tensor) -> torch.Tensor: def forward( self, input_: torch.Tensor - ) -> tuple[Optional[torch.Tensor], Optional[torch.Tensor]]: + ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]: """Forward of ColumnParallelLinear Args: @@ -536,7 +536,7 @@ def can_replace_layer( cls, source_layer: nn.Module, lora_config: LoRAConfig, - packed_modules_list: list, + packed_modules_list: List, model_config: Optional[PretrainedConfig], ) -> bool: return type(source_layer) is ColumnParallelLinear or ( @@ -613,13 +613,13 @@ def create_lora_weights( ) for output_size in self.output_slices) def slice_lora_a( - self, lora_a: list[Union[torch.Tensor, None]] - ) -> list[Union[torch.Tensor, None]]: + self, lora_a: List[Union[torch.Tensor, None]] + ) -> List[Union[torch.Tensor, None]]: return lora_a def slice_lora_b( - self, lora_b: list[Union[torch.Tensor, None]] - ) -> list[Union[torch.Tensor, None]]: + self, lora_b: List[Union[torch.Tensor, None]] + ) -> List[Union[torch.Tensor, None]]: for i, (shard_id, shard_size) in enumerate( zip(self.output_ids, self.output_slices)): if (lora_b_i := lora_b[i]) is not None: @@ -628,8 +628,8 @@ def slice_lora_b( return lora_b def slice_bias( - self, bias: list[Union[torch.Tensor, - None]]) -> list[Union[torch.Tensor, None]]: + self, bias: List[Union[torch.Tensor, + None]]) -> List[Union[torch.Tensor, None]]: for i, (shard_id, shard_size) in enumerate( zip(self.output_ids, self.output_slices)): if (bias_i := bias[i]) is not None: @@ -664,7 +664,7 @@ def set_lora( lora_b_i.T, non_blocking=True) if lora_bias is not None: - self.lora_bias_stacked = cast(tuple[torch.Tensor, ...], + self.lora_bias_stacked = cast(Tuple[torch.Tensor, ...], self.lora_bias_stacked) for i in range(self.n_slices): if (lora_bias_i := lora_bias[i]) is not None: @@ -679,7 +679,7 @@ def can_replace_layer( cls, source_layer: nn.Module, lora_config: LoRAConfig, - packed_modules_list: list, + packed_modules_list: List, model_config: Optional[PretrainedConfig], ) -> bool: return (type(source_layer) is MergedColumnParallelLinear @@ -748,7 +748,7 @@ def slice_bias(self, bias: torch.Tensor) -> torch.Tensor: @classmethod @_not_fully_sharded_can_replace def can_replace_layer(cls, source_layer: nn.Module, - lora_config: LoRAConfig, packed_modules_list: list, + lora_config: LoRAConfig, packed_modules_list: List, model_config: Optional[PretrainedConfig]) -> bool: return type(source_layer) is QKVParallelLinear and len( packed_modules_list) == 1 @@ -808,7 +808,7 @@ def can_replace_layer( cls, source_layer: nn.Module, lora_config: LoRAConfig, - packed_modules_list: list, + packed_modules_list: List, model_config: Optional[PretrainedConfig], ) -> bool: return (type(source_layer) is QKVParallelLinear @@ -845,7 +845,7 @@ def slice_bias(self, bias: torch.Tensor) -> torch.Tensor: def forward( self, input_: torch.Tensor - ) -> tuple[Optional[torch.Tensor], Optional[torch.Tensor]]: + ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]: """Forward of RowParallelLinear Args: @@ -893,7 +893,7 @@ def can_replace_layer( cls, source_layer: nn.Module, lora_config: LoRAConfig, - packed_modules_list: list, + packed_modules_list: List, model_config: Optional[PretrainedConfig], ) -> bool: return type(source_layer) is RowParallelLinear @@ -916,7 +916,7 @@ class LogitsProcessorWithLoRA(BaseLayerWithLoRA): def __init__(self, base_layer: LogitsProcessor, hidden_size: int, dtype: torch.dtype, device: torch.device, - sharded_to_full_mapping: Optional[list[int]]) -> None: + sharded_to_full_mapping: Optional[List[int]]) -> None: super().__init__() self.base_layer = base_layer self.hidden_size = hidden_size @@ -1113,7 +1113,7 @@ def can_replace_layer( cls, source_layer: nn.Module, lora_config: LoRAConfig, - packed_modules_list: list, + packed_modules_list: List, model_config: Optional[PretrainedConfig], ) -> bool: # Special handling for the LogitsProcessor. @@ -1180,7 +1180,7 @@ def forward( positions: torch.Tensor, query: torch.Tensor, key: torch.Tensor, - ) -> tuple[torch.Tensor, torch.Tensor]: + ) -> Tuple[torch.Tensor, torch.Tensor]: return self.base_layer( positions, query, @@ -1189,7 +1189,7 @@ def forward( ) @property - def scaling_factor_to_offset(self) -> dict[float, int]: + def scaling_factor_to_offset(self) -> Dict[float, int]: return self.base_layer.scaling_factor_to_offset @classmethod @@ -1197,7 +1197,7 @@ def can_replace_layer( cls, source_layer: nn.Module, lora_config: LoRAConfig, - packed_modules_list: list, + packed_modules_list: List, model_config: Optional[PretrainedConfig], ) -> bool: """Returns True if the layer can be replaced by this LoRA layer.""" diff --git a/vllm/lora/lora.py b/vllm/lora/lora.py index 294b49e0a8997..00299bf6c2a81 100644 --- a/vllm/lora/lora.py +++ b/vllm/lora/lora.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 -from collections.abc import Sequence as GenericSequence -from typing import Optional +from typing import List, Optional +from typing import Sequence as GenericSequence import torch import torch.types @@ -125,11 +125,11 @@ def __init__( self, module_name: str, rank: int, - lora_alphas: list[Optional[int]], - lora_a: list[Optional[torch.Tensor]], - lora_b: list[Optional[torch.Tensor]], - bias: Optional[list[Optional[torch.Tensor]]] = None, - scaling: Optional[list[float]] = None, + lora_alphas: List[Optional[int]], + lora_a: List[Optional[torch.Tensor]], + lora_b: List[Optional[torch.Tensor]], + bias: Optional[List[Optional[torch.Tensor]]] = None, + scaling: Optional[List[float]] = None, ) -> None: super().__init__( module_name=module_name, diff --git a/vllm/lora/models.py b/vllm/lora/models.py index cbd303deb58c3..e1294884ac2af 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -4,9 +4,9 @@ import math import os import re -from collections.abc import Sequence from dataclasses import dataclass, field -from typing import Any, Callable, Optional, Union +from typing import (Any, Callable, Dict, List, Optional, Sequence, Set, Type, + Union) import safetensors.torch import torch @@ -43,12 +43,12 @@ class LongContextLoRAContext: """Context for lora adapters that support long context.""" # The scaling factors to support long context lora fine tuned models. - scaling_factors: list[float] + scaling_factors: List[float] # dimension to apply rotary embedding. rot_dim: int # offsets to the sin_cos_cache for each lora_id loaded. # This value is dynamically modified. - offsets_by_lora_id: dict[int, int] = field(default_factory=dict) + offsets_by_lora_id: Dict[int, int] = field(default_factory=dict) def get_lora_id(): @@ -64,7 +64,7 @@ def __init__( self, lora_model_id: int, rank: int, - loras: dict[str, LoRALayerWeights], + loras: Dict[str, LoRALayerWeights], scaling_factor: Optional[float] = None, ) -> None: """ @@ -83,7 +83,7 @@ def __init__( lora_model_id > 0), f"a valid lora id should be greater than 0, got {self.id}" self.rank = rank - self.loras: dict[str, LoRALayerWeights] = loras + self.loras: Dict[str, LoRALayerWeights] = loras def clone(self, lora_model_id: int) -> "LoRAModel": """Return a copy of the object with different ids. @@ -109,19 +109,19 @@ def get_lora(self, module_name: str) -> Optional[LoRALayerWeights]: def from_lora_tensors( cls, lora_model_id: int, - tensors: dict[str, torch.Tensor], + tensors: Dict[str, torch.Tensor], peft_helper: PEFTHelper, device: str = "cuda", dtype: Optional[torch.dtype] = None, - embeddings: Optional[dict[str, torch.Tensor]] = None, + embeddings: Optional[Dict[str, torch.Tensor]] = None, target_embedding_padding: Optional[int] = None, - embedding_modules: Optional[dict[str, str]] = None, - embedding_padding_modules: Optional[list[str]] = None, + embedding_modules: Optional[Dict[str, str]] = None, + embedding_padding_modules: Optional[List[str]] = None, weights_mapper: Optional[WeightsMapper] = None, ) -> "LoRAModel": """Create a LoRAModel from a dictionary of tensors.""" pin_memory = str(device) == "cpu" and is_pin_memory_available() - loras: dict[str, LoRALayerWeights] = {} + loras: Dict[str, LoRALayerWeights] = {} for tensor_name, tensor in tensors.items(): module_name, is_lora_a, is_bias = parse_fine_tuned_lora_name( tensor_name, weights_mapper) @@ -183,15 +183,15 @@ def from_lora_tensors( def from_local_checkpoint( cls, lora_dir: str, - expected_lora_modules: list[str], + expected_lora_modules: List[str], peft_helper: PEFTHelper, *, lora_model_id: Optional[int] = None, device: str = "cuda", dtype: Optional[torch.dtype] = None, target_embedding_padding: Optional[int] = None, - embedding_modules: Optional[dict[str, str]] = None, - embedding_padding_modules: Optional[list[str]] = None, + embedding_modules: Optional[Dict[str, str]] = None, + embedding_padding_modules: Optional[List[str]] = None, weights_mapper: Optional[WeightsMapper] = None, ) -> "LoRAModel": """Create a LoRAModel from a local checkpoint. @@ -216,9 +216,9 @@ def from_local_checkpoint( new_embeddings_bin_file_path = os.path.join(lora_dir, "new_embeddings.bin") - unexpected_modules: list[Union[list[str], str]] + unexpected_modules: List[Union[list[str], str]] if os.path.isfile(lora_tensor_path): - tensors: dict[str, torch.Tensor] = {} + tensors: Dict[str, torch.Tensor] = {} # Find unexpected modules. # Use safetensor key as a source of truth to find expected modules. # in peft if you have target_modules A, B, C and C does not exist @@ -323,7 +323,7 @@ def __init__( self.max_num_seqs = max_num_seqs assert self.capacity >= self.lora_slots self.max_num_batched_tokens = math.ceil(max_num_batched_tokens / 8) * 8 - self.lora_index_to_id: list[Optional[int]] = [None] * self.lora_slots + self.lora_index_to_id: List[Optional[int]] = [None] * self.lora_slots self.vocab_size = vocab_size self.long_lora_context: Optional[LongContextLoRAContext] = None self.punica_wrapper = get_punica_wrapper(max_num_batched_tokens, @@ -331,7 +331,7 @@ def __init__( device=self.device) # Scaling factor -> offset to the sin_cos_cache to it. # Used for long context lora. - self.scaling_factor_to_offset: dict[float, int] = {} + self.scaling_factor_to_offset: Dict[float, int] = {} super().__init__(model) self.supported_lora_modules = get_supported_lora_modules(self.model) assert self.supported_lora_modules, "No supported LoRA modules found in" @@ -348,9 +348,9 @@ def __init__( # In case the model only supports LoRA for # text modules (e.g. ChatGLM) and hasattr(self.model, "get_mm_mapping")) - self.packed_modules: dict[str, list[str]] = {} - self.modules: dict[str, BaseLayerWithLoRA] = {} - # dict instead of a Set for compatibility with LRUCache. + self.packed_modules: Dict[str, List[str]] = {} + self.modules: Dict[str, BaseLayerWithLoRA] = {} + # Dict instead of a Set for compatibility with LRUCache. self._last_mapping: Optional[LoRAMapping] = None self._create_lora_modules() self.model.lora_manager = self @@ -520,7 +520,7 @@ def create_dummy_lora( lora_id: int, rank: int, scaling_factor: Optional[float], - embedding_modules: Optional[dict[str, str]] = None) -> LoRAModel: + embedding_modules: Optional[Dict[str, str]] = None) -> LoRAModel: """Create zero-initialized LoRAModel for warmup.""" model = LoRAModel(lora_id, rank, {}, scaling_factor) for module_name, module in self.model.named_modules(): @@ -568,7 +568,7 @@ def create_dummy_lora( else: parts = module_name.split(".") replacements = self.packed_modules_mapping[parts[-1]] - subloras: list[Optional[LoRALayerWeights]] = [] + subloras: List[Optional[LoRALayerWeights]] = [] for i, r in enumerate(replacements): lora = LoRALayerWeights.create_dummy_lora_weights( module_name + "." + r, @@ -620,8 +620,8 @@ def _register_packed_modules(self, module_full_name: str) -> None: def _create_merged_loras_inplace(self, lora_model: LoRAModel) -> None: for module_name, new_module_names in self.packed_modules.items(): - replacement_loras: list[Optional[LoRALayerWeights]] = [] - replaced_module: set[str] = set() + replacement_loras: List[Optional[LoRALayerWeights]] = [] + replaced_module: Set[str] = set() has_replacement = False for r in new_module_names: lora = lora_model.get_lora(r) @@ -662,7 +662,7 @@ def remove_adapter(self, adapter_id: int) -> bool: return remove_adapter(adapter_id, self._registered_adapters, self.deactivate_adapter) - def list_adapters(self) -> dict[int, Any]: + def list_adapters(self) -> Dict[int, Any]: return list_adapters(self._registered_adapters) def get_adapter(self, adapter_id: int) -> Optional[Any]: @@ -689,7 +689,7 @@ def __init__(self, model: nn.Module, max_num_seqs: int, self._active_adapters: LoRALRUCache = LoRALRUCache( self.lora_slots, self._deactivate_adapter) - def list_adapters(self) -> dict[int, LoRAModel]: + def list_adapters(self) -> Dict[int, LoRAModel]: """List all registered LoRAModels.""" return dict(self._registered_adapters.cache) @@ -754,7 +754,7 @@ def create_lora_manager( vocab_size: int, lora_config: LoRAConfig, device: torch.device, - lora_manager_cls: type[LoRAModelManager] = LoRAModelManager, + lora_manager_cls: Type[LoRAModelManager] = LoRAModelManager, **kwargs) -> LoRAModelManager: """Create a LoRA adapter for a given model.""" if not hasattr(model, "packed_modules_mapping"): diff --git a/vllm/lora/ops/triton_ops/sgmv_expand.py b/vllm/lora/ops/triton_ops/sgmv_expand.py index ae4afa759bcea..6aa3eafaba4c0 100644 --- a/vllm/lora/ops/triton_ops/sgmv_expand.py +++ b/vllm/lora/ops/triton_ops/sgmv_expand.py @@ -6,6 +6,8 @@ https://arxiv.org/abs/2310.18547 """ +from typing import List + import torch import triton import triton.language as tl @@ -117,7 +119,7 @@ def _sgmv_expand_kernel( @torch.inference_mode() def _sgmv_expand( inputs: torch.Tensor, - lora_b_weights: list[torch.Tensor], + lora_b_weights: List[torch.Tensor], output_tensor: torch.Tensor, b_seq_start_loc: torch.Tensor, seq_len_tensor: torch.Tensor, @@ -131,7 +133,7 @@ def _sgmv_expand( """ Args: inputs (torch.Tensor): input tensor - lora_b_weights (list[torch.Tensor]): lora'b weight + lora_b_weights (List[torch.Tensor]): lora'b weight output_tensor (torch.Tensor): output tensor b_seq_start_loc (torch.Tensor): (batch_size,). The cumulative sequence lengths of the sequences in the batch, used to index @@ -220,7 +222,7 @@ def _sgmv_expand( def _sgmv_expand_fake( inputs: torch.Tensor, - lora_b_weights: list[torch.Tensor], + lora_b_weights: List[torch.Tensor], output_tensor: torch.Tensor, b_seq_start_loc: torch.Tensor, seq_len_tensor: torch.Tensor, diff --git a/vllm/lora/ops/triton_ops/sgmv_shrink.py b/vllm/lora/ops/triton_ops/sgmv_shrink.py index 04f5beffc7810..b8ed0b020f9ac 100644 --- a/vllm/lora/ops/triton_ops/sgmv_shrink.py +++ b/vllm/lora/ops/triton_ops/sgmv_shrink.py @@ -6,6 +6,8 @@ https://arxiv.org/abs/2310.18547 """ +from typing import List + import torch import triton import triton.language as tl @@ -110,7 +112,7 @@ def _sgmv_shrink_kernel( @torch.inference_mode() def _sgmv_shrink( inputs: torch.Tensor, - lora_a_weights: list[torch.Tensor], + lora_a_weights: List[torch.Tensor], output_tensor: torch.Tensor, b_seq_start_loc: torch.Tensor, seq_len_tensor: torch.Tensor, @@ -123,7 +125,7 @@ def _sgmv_shrink( """ Args: inputs (torch.Tensor): input tensor - lora_a_weights (list[torch.Tensor]): lora'a weight + lora_a_weights (List[torch.Tensor]): lora'a weight output_tensor (torch.Tensor): output tensor b_seq_start_loc (torch.Tensor): (batch_size,). The cumulative sequence lengths of the sequences in the batch, used to index @@ -196,7 +198,7 @@ def _sgmv_shrink( def sgmv_shrink_fake( inputs: torch.Tensor, - lora_a_weights: list[torch.Tensor], + lora_a_weights: List[torch.Tensor], output_tensor: torch.Tensor, b_seq_start_loc: torch.Tensor, seq_len_tensor: torch.Tensor, diff --git a/vllm/lora/ops/triton_ops/utils.py b/vllm/lora/ops/triton_ops/utils.py index 1f52f1bc8dd5f..78409b91a14e8 100644 --- a/vllm/lora/ops/triton_ops/utils.py +++ b/vllm/lora/ops/triton_ops/utils.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import functools +from typing import Dict, List, Tuple import torch @@ -35,7 +36,7 @@ def _get_default_config(op_type: str, batch: int, hidden_size: int): def get_lora_op_configs(op_type: str, batch: int, - hidden_size: int) -> dict[str, int]: + hidden_size: int) -> Dict[str, int]: """Inspired by `fused_moe_kernel` The return value will be a dictionary mapping an irregular grid of batch sizes and hidden_size to configurations of the bgmv-related kernel. @@ -49,11 +50,11 @@ def get_lora_op_configs(op_type: str, batch: int, return config -_LORA_A_PTR_DICT: dict[tuple[int, ...], tuple[torch.tensor, ...]] = {} -_LORA_B_PTR_DICT: dict[tuple[int, ...], tuple[torch.tensor, ...]] = {} +_LORA_A_PTR_DICT: Dict[Tuple[int, ...], Tuple[torch.tensor, ...]] = {} +_LORA_B_PTR_DICT: Dict[Tuple[int, ...], Tuple[torch.tensor, ...]] = {} -def _get_lora_a_ptr(lora_a_weights: list[torch.Tensor], device: str): +def _get_lora_a_ptr(lora_a_weights: List[torch.Tensor], device: str): """ `_LORA_A_PTR_DICT` collects the required information during `profile_run`, After this, it remains constant and subsequent usage is through LUT. @@ -98,7 +99,7 @@ def _get_lora_a_ptr(lora_a_weights: list[torch.Tensor], device: str): return _LORA_A_PTR_DICT.get(key) -def _get_lora_b_ptr(lora_weights: list[torch.Tensor], offset_start: int, +def _get_lora_b_ptr(lora_weights: List[torch.Tensor], offset_start: int, device: str): """ `_LORA_B_PTR_DICT` collects the required information during `profile_run`, diff --git a/vllm/lora/peft_helper.py b/vllm/lora/peft_helper.py index d5de63f5baade..f6944368b36ee 100644 --- a/vllm/lora/peft_helper.py +++ b/vllm/lora/peft_helper.py @@ -6,7 +6,7 @@ import math import os from dataclasses import MISSING, dataclass, field, fields -from typing import Literal, Optional, Union +from typing import List, Literal, Optional, Union from vllm.config import LoRAConfig from vllm.logger import init_logger @@ -40,7 +40,7 @@ class PEFTHelper: vllm_max_position_embeddings: Optional[int] = field(default=False) vllm_long_context_scaling_factor: Optional[float] = field(default=None) - def _validate_features(self) -> list[str]: + def _validate_features(self) -> List[str]: """ Check if there are any unsupported LoRA features. """ diff --git a/vllm/lora/punica_wrapper/punica_base.py b/vllm/lora/punica_wrapper/punica_base.py index 38d1ce6584d2b..94fa3f27ab604 100644 --- a/vllm/lora/punica_wrapper/punica_base.py +++ b/vllm/lora/punica_wrapper/punica_base.py @@ -7,7 +7,7 @@ """ from abc import ABC, abstractmethod -from typing import TYPE_CHECKING, Optional, Union +from typing import TYPE_CHECKING, List, Optional, Tuple, Union import torch @@ -28,7 +28,7 @@ class PunicaWrapperABC(ABC): def update_metadata( self, mapping: "LoRAMapping", - lora_index_to_id: list[Optional[int]], + lora_index_to_id: List[Optional[int]], max_loras: int, vocab_size: int, extra_vocab_size: int, @@ -43,9 +43,9 @@ def update_metadata( @abstractmethod def add_shrink( self, - y: Union[tuple[torch.Tensor, ...], torch.Tensor], + y: Union[Tuple[torch.Tensor, ...], torch.Tensor], x: torch.Tensor, - lora_a_stacked: tuple[torch.Tensor, ...], + lora_a_stacked: Tuple[torch.Tensor, ...], scale: float, **kwargs, ) -> None: @@ -59,10 +59,10 @@ def add_shrink( def add_expand( self, y: torch.Tensor, - x: Union[tuple[torch.Tensor, ...], torch.Tensor], - lora_b_stacked: tuple[torch.Tensor, ...], - lora_bias_stacked: Optional[tuple[torch.Tensor, ...]], - output_slices: tuple[int, ...], + x: Union[Tuple[torch.Tensor, ...], torch.Tensor], + lora_b_stacked: Tuple[torch.Tensor, ...], + lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]], + output_slices: Tuple[int, ...], offset_start: int = 0, add_inputs=True, **kwargs, @@ -91,13 +91,13 @@ def add_lora_embedding( def add_lora_linear(self, y: torch.Tensor, x: torch.Tensor, - lora_a_stacked: tuple[torch.Tensor, ...], - lora_b_stacked: tuple[torch.Tensor, ...], - lora_bias_stacked: Optional[tuple[torch.Tensor, ...]], + lora_a_stacked: Tuple[torch.Tensor, ...], + lora_b_stacked: Tuple[torch.Tensor, ...], + lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]], scale: float, - output_slices: tuple[int, ...], + output_slices: Tuple[int, ...], *, - buffer: Optional[tuple[torch.Tensor, ...]] = None, + buffer: Optional[Tuple[torch.Tensor, ...]] = None, **kwargs) -> None: """ Applicable to linear-related lora. @@ -150,7 +150,7 @@ def __init__(self, max_num_batched_tokens: int, max_batches: int, # 5 is the number of indices tensors. # base_indices, sampler_indices, sampler_indices_padded, # embeddings_indices,long_lora_indices - self.indices_len: list[Optional[int]] = [None] * 5 + self.indices_len: List[Optional[int]] = [None] * 5 # these attributes are the information required for sgmv kernel self._seq_start_locs = torch.empty(max_batches, dtype=torch.long, @@ -171,7 +171,7 @@ def __init__(self, max_num_batched_tokens: int, max_batches: int, def _update_base_metadata( self, mapping: "LoRAMapping", - lora_index_to_id: list[Optional[int]], + lora_index_to_id: List[Optional[int]], max_loras: int, vocab_size: int, extra_vocab_size: int, @@ -227,8 +227,8 @@ def _apply_bias( self, indices: torch.Tensor, output: torch.Tensor, - output_slices: tuple[int, ...], - lora_bias_stacked: tuple[Optional[torch.Tensor], ...], + output_slices: Tuple[int, ...], + lora_bias_stacked: Tuple[Optional[torch.Tensor], ...], ): """Applies bias to output @@ -258,7 +258,7 @@ def _apply_bias( @property def prefill_metadata( self - ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, int, int, int]: + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, int, int, int]: """ This property provides a convenient way to access the necessary metadata for prefill-related kernel computations. @@ -322,7 +322,7 @@ def long_lora_indices(self) -> torch.Tensor: def update_metadata( self, mapping: "LoRAMapping", - lora_index_to_id: list[Optional[int]], + lora_index_to_id: List[Optional[int]], max_loras: int, vocab_size: int, extra_vocab_size: int, @@ -340,8 +340,8 @@ def update_metadata( self.is_prefill = False @abstractmethod - def add_shrink(self, y: Union[tuple[torch.Tensor, ...], torch.Tensor], - x: torch.Tensor, lora_a_stacked: tuple[torch.Tensor, ...], + def add_shrink(self, y: Union[Tuple[torch.Tensor, ...], torch.Tensor], + x: torch.Tensor, lora_a_stacked: Tuple[torch.Tensor, ...], scale: float, **kwargs) -> None: """ Performs GEMM for multiple slices of lora_a. @@ -351,9 +351,9 @@ def add_shrink(self, y: Union[tuple[torch.Tensor, ...], torch.Tensor], y[i] += (x @ lora_a_stacked[i]) * scale Args: - y (Union[tuple[torch.Tensor, ...], torch.Tensor]): Output tensors + y (Union[Tuple[torch.Tensor, ...], torch.Tensor]): Output tensors x (torch.Tensor): Input tensor - lora_a_stacked (tuple[torch.Tensor, ...]): lora_a's weights + lora_a_stacked (Tuple[torch.Tensor, ...]): lora_a's weights scale (float): Scaling factor for the operation """ @@ -363,10 +363,10 @@ def add_shrink(self, y: Union[tuple[torch.Tensor, ...], torch.Tensor], @abstractmethod def add_expand(self, y: torch.Tensor, - x: Union[tuple[torch.Tensor, ...], torch.Tensor], - lora_b_stacked: tuple[torch.Tensor, ...], - lora_bias_stacked: Optional[tuple[torch.Tensor, ...]], - output_slices: tuple[int, ...], + x: Union[Tuple[torch.Tensor, ...], torch.Tensor], + lora_b_stacked: Tuple[torch.Tensor, ...], + lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]], + output_slices: Tuple[int, ...], offset_start: int = 0, add_inputs=True, **kwargs) -> None: @@ -383,11 +383,11 @@ def add_expand(self, Args: y (torch.Tensor): Output tensor. - x (Union[tuple[torch.Tensor, ...], torch.Tensor]): Input tensors - lora_b_stacked (tuple[torch.Tensor, ...]): lora_b's weight - lora_bias_stacked (Optional[tuple[torch.Tensor, ...]]): + x (Union[Tuple[torch.Tensor, ...], torch.Tensor]): Input tensors + lora_b_stacked (Tuple[torch.Tensor, ...]): lora_b's weight + lora_bias_stacked (Optional[Tuple[torch.Tensor, ...]]): bias's weight - output_slices (tuple[int, ...]): Every slice's size + output_slices (Tuple[int, ...]): Every slice's size offset_start (int): The starting position of y, defaults to 0 add_inputs (bool): Defaults to True. @@ -421,13 +421,13 @@ def add_lora_embedding(self, def add_lora_linear(self, y: torch.Tensor, x: torch.Tensor, - lora_a_stacked: tuple[torch.Tensor, ...], - lora_b_stacked: tuple[torch.Tensor, ...], - lora_bias_stacked: Optional[tuple[torch.Tensor, ...]], + lora_a_stacked: Tuple[torch.Tensor, ...], + lora_b_stacked: Tuple[torch.Tensor, ...], + lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]], scale: float, - output_slices: tuple[int, ...], + output_slices: Tuple[int, ...], *, - buffer: Optional[tuple[torch.Tensor, ...]] = None, + buffer: Optional[Tuple[torch.Tensor, ...]] = None, **kwargs) -> None: """ Applicable to linear-related lora. @@ -444,12 +444,12 @@ def add_lora_linear(self, Args: y (torch.Tensor): Output tensor. Will be changed in-place. x (torch.Tensor): Input tensor - lora_a_stacked (tuple[torch.Tensor, ...]): lora_a's weight. - lora_b_stacked (tuple[torch.Tensor, ...]): lora_b's weight. - lora_bias_stacked (Optional[tuple[torch.Tensor, ...]]): lora's bias. + lora_a_stacked (Tuple[torch.Tensor, ...]): lora_a's weight. + lora_b_stacked (Tuple[torch.Tensor, ...]): lora_b's weight. + lora_bias_stacked (Optional[Tuple[torch.Tensor, ...]]): lora's bias. scale (float): Scaling factor. - output_slices (tuple[int, ...]): Every slice's size. - buffer (Optional[tuple[torch.Tensor, ...]]): Defaults to None. + output_slices (Tuple[int, ...]): Every slice's size. + buffer (Optional[Tuple[torch.Tensor, ...]]): Defaults to None. """ # TODO: implement it based on torch ops raise NotImplementedError diff --git a/vllm/lora/punica_wrapper/punica_cpu.py b/vllm/lora/punica_wrapper/punica_cpu.py index 8118a72d696a2..29428f4cfff31 100644 --- a/vllm/lora/punica_wrapper/punica_cpu.py +++ b/vllm/lora/punica_wrapper/punica_cpu.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Callable, Optional, Union +from typing import Callable, Optional, Tuple, Union import torch @@ -150,8 +150,8 @@ def _apply_shrink(self, y: torch.Tensor, x: torch.Tensor, shrink_fun(y, x, w_t_all, scale) y = y.view_as(y_org) - def add_shrink(self, y: Union[tuple[torch.Tensor, ...], torch.Tensor], - x: torch.Tensor, lora_a_stacked: tuple[torch.Tensor, ...], + def add_shrink(self, y: Union[Tuple[torch.Tensor, ...], torch.Tensor], + x: torch.Tensor, lora_a_stacked: Tuple[torch.Tensor, ...], scale: float, **kwargs): """ Performs GEMM for multiple slices of lora_a. @@ -165,9 +165,9 @@ def add_shrink(self, y: Union[tuple[torch.Tensor, ...], torch.Tensor], y[i] += (x @ lora_a_stacked[i]) * scale Args: - y (Union[tuple[torch.Tensor, ...], torch.Tensor]): Output tensors + y (Union[Tuple[torch.Tensor, ...], torch.Tensor]): Output tensors x (torch.Tensor): Input tensor - lora_a_stacked (tuple[torch.Tensor, ...]): lora_a's weights + lora_a_stacked (Tuple[torch.Tensor, ...]): lora_a's weights scale (float): Scaling factor for the operation """ @@ -179,10 +179,10 @@ def add_shrink(self, y: Union[tuple[torch.Tensor, ...], torch.Tensor], def add_expand(self, y: torch.Tensor, - x: Union[tuple[torch.Tensor, ...], torch.Tensor], - lora_b_stacked: tuple[torch.Tensor, ...], - lora_bias_stacked: Optional[tuple[torch.Tensor, ...]], - output_slices: tuple[int, ...], + x: Union[Tuple[torch.Tensor, ...], torch.Tensor], + lora_b_stacked: Tuple[torch.Tensor, ...], + lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]], + output_slices: Tuple[int, ...], offset_start: int = 0, add_inputs=True, **kwargs) -> None: @@ -198,11 +198,11 @@ def add_expand(self, Args: y (torch.Tensor): Output tensor. - x (Union[tuple[torch.Tensor, ...], torch.Tensor]): Input tensors - lora_b_stacked (tuple[torch.Tensor, ...]): lora_b's weight - lora_bias_stacked (Optional[tuple[torch.Tensor, ...]]): + x (Union[Tuple[torch.Tensor, ...], torch.Tensor]): Input tensors + lora_b_stacked (Tuple[torch.Tensor, ...]): lora_b's weight + lora_bias_stacked (Optional[Tuple[torch.Tensor, ...]]): bias's weight - output_slices (tuple[int, ...]): Every slice's size + output_slices (Tuple[int, ...]): Every slice's size add_inputs (bool): Defaults to True. """ y_org = y @@ -250,13 +250,13 @@ def add_lora_embedding(self, def add_lora_linear(self, y: torch.Tensor, x: torch.Tensor, - lora_a_stacked: tuple[torch.Tensor, ...], - lora_b_stacked: tuple[torch.Tensor, ...], - lora_bias_stacked: Optional[tuple[torch.Tensor, ...]], + lora_a_stacked: Tuple[torch.Tensor, ...], + lora_b_stacked: Tuple[torch.Tensor, ...], + lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]], scale: float, - output_slices: tuple[int, ...], + output_slices: Tuple[int, ...], *, - buffer: Optional[tuple[torch.Tensor, ...]] = None, + buffer: Optional[Tuple[torch.Tensor, ...]] = None, **kwargs) -> None: """ Applicable to linear-related lora. @@ -273,12 +273,12 @@ def add_lora_linear(self, Args: y (torch.Tensor): Output tensor. Will be changed in-place. x (torch.Tensor): Input tensor - lora_a_stacked (tuple[torch.Tensor, ...]): lora_a's weight. - lora_b_stacked (tuple[torch.Tensor, ...]): lora_b's weight. - lora_bias_stacked (Optional[tuple[torch.Tensor, ...]]): lora's bias. + lora_a_stacked (Tuple[torch.Tensor, ...]): lora_a's weight. + lora_b_stacked (Tuple[torch.Tensor, ...]): lora_b's weight. + lora_bias_stacked (Optional[Tuple[torch.Tensor, ...]]): lora's bias. scale (float): Scaling factor. - output_slices (tuple[int, ...]): Every slice's size. - buffer (Optional[tuple[torch.Tensor, ...]]): Defaults to None. + output_slices (Tuple[int, ...]): Every slice's size. + buffer (Optional[Tuple[torch.Tensor, ...]]): Defaults to None. """ assert len(lora_a_stacked) == len(lora_b_stacked) == len(output_slices) diff --git a/vllm/lora/punica_wrapper/punica_gpu.py b/vllm/lora/punica_wrapper/punica_gpu.py index 9a54243a070ea..9ccd9c36a073e 100644 --- a/vllm/lora/punica_wrapper/punica_gpu.py +++ b/vllm/lora/punica_wrapper/punica_gpu.py @@ -6,7 +6,7 @@ https://arxiv.org/abs/2310.18547 """ -from typing import Optional, Union, final +from typing import Optional, Tuple, Union, final import torch @@ -39,7 +39,7 @@ def _apply_shrink_prefill( self, y: torch.Tensor, x: torch.Tensor, - w_t_all: tuple[torch.Tensor, ...], + w_t_all: Tuple[torch.Tensor, ...], scale: float, ): #No LoRA request, so return directly @@ -95,8 +95,8 @@ def _apply_expand_decode( bgmv_expand_slice(x, w_t_all, y, self.token_lora_indices, y_offset, y_slice_size, add_inputs) - def add_shrink(self, y: Union[tuple[torch.Tensor, ...], torch.Tensor], - x: torch.Tensor, lora_a_stacked: tuple[torch.Tensor, ...], + def add_shrink(self, y: Union[Tuple[torch.Tensor, ...], torch.Tensor], + x: torch.Tensor, lora_a_stacked: Tuple[torch.Tensor, ...], scale: float, **kwargs): """ Performs GEMM for multiple slices of lora_a. @@ -110,9 +110,9 @@ def add_shrink(self, y: Union[tuple[torch.Tensor, ...], torch.Tensor], y[i] += (x @ lora_a_stacked[i]) * scale Args: - y (Union[tuple[torch.Tensor, ...], torch.Tensor]): Output tensors + y (Union[Tuple[torch.Tensor, ...], torch.Tensor]): Output tensors x (torch.Tensor): Input tensor - lora_a_stacked (tuple[torch.Tensor, ...]): lora_a's weights + lora_a_stacked (Tuple[torch.Tensor, ...]): lora_a's weights scale (float): Scaling factor for the operation """ @@ -129,10 +129,10 @@ def add_shrink(self, y: Union[tuple[torch.Tensor, ...], torch.Tensor], def add_expand(self, y: torch.Tensor, - x: Union[tuple[torch.Tensor, ...], torch.Tensor], - lora_b_stacked: tuple[torch.Tensor, ...], - lora_bias_stacked: Optional[tuple[torch.Tensor, ...]], - output_slices: tuple[int, ...], + x: Union[Tuple[torch.Tensor, ...], torch.Tensor], + lora_b_stacked: Tuple[torch.Tensor, ...], + lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]], + output_slices: Tuple[int, ...], offset_start: int = 0, add_inputs=True, **kwargs) -> None: @@ -148,11 +148,11 @@ def add_expand(self, Args: y (torch.Tensor): Output tensor. - x (Union[tuple[torch.Tensor, ...], torch.Tensor]): Input tensors - lora_b_stacked (tuple[torch.Tensor, ...]): lora_b's weight - lora_bias_stacked (Optional[tuple[torch.Tensor, ...]]): + x (Union[Tuple[torch.Tensor, ...], torch.Tensor]): Input tensors + lora_b_stacked (Tuple[torch.Tensor, ...]): lora_b's weight + lora_bias_stacked (Optional[Tuple[torch.Tensor, ...]]): bias's weight - output_slices (tuple[int, ...]): Every slice's size + output_slices (Tuple[int, ...]): Every slice's size add_inputs (bool): Defaults to True. """ y_org = y @@ -216,13 +216,13 @@ def add_lora_embedding(self, def add_lora_linear(self, y: torch.Tensor, x: torch.Tensor, - lora_a_stacked: tuple[torch.Tensor, ...], - lora_b_stacked: tuple[torch.Tensor, ...], - lora_bias_stacked: Optional[tuple[torch.Tensor, ...]], + lora_a_stacked: Tuple[torch.Tensor, ...], + lora_b_stacked: Tuple[torch.Tensor, ...], + lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]], scale: float, - output_slices: tuple[int, ...], + output_slices: Tuple[int, ...], *, - buffer: Optional[tuple[torch.Tensor, ...]] = None, + buffer: Optional[Tuple[torch.Tensor, ...]] = None, **kwargs) -> None: """ Applicable to linear-related lora. @@ -239,12 +239,12 @@ def add_lora_linear(self, Args: y (torch.Tensor): Output tensor. Will be changed in-place. x (torch.Tensor): Input tensor - lora_a_stacked (tuple[torch.Tensor, ...]): lora_a's weight. - lora_b_stacked (tuple[torch.Tensor, ...]): lora_b's weight. - lora_bias_stacked (Optional[tuple[torch.Tensor, ...]]): lora's bias. + lora_a_stacked (Tuple[torch.Tensor, ...]): lora_a's weight. + lora_b_stacked (Tuple[torch.Tensor, ...]): lora_b's weight. + lora_bias_stacked (Optional[Tuple[torch.Tensor, ...]]): lora's bias. scale (float): Scaling factor. - output_slices (tuple[int, ...]): Every slice's size. - buffer (Optional[tuple[torch.Tensor, ...]]): Defaults to None. + output_slices (Tuple[int, ...]): Every slice's size. + buffer (Optional[Tuple[torch.Tensor, ...]]): Defaults to None. """ assert len(lora_a_stacked) == len(lora_b_stacked) == len(output_slices) diff --git a/vllm/lora/punica_wrapper/punica_hpu.py b/vllm/lora/punica_wrapper/punica_hpu.py index 416c23e73bf85..3661a7214648a 100644 --- a/vllm/lora/punica_wrapper/punica_hpu.py +++ b/vllm/lora/punica_wrapper/punica_hpu.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import TYPE_CHECKING, Optional, Union, final +from typing import TYPE_CHECKING, List, Optional, Tuple, Union, final import torch from vllm_hpu_extension.ops import (dispatch_bgmv_embedding, @@ -28,7 +28,7 @@ def __init__(self, max_num_batched_tokens: int, max_batches: int, def _update_base_metadata( self, mapping: "LoRAMapping", - lora_index_to_id: list[Optional[int]], + lora_index_to_id: List[Optional[int]], max_loras: int, vocab_size: int, extra_vocab_size: int, @@ -48,9 +48,9 @@ def _update_base_metadata( # graph accumulation. Hence HPU appends `lora_offset` to a list and # converts it to a tensor only after it is ready. if long_lora_context: - index_mapping_indices: list[int] = list( + index_mapping_indices: List[int] = list( mapping.index_mapping).copy() - long_lora_offsets: list[int] = [] + long_lora_offsets: List[int] = [] for i in range(len(index_mapping_indices)): lora_offset: int = long_lora_context.offsets_by_lora_id.get( index_mapping_indices[i], 0) @@ -85,13 +85,13 @@ def add_lora_embedding(self, def add_lora_linear(self, y: torch.Tensor, x: torch.Tensor, - lora_a_stacked: tuple[torch.Tensor, ...], - lora_b_stacked: tuple[torch.Tensor, ...], - lora_bias_stacked: Optional[tuple[torch.Tensor, ...]], + lora_a_stacked: Tuple[torch.Tensor, ...], + lora_b_stacked: Tuple[torch.Tensor, ...], + lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]], scale: float, - output_slices: tuple[int, ...], + output_slices: Tuple[int, ...], *, - buffer: Optional[tuple[torch.Tensor, ...]] = None, + buffer: Optional[Tuple[torch.Tensor, ...]] = None, **kwargs) -> None: y_org = y x = x.view(-1, x.shape[-1]) @@ -122,9 +122,9 @@ def add_lora_logits(self, def add_shrink( self, - y: Union[tuple[torch.Tensor, ...], torch.Tensor], + y: Union[Tuple[torch.Tensor, ...], torch.Tensor], x: torch.Tensor, - lora_a_stacked: tuple[torch.Tensor, ...], + lora_a_stacked: Tuple[torch.Tensor, ...], scale: float, **kwargs, ) -> None: @@ -133,10 +133,10 @@ def add_shrink( def add_expand( self, y: torch.Tensor, - x: Union[tuple[torch.Tensor, ...], torch.Tensor], - lora_b_stacked: tuple[torch.Tensor, ...], - lora_bias_stacked: Optional[tuple[torch.Tensor, ...]], - output_slices: tuple[int, ...], + x: Union[Tuple[torch.Tensor, ...], torch.Tensor], + lora_b_stacked: Tuple[torch.Tensor, ...], + lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]], + output_slices: Tuple[int, ...], offset_start: int = 0, add_inputs=True, **kwargs, diff --git a/vllm/lora/punica_wrapper/utils.py b/vllm/lora/punica_wrapper/utils.py index c37d2b2bddcb2..dbc2d27c597f2 100644 --- a/vllm/lora/punica_wrapper/utils.py +++ b/vllm/lora/punica_wrapper/utils.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import TYPE_CHECKING, Optional, Union +from typing import TYPE_CHECKING, List, Optional, Tuple, Union import torch @@ -12,7 +12,7 @@ def compute_meta( token_lora_tensor: torch.Tensor -) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, int, int, int, bool]: +) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, int, int, int, bool]: """ Get the information required for the sgmv kernel. With the features: 1. If consecutive requests in the batch use the same LoRA, this function @@ -43,19 +43,19 @@ def compute_meta( # TODO see if this can be vectorized def convert_mapping( mapping: "LoRAMapping", - lora_index_to_id: list[Optional[int]], + lora_index_to_id: List[Optional[int]], max_loras: int, vocab_size: int, extra_vocab_size: int, device: torch.device, long_lora_context: Optional["LongContextLoRAContext"] = None, -) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, - Optional[torch.Tensor], list[int]]: +) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, + Optional[torch.Tensor], List[int]]: """Converts LoRAMapping to index tensors. Args: mapping: LoRAMapping mapping rows in a batch to LoRA ids. - lora_index_to_id: list mapping LoRA ids to LoRA indices. + lora_index_to_id: List mapping LoRA ids to LoRA indices. max_loras: Maximum number of LoRAs. vocab_size: Model vocab size. extra_vocab_size: Extra vocab size each LoRA can have. @@ -80,11 +80,11 @@ def convert_mapping( long_lora_indices: Tensor of shape [batch_size] mapping requests to RoPE offsets and rot dims for long LoRAs. None if long context lora doesn't exist. - indices_len: list of lengths of the above tensors. It contains + indices_len: List of lengths of the above tensors. It contains (base_indices, sampler_indices, sampler_indices_padded, embeddings_indices, long_lora_indices). """ - index_mapping_indices: list[int] = list(mapping.index_mapping).copy() + index_mapping_indices: List[int] = list(mapping.index_mapping).copy() embedding_indices = index_mapping_indices.copy() lora_indices = index_mapping_indices.copy() long_lora_offsets: Optional[torch.Tensor] = None @@ -92,7 +92,7 @@ def convert_mapping( long_lora_offsets = torch.zeros(len(index_mapping_indices), device=device, dtype=torch.long) - prompt_mapping: list[int] = [ + prompt_mapping: List[int] = [ lora_index_to_id.index(x) if x > 0 else -1 for x in mapping.prompt_mapping ] @@ -109,7 +109,7 @@ def convert_mapping( index_mapping_indices[i], 0) long_lora_offsets[i] = lora_offset - indices_list: list[Union[list[int], torch.Tensor]] = [ + indices_list: List[Union[List[int], torch.Tensor]] = [ index_mapping_indices, lora_indices, embedding_indices, diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py index dee1c94f3527a..63b465fdf7432 100644 --- a/vllm/lora/utils.py +++ b/vllm/lora/utils.py @@ -2,7 +2,7 @@ import os import re -from typing import Optional, Union +from typing import List, Optional, Set, Tuple, Type, Union import huggingface_hub from huggingface_hub.utils import (EntryNotFoundError, HfHubHTTPError, @@ -37,7 +37,7 @@ logger = init_logger(__name__) -_all_lora_classes: set[type[BaseLayerWithLoRA]] = { +_all_lora_classes: Set[Type[BaseLayerWithLoRA]] = { VocabParallelEmbeddingWithLoRA, ColumnParallelLinearWithLoRA, MergedColumnParallelLinearWithLoRA, @@ -58,7 +58,7 @@ def from_layer(layer: nn.Module, max_loras: int, lora_config: LoRAConfig, - packed_modules_list: list, + packed_modules_list: List, model_config: Optional[PretrainedConfig] = None) -> nn.Module: for lora_cls in _all_lora_classes: # specifying kwargs so they can be easily accessed in decorator @@ -106,7 +106,7 @@ def replace_submodule(model: nn.Module, module_name: str, def parse_fine_tuned_lora_name( name: str, weights_mapper: Optional[WeightsMapper] = None -) -> tuple[str, bool, bool]: +) -> Tuple[str, bool, bool]: """Parse the name of lora weights. args: @@ -115,7 +115,7 @@ def parse_fine_tuned_lora_name( weights_mapper: maps the name of weight, e.g. `model.` -> `language_model.model.`, return: - tuple(module_name, is_lora_a): + Tuple(module_name, is_lora_a): module_name: the name of the module, e.g. model.dense1, is_lora_a whether the tensor is lora_a or lora_b. is_bias whether the tensor is lora bias. @@ -147,8 +147,8 @@ def parse_fine_tuned_lora_name( raise ValueError(f"{name} is unsupported LoRA weight") -def is_regex_target_modules(load_modules: Union[str, list[str]], - expected_lora_modules: list[str]) -> bool: +def is_regex_target_modules(load_modules: Union[str, List[str]], + expected_lora_modules: List[str]) -> bool: """ PEFT supports passing `target_modules` in the form of regular expressions, such as `model.*(q_proj|k_proj|v_proj)$`. This function is mainly used to @@ -179,11 +179,11 @@ def is_subset(sub_list, full_list): return False -def get_supported_lora_modules(model: nn.Module) -> list[str]: +def get_supported_lora_modules(model: nn.Module) -> List[str]: """ In vLLM, all linear layers support LoRA. """ - supported_lora_modules: set[str] = set() + supported_lora_modules: Set[str] = set() # step1: traverse the model to get all the linear subfixes. for name, module in model.named_modules(): if isinstance(module, (LinearBase, )): diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py index 8e5bc61066593..108beb34b244a 100644 --- a/vllm/lora/worker_manager.py +++ b/vllm/lora/worker_manager.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 from contextlib import contextmanager -from typing import Any, Literal, Optional, Union +from typing import Any, Dict, List, Literal, Optional, Set, Type, Union import torch @@ -27,7 +27,7 @@ class WorkerLoRAManager(AbstractWorkerManager): Every request, the requested LoRAs will be loaded (unless they are already loaded), and every other LoRA will be unloaded.""" - _manager_cls: type[LoRAModelManager] = LoRAModelManager + _manager_cls: Type[LoRAModelManager] = LoRAModelManager def __init__( self, @@ -36,9 +36,9 @@ def __init__( vocab_size: int, lora_config: LoRAConfig, device: torch.device, - embedding_modules: dict[str, str], - embedding_padding_modules: list[str], - lora_model_cls: type[LoRAModel] = LoRAModel, + embedding_modules: Dict[str, str], + embedding_padding_modules: List[str], + lora_model_cls: Type[LoRAModel] = LoRAModel, max_position_embeddings: Optional[int] = None, ): self._lora_model_cls = lora_model_cls @@ -88,7 +88,7 @@ def _load_adapter(self, lora_request: LoRARequest) -> LoRAModel: self._adapter_manager.supported_lora_modules) packed_modules_mapping = ( self._adapter_manager.packed_modules_mapping) - expected_lora_modules: list[str] = [] + expected_lora_modules: List[str] = [] for module in supported_lora_modules: if module in packed_modules_mapping: expected_lora_modules.extend( @@ -162,12 +162,12 @@ def add_dummy_lora(self, lora_request: LoRARequest, rank: int) -> bool: def pin_adapter(self, adapter_id: int) -> bool: return self._adapter_manager.pin_adapter(adapter_id) - def set_active_adapters(self, requests: set[Any], + def set_active_adapters(self, requests: Set[Any], mapping: Optional[Any]) -> None: set_active_adapters_worker(requests, mapping, self._apply_adapters, self._adapter_manager.set_adapter_mapping) - def _apply_adapters(self, adapter_requests: set[Any]) -> None: + def _apply_adapters(self, adapter_requests: Set[Any]) -> None: apply_adapters_worker(adapter_requests, self.list_adapters, self._adapter_manager.adapter_slots, self.remove_adapter, self.add_adapter) @@ -184,7 +184,7 @@ def remove_adapter(self, adapter_id: int) -> bool: def remove_all_adapters(self): self._adapter_manager.remove_all_adapters() - def list_adapters(self) -> set[int]: + def list_adapters(self) -> Set[int]: return list_adapters_worker(self._adapter_manager.list_adapters) @@ -195,7 +195,7 @@ class LRUCacheWorkerLoRAManager(WorkerLoRAManager): (unless they are already loaded) and least recently used LoRAs will be unloaded if the cache is above capacity.""" - _manager_cls: type[LRUCacheLoRAModelManager] = LRUCacheLoRAModelManager + _manager_cls: Type[LRUCacheLoRAModelManager] = LRUCacheLoRAModelManager def create_lora_manager( self, @@ -213,7 +213,7 @@ def create_lora_manager( self._adapter_manager = lora_manager return lora_manager.model - def _apply_adapters(self, lora_requests: set[LoRARequest]) -> None: + def _apply_adapters(self, lora_requests: Set[LoRARequest]) -> None: loras_map = { lora_request.lora_int_id: lora_request for lora_request in lora_requests if lora_request diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py index d0c3c0280428b..dfd052f625211 100644 --- a/vllm/model_executor/custom_op.py +++ b/vllm/model_executor/custom_op.py @@ -1,5 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 +from typing import Dict, Type + import torch.nn as nn from vllm.config import get_current_vllm_config @@ -136,7 +138,7 @@ def default_on() -> bool: # Examples: # - MyOp.enabled() # - op_registry["my_op"].enabled() - op_registry: dict[str, type['CustomOp']] = {} + op_registry: Dict[str, Type['CustomOp']] = {} # Decorator to register custom ops. @classmethod diff --git a/vllm/model_executor/guided_decoding/guided_fields.py b/vllm/model_executor/guided_decoding/guided_fields.py index 2b84cad0d1f5c..db4ce26806c1f 100644 --- a/vllm/model_executor/guided_decoding/guided_fields.py +++ b/vllm/model_executor/guided_decoding/guided_fields.py @@ -1,16 +1,16 @@ # SPDX-License-Identifier: Apache-2.0 from dataclasses import dataclass -from typing import Optional, TypedDict, Union +from typing import Dict, List, Optional, TypedDict, Union from pydantic import BaseModel # These classes are deprecated, see SamplingParams class LLMGuidedOptions(TypedDict, total=False): - guided_json: Union[dict, BaseModel, str] + guided_json: Union[Dict, BaseModel, str] guided_regex: str - guided_choice: list[str] + guided_choice: List[str] guided_grammar: str guided_decoding_backend: str guided_whitespace_pattern: str @@ -20,9 +20,9 @@ class LLMGuidedOptions(TypedDict, total=False): @dataclass class GuidedDecodingRequest: """One of the fields will be used to retrieve the logit processor.""" - guided_json: Optional[Union[dict, BaseModel, str]] = None + guided_json: Optional[Union[Dict, BaseModel, str]] = None guided_regex: Optional[str] = None - guided_choice: Optional[list[str]] = None + guided_choice: Optional[List[str]] = None guided_grammar: Optional[str] = None guided_decoding_backend: Optional[str] = None guided_whitespace_pattern: Optional[str] = None diff --git a/vllm/model_executor/guided_decoding/outlines_decoding.py b/vllm/model_executor/guided_decoding/outlines_decoding.py index a480fabf7d803..ba9c982903682 100644 --- a/vllm/model_executor/guided_decoding/outlines_decoding.py +++ b/vllm/model_executor/guided_decoding/outlines_decoding.py @@ -6,7 +6,7 @@ from enum import Enum from json import dumps as json_dumps from re import escape as regex_escape -from typing import Union +from typing import Tuple, Union from transformers import PreTrainedTokenizerBase @@ -105,7 +105,7 @@ def get_local_outlines_guided_decoding_logits_processor( def _get_guide_and_mode( guided_params: GuidedDecodingParams -) -> Union[tuple[str, GuidedDecodingMode], tuple[None, None]]: +) -> Union[Tuple[str, GuidedDecodingMode], Tuple[None, None]]: if guided_params.json: if isinstance(guided_params.json, dict): # turn dict into hashable string diff --git a/vllm/model_executor/guided_decoding/outlines_logits_processors.py b/vllm/model_executor/guided_decoding/outlines_logits_processors.py index e5d926e82fc67..a05267d921d1a 100644 --- a/vllm/model_executor/guided_decoding/outlines_logits_processors.py +++ b/vllm/model_executor/guided_decoding/outlines_logits_processors.py @@ -19,7 +19,7 @@ import json from collections import defaultdict from functools import lru_cache -from typing import Callable, Union +from typing import Callable, DefaultDict, Dict, List, Union import numpy as np import torch @@ -40,10 +40,10 @@ class BaseLogitsProcessor: def __init__(self, guide: Guide): self._guide: Guide = guide # CFGState is used for the FSM state for CFGGuide - self._fsm_state: defaultdict[int, Union[int, + self._fsm_state: DefaultDict[int, Union[int, CFGState]] = defaultdict(int) - def __call__(self, input_ids: list[int], + def __call__(self, input_ids: List[int], scores: torch.Tensor) -> torch.Tensor: """Use the FSM to bias the logits before sampling the next token.""" seq_id = hash(tuple(input_ids)) @@ -130,7 +130,7 @@ def __init__(self, regex_string: str, tokenizer: PreTrainedTokenizerBase): class JSONLogitsProcessor(RegexLogitsProcessor): - def __init__(self, schema: Union[str, dict, BaseModel], + def __init__(self, schema: Union[str, Dict, BaseModel], tokenizer: PreTrainedTokenizerBase, whitespace_pattern: Union[str, None]): """Compile the FSM that drives the JSON-guided generation. @@ -150,7 +150,7 @@ def __init__(self, schema: Union[str, dict, BaseModel], """ if isinstance(schema, type(BaseModel)): schema_str = json.dumps(schema.model_json_schema()) - elif isinstance(schema, dict): + elif isinstance(schema, Dict): schema_str = json.dumps(schema) elif isinstance(schema, str): schema_str = schema @@ -219,11 +219,11 @@ def convert_token_to_string(token: str) -> str: return string def change_decoder( - decoder: Callable[[list[int]], - str]) -> Callable[[list[int]], list[str]]: + decoder: Callable[[List[int]], + str]) -> Callable[[List[int]], List[str]]: """Sync vLLM's decoder with the outlines by returning list.""" - def new_decoder(inp_tokens: list[int]) -> list[str]: + def new_decoder(inp_tokens: List[int]) -> List[str]: if (isinstance(inp_tokens, list) and len(inp_tokens) == 1 and isinstance(inp_tokens[0], list)): inp_tokens = inp_tokens[0] diff --git a/vllm/model_executor/guided_decoding/xgrammar_decoding.py b/vllm/model_executor/guided_decoding/xgrammar_decoding.py index 883d4e728abb0..eb9d83acb2867 100644 --- a/vllm/model_executor/guided_decoding/xgrammar_decoding.py +++ b/vllm/model_executor/guided_decoding/xgrammar_decoding.py @@ -6,7 +6,7 @@ import json import re from dataclasses import dataclass, field -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, List import torch from transformers import PreTrainedTokenizerFast @@ -281,7 +281,7 @@ def escape_ebnf_string(s: str) -> str: return re.sub(r'(["\\])', r'\\\1', s) @staticmethod - def choice_as_grammar(choice: list[str] | None) -> str: + def choice_as_grammar(choice: List[str] | None) -> str: if choice is None: raise ValueError("Choice is not set") escaped_choices = (GrammarConfig.escape_ebnf_string(c) for c in choice) diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py index 59adfe3d5c32f..6f933c3fa3c9f 100644 --- a/vllm/model_executor/layers/fused_moe/__init__.py +++ b/vllm/model_executor/layers/fused_moe/__init__.py @@ -1,13 +1,13 @@ # SPDX-License-Identifier: Apache-2.0 from contextlib import contextmanager -from typing import Any, Optional +from typing import Any, Dict, Optional from vllm.model_executor.layers.fused_moe.layer import ( FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported) from vllm.triton_utils import HAS_TRITON -_config: Optional[dict[str, Any]] = None +_config: Optional[Dict[str, Any]] = None @contextmanager @@ -19,7 +19,7 @@ def override_config(config): _config = old_config -def get_config() -> Optional[dict[str, Any]]: +def get_config() -> Optional[Dict[str, Any]]: return _config diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 20fd415d2dd80..00260313e72eb 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -3,7 +3,7 @@ import functools import json import os -from typing import Any, Callable, Optional +from typing import Any, Callable, Dict, List, Optional, Tuple import torch import triton @@ -578,7 +578,7 @@ def moe_align_block_size( block_size: int, num_experts: int, expert_map: torch.Tensor = None -) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: +) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: """ Aligns the token distribution across experts to be compatible with block size for matrix multiplication. @@ -676,12 +676,12 @@ def invoke_fused_moe_kernel(A: torch.Tensor, num_tokens_post_padded: torch.Tensor, mul_routed_weight: bool, top_k: int, - config: dict[str, Any], + config: Dict[str, Any], compute_type: tl.dtype, use_fp8_w8a8: bool, use_int8_w8a16: bool, use_int4_w4a16: bool, - block_shape: Optional[list[int]] = None) -> None: + block_shape: Optional[List[int]] = None) -> None: assert topk_weights.stride(1) == 1 assert sorted_token_ids.stride(0) == 1 @@ -804,7 +804,7 @@ def invoke_fused_moe_kernel(A: torch.Tensor, def get_config_file_name(E: int, N: int, dtype: Optional[str], - block_shape: Optional[list[int]] = None) -> str: + block_shape: Optional[List[int]] = None) -> str: device_name = current_platform.get_device_name().replace(" ", "_") dtype_selector = "" if not dtype else f",dtype={dtype}" block_shape_selector = ("" if not block_shape or not all(block_shape) else @@ -820,7 +820,7 @@ def get_moe_configs( dtype: Optional[str], block_n: Optional[int] = None, block_k: Optional[int] = None, -) -> Optional[dict[int, Any]]: +) -> Optional[Dict[int, Any]]: """ Return optimized configurations for the fused MoE kernel. @@ -860,8 +860,8 @@ def get_default_config( topk: int, dtype: Optional[str], is_marlin: bool, - block_shape: Optional[list[int]] = None, -) -> dict[str, int]: + block_shape: Optional[List[int]] = None, +) -> Dict[str, int]: if dtype == "fp8_w8a8" and block_shape is not None: # Block-wise quant: BLOCK_SIZE_N must be divisible by block_shape[0] # BLOCK_SIZE_K must be divisible by block_shape[1] @@ -892,13 +892,13 @@ def get_default_config( def try_get_optimal_moe_config( - w1_shape: tuple[int, ...], - w2_shape: tuple[int, ...], + w1_shape: Tuple[int, ...], + w2_shape: Tuple[int, ...], top_k: int, dtype: Optional[str], M: int, is_marlin: bool = False, - block_shape: Optional[list[int]] = None, + block_shape: Optional[List[int]] = None, ): from vllm.model_executor.layers.fused_moe import get_config override_config = get_config() @@ -1052,7 +1052,7 @@ def inplace_fused_experts(hidden_states: torch.Tensor, w2_zp: Optional[torch.Tensor] = None, a1_scale: Optional[torch.Tensor] = None, a2_scale: Optional[torch.Tensor] = None, - block_shape: Optional[list[int]] = None) -> None: + block_shape: Optional[List[int]] = None) -> None: fused_experts_impl(hidden_states, w1, w2, topk_weights, topk_ids, True, activation, use_fp8_w8a8, use_int8_w8a16, use_int4_w4a16, global_num_experts, expert_map, @@ -1078,7 +1078,7 @@ def inplace_fused_experts_fake( w2_zp: Optional[torch.Tensor] = None, a1_scale: Optional[torch.Tensor] = None, a2_scale: Optional[torch.Tensor] = None, - block_shape: Optional[list[int]] = None) -> None: + block_shape: Optional[List[int]] = None) -> None: pass @@ -1108,7 +1108,7 @@ def outplace_fused_experts( w2_zp: Optional[torch.Tensor] = None, a1_scale: Optional[torch.Tensor] = None, a2_scale: Optional[torch.Tensor] = None, - block_shape: Optional[list[int]] = None) -> torch.Tensor: + block_shape: Optional[List[int]] = None) -> torch.Tensor: return fused_experts_impl(hidden_states, w1, w2, topk_weights, topk_ids, False, activation, use_fp8_w8a8, use_int8_w8a16, use_int4_w4a16, global_num_experts, expert_map, @@ -1134,7 +1134,7 @@ def outplace_fused_experts_fake( w2_zp: Optional[torch.Tensor] = None, a1_scale: Optional[torch.Tensor] = None, a2_scale: Optional[torch.Tensor] = None, - block_shape: Optional[list[int]] = None) -> torch.Tensor: + block_shape: Optional[List[int]] = None) -> torch.Tensor: return torch.empty_like(hidden_states) @@ -1164,7 +1164,7 @@ def fused_experts(hidden_states: torch.Tensor, w2_zp: Optional[torch.Tensor] = None, a1_scale: Optional[torch.Tensor] = None, a2_scale: Optional[torch.Tensor] = None, - block_shape: Optional[list[int]] = None) -> torch.Tensor: + block_shape: Optional[List[int]] = None) -> torch.Tensor: if inplace: torch.ops.vllm.inplace_fused_experts( @@ -1199,7 +1199,7 @@ def fused_experts_impl(hidden_states: torch.Tensor, w2_zp: Optional[torch.Tensor] = None, a1_scale: Optional[torch.Tensor] = None, a2_scale: Optional[torch.Tensor] = None, - block_shape: Optional[list[int]] = None): + block_shape: Optional[List[int]] = None): # Check constraints. if use_int4_w4a16: assert hidden_states.shape[1] // 2 == w1.shape[ @@ -1370,7 +1370,7 @@ def fused_moe( w2_zp: Optional[torch.Tensor] = None, a1_scale: Optional[torch.Tensor] = None, a2_scale: Optional[torch.Tensor] = None, - block_shape: Optional[list[int]] = None, + block_shape: Optional[List[int]] = None, ) -> torch.Tensor: """ This function computes a Mixture of Experts (MoE) layer using two sets of @@ -1413,7 +1413,7 @@ def fused_moe( a1. - a2_scale (Optional[torch.Tensor]): Optional scale to be used for a2. - - block_shape: (Optional[list[int]]): Optional block size for block-wise + - block_shape: (Optional[List[int]]): Optional block size for block-wise quantization. Returns: diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 99ce694327441..28a88571dab4b 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -2,7 +2,7 @@ from abc import abstractmethod from enum import Enum -from typing import Callable, Optional +from typing import Callable, List, Optional, Tuple import torch from torch.nn.parameter import UninitializedParameter @@ -698,7 +698,7 @@ def forward(self, hidden_states: torch.Tensor, def make_expert_params_mapping( cls, ckpt_gate_proj_name: str, ckpt_down_proj_name: str, ckpt_up_proj_name: str, - num_experts: int) -> list[tuple[str, str, int, str]]: + num_experts: int) -> List[Tuple[str, str, int, str]]: return [ # (param_name, weight_name, expert_id, shard_id) diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py index 9d17b1e3044eb..b476fb0dbc7eb 100644 --- a/vllm/model_executor/layers/layernorm.py +++ b/vllm/model_executor/layers/layernorm.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 """Custom normalization layers.""" -from typing import Optional, Union +from typing import Optional, Tuple, Union import torch import torch.nn as nn @@ -39,7 +39,7 @@ def forward_native( self, x: torch.Tensor, residual: Optional[torch.Tensor] = None, - ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]: + ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: """PyTorch-native implementation equivalent to forward().""" orig_dtype = x.dtype x = x.to(torch.float32) @@ -77,7 +77,7 @@ def forward_cuda( self, x: torch.Tensor, residual: Optional[torch.Tensor] = None, - ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]: + ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: if self.variance_size_override is not None: return self.forward_native(x, residual) @@ -104,7 +104,7 @@ def forward_hpu( self, x: torch.Tensor, residual: Optional[torch.Tensor] = None, - ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]: + ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: from vllm_hpu_extension.ops import HPUFusedRMSNorm if HPUFusedRMSNorm is None: return self.forward_native(x, residual) @@ -123,7 +123,7 @@ def forward_xpu( self, x: torch.Tensor, residual: Optional[torch.Tensor] = None, - ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]: + ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: if self.variance_size_override is not None: return self.forward_native(x, residual) @@ -173,7 +173,7 @@ def forward_static( variance_epsilon: float, x: torch.Tensor, residual: Optional[torch.Tensor], - ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]: + ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: """PyTorch-native implementation equivalent to forward().""" orig_dtype = x.dtype if residual is not None: @@ -193,7 +193,7 @@ def forward_native( self, x: torch.Tensor, residual: Optional[torch.Tensor] = None, - ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]: + ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: """PyTorch-native implementation equivalent to forward().""" return self.forward_static(self.weight.data, self.variance_epsilon, x, residual) @@ -202,7 +202,7 @@ def forward_cuda( self, x: torch.Tensor, residual: Optional[torch.Tensor] = None, - ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]: + ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: if torch.compiler.is_compiling(): return self.forward_native(x, residual) diff --git a/vllm/model_executor/layers/mamba/mamba_mixer2.py b/vllm/model_executor/layers/mamba/mamba_mixer2.py index c1c582681bb96..b53a540ed6624 100644 --- a/vllm/model_executor/layers/mamba/mamba_mixer2.py +++ b/vllm/model_executor/layers/mamba/mamba_mixer2.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Optional, Union +from typing import List, Optional, Tuple, Union import torch from torch import nn @@ -107,7 +107,7 @@ def forward_cuda( self, x: torch.Tensor, gate: torch.Tensor, - ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]: + ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: if self.tp_size > 1 or self.n_groups != 1: return self.forward_native(x, gate) @@ -139,7 +139,7 @@ def extra_groups_for_head_shards(ngroups: int, tp_size: int): def mamba_v2_sharded_weight_loader( - shard_spec: list[tuple[int, int, float]], + shard_spec: List[Tuple[int, int, float]], tp_size: int, tp_rank: int, ) -> LoaderFunction: diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py index 8785a5b4cb82d..0012636ef9ffc 100644 --- a/vllm/model_executor/layers/pooler.py +++ b/vllm/model_executor/layers/pooler.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 from enum import IntEnum -from typing import Optional, Union +from typing import List, Optional, Union import torch import torch.nn as nn @@ -46,7 +46,7 @@ def from_pooling_type( normalize: bool, softmax: bool, step_tag_id: Optional[int] = None, - returned_token_ids: Optional[list[int]] = None, + returned_token_ids: Optional[List[int]] = None, ) -> "SimplePooler": if pooling_type == PoolingType.LAST: assert step_tag_id is None and returned_token_ids is None @@ -174,7 +174,7 @@ def __init__( normalize: bool, softmax: bool, step_tag_id: Optional[int] = None, - returned_token_ids: Optional[list[int]] = None, + returned_token_ids: Optional[List[int]] = None, ): super().__init__(normalize=normalize, softmax=softmax) @@ -245,7 +245,7 @@ def from_config_with_defaults( normalize: bool, softmax: bool, step_tag_id: Optional[int] = None, - returned_token_ids: Optional[list[int]] = None, + returned_token_ids: Optional[List[int]] = None, ) -> SimplePooler: return SimplePooler.from_pooling_type( pooling_type=PoolingType[pooler_config.pooling_type] diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py index 93fb964eeea9b..6cd508d057a44 100644 --- a/vllm/model_executor/layers/quantization/__init__.py +++ b/vllm/model_executor/layers/quantization/__init__.py @@ -1,9 +1,11 @@ # SPDX-License-Identifier: Apache-2.0 +from typing import Dict, List, Type + from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) -QUANTIZATION_METHODS: list[str] = [ +QUANTIZATION_METHODS: List[str] = [ "aqlm", "awq", "deepspeedfp", @@ -71,7 +73,7 @@ def _wrapper(quant_config_cls): return _wrapper -def get_quantization_config(quantization: str) -> type[QuantizationConfig]: +def get_quantization_config(quantization: str) -> Type[QuantizationConfig]: if quantization not in QUANTIZATION_METHODS: raise ValueError(f"Invalid quantization method: {quantization}") @@ -102,7 +104,7 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]: from .qqq import QQQConfig from .tpu_int8 import Int8TpuConfig - method_to_config: dict[str, type[QuantizationConfig]] = { + method_to_config: Dict[str, Type[QuantizationConfig]] = { "aqlm": AQLMConfig, "awq": AWQConfig, "deepspeedfp": DeepSpeedFPConfig, diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py index a2c61e7d0862b..10f5241f9a717 100644 --- a/vllm/model_executor/layers/quantization/aqlm.py +++ b/vllm/model_executor/layers/quantization/aqlm.py @@ -4,7 +4,7 @@ # and https://arxiv.org/pdf/2401.06118.pdf import math -from typing import Any, Optional +from typing import Any, Dict, List, Optional import torch import torch.nn.functional as F @@ -97,7 +97,7 @@ def generic_dequantize_gemm( codebooks: torch. Tensor, # [num_codebooks, codebook_size, out_group_size, in_group_size] scales: torch.Tensor, # [num_out_groups, 1, 1, 1] - output_partition_sizes: list[int], + output_partition_sizes: List[int], bias: Optional[torch.Tensor], ) -> torch.Tensor: output_shape = input.shape[:-1] + (scales.shape[0], ) @@ -135,7 +135,7 @@ def optimized_dequantize_gemm( codebooks: torch. Tensor, # [num_codebooks, codebook_size, out_group_size, in_group_size] scales: torch.Tensor, # [num_out_groups, 1, 1, 1] - output_partition_sizes: list[int], + output_partition_sizes: List[int], bias: Optional[torch.Tensor], ) -> torch.Tensor: weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes) @@ -190,7 +190,7 @@ def get_name(cls) -> str: return "aqlm" @classmethod - def get_supported_act_dtypes(cls) -> list[torch.dtype]: + def get_supported_act_dtypes(cls) -> List[torch.dtype]: return [torch.half] @classmethod @@ -198,11 +198,11 @@ def get_min_capability(cls) -> int: return 60 @classmethod - def get_config_filenames(cls) -> list[str]: + def get_config_filenames(cls) -> List[str]: return [] # no extra configs. @classmethod - def from_config(cls, config: dict[str, Any]) -> "AQLMConfig": + def from_config(cls, config: Dict[str, Any]) -> "AQLMConfig": in_group_size = cls.get_from_keys(config, ["in_group_size"]) nbits_per_codebook = cls.get_from_keys(config, ["nbits_per_codebook"]) num_code_books = cls.get_from_keys(config, ["num_codebooks"]) @@ -229,7 +229,7 @@ def __init__(self, quant_config: AQLMConfig): def create_weights(self, layer: torch.nn.Module, input_size_per_partition: int, - output_partition_sizes: list[int], input_size: int, + output_partition_sizes: List[int], input_size: int, output_size: int, params_dtype: torch.dtype, **extra_weight_attrs): del output_size # Unused. diff --git a/vllm/model_executor/layers/quantization/awq.py b/vllm/model_executor/layers/quantization/awq.py index 1f0f6f7074d43..227be1497d0ec 100644 --- a/vllm/model_executor/layers/quantization/awq.py +++ b/vllm/model_executor/layers/quantization/awq.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Any, Optional +from typing import Any, Dict, List, Optional import torch @@ -24,7 +24,7 @@ def __init__( weight_bits: int, group_size: int, zero_point: bool, - modules_to_not_convert: Optional[list[str]] = None, + modules_to_not_convert: Optional[List[str]] = None, ) -> None: super().__init__() self.weight_bits = weight_bits @@ -47,7 +47,7 @@ def __repr__(self) -> str: def get_name(self) -> str: return "awq" - def get_supported_act_dtypes(self) -> list[torch.dtype]: + def get_supported_act_dtypes(self) -> List[torch.dtype]: return [torch.half] @classmethod @@ -56,7 +56,7 @@ def get_min_capability(cls) -> int: return 75 @staticmethod - def get_config_filenames() -> list[str]: + def get_config_filenames() -> List[str]: return [ "quant_config.json", # E.g., casperhansen/vicuna-7b-v1.5-awq # E.g., abhinavkulkarni/mosaicml-mpt-7b-instruct-w4-g128-awq @@ -64,7 +64,7 @@ def get_config_filenames() -> list[str]: ] @classmethod - def from_config(cls, config: dict[str, Any]) -> "AWQConfig": + def from_config(cls, config: Dict[str, Any]) -> "AWQConfig": weight_bits = cls.get_from_keys(config, ["w_bit", "bits"]) group_size = cls.get_from_keys(config, ["q_group_size", "group_size"]) zero_point = cls.get_from_keys(config, ["zero_point"]) @@ -81,7 +81,7 @@ def get_quant_method(self, layer: torch.nn.Module, return None -def is_layer_skipped_awq(prefix: str, modules_to_not_convert: list[str]): +def is_layer_skipped_awq(prefix: str, modules_to_not_convert: List[str]): return any(module_name in prefix for module_name in modules_to_not_convert) @@ -97,7 +97,7 @@ def __init__(self, quant_config: AWQConfig): def create_weights(self, layer: torch.nn.Module, input_size_per_partition: int, - output_partition_sizes: list[int], input_size: int, + output_partition_sizes: List[int], input_size: int, output_size: int, params_dtype: torch.dtype, **extra_weight_attrs): if input_size_per_partition % self.quant_config.group_size != 0: diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py index 2f543cd9a765d..473816fcc3ecd 100644 --- a/vllm/model_executor/layers/quantization/awq_marlin.py +++ b/vllm/model_executor/layers/quantization/awq_marlin.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Any, Callable, Optional +from typing import Any, Callable, Dict, List, Optional import torch from torch.nn import Parameter @@ -45,8 +45,8 @@ class AWQMarlinConfig(QuantizationConfig): def __init__(self, weight_bits: int, group_size: int, zero_point: bool, lm_head_quantized: bool, - modules_to_not_convert: Optional[list[str]], - full_config: dict[str, Any]) -> None: + modules_to_not_convert: Optional[List[str]], + full_config: Dict[str, Any]) -> None: super().__init__() self.pack_factor = 32 // weight_bits # packed into int32 self.group_size = group_size @@ -78,7 +78,7 @@ def get_name(cls) -> str: return "awq_marlin" @classmethod - def get_supported_act_dtypes(cls) -> list[torch.dtype]: + def get_supported_act_dtypes(cls) -> List[torch.dtype]: return [torch.half, torch.bfloat16] @classmethod @@ -86,11 +86,11 @@ def get_min_capability(cls) -> int: return 80 @classmethod - def get_config_filenames(cls) -> list[str]: + def get_config_filenames(cls) -> List[str]: return ["quantize_config.json"] @classmethod - def from_config(cls, config: dict[str, Any]) -> "AWQMarlinConfig": + def from_config(cls, config: Dict[str, Any]) -> "AWQMarlinConfig": weight_bits = cls.get_from_keys(config, ["bits"]) group_size = cls.get_from_keys(config, ["group_size"]) zero_point = cls.get_from_keys(config, ["zero_point"]) @@ -145,7 +145,7 @@ def get_quant_method(self, layer: torch.nn.Module, return None @classmethod - def is_awq_marlin_compatible(cls, quant_config: dict[str, Any]): + def is_awq_marlin_compatible(cls, quant_config: Dict[str, Any]): # Extract data from quant config. quant_method = quant_config.get("quant_method", "").lower() num_bits = quant_config.get("bits") @@ -184,7 +184,7 @@ def create_weights( self, layer: torch.nn.Module, input_size_per_partition: int, - output_partition_sizes: list[int], + output_partition_sizes: List[int], input_size: int, output_size: int, params_dtype: torch.dtype, diff --git a/vllm/model_executor/layers/quantization/base_config.py b/vllm/model_executor/layers/quantization/base_config.py index 5343e6ca0e773..5ef11546fd41b 100644 --- a/vllm/model_executor/layers/quantization/base_config.py +++ b/vllm/model_executor/layers/quantization/base_config.py @@ -2,7 +2,7 @@ import inspect from abc import ABC, abstractmethod -from typing import Any, Optional +from typing import Any, Dict, List, Optional, Type import torch from torch import nn @@ -43,7 +43,7 @@ def process_weights_after_loading(self, layer: nn.Module) -> None: def method_has_implemented_embedding( - method_class: type[QuantizeMethodBase]) -> bool: + method_class: Type[QuantizeMethodBase]) -> bool: """ Not all quant methods have embedding implemented, so we need to check that it exists for our given method. We check this by making sure the function @@ -63,7 +63,7 @@ class QuantizationConfig(ABC): def __init__(self): super().__init__() # mapping is updated by models as they initialize - self.packed_modules_mapping: dict[str, list[str]] = dict() + self.packed_modules_mapping: Dict[str, List[str]] = dict() @abstractmethod def get_name(self) -> str: @@ -71,7 +71,7 @@ def get_name(self) -> str: raise NotImplementedError @abstractmethod - def get_supported_act_dtypes(self) -> list[torch.dtype]: + def get_supported_act_dtypes(self) -> List[torch.dtype]: """List of supported activation dtypes.""" raise NotImplementedError @@ -88,13 +88,13 @@ def get_min_capability(cls) -> int: @staticmethod @abstractmethod - def get_config_filenames() -> list[str]: + def get_config_filenames() -> List[str]: """List of filenames to search for in the model directory.""" raise NotImplementedError @classmethod @abstractmethod - def from_config(cls, config: dict[str, Any]) -> "QuantizationConfig": + def from_config(cls, config: Dict[str, Any]) -> "QuantizationConfig": """Create a config class from the model's quantization config.""" raise NotImplementedError @@ -110,7 +110,7 @@ def override_quantization_method(cls, hf_quant_cfg, return None @staticmethod - def get_from_keys(config: dict[str, Any], keys: list[str]) -> Any: + def get_from_keys(config: Dict[str, Any], keys: List[str]) -> Any: """Get a value from the model's quantization config.""" for key in keys: if key in config: @@ -119,7 +119,7 @@ def get_from_keys(config: dict[str, Any], keys: list[str]) -> Any: "quantization config.") @staticmethod - def get_from_keys_or(config: dict[str, Any], keys: list[str], + def get_from_keys_or(config: Dict[str, Any], keys: List[str], default: Any) -> Any: """Get a optional value from the model's quantization config.""" try: diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py index b7d0f0564e660..33c2ca93ffa17 100644 --- a/vllm/model_executor/layers/quantization/bitsandbytes.py +++ b/vllm/model_executor/layers/quantization/bitsandbytes.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Any, Optional +from typing import Any, Dict, List, Optional import torch @@ -27,7 +27,7 @@ def __init__( bnb_4bit_use_double_quant: bool = False, llm_int8_enable_fp32_cpu_offload: bool = False, llm_int8_has_fp16_weight: bool = False, - llm_int8_skip_modules: Optional[list[str]] = None, + llm_int8_skip_modules: Optional[List[str]] = None, llm_int8_threshold: float = 6.0, ) -> None: super().__init__() @@ -59,7 +59,7 @@ def get_name(self) -> str: return "bitsandbytes" @classmethod - def get_supported_act_dtypes(self) -> list[torch.dtype]: + def get_supported_act_dtypes(self) -> List[torch.dtype]: return [torch.float32, torch.float16, torch.bfloat16] @classmethod @@ -67,13 +67,13 @@ def get_min_capability(cls) -> int: return 70 @staticmethod - def get_config_filenames() -> list[str]: + def get_config_filenames() -> List[str]: return [ "adapter_config.json", ] @classmethod - def from_config(cls, config: dict[str, Any]) -> "BitsAndBytesConfig": + def from_config(cls, config: Dict[str, Any]) -> "BitsAndBytesConfig": def get_safe_value(config, keys, default_value=None): try: @@ -128,7 +128,7 @@ def get_quant_method(self, layer: torch.nn.Module, return None -def is_layer_skipped_bnb(prefix: str, llm_int8_skip_modules: list[str]): +def is_layer_skipped_bnb(prefix: str, llm_int8_skip_modules: List[str]): # Split the prefix into its dot-separated components components = prefix.split('.') @@ -167,7 +167,7 @@ def __init__(self, quant_config: BitsAndBytesConfig): def create_weights(self, layer: torch.nn.Module, input_size_per_partition: int, - output_partition_sizes: list[int], input_size: int, + output_partition_sizes: List[int], input_size: int, output_size: int, params_dtype: torch.dtype, **extra_weight_attrs): from bitsandbytes.nn import Int8Params diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index d9f54839112da..ce6c706fe3d27 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 from contextlib import suppress -from typing import Any, Literal, Optional, cast +from typing import Any, Dict, List, Literal, Optional, Tuple, cast import torch from compressed_tensors.config import (CompressionFormat, @@ -36,20 +36,20 @@ __all__ = ["CompressedTensorsLinearMethod"] SPARSITY_CONFIG_NAME: Literal["sparsity_config"] = "sparsity_config" -QUANTIZATION_SCHEME_MAP_TYPE = dict[str, Optional[dict[str, QuantizationArgs]]] +QUANTIZATION_SCHEME_MAP_TYPE = Dict[str, Optional[Dict[str, QuantizationArgs]]] class CompressedTensorsConfig(QuantizationConfig): def __init__( self, - target_scheme_map: dict[str, Any], - ignore: list[str], + target_scheme_map: Dict[str, Any], + ignore: List[str], quant_format: str, - sparsity_scheme_map: dict[str, SparsityCompressionConfig], - sparsity_ignore_list: list[str], - kv_cache_scheme: Optional[dict[str, Any]] = None, - config: Optional[dict[str, Any]] = None, + sparsity_scheme_map: Dict[str, SparsityCompressionConfig], + sparsity_ignore_list: List[str], + kv_cache_scheme: Optional[Dict[str, Any]] = None, + config: Optional[Dict[str, Any]] = None, ): super().__init__() self.ignore = ignore @@ -64,7 +64,7 @@ def __init__( def get_linear_method(self) -> "CompressedTensorsLinearMethod": return CompressedTensorsLinearMethod(self) - def get_supported_act_dtypes(cls) -> list[torch.dtype]: + def get_supported_act_dtypes(cls) -> List[torch.dtype]: return [torch.float16, torch.bfloat16] @classmethod @@ -100,8 +100,8 @@ def get_quant_method( return None @classmethod - def from_config(cls, config: dict[str, Any]) -> "CompressedTensorsConfig": - ignore: list[str] = cast(list[str], config.get("ignore", [])) + def from_config(cls, config: Dict[str, Any]) -> "CompressedTensorsConfig": + ignore: List[str] = cast(List[str], config.get("ignore", [])) quant_format = cast(str, config.get("format")) target_scheme_map = cls._quantization_scheme_map_from_config( config=config) @@ -119,8 +119,8 @@ def from_config(cls, config: dict[str, Any]) -> "CompressedTensorsConfig": @classmethod def _parse_sparsity_config( - cls, config: dict[str, Any] - ) -> tuple[dict[str, SparsityCompressionConfig], list[str]]: + cls, config: Dict[str, Any] + ) -> Tuple[Dict[str, SparsityCompressionConfig], List[str]]: """ :param config: The `quantization_config` dictionary from config.json :return: A tuple with two elements @@ -133,7 +133,7 @@ def _parse_sparsity_config( sparsity_config = SparsityCompressionConfig.model_validate( sparsity_config) - sparse_scheme_map: dict[str, SparsityCompressionConfig] = { + sparse_scheme_map: Dict[str, SparsityCompressionConfig] = { target: sparsity_config for target in sparsity_config.targets or list() } @@ -142,13 +142,13 @@ def _parse_sparsity_config( @classmethod def _quantization_scheme_map_from_config( - cls, config: dict[str, Any]) -> QUANTIZATION_SCHEME_MAP_TYPE: + cls, config: Dict[str, Any]) -> QUANTIZATION_SCHEME_MAP_TYPE: """ :param config: The `quantization_config` dictionary from config.json :return: A dictionary mapping target layer names to their corresponding quantization_args for weights and input activations """ - target_scheme_map: dict[str, Any] = dict() + target_scheme_map: Dict[str, Any] = dict() quant_format = cast(str, config.get("format")) # The quant_config has multiple config_groups, each containing @@ -186,7 +186,7 @@ def _quantization_scheme_map_from_config( return target_scheme_map @classmethod - def get_config_filenames(cls) -> list[str]: + def get_config_filenames(cls) -> List[str]: return [] def _check_scheme_supported(self, @@ -531,7 +531,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: def create_weights(self, layer: torch.nn.Module, input_size_per_partition: int, - output_partition_sizes: list[int], input_size: int, + output_partition_sizes: List[int], input_size: int, output_size: int, params_dtype: torch.dtype, **extra_weight_attrs): """ @@ -577,7 +577,7 @@ def __init__(self, quant_config: CompressedTensorsConfig): super().__init__(quant_config) @staticmethod - def validate_kv_cache_scheme(kv_cache_scheme: Optional[dict[str, Any]]): + def validate_kv_cache_scheme(kv_cache_scheme: Optional[Dict[str, Any]]): """ Validator for the kv cache scheme. Useful for controlling the kv cache quantization schemes, that are being supported in vLLM diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index c9982b7a88e80..c9aa0ec285baf 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -2,7 +2,7 @@ import enum from enum import Enum -from typing import Callable, Optional +from typing import Callable, List, Optional import torch from compressed_tensors import CompressionFormat @@ -417,10 +417,10 @@ def replace_tensor(name, new_t): del new_t def get_scale_perms(num_bits: int): - scale_perm: list[int] = [] + scale_perm: List[int] = [] for i in range(8): scale_perm.extend([i + 8 * j for j in range(8)]) - scale_perm_single: list[int] = [] + scale_perm_single: List[int] = [] for i in range(4): scale_perm_single.extend( [2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]]) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py index f010bc03418c3..ec805c934e4ae 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Any, Callable, Optional +from typing import Any, Callable, Dict, List, Optional, Tuple import torch from compressed_tensors import CompressionFormat, ModelCompressor @@ -31,7 +31,7 @@ def __init__( quantized: bool = False, weight_quant: Optional[QuantizationArgs] = None, input_quant: Optional[QuantizationArgs] = None, - model_compression_config: Optional[dict[str, Any]] = None, + model_compression_config: Optional[Dict[str, Any]] = None, ): self.quantized = quantized self.weight_quant = weight_quant @@ -53,7 +53,7 @@ def create_weights( self, layer: torch.nn.Module, input_size: int, - output_partition_sizes: list[int], + output_partition_sizes: List[int], input_size_per_partition: int, params_dtype: torch.dtype, weight_loader: Callable, @@ -327,9 +327,9 @@ def _process_split( ) return sparsity_compressor.decompress_weight(weight_data) - split_weights: list[torch.Tensor] = [] - split_bitmask: list[torch.Tensor] = [] - split_shape: list[tuple[int, int]] = [] + split_weights: List[torch.Tensor] = [] + split_bitmask: List[torch.Tensor] = [] + split_shape: List[Tuple[int, int]] = [] if isinstance(layer, (QKVParallelLinear, MergedColumnParallelLinear)): split_weights = torch.split(compressed, layer.logical_widths) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py index 6ea31e50caa72..535ea6b32cfbf 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Callable, Optional +from typing import Callable, List, Optional import torch from torch.nn import Parameter @@ -58,7 +58,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: layer.meta = Parameter(layer.meta.data, requires_grad=False) def create_weights(self, layer: torch.nn.Module, input_size: int, - output_partition_sizes: list[int], + output_partition_sizes: List[int], input_size_per_partition: int, params_dtype: torch.dtype, weight_loader: Callable, **kwargs): diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py index d5ff04ee3811b..5c8261908735f 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Callable, Optional +from typing import Callable, List, Optional import torch from compressed_tensors.quantization import QuantizationStrategy @@ -58,7 +58,7 @@ def process_weights_after_loading(self, layer) -> None: prepare_fp8_layer_for_marlin(layer, strategy="channel") def create_weights(self, layer: torch.nn.Module, input_size: int, - output_partition_sizes: list[int], + output_partition_sizes: List[int], input_size_per_partition: int, params_dtype: torch.dtype, weight_loader: Callable, **kwargs): diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py index 37cb2a4e99e02..32072e9fa570f 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Callable, Optional +from typing import Callable, List, Optional import torch from compressed_tensors.quantization import QuantizationStrategy @@ -89,7 +89,7 @@ def process_weights_after_loading(self, layer) -> None: layer.input_scale = None def create_weights(self, layer: torch.nn.Module, - output_partition_sizes: list[int], + output_partition_sizes: List[int], input_size_per_partition: int, params_dtype: torch.dtype, weight_loader: Callable, **kwargs): diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py index 7792ce86553c6..08d86a4e5ddd2 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Callable, Optional +from typing import Callable, List, Optional, Set import torch from compressed_tensors.quantization import QuantizationStrategy @@ -19,7 +19,7 @@ class CompressedTensorsW8A8Int8(CompressedTensorsScheme): - _kernel_backends_being_used: set[str] = set() + _kernel_backends_being_used: Set[str] = set() def __init__(self, strategy: str, is_static_input_scheme: bool, input_symmetric: bool): @@ -33,7 +33,7 @@ def get_min_capability(cls) -> int: return 75 def create_weights(self, layer: torch.nn.Module, - output_partition_sizes: list[int], + output_partition_sizes: List[int], input_size_per_partition: int, params_dtype: torch.dtype, weight_loader: Callable, **kwargs): diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py index e19ea17361201..38df09ff39373 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Callable, Optional +from typing import Callable, List, Optional, Set import torch from compressed_tensors.quantization import ActivationOrdering @@ -30,7 +30,7 @@ class CompressedTensorsWNA16(CompressedTensorsScheme): - _kernel_backends_being_used: set[str] = set() + _kernel_backends_being_used: Set[str] = set() def __init__(self, strategy: str, @@ -61,7 +61,7 @@ def get_min_capability(cls) -> int: return 80 def create_weights(self, layer: torch.nn.Module, output_size: int, - input_size: int, output_partition_sizes: list[int], + input_size: int, output_partition_sizes: List[int], input_size_per_partition: int, params_dtype: torch.dtype, weight_loader: Callable, **kwargs): diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py b/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py index 664697a037009..b69c5e7a02a72 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Optional +from typing import Optional, Type import torch import triton @@ -126,7 +126,7 @@ def triton_scaled_mm(input: torch.Tensor, weight: torch.Tensor, scale_a: torch.Tensor, scale_b: torch.Tensor, - out_dtype: type[torch.dtype], + out_dtype: Type[torch.dtype], bias: Optional[torch.Tensor] = None, block_size_m: int = 32, block_size_n: int = 32, diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py index ccd54281ceb7e..85ae1d5cb7878 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py @@ -1,9 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 import re -from collections.abc import Iterable, Mapping from types import MappingProxyType -from typing import Optional +from typing import Iterable, List, Mapping, Optional from compressed_tensors import CompressionFormat from torch.nn import Module @@ -21,7 +20,7 @@ def is_activation_quantization_format(format: str) -> bool: def should_ignore_layer( layer_name: Optional[str], ignore: Iterable[str] = tuple(), - fused_mapping: Mapping[str, list[str]] = MappingProxyType({}) + fused_mapping: Mapping[str, List[str]] = MappingProxyType({}) ) -> bool: if layer_name is None: return False @@ -85,7 +84,7 @@ def find_matched_target( layer_name: Optional[str], module: Module, targets: Iterable[str], - fused_mapping: Mapping[str, list[str]] = MappingProxyType({}) + fused_mapping: Mapping[str, List[str]] = MappingProxyType({}) ) -> str: """ Helper function to look up which "target" in the compressed-tensors @@ -172,7 +171,7 @@ def _is_equal_or_regex_match(value: str, def _match_fused_layer( layer_name: str, target_layers: Iterable[str], - fused_mapping: Mapping[str, list[str]]) -> Optional[str]: + fused_mapping: Mapping[str, List[str]]) -> Optional[str]: """ Match a fused layer name to its corresponding individual layer in target_layers. Returns first value in fused_mapping which matches targets @@ -202,7 +201,7 @@ def _match_fused_layer( ] # for each unfused component, find a match in targets - unfused_matches: list[Optional[str]] = [] + unfused_matches: List[Optional[str]] = [] for unfused in unfused_paths: for target in target_layers: if _is_equal_or_regex_match(unfused, target): diff --git a/vllm/model_executor/layers/quantization/deepspeedfp.py b/vllm/model_executor/layers/quantization/deepspeedfp.py index e44b25a91b0e4..67934d37284e1 100644 --- a/vllm/model_executor/layers/quantization/deepspeedfp.py +++ b/vllm/model_executor/layers/quantization/deepspeedfp.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Any, Optional +from typing import Any, Dict, List, Optional import torch import torch.nn as nn @@ -45,7 +45,7 @@ def get_name(cls) -> str: return "DeepSpeedFP" @classmethod - def from_config(cls, config: dict[str, Any]) -> "DeepSpeedFPConfig": + def from_config(cls, config: Dict[str, Any]) -> "DeepSpeedFPConfig": weight_bits = cls.get_from_keys(config, ["bits"]) group_size = cls.get_from_keys(config, ["group_size"]) return cls(weight_bits=weight_bits, group_size=group_size) @@ -54,7 +54,7 @@ def get_linear_method(self) -> "DeepSpeedFPLinearMethod": return DeepSpeedFPLinearMethod(self) @classmethod - def get_supported_act_dtypes(cls) -> list[torch.dtype]: + def get_supported_act_dtypes(cls) -> List[torch.dtype]: return [torch.half, torch.bfloat16] @classmethod @@ -63,7 +63,7 @@ def get_min_capability(cls) -> int: return 60 @staticmethod - def get_config_filenames() -> list[str]: + def get_config_filenames() -> List[str]: return [ "quant_config.json", "quantize_config.json", @@ -90,7 +90,7 @@ def __init__(self, quant_config: DeepSpeedFPConfig): def create_weights(self, layer: torch.nn.Module, input_size_per_partition: int, - output_partition_sizes: list[int], + output_partition_sizes: List[int], input_size: int, output_size: int, params_dtype: torch.dtype, diff --git a/vllm/model_executor/layers/quantization/experts_int8.py b/vllm/model_executor/layers/quantization/experts_int8.py index cfa8ae7c00644..d18ca55afebdb 100644 --- a/vllm/model_executor/layers/quantization/experts_int8.py +++ b/vllm/model_executor/layers/quantization/experts_int8.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Any, Callable, Optional +from typing import Any, Callable, Dict, List, Optional import torch @@ -24,7 +24,7 @@ def get_name(cls) -> str: return "experts_int8" @classmethod - def get_supported_act_dtypes(cls) -> list[torch.dtype]: + def get_supported_act_dtypes(cls) -> List[torch.dtype]: return [torch.bfloat16, torch.half] @classmethod @@ -32,11 +32,11 @@ def get_min_capability(cls) -> int: return 80 @classmethod - def get_config_filenames(cls) -> list[str]: + def get_config_filenames(cls) -> List[str]: return [] @classmethod - def from_config(cls, config: dict[str, Any]) -> "ExpertsInt8Config": + def from_config(cls, config: Dict[str, Any]) -> "ExpertsInt8Config": return cls() def get_quant_method(self, layer: torch.nn.Module, diff --git a/vllm/model_executor/layers/quantization/fbgemm_fp8.py b/vllm/model_executor/layers/quantization/fbgemm_fp8.py index 3fc3f6c677be9..20f2c3da600d7 100644 --- a/vllm/model_executor/layers/quantization/fbgemm_fp8.py +++ b/vllm/model_executor/layers/quantization/fbgemm_fp8.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Any, Optional +from typing import Any, Dict, List, Optional import torch from torch.nn import Module @@ -29,7 +29,7 @@ class FBGEMMFp8Config(QuantizationConfig): """Config class for FBGEMM Fp8.""" - def __init__(self, ignore_list: list[str], input_scale_ub: float): + def __init__(self, ignore_list: List[str], input_scale_ub: float): super().__init__() self.ignore_list = ignore_list if ignore_list else [] self.input_scale_ub = input_scale_ub @@ -43,7 +43,7 @@ def get_name(cls) -> str: return "fbgemm_fp8" @classmethod - def get_supported_act_dtypes(cls) -> list[torch.dtype]: + def get_supported_act_dtypes(cls) -> List[torch.dtype]: return [torch.bfloat16, torch.float16] @classmethod @@ -51,11 +51,11 @@ def get_min_capability(cls) -> int: return 80 @classmethod - def get_config_filenames(cls) -> list[str]: + def get_config_filenames(cls) -> List[str]: return [] @classmethod - def from_config(cls, config: dict[str, Any]) -> "FBGEMMFp8Config": + def from_config(cls, config: Dict[str, Any]) -> "FBGEMMFp8Config": ignore_list = cls.get_from_keys(config, ["modules_to_not_convert"]) input_scale_ub = cls.get_from_keys(config, ["activation_scale_ub"]) return cls(ignore_list=ignore_list, input_scale_ub=input_scale_ub) @@ -79,7 +79,7 @@ def create_weights( self, layer: torch.nn.Module, input_size_per_partition: int, - output_partition_sizes: list[int], + output_partition_sizes: List[int], input_size: int, output_size: int, params_dtype: torch.dtype, diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index e3fca559c397e..a705f63be4acb 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Any, Callable, Optional +from typing import Any, Callable, Dict, List, Optional import torch import torch.nn.functional as F @@ -45,8 +45,8 @@ def __init__( self, is_checkpoint_fp8_serialized: bool = False, activation_scheme: str = "dynamic", - ignored_layers: Optional[list[str]] = None, - weight_block_size: Optional[list[int]] = None, + ignored_layers: Optional[List[str]] = None, + weight_block_size: Optional[List[int]] = None, ) -> None: super().__init__() self.is_checkpoint_fp8_serialized = is_checkpoint_fp8_serialized @@ -78,7 +78,7 @@ def get_name(cls) -> str: return "fp8" @classmethod - def get_supported_act_dtypes(cls) -> list[torch.dtype]: + def get_supported_act_dtypes(cls) -> List[torch.dtype]: return [torch.bfloat16, torch.half] @classmethod @@ -86,11 +86,11 @@ def get_min_capability(cls) -> int: return 80 @classmethod - def get_config_filenames(cls) -> list[str]: + def get_config_filenames(cls) -> List[str]: return [] @classmethod - def from_config(cls, config: dict[str, Any]) -> "Fp8Config": + def from_config(cls, config: Dict[str, Any]) -> "Fp8Config": quant_method = cls.get_from_keys(config, ["quant_method"]) is_checkpoint_fp8_serialized = ("fp8" in quant_method) activation_scheme = cls.get_from_keys(config, ["activation_scheme"]) @@ -157,7 +157,7 @@ def create_weights( self, layer: torch.nn.Module, input_size_per_partition: int, - output_partition_sizes: list[int], + output_partition_sizes: List[int], input_size: int, output_size: int, params_dtype: torch.dtype, diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py index 19f0fc9717a29..ba176e4a567cc 100644 --- a/vllm/model_executor/layers/quantization/gguf.py +++ b/vllm/model_executor/layers/quantization/gguf.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Any, Callable, Optional +from typing import Any, Callable, Dict, List, Optional import gguf import torch @@ -31,7 +31,7 @@ def __repr__(self) -> str: def get_name(self) -> str: return "gguf" - def get_supported_act_dtypes(self) -> list[torch.dtype]: + def get_supported_act_dtypes(self) -> List[torch.dtype]: return [torch.half] @classmethod @@ -39,11 +39,11 @@ def get_min_capability(cls) -> int: return 60 @classmethod - def get_config_filenames(cls) -> list[str]: + def get_config_filenames(cls) -> List[str]: return [] # no extra configs. @classmethod - def from_config(cls, config: dict[str, Any]) -> "GGUFConfig": + def from_config(cls, config: Dict[str, Any]) -> "GGUFConfig": return cls() def get_quant_method(self, layer: torch.nn.Module, @@ -131,7 +131,7 @@ def __init__(self, quant_config: GGUFConfig): def create_weights(self, layer: torch.nn.Module, input_size_per_partition: int, - output_partition_sizes: list[int], input_size: int, + output_partition_sizes: List[int], input_size: int, output_size: int, params_dtype: torch.dtype, **extra_weight_attrs): output_size_per_partition = sum(output_partition_sizes) @@ -332,7 +332,7 @@ def embedding(self, layer: torch.nn.Module, class GGUFUninitializedParameter(UninitializedParameter): cls_to_become = Parameter - data_container: list[torch.Tensor] + data_container: List[torch.Tensor] def materialize_nested(self) -> Parameter: dtype = {data.dtype for data in self.data_container} diff --git a/vllm/model_executor/layers/quantization/gptq.py b/vllm/model_executor/layers/quantization/gptq.py index c057a4bdabe7d..1c8d6cb1ea79a 100644 --- a/vllm/model_executor/layers/quantization/gptq.py +++ b/vllm/model_executor/layers/quantization/gptq.py @@ -3,7 +3,7 @@ import enum from enum import Enum from fractions import Fraction -from typing import Any, Optional, Union +from typing import Any, Dict, List, Optional, Union import torch from torch.nn.parameter import Parameter @@ -33,11 +33,11 @@ def __init__( group_size: int, desc_act: bool, lm_head_quantized: bool, - dynamic: dict[str, dict[str, Union[int, bool]]], + dynamic: Dict[str, Dict[str, Union[int, bool]]], ) -> None: # GPTQModel use `dynamic` config property to allow per module # quantization config so each module can be individually optimized. - # Format is dict[str, dict] where key is a regex string that can + # Format is Dict[str, Dict] where key is a regex string that can # perform both positive ("+:" prefixed) or negative ("-:" prefixed) # matching of a module. # Default to positive match, override base quant config mode, if no @@ -83,7 +83,7 @@ def get_name(cls) -> str: return "gptq" @classmethod - def get_supported_act_dtypes(cls) -> list[torch.dtype]: + def get_supported_act_dtypes(cls) -> List[torch.dtype]: return [torch.half] @classmethod @@ -92,11 +92,11 @@ def get_min_capability(cls) -> int: return 60 @classmethod - def get_config_filenames(cls) -> list[str]: + def get_config_filenames(cls) -> List[str]: return ["quantize_config.json"] @classmethod - def from_config(cls, config: dict[str, Any]) -> "GPTQConfig": + def from_config(cls, config: Dict[str, Any]) -> "GPTQConfig": dynamic = cls.get_from_keys_or(config, ["dynamic"], default={}) dynamic = {} if dynamic is None else dynamic @@ -134,7 +134,7 @@ def create_weights( self, layer: torch.nn.Module, input_size_per_partition: int, - output_partition_sizes: list[int], + output_partition_sizes: List[int], input_size: int, output_size: int, params_dtype: torch.dtype, diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index a826a7b5be4b3..21db8ccba059c 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Any, Callable, Optional, Union +from typing import Any, Callable, Dict, List, Optional, Set, Union import torch @@ -44,8 +44,8 @@ class GPTQMarlinConfig(QuantizationConfig): def __init__(self, weight_bits: int, group_size: int, desc_act: bool, is_sym: bool, lm_head_quantized: bool, - dynamic: dict[str, dict[str, Union[int, bool]]], - full_config: dict[str, Any]) -> None: + dynamic: Dict[str, Dict[str, Union[int, bool]]], + full_config: Dict[str, Any]) -> None: super().__init__() if desc_act and group_size == -1: # In this case, act_order == True is the same as act_order == False @@ -54,7 +54,7 @@ def __init__(self, weight_bits: int, group_size: int, desc_act: bool, # GPTQModel use `dynamic` config property to allow per module # quantization config so each module can be individually optimized. - # Format is dict[str, dict] where key is a regex string that can + # Format is Dict[str, Dict] where key is a regex string that can # perform both positive ("+:" prefixed) or negative ("-:" prefixed) # matching of a module. # Default to positive match, override base quant config mode, if no @@ -104,7 +104,7 @@ def get_name(cls) -> str: return "gptq_marlin" @classmethod - def get_supported_act_dtypes(cls) -> list[torch.dtype]: + def get_supported_act_dtypes(cls) -> List[torch.dtype]: return [torch.half, torch.bfloat16] @classmethod @@ -112,11 +112,11 @@ def get_min_capability(cls) -> int: return 80 @classmethod - def get_config_filenames(cls) -> list[str]: + def get_config_filenames(cls) -> List[str]: return ["quantize_config.json"] @classmethod - def from_config(cls, config: dict[str, Any]) -> "GPTQMarlinConfig": + def from_config(cls, config: Dict[str, Any]) -> "GPTQMarlinConfig": dynamic = cls.get_from_keys_or(config, ["dynamic"], default={}) dynamic = {} if dynamic is None else dynamic @@ -163,7 +163,7 @@ def get_quant_method(self, layer: torch.nn.Module, GPTQMarlinLinearMethod) @classmethod - def is_gptq_marlin_compatible(cls, quant_config: dict[str, Any]): + def is_gptq_marlin_compatible(cls, quant_config: Dict[str, Any]): quant_method = quant_config.get("quant_method", "").lower() num_bits = quant_config.get("bits") group_size = quant_config.get("group_size") @@ -195,7 +195,7 @@ class GPTQMarlinLinearMethod(LinearMethodBase): quant_config: The GPTQ Marlin quantization config. """ - _kernel_backends_being_used: set[str] = set() + _kernel_backends_being_used: Set[str] = set() def __init__(self, quant_config: GPTQMarlinConfig) -> None: self.quant_config = quant_config @@ -208,7 +208,7 @@ def create_weights( self, layer: torch.nn.Module, input_size_per_partition: int, - output_partition_sizes: list[int], + output_partition_sizes: List[int], input_size: int, output_size: int, params_dtype: torch.dtype, diff --git a/vllm/model_executor/layers/quantization/gptq_marlin_24.py b/vllm/model_executor/layers/quantization/gptq_marlin_24.py index be97fb2f176fc..dd747e182e289 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin_24.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin_24.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Any, Optional +from typing import Any, Dict, List, Optional import torch from torch.nn.parameter import Parameter @@ -89,7 +89,7 @@ def get_name(cls) -> str: return "gptq_marlin_24" @classmethod - def get_supported_act_dtypes(cls) -> list[torch.dtype]: + def get_supported_act_dtypes(cls) -> List[torch.dtype]: return [torch.half] @classmethod @@ -98,11 +98,11 @@ def get_min_capability(cls) -> int: return 80 @classmethod - def get_config_filenames(cls) -> list[str]: + def get_config_filenames(cls) -> List[str]: return ["quantize_config.json"] @classmethod - def from_config(cls, config: dict[str, Any]) -> "GPTQMarlin24Config": + def from_config(cls, config: Dict[str, Any]) -> "GPTQMarlin24Config": weight_bits = cls.get_from_keys(config, ["bits"]) group_size = cls.get_from_keys(config, ["group_size"]) return cls(weight_bits, group_size) @@ -145,7 +145,7 @@ def create_weights( self, layer: torch.nn.Module, input_size_per_partition: int, - output_partition_sizes: list[int], + output_partition_sizes: List[int], input_size: int, output_size: int, params_dtype: torch.dtype, diff --git a/vllm/model_executor/layers/quantization/hqq_marlin.py b/vllm/model_executor/layers/quantization/hqq_marlin.py index f42212b90387f..4edc9aa848a19 100644 --- a/vllm/model_executor/layers/quantization/hqq_marlin.py +++ b/vllm/model_executor/layers/quantization/hqq_marlin.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Any, Optional +from typing import Any, Dict, List, Optional import torch @@ -31,7 +31,7 @@ def __init__( self, weight_bits: int, group_size: int, - skip_modules: Optional[list[str]] = None, + skip_modules: Optional[List[str]] = None, ) -> None: super().__init__() assert group_size == 64, ("The only supported HQQ group size is " @@ -54,7 +54,7 @@ def get_name(cls) -> str: return "hqq" @classmethod - def get_supported_act_dtypes(cls) -> list[torch.dtype]: + def get_supported_act_dtypes(cls) -> List[torch.dtype]: return [torch.half, torch.bfloat16] @classmethod @@ -62,11 +62,11 @@ def get_min_capability(cls) -> int: return 80 @classmethod - def get_config_filenames(cls) -> list[str]: + def get_config_filenames(cls) -> List[str]: return ["quantize_config.json"] @classmethod - def from_config(cls, config: dict[str, Any]) -> "HQQMarlinConfig": + def from_config(cls, config: Dict[str, Any]) -> "HQQMarlinConfig": wq_params = (config["quant_config"]["weight_quant_params"]) weight_bits = cls.get_from_keys(wq_params, ["nbits"]) group_size = cls.get_from_keys(wq_params, ["group_size"]) @@ -191,7 +191,7 @@ def create_weights( self, layer: torch.nn.Module, input_size_per_partition: int, - output_partition_sizes: list[int], + output_partition_sizes: List[int], input_size: int, output_size: int, params_dtype: torch.dtype, diff --git a/vllm/model_executor/layers/quantization/ipex_quant.py b/vllm/model_executor/layers/quantization/ipex_quant.py index 8a39c6edb0e8a..c09cc13cb276b 100644 --- a/vllm/model_executor/layers/quantization/ipex_quant.py +++ b/vllm/model_executor/layers/quantization/ipex_quant.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Any, Optional +from typing import Any, Dict, List, Optional import torch @@ -31,7 +31,7 @@ def __init__( method: str, weight_bits: int, group_size: int, - modules_to_not_convert: Optional[list[str]] = None, + modules_to_not_convert: Optional[List[str]] = None, desc_act: Optional[bool] = None, lm_head_quantized: Optional[bool] = None, ) -> None: @@ -62,7 +62,7 @@ def get_name(cls) -> str: return "ipex" @classmethod - def get_supported_act_dtypes(cls) -> list[torch.dtype]: + def get_supported_act_dtypes(cls) -> List[torch.dtype]: return [torch.bfloat16, torch.float16] @classmethod @@ -70,14 +70,14 @@ def get_min_capability(cls) -> int: return -1 @staticmethod - def get_config_filenames() -> list[str]: + def get_config_filenames() -> List[str]: return [ "quant_config.json", "quantize_config.json", ] @classmethod - def from_config(cls, config: dict[str, Any]) -> "IPEXConfig": + def from_config(cls, config: Dict[str, Any]) -> "IPEXConfig": method = cls.get_from_keys(config, ["quant_method"]).lower() if method == "awq": weight_bits = cls.get_from_keys(config, ["w_bit", "bits"]) diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py index 55ad00b1cf461..c06befaf3b5ad 100644 --- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py +++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py @@ -2,7 +2,7 @@ from abc import ABC, abstractmethod from dataclasses import dataclass -from typing import Callable, Optional +from typing import Callable, Optional, Tuple import torch @@ -12,8 +12,8 @@ @dataclass class MPLinearLayerConfig: - full_weight_shape: tuple[int, int] # [in, out] - partition_weight_shape: tuple[int, int] + full_weight_shape: Tuple[int, int] # [in, out] + partition_weight_shape: Tuple[int, int] weight_type: ScalarType act_type: torch.dtype group_size: int @@ -31,7 +31,7 @@ def get_min_capability(cls) -> int: @classmethod @abstractmethod def can_implement(cls, - c: MPLinearLayerConfig) -> tuple[bool, Optional[str]]: + c: MPLinearLayerConfig) -> Tuple[bool, Optional[str]]: raise NotImplementedError def __init__(self, @@ -75,7 +75,7 @@ def _transform_param(self, layer: torch.nn.Module, name: Optional[str], torch.nn.Parameter(new_param.data, requires_grad=False)) def _get_weight_params( - self, layer: torch.nn.Module) -> tuple[ + self, layer: torch.nn.Module) -> Tuple[ torch.Tensor, # w_q torch.Tensor, # w_s Optional[torch.Tensor], # w_zp, diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py index 537553e7d3d70..bcfdb16777166 100644 --- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py +++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Optional +from typing import List, Optional, Type import vllm.envs as envs from vllm.model_executor.layers.quantization.kernels.mixed_precision.exllama import ( # noqa: E501 @@ -14,7 +14,7 @@ from vllm.platforms import current_platform # in priority/performance order (when available) -_POSSIBLE_KERNELS: list[type[MPLinearKernel]] = [ +_POSSIBLE_KERNELS: List[Type[MPLinearKernel]] = [ MacheteLinearKernel, MarlinLinearKernel, ExllamaLinearKernel, @@ -23,7 +23,7 @@ def choose_mp_linear_kernel( config: MPLinearLayerConfig, - compute_capability: Optional[int] = None) -> type[MPLinearKernel]: + compute_capability: Optional[int] = None) -> Type[MPLinearKernel]: """ Choose an MPLinearKernel that can implement the given config for the given compute capability. Attempts to choose the best kernel in terms of @@ -40,7 +40,7 @@ def choose_mp_linear_kernel( ValueError: If no kernel can implement the given config. Returns: - type[MPLinearKernel]: Chosen kernel. + Type[MPLinearKernel]: Chosen kernel. """ if compute_capability is None: if current_platform is None: diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py index 50d293cf415bf..2706fbb539ab4 100644 --- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py +++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Optional +from typing import Optional, Tuple import torch @@ -25,7 +25,7 @@ def get_min_capability(cls) -> int: @classmethod def can_implement(cls, - c: MPLinearLayerConfig) -> tuple[bool, Optional[str]]: + c: MPLinearLayerConfig) -> Tuple[bool, Optional[str]]: if c.has_g_idx and\ c.partition_weight_shape[0] != c.full_weight_shape[0]: return False, "Act reordering currently not supported by Exllama, "\ diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py index 2dec4ace92f09..3f0586f6e30d6 100644 --- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py +++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 from functools import partial -from typing import Optional +from typing import Optional, Tuple import torch @@ -25,7 +25,7 @@ def get_min_capability(cls) -> int: @classmethod def can_implement(cls, - c: MPLinearLayerConfig) -> tuple[bool, Optional[str]]: + c: MPLinearLayerConfig) -> Tuple[bool, Optional[str]]: if c.has_g_idx and\ c.partition_weight_shape[0] != c.full_weight_shape[0]: return False, "Act reordering currently not supported by Machete, "\ diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py index ed8a31b318393..e21801cf6a785 100644 --- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py +++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Optional +from typing import Optional, Tuple import torch @@ -24,7 +24,7 @@ def get_min_capability(cls) -> int: @classmethod def can_implement(cls, - c: MPLinearLayerConfig) -> tuple[bool, Optional[str]]: + c: MPLinearLayerConfig) -> Tuple[bool, Optional[str]]: if c.zero_points: return False, "Zero points currently not supported by "\ " MarlinLinearKernel. Will be added when AWQMarlin "\ diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py index 2d92af74bbf9a..91e7654053f9d 100644 --- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py +++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py @@ -2,7 +2,7 @@ from abc import ABC, abstractmethod from dataclasses import dataclass -from typing import Optional +from typing import Optional, Tuple import torch @@ -24,7 +24,7 @@ def get_min_capability(cls) -> int: @classmethod @abstractmethod def can_implement( - cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, Optional[str]]: + cls, c: ScaledMMLinearLayerConfig) -> Tuple[bool, Optional[str]]: raise NotImplementedError def __init__(self, c: ScaledMMLinearLayerConfig, w_q_param_name: str, @@ -50,7 +50,7 @@ def apply_weights(self, raise NotImplementedError def _get_weight_params( - self, layer: torch.nn.Module) -> tuple[ + self, layer: torch.nn.Module) -> Tuple[ torch.Tensor, # weight torch.Tensor, # weight_scale Optional[torch.Tensor], # input_scale, diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py index 3ac2fd4ed9b43..a5967995ac88d 100644 --- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py +++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import os -from typing import Optional +from typing import Dict, List, Optional, Type from vllm.model_executor.layers.quantization.kernels.scaled_mm.cutlass import ( CutlassScaledMMLinearKernel) @@ -14,7 +14,7 @@ from vllm.platforms import PlatformEnum, current_platform # in priority/performance order (when available) -_POSSIBLE_KERNELS: dict[PlatformEnum, list[type[ScaledMMLinearKernel]]] = { +_POSSIBLE_KERNELS: Dict[PlatformEnum, List[Type[ScaledMMLinearKernel]]] = { PlatformEnum.CPU: [CutlassScaledMMLinearKernel], PlatformEnum.CUDA: [CutlassScaledMMLinearKernel], PlatformEnum.ROCM: [TritonScaledMMLinearKernel], @@ -25,7 +25,7 @@ def choose_scaled_mm_linear_kernel( config: ScaledMMLinearLayerConfig, compute_capability: Optional[int] = None -) -> type[ScaledMMLinearKernel]: +) -> Type[ScaledMMLinearKernel]: """ Choose an ScalledMMLinearKernel that can implement the given config for the given compute capability. Attempts to choose the best kernel in terms of @@ -42,7 +42,7 @@ def choose_scaled_mm_linear_kernel( ValueError: If no kernel can implement the given config. Returns: - type[ScaledMMLinearKernel]: Chosen kernel. + Type[ScaledMMLinearKernel]: Chosen kernel. """ if compute_capability is None: diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py index 245f6635cf85a..2bf21a05c46d9 100644 --- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py +++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Optional +from typing import Optional, Tuple import torch @@ -22,7 +22,7 @@ def get_min_capability(cls) -> int: @classmethod def can_implement( - cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, Optional[str]]: + cls, c: ScaledMMLinearLayerConfig) -> Tuple[bool, Optional[str]]: if (not current_platform.is_cuda() and not current_platform.is_cpu()): return False, "CutlassScaledMM requires running on CUDA or CPU." diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py index c09ca83d01cbb..5da5df8efaeb0 100644 --- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py +++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Optional +from typing import Optional, Tuple import torch @@ -18,7 +18,7 @@ def get_min_capability(cls) -> int: @classmethod def can_implement( - cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, Optional[str]]: + cls, c: ScaledMMLinearLayerConfig) -> Tuple[bool, Optional[str]]: if current_platform.is_cpu(): return ( False, diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py index ab27f49115c26..0bf090d7fab3c 100644 --- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py +++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import warnings -from typing import Optional +from typing import Optional, Tuple import torch from functorch.experimental.control_flow import cond # noqa: F401 @@ -25,7 +25,7 @@ def get_min_capability(cls) -> int: @classmethod def can_implement( - cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, Optional[str]]: + cls, c: ScaledMMLinearLayerConfig) -> Tuple[bool, Optional[str]]: if not current_platform.is_tpu(): return False, "ScaledMMXLA requires running on TPU." diff --git a/vllm/model_executor/layers/quantization/marlin.py b/vllm/model_executor/layers/quantization/marlin.py index 284abeea912e6..4cf0c677c0794 100644 --- a/vllm/model_executor/layers/quantization/marlin.py +++ b/vllm/model_executor/layers/quantization/marlin.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Any, Optional +from typing import Any, Dict, List, Optional import torch from torch.nn.parameter import Parameter @@ -67,7 +67,7 @@ def get_name(cls) -> str: return "marlin" @classmethod - def get_supported_act_dtypes(cls) -> list[torch.dtype]: + def get_supported_act_dtypes(cls) -> List[torch.dtype]: return [torch.half] @classmethod @@ -76,11 +76,11 @@ def get_min_capability(cls) -> int: return 80 @classmethod - def get_config_filenames(cls) -> list[str]: + def get_config_filenames(cls) -> List[str]: return ["quantize_config.json"] @classmethod - def from_config(cls, config: dict[str, Any]) -> "MarlinConfig": + def from_config(cls, config: Dict[str, Any]) -> "MarlinConfig": group_size = cls.get_from_keys(config, ["group_size"]) lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"], default=False) @@ -127,7 +127,7 @@ def create_weights( self, layer: torch.nn.Module, input_size_per_partition: int, - output_partition_sizes: list[int], + output_partition_sizes: List[int], input_size: int, output_size: int, params_dtype: torch.dtype, diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index 4223bf3cb6378..36711a7a5098b 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Any, Optional +from typing import Any, Dict, List, Optional import torch from torch.nn import Module @@ -39,7 +39,7 @@ def get_name(cls) -> str: return "modelopt" @classmethod - def get_supported_act_dtypes(cls) -> list[torch.dtype]: + def get_supported_act_dtypes(cls) -> List[torch.dtype]: return [torch.bfloat16, torch.half] @classmethod @@ -47,11 +47,11 @@ def get_min_capability(cls) -> int: return 89 @classmethod - def get_config_filenames(cls) -> list[str]: + def get_config_filenames(cls) -> List[str]: return ["hf_quant_config.json"] @classmethod - def from_config(cls, config: dict[str, Any]) -> "ModelOptFp8Config": + def from_config(cls, config: Dict[str, Any]) -> "ModelOptFp8Config": quant_config = cls.get_from_keys(config, ["quantization"]) quant_method = quant_config["quant_algo"] is_checkpoint_fp8_serialized = ("FP8" in quant_method) @@ -101,7 +101,7 @@ def create_weights( self, layer: torch.nn.Module, input_size_per_partition: int, - output_partition_sizes: list[int], + output_partition_sizes: List[int], input_size: int, output_size: int, params_dtype: torch.dtype, diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py index 99c46007ca6e7..a3adac1bb129b 100644 --- a/vllm/model_executor/layers/quantization/moe_wna16.py +++ b/vllm/model_executor/layers/quantization/moe_wna16.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Any, Callable, Optional +from typing import Any, Callable, Dict, List, Optional import torch @@ -22,8 +22,8 @@ class MoeWNA16Config(QuantizationConfig): def __init__(self, linear_quant_method: str, weight_bits: int, group_size: int, has_zp: bool, lm_head_quantized: bool, - modules_to_not_convert: Optional[list[str]], - full_config: dict[str, Any]) -> None: + modules_to_not_convert: Optional[List[str]], + full_config: Dict[str, Any]) -> None: super().__init__() self.weight_bits = weight_bits self.group_size = group_size @@ -68,7 +68,7 @@ def get_name(cls) -> str: return "moe_wna16" @classmethod - def get_supported_act_dtypes(cls) -> list[torch.dtype]: + def get_supported_act_dtypes(cls) -> List[torch.dtype]: return [torch.bfloat16, torch.half] @classmethod @@ -76,11 +76,11 @@ def get_min_capability(cls) -> int: return 70 @classmethod - def get_config_filenames(cls) -> list[str]: + def get_config_filenames(cls) -> List[str]: return ["quantize_config.json"] @classmethod - def from_config(cls, config: dict[str, Any]) -> "MoeWNA16Config": + def from_config(cls, config: Dict[str, Any]) -> "MoeWNA16Config": linear_quant_method = cls.get_from_keys(config, ["quant_method"]) weight_bits = cls.get_from_keys(config, ["bits"]) group_size = cls.get_from_keys(config, ["group_size"]) @@ -108,7 +108,7 @@ def override_quantization_method(cls, hf_quant_cfg, return None @classmethod - def is_moe_wna16_compatible(cls, quant_config: dict[str, Any]): + def is_moe_wna16_compatible(cls, quant_config: Dict[str, Any]): # Extract data from quant config. quant_method = quant_config.get("quant_method", "").lower() num_bits = quant_config.get("bits") @@ -162,7 +162,7 @@ def get_quant_method(self, layer: torch.nn.Module, return None -def is_layer_skipped_quant(prefix: str, modules_to_not_convert: list[str]): +def is_layer_skipped_quant(prefix: str, modules_to_not_convert: List[str]): return any(module_name in prefix for module_name in modules_to_not_convert) diff --git a/vllm/model_executor/layers/quantization/neuron_quant.py b/vllm/model_executor/layers/quantization/neuron_quant.py index 325ea71871f99..f6f66803f8169 100644 --- a/vllm/model_executor/layers/quantization/neuron_quant.py +++ b/vllm/model_executor/layers/quantization/neuron_quant.py @@ -2,7 +2,7 @@ import os from importlib.util import find_spec -from typing import Any, Optional +from typing import Any, Dict, List, Optional from torch.nn import Module @@ -33,7 +33,7 @@ def __init__( def get_name(self) -> str: return "neuron_quant" - def get_supported_act_dtypes(self) -> list[str]: + def get_supported_act_dtypes(self) -> List[str]: return SUPPORTED_QUANT_DTYPE_LIST @classmethod @@ -42,11 +42,11 @@ def get_min_capability(cls) -> int: "This function should not be called with Neuron Backend") @staticmethod - def get_config_filenames() -> list[str]: + def get_config_filenames() -> List[str]: return [] @classmethod - def from_config(cls, config: dict[str, Any]) -> "NeuronQuantConfig": + def from_config(cls, config: Dict[str, Any]) -> "NeuronQuantConfig": quantize_method = cls.get_from_keys(config, ["quantize_method"]) dequant_dtype = cls.get_from_keys(config, ["dequant_dtype"]) return cls(dequant_dtype=dequant_dtype, diff --git a/vllm/model_executor/layers/quantization/ptpc_fp8.py b/vllm/model_executor/layers/quantization/ptpc_fp8.py index 95cfbca8d05c8..1ded5389e5f45 100644 --- a/vllm/model_executor/layers/quantization/ptpc_fp8.py +++ b/vllm/model_executor/layers/quantization/ptpc_fp8.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Any, Optional +from typing import Any, Dict, List, Optional import torch from torch.nn.parameter import Parameter @@ -31,7 +31,7 @@ class PTPCFp8Config(Fp8Config): def __init__( self, activation_scheme: str = "dynamic", - ignored_layers: Optional[list[str]] = None, + ignored_layers: Optional[List[str]] = None, ) -> None: if not current_platform.is_rocm(): raise ValueError( @@ -54,7 +54,7 @@ def get_name(cls) -> str: return "ptpc_fp8" @classmethod - def from_config(cls, config: dict[str, Any]) -> "PTPCFp8Config": + def from_config(cls, config: Dict[str, Any]) -> "PTPCFp8Config": activation_scheme = cls.get_from_keys(config, ["activation_scheme"]) ignored_layers = cls.get_from_keys_or(config, ["ignored_layers"], None) return cls(activation_scheme=activation_scheme, diff --git a/vllm/model_executor/layers/quantization/qqq.py b/vllm/model_executor/layers/quantization/qqq.py index 58d2adbed35c9..1e05917a5187b 100644 --- a/vllm/model_executor/layers/quantization/qqq.py +++ b/vllm/model_executor/layers/quantization/qqq.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Any, Optional +from typing import Any, Dict, List, Optional import torch from torch.nn.parameter import Parameter @@ -88,7 +88,7 @@ def get_name(cls) -> str: return "qqq" @classmethod - def get_supported_act_dtypes(cls) -> list[torch.dtype]: + def get_supported_act_dtypes(cls) -> List[torch.dtype]: return [torch.half] @classmethod @@ -96,7 +96,7 @@ def get_min_capability(cls) -> int: return 80 @classmethod - def get_config_filenames(cls) -> list[str]: + def get_config_filenames(cls) -> List[str]: """List of filenames to search for in the model directory.""" return [ "quant_config.json", @@ -104,7 +104,7 @@ def get_config_filenames(cls) -> list[str]: ] @classmethod - def from_config(cls, config: dict[str, Any]) -> "QQQConfig": + def from_config(cls, config: Dict[str, Any]) -> "QQQConfig": weight_bits = cls.get_from_keys(config, ["wbits"]) group_size = cls.get_from_keys(config, ["group_size"]) return cls(weight_bits, group_size) @@ -130,7 +130,7 @@ def create_weights( self, layer: torch.nn.Module, input_size_per_partition: int, - output_partition_sizes: list[int], + output_partition_sizes: List[int], input_size: int, output_size: int, params_dtype: torch.dtype, diff --git a/vllm/model_executor/layers/quantization/quark/quark.py b/vllm/model_executor/layers/quantization/quark/quark.py index c512393774c59..ca71da8b736a5 100644 --- a/vllm/model_executor/layers/quantization/quark/quark.py +++ b/vllm/model_executor/layers/quantization/quark/quark.py @@ -2,7 +2,7 @@ import fnmatch import re -from typing import Any, Optional, cast +from typing import Any, Dict, List, Optional, cast import torch @@ -26,9 +26,9 @@ class QuarkConfig(QuantizationConfig): def __init__(self, - quant_config: dict[str, Any], - kv_cache_group: Optional[list[str]] = None, - kv_cache_config: Optional[dict[str, Any]] = None, + quant_config: Dict[str, Any], + kv_cache_group: Optional[List[str]] = None, + kv_cache_config: Optional[Dict[str, Any]] = None, pack_method: str = "reorder"): super().__init__() if kv_cache_group is None: @@ -41,7 +41,7 @@ def __init__(self, def get_linear_method(self) -> "QuarkLinearMethod": return QuarkLinearMethod(self) - def get_supported_act_dtypes(cls) -> list[torch.dtype]: + def get_supported_act_dtypes(cls) -> List[torch.dtype]: return [torch.float16, torch.bfloat16] @classmethod @@ -56,7 +56,7 @@ def get_quant_method(self, layer: torch.nn.Module, from vllm.attention.layer import Attention # Avoid circular import # Check if the layer is skipped for quantization. - exclude_layers = cast(list[str], self.quant_config.get("exclude")) + exclude_layers = cast(List[str], self.quant_config.get("exclude")) if should_ignore_layer(prefix, ignore=exclude_layers, fused_mapping=self.packed_modules_mapping): @@ -74,12 +74,12 @@ def get_quant_method(self, layer: torch.nn.Module, return None @classmethod - def from_config(cls, config: dict[str, Any]) -> "QuarkConfig": + def from_config(cls, config: Dict[str, Any]) -> "QuarkConfig": export_config = config.get("export") if export_config is None: raise ValueError("The export key should be included in " "the configurations of Quark quantized model") - kv_cache_group = cast(list[str], export_config.get("kv_cache_group")) + kv_cache_group = cast(List[str], export_config.get("kv_cache_group")) pack_method = cast(str, export_config.get("pack_method")) # In the export model of quark, the quantization configuration @@ -91,7 +91,7 @@ def from_config(cls, config: dict[str, Any]) -> "QuarkConfig": kv_cache_config = None else: kv_cache_set = set(kv_cache_group) - layer_quant_config = cast(dict[str, Any], + layer_quant_config = cast(Dict[str, Any], config.get("layer_quant_config")) layer_quant_names = list(layer_quant_config.keys()) layer_quant_set = set(layer_quant_names) @@ -104,7 +104,7 @@ def from_config(cls, config: dict[str, Any]) -> "QuarkConfig": "configuration.") q_configs = [ - cast(dict[str, Any], layer_quant_config.get(name)) + cast(Dict[str, Any], layer_quant_config.get(name)) for name in kv_cache_group ] if not all( @@ -131,7 +131,7 @@ def from_config(cls, config: dict[str, Any]) -> "QuarkConfig": pack_method=pack_method) @classmethod - def get_config_filenames(cls) -> list[str]: + def get_config_filenames(cls) -> List[str]: return [] def _check_scheme_supported(self, @@ -151,8 +151,8 @@ def _check_scheme_supported(self, else: return False - def _is_fp8_w8a8(self, weight_quant: Optional[dict[str, Any]], - input_quant: Optional[dict[str, Any]]) -> bool: + def _is_fp8_w8a8(self, weight_quant: Optional[Dict[str, Any]], + input_quant: Optional[Dict[str, Any]]) -> bool: # Confirm weights and input quantized. if weight_quant is None or input_quant is None: return False @@ -176,8 +176,8 @@ def _is_fp8_w8a8(self, weight_quant: Optional[dict[str, Any]], is_per_tensor_activation = (input_quant.get("qscheme") == "per_tensor") return is_per_tensor_activation - def _is_static_tensor_w8a8(self, weight_quant: Optional[dict[str, Any]], - input_quant: Optional[dict[str, Any]]) -> bool: + def _is_static_tensor_w8a8(self, weight_quant: Optional[Dict[str, Any]], + input_quant: Optional[Dict[str, Any]]) -> bool: # Confirm weights and input quantized. if weight_quant is None or input_quant is None: return False @@ -199,7 +199,7 @@ def _is_static_tensor_w8a8(self, weight_quant: Optional[dict[str, Any]], return is_int8_dtype and is_tensor and is_weight_symmetric and is_static def _find_matched_config(self, layer_name: str, - module: torch.nn.Module) -> dict[str, Any]: + module: torch.nn.Module) -> Dict[str, Any]: proj_name = layer_name.split(".")[-1] if proj_name in self.packed_modules_mapping: @@ -224,29 +224,29 @@ def _find_matched_config(self, layer_name: str, return shard_configs[0] else: layer_quant_config = cast( - dict[str, Any], self.quant_config.get("layer_quant_config")) + Dict[str, Any], self.quant_config.get("layer_quant_config")) for name_pattern in layer_quant_config: if fnmatch.fnmatch(layer_name, name_pattern): return layer_quant_config[name_pattern] layer_type = cast(str, type(module)) layer_type_quant_config = cast( - dict[str, Any], + Dict[str, Any], self.quant_config.get("layer_type_quant_config")) if layer_type in layer_type_quant_config: return layer_type_quant_config[layer_type] global_quant_config = cast( - dict[str, Any], self.quant_config.get("global_quant_config")) + Dict[str, Any], self.quant_config.get("global_quant_config")) return global_quant_config - def _get_scheme_from_config(self, config: dict[str, Any]) -> "QuarkScheme": + def _get_scheme_from_config(self, config: Dict[str, Any]) -> "QuarkScheme": if config.get("output_tensors") or config.get("bias"): raise NotImplementedError( "Currently, Quark models with output_tensors " "and bias quantized are not supported") - weight_config = cast(dict[str, Any], config.get("weight")) - input_config = cast(dict[str, Any], config.get("input_tensors")) + weight_config = cast(Dict[str, Any], config.get("weight")) + input_config = cast(Dict[str, Any], config.get("input_tensors")) if self._is_fp8_w8a8(weight_config, input_config): is_fp8_w8a8_supported = self._check_scheme_supported( @@ -323,7 +323,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: def create_weights(self, layer: torch.nn.Module, input_size_per_partition: int, - output_partition_sizes: list[int], input_size: int, + output_partition_sizes: List[int], input_size: int, output_size: int, params_dtype: torch.dtype, **extra_weight_attrs): """ @@ -367,7 +367,7 @@ def __init__(self, quant_config: QuarkConfig): super().__init__(quant_config) @staticmethod - def validate_kv_cache_config(kv_cache_config: Optional[dict[str, Any]]): + def validate_kv_cache_config(kv_cache_config: Optional[Dict[str, Any]]): """ Validator for the kv cache configuration. Useful for controlling the kv cache quantization schemes, that are being supported in vLLM diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py index 1ae3fc937a28d..32dce5aaf5e07 100644 --- a/vllm/model_executor/layers/quantization/quark/quark_moe.py +++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Any, Callable, Optional +from typing import Any, Callable, Dict, Optional import torch @@ -45,7 +45,7 @@ def get_moe_method( class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod): - def __init__(self, weight_config: dict[str, Any], input_config: dict[str, + def __init__(self, weight_config: Dict[str, Any], input_config: Dict[str, Any]): self.weight_quant = weight_config self.input_quant = input_config diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py index 221d3c93b5fb8..c885e98a4d66e 100644 --- a/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py +++ b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Callable, Optional +from typing import Callable, List, Optional import torch from torch.nn import Parameter @@ -83,7 +83,7 @@ def process_weights_after_loading(self, layer) -> None: layer.input_scale = None def create_weights(self, layer: torch.nn.Module, - output_partition_sizes: list[int], + output_partition_sizes: List[int], input_size_per_partition: int, params_dtype: torch.dtype, weight_loader: Callable, **kwargs): diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py index f3dc4ab705764..1bf34b098938c 100644 --- a/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py +++ b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Callable, Optional +from typing import Callable, List, Optional, Set import torch @@ -17,7 +17,7 @@ class QuarkW8A8Int8(QuarkScheme): - _kernel_backends_being_used: set[str] = set() + _kernel_backends_being_used: Set[str] = set() def __init__(self, qscheme: str, is_static_input_scheme: Optional[bool], input_symmetric: Optional[bool]): @@ -31,7 +31,7 @@ def get_min_capability(cls) -> int: return 75 def create_weights(self, layer: torch.nn.Module, - output_partition_sizes: list[int], + output_partition_sizes: List[int], input_size_per_partition: int, params_dtype: torch.dtype, weight_loader: Callable, **kwargs): diff --git a/vllm/model_executor/layers/quantization/quark/utils.py b/vllm/model_executor/layers/quantization/quark/utils.py index d1d293b017914..17e0df021085a 100644 --- a/vllm/model_executor/layers/quantization/quark/utils.py +++ b/vllm/model_executor/layers/quantization/quark/utils.py @@ -1,9 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 import re -from collections.abc import Iterable, Mapping from types import MappingProxyType -from typing import Any, Optional +from typing import Any, Iterable, List, Mapping, Optional def deep_compare(dict1: Any, dict2: Any) -> bool: @@ -22,7 +21,7 @@ def deep_compare(dict1: Any, dict2: Any) -> bool: def should_ignore_layer( layer_name: Optional[str], ignore: Iterable[str], - fused_mapping: Mapping[str, list[str]] = MappingProxyType({}) + fused_mapping: Mapping[str, List[str]] = MappingProxyType({}) ) -> bool: if layer_name is None: return False diff --git a/vllm/model_executor/layers/quantization/schema.py b/vllm/model_executor/layers/quantization/schema.py index c0be40c16affc..026881f2dbaac 100644 --- a/vllm/model_executor/layers/quantization/schema.py +++ b/vllm/model_executor/layers/quantization/schema.py @@ -12,7 +12,7 @@ scaling factors. """ -from typing import Optional +from typing import Dict, Optional from pydantic import BaseModel, ConfigDict, ValidationInfo, model_validator @@ -23,7 +23,7 @@ class KVCacheQuantSchema(BaseModel): # layer indices to their per-tensor KV cache scaling factor. # TODO: Consider pulling this and its validation methods out into its # own schema class (tricky as its members are variable) - scaling_factor: dict[int, dict[int, float]] + scaling_factor: Dict[int, Dict[int, float]] @model_validator(mode="after") def check_is_fp8(self) -> "KVCacheQuantSchema": diff --git a/vllm/model_executor/layers/quantization/tpu_int8.py b/vllm/model_executor/layers/quantization/tpu_int8.py index a7c2b623ddea1..14e5bcf6e5bbe 100644 --- a/vllm/model_executor/layers/quantization/tpu_int8.py +++ b/vllm/model_executor/layers/quantization/tpu_int8.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Any, Optional +from typing import Any, Dict, List, Optional, Tuple import torch from torch.nn import Module @@ -30,7 +30,7 @@ def __init__( def get_name(self) -> str: return "tpu_int8" - def get_supported_act_dtypes(self) -> list[torch.dtype]: + def get_supported_act_dtypes(self) -> List[torch.dtype]: return [torch.float16, torch.bfloat16] @classmethod @@ -39,11 +39,11 @@ def get_min_capability(cls) -> int: "This function should not be called with TPU Backend") @staticmethod - def get_config_filenames() -> list[str]: + def get_config_filenames() -> List[str]: return [] @classmethod - def from_config(cls, config: dict[str, Any]) -> "Int8TpuConfig": + def from_config(cls, config: Dict[str, Any]) -> "Int8TpuConfig": activation_scheme = cls.get_from_keys(config, ["activation_scheme"]) return cls(activation_scheme=activation_scheme) @@ -61,7 +61,7 @@ def __init__(self, quant_config: Int8TpuConfig): self.quant_config = quant_config def create_weights(self, layer: Module, input_size_per_partition: int, - output_partition_sizes: list[int], input_size: int, + output_partition_sizes: List[int], input_size: int, output_size: int, params_dtype: torch.dtype, **extra_weight_attrs): @@ -76,7 +76,7 @@ def create_weights(self, layer: Module, input_size_per_partition: int, layer.register_parameter("weight", weight) def _quantize_weight( - self, weight: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + self, weight: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: weight_dtype = weight.dtype weight = weight.cpu().to(torch.float32) n_bit = 8 diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py index 2aaee820988a0..7d91d2cf1c6e8 100644 --- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py @@ -4,7 +4,7 @@ import functools import json import os -from typing import Any, Optional, Union +from typing import Any, Dict, List, Optional, Tuple, Union import torch import triton @@ -35,7 +35,7 @@ def is_fp8(x: Union[torch.dtype, torch.Tensor]) -> bool: def apply_w8a8_block_fp8_linear( input: torch.Tensor, weight: torch.Tensor, - block_size: list[int], + block_size: List[int], weight_scale: torch.Tensor, input_scale: Optional[torch.Tensor] = None, bias: Optional[torch.Tensor] = None, @@ -85,7 +85,7 @@ def apply_w8a8_block_fp8_linear( def apply_w8a8_block_fp8_linear_fake( input: torch.Tensor, weight: torch.Tensor, - block_size: list[int], + block_size: List[int], weight_scale: torch.Tensor, input_scale: Optional[torch.Tensor] = None, ) -> torch.Tensor: @@ -108,8 +108,8 @@ def apply_fp8_linear_generic( input: torch.Tensor, weight: torch.Tensor, weight_scale: torch.Tensor, - input_group_shape: tuple[int, int], - weight_group_shape: tuple[int, int], + input_group_shape: Tuple[int, int], + weight_group_shape: Tuple[int, int], input_scale: Optional[torch.Tensor] = None, # static scale if one cutlass_fp8_supported: bool = CUTLASS_FP8_SUPPORTED, cutlass_block_fp8_supported: bool = CUTLASS_BLOCK_FP8_SUPPORTED, @@ -146,7 +146,7 @@ def is_dim_blocked(dim, shape, group_shape): def input_to_float8( x: torch.Tensor, dtype: Optional[torch.dtype] = None -) -> tuple[torch.Tensor, torch.Tensor]: +) -> Tuple[torch.Tensor, torch.Tensor]: """This function quantizes input values to float8 values " "with tensor-wise quantization.""" if dtype is None: @@ -163,7 +163,7 @@ def input_to_float8( def block_quant_to_tensor_quant( x_q_block: torch.Tensor, x_s: torch.Tensor, -) -> tuple[torch.Tensor, torch.Tensor]: +) -> Tuple[torch.Tensor, torch.Tensor]: """This function converts block-wise quantization to tensor-wise quantization. The inputs are block-wise quantization tensor `x_q_block`, block-wise quantization scale and the block size. @@ -281,7 +281,7 @@ def per_token_group_quant_fp8( eps: float = 1e-10, dtype: Optional[torch.dtype] = None, column_major_scales: bool = False, -) -> tuple[torch.Tensor, torch.Tensor]: +) -> Tuple[torch.Tensor, torch.Tensor]: """Function to perform per-token-group quantization on an input tensor `x`. It converts the tensor values into signed float8 values and returns the quantized tensor along with the scaling factor used for quantization. @@ -292,7 +292,7 @@ def per_token_group_quant_fp8( dtype: The dype of output tensor. Note that only `torch.float8_e4m3fn` is supported for now. Returns: - tuple[torch.Tensor, torch.Tensor]: The quantized tensor and the + Tuple[torch.Tensor, torch.Tensor]: The quantized tensor and the scaling factor for quantization. """ if dtype is None: @@ -448,7 +448,7 @@ def _w8a8_block_fp8_matmul( @functools.lru_cache def get_w8a8_block_fp8_configs(N: int, K: int, block_n: int, - block_k: int) -> Optional[dict[int, Any]]: + block_k: int) -> Optional[Dict[int, Any]]: """ Return optimized configurations for the w8a8 block fp8 kernel. The return value will be a dictionary that maps an irregular grid of @@ -488,7 +488,7 @@ def w8a8_block_fp8_matmul( B: torch.Tensor, As: torch.Tensor, Bs: torch.Tensor, - block_size: list[int], + block_size: List[int], output_dtype: torch.dtype = torch.float16, ) -> torch.Tensor: """This function performs matrix multiplication with block-wise diff --git a/vllm/model_executor/layers/quantization/utils/gptq_utils.py b/vllm/model_executor/layers/quantization/utils/gptq_utils.py index ff7a8169e6fbc..5b0e6299f4739 100644 --- a/vllm/model_executor/layers/quantization/utils/gptq_utils.py +++ b/vllm/model_executor/layers/quantization/utils/gptq_utils.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import re from copy import deepcopy -from typing import Optional, Union +from typing import Dict, Optional, Union import torch @@ -52,7 +52,7 @@ def get_dynamic_override( layer_name: str, key: Optional[str] = None, default_value: Union[int, bool, - None] = None) -> Union[dict, int, bool, None]: + None] = None) -> Union[Dict, int, bool, None]: for pattern, pattern_dict in config.dynamic.items(): # Negative match: matched modules are excluded from quantized init if pattern.startswith("-:"): diff --git a/vllm/model_executor/layers/quantization/utils/machete_utils.py b/vllm/model_executor/layers/quantization/utils/machete_utils.py index 6d840b5686123..cb7d49ed6f1ca 100644 --- a/vllm/model_executor/layers/quantization/utils/machete_utils.py +++ b/vllm/model_executor/layers/quantization/utils/machete_utils.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Optional +from typing import List, Optional, Tuple import torch @@ -10,19 +10,19 @@ MACHETE_PREPACKED_BLOCK_SHAPE = [64, 128] -def query_machete_supported_quant_types(zero_points: bool) -> list[ScalarType]: +def query_machete_supported_quant_types(zero_points: bool) -> List[ScalarType]: if zero_points: return [scalar_types.uint4, scalar_types.uint8] else: return [scalar_types.uint4b8, scalar_types.uint8b128] -def query_machete_supported_act_types(zero_points: bool) -> list[ScalarType]: +def query_machete_supported_act_types(zero_points: bool) -> List[ScalarType]: return [torch.float16, torch.bfloat16] def check_machete_supports_shape(in_features: int, out_featrues: int) \ - -> tuple[bool, Optional[str]]: + -> Tuple[bool, Optional[str]]: if in_features % MACHETE_PREPACKED_BLOCK_SHAPE[0] != 0: return False, "Input features size must be divisible by "\ f"{MACHETE_PREPACKED_BLOCK_SHAPE[0]}" diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py index 57f6137bf4763..80416c1bc6ebc 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Optional +from typing import List, Optional, Tuple import numpy import torch @@ -53,7 +53,7 @@ def _check_marlin_supported( quant_type: ScalarType, group_size: Optional[int], has_zp: bool, - device_capability: Optional[int] = None) -> tuple[bool, Optional[str]]: + device_capability: Optional[int] = None) -> Tuple[bool, Optional[str]]: if device_capability is None: capability_tuple = current_platform.get_device_capability() @@ -126,7 +126,7 @@ def verify_marlin_supports_shape(output_size_per_partition: int, def check_marlin_supports_shape(output_size_per_partition: int, input_size_per_partition: int, input_size: int, group_size: int) \ - -> tuple[bool, Optional[str]]: + -> Tuple[bool, Optional[str]]: try: verify_marlin_supports_shape(output_size_per_partition, input_size_per_partition, input_size, @@ -184,16 +184,16 @@ def marlin_make_empty_zp(device: torch.device) -> torch.Tensor: def marlin_sort_g_idx( - g_idx: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + g_idx: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: g_idx_sort_indices = torch.argsort(g_idx).to(torch.int) return g_idx[g_idx_sort_indices], g_idx_sort_indices def get_scale_perms(): - scale_perm: list[int] = [] + scale_perm: List[int] = [] for i in range(8): scale_perm.extend([i + 8 * j for j in range(8)]) - scale_perm_single: list[int] = [] + scale_perm_single: List[int] = [] for i in range(4): scale_perm_single.extend( [2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]]) diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py index 81112b27f53a8..fb557a31393ca 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 """Utility functions used for tests and benchmarks""" -from typing import Optional +from typing import List, Optional import numpy as np import torch @@ -64,9 +64,9 @@ def marlin_weights(q_w, size_k, size_n, num_bits, perm): def get_weight_perm(num_bits: int): - perm_list: list[int] = [] + perm_list: List[int] = [] for i in range(32): - perm1: list[int] = [] + perm1: List[int] = [] col = i // 4 for block in [0, 1]: for row in [ diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py index 73feb4264a8bb..3654268e27af3 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py @@ -2,6 +2,7 @@ """Utility functions used for tests and benchmarks""" import random +from typing import List import numpy import torch @@ -372,19 +373,19 @@ def compress_quantized_24_weight(q_24, size_k, size_n, wtype: ScalarType): def get_scale_perms_24(): - scale_perm: list[int] = [] + scale_perm: List[int] = [] for i in range(8): scale_perm.extend([i * 8 + j for j in [0, 4, 1, 5, 2, 6, 3, 7]]) - scale_perm_single: list[int] = [] + scale_perm_single: List[int] = [] for i in range(8): scale_perm_single.extend([8 * i + j for j in [0, 1, 2, 3, 4, 5, 6, 7]]) return scale_perm, scale_perm_single def get_weight_perm_24(num_bits: int): - perm_list: list[int] = [] + perm_list: List[int] = [] for i in range(32): - perm1: list[int] = [] + perm1: List[int] = [] col = i // 4 col_o = col // 2 for block in [0, 1]: diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py index 0123540fc5ddd..176b2947ab09e 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py @@ -1,5 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 +from typing import List + import numpy import torch @@ -32,10 +34,10 @@ def marlin_qqq_weights(q_w, size_k, size_n, num_bits, perm, group_size): def get_qqq_scale_perms(): - scale_perm: list[int] = [] + scale_perm: List[int] = [] for i in range(8): scale_perm.extend([i + 8 * j for j in range(8)]) - scale_perm_single: list[int] = [] + scale_perm_single: List[int] = [] for i in range(4): scale_perm_single.extend( [2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]]) @@ -44,9 +46,9 @@ def get_qqq_scale_perms(): # NOTE(HandH1998): QQQ employs different perms for per-group and per-channel weight quantization. # noqa: E501 def get_qqq_weight_perm(num_bits: int, quant_type: str): - perm_list: list[int] = [] + perm_list: List[int] = [] for i in range(32): - perm1: list[int] = [] + perm1: List[int] = [] col = i // 4 for block in [0, 1]: for row in [ diff --git a/vllm/model_executor/layers/quantization/utils/quant_utils.py b/vllm/model_executor/layers/quantization/utils/quant_utils.py index 6ba327f3db7a4..c7ce3a42c81f9 100644 --- a/vllm/model_executor/layers/quantization/utils/quant_utils.py +++ b/vllm/model_executor/layers/quantization/utils/quant_utils.py @@ -1,8 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 """This file is used for /tests and /benchmarks""" -from collections.abc import Mapping from types import MappingProxyType -from typing import Optional +from typing import List, Mapping, Optional, Tuple import numpy import torch @@ -16,7 +15,7 @@ # Normalize the group_shape to the full extent for any dims that are -1 -def _normalize_quant_group_shape(x: torch.Tensor, group_shape: tuple[int, +def _normalize_quant_group_shape(x: torch.Tensor, group_shape: Tuple[int, int]): # -1 means full extent return (group_shape[0] if group_shape[0] > 0 else x.shape[-2], @@ -57,9 +56,9 @@ def group_broadcast(t, shape): # (i.e. per-token-per-group) def scaled_quantize( x: torch.Tensor, - group_shape: tuple[int, int], + group_shape: Tuple[int, int], quant_dtype: torch.dtype, -) -> tuple[torch.Tensor, torch.Tensor]: +) -> Tuple[torch.Tensor, torch.Tensor]: group_shape = _normalize_quant_group_shape(x, group_shape) assert quant_dtype.is_floating_point, \ "currently `scaled_quantize` only supports floating point dtypes " \ @@ -98,9 +97,9 @@ def scaled_quantize( def scaled_dequantize( x_q: torch.Tensor, x_s: torch.Tensor, - group_shape: Optional[tuple[int, int]] = None, + group_shape: Optional[Tuple[int, int]] = None, out_dtype: torch.dtype = torch.float32, -) -> tuple[torch.Tensor, torch.Tensor]: +) -> Tuple[torch.Tensor, torch.Tensor]: if group_shape is not None: group_shape = _normalize_quant_group_shape(x_q, group_shape) @@ -174,8 +173,8 @@ def unpack_quantized_values_into_int32(w_q: torch.Tensor, def is_layer_skipped( prefix: str, - ignored_layers: list[str], - fused_mapping: Mapping[str, list[str]] = MappingProxyType({}) + ignored_layers: List[str], + fused_mapping: Mapping[str, List[str]] = MappingProxyType({}) ) -> bool: # prefix: model.layers.0.self_attn.q_proj # proj_name: q_proj diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py index 4cbf64a5a06bf..0f93b7f6c45ba 100644 --- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Optional, Union +from typing import List, Optional, Tuple, Union import torch @@ -68,7 +68,7 @@ def all_close_1d(x: torch.Tensor) -> bool: def convert_to_channelwise( weight_scale: torch.Tensor, - logical_widths: list[int]) -> tuple[torch.Tensor, torch.Tensor]: + logical_widths: List[int]) -> Tuple[torch.Tensor, torch.Tensor]: # Create channelwise buffer weight_scale_channel = torch.empty((sum(logical_widths), 1), dtype=torch.float32, @@ -86,7 +86,7 @@ def convert_to_channelwise( def requantize_with_max_scale( weight: torch.Tensor, weight_scale: torch.Tensor, - logical_widths: list[int]) -> tuple[torch.Tensor, torch.Tensor]: + logical_widths: List[int]) -> Tuple[torch.Tensor, torch.Tensor]: # Max scale to be used for requanitzation. max_w_scale = weight_scale.max() @@ -250,7 +250,7 @@ def normalize_e4m3fn_to_e4m3fnuz( weight: torch.Tensor, weight_scale: torch.Tensor, input_scale: Optional[torch.Tensor] = None -) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: +) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: assert weight.dtype == torch.float8_e4m3fn # The bits pattern 10000000(-128) represents zero in e4m3fn # but NaN in e4m3fnuz. So here we set it to 0. diff --git a/vllm/model_executor/layers/rejection_sampler.py b/vllm/model_executor/layers/rejection_sampler.py index 78aa82285af2b..62e27b714866a 100644 --- a/vllm/model_executor/layers/rejection_sampler.py +++ b/vllm/model_executor/layers/rejection_sampler.py @@ -2,7 +2,7 @@ from functools import cached_property from importlib.util import find_spec -from typing import Optional +from typing import Dict, Optional, Tuple import torch import torch.jit @@ -65,7 +65,7 @@ def forward( bonus_token_ids: torch.Tensor, draft_probs: torch.Tensor, draft_token_ids: torch.Tensor, - seeded_seqs: Optional[dict[int, torch.Generator]] = None, + seeded_seqs: Optional[Dict[int, torch.Generator]] = None, ) -> torch.Tensor: """Sample token ids using rejection sampling. This accepts or rejects tokens proposed by the draft model using the probability of each token @@ -95,7 +95,7 @@ def forward( probabilities. shape = [batch_size, num_speculative_tokens] - seeded_seqs: dict of batch row index to torch generator, for + seeded_seqs: Dict of batch row index to torch generator, for sequences using seeded generation. Returns: @@ -161,8 +161,8 @@ def _batch_modified_rejection_sampling( target_probs: torch.Tensor, # [batch_size, k, vocab_size] draft_probs: torch.Tensor, # [batch_size, k, vocab_size] draft_token_ids: torch.Tensor, # [batch_size, k] - seeded_seqs: Optional[dict[int, torch.Generator]], - ) -> tuple[torch.Tensor, torch.Tensor]: + seeded_seqs: Optional[Dict[int, torch.Generator]], + ) -> Tuple[torch.Tensor, torch.Tensor]: """Perform modified rejection sampling on each sequence. Returns: @@ -194,7 +194,7 @@ def _batch_modified_rejection_sampling( return accepted, recovered_token_ids def _create_uniform_samples(self, - seeded_seqs: Optional[dict[int, + seeded_seqs: Optional[Dict[int, torch.Generator]], batch_size: int, k: int, device: torch.device) -> torch.Tensor: @@ -210,7 +210,7 @@ def _create_uniform_samples(self, a seed. Args: - seeded_seqs : Optional[dict[int, torch.Generator]] + seeded_seqs : Optional[Dict[int, torch.Generator]] A dictionary mapping indices in the batch to `torch.Generator` objects. If `None`, all samples are generated without a seed. @@ -255,7 +255,7 @@ def _get_accepted( target_probs: torch.Tensor, # [batch_size, k, vocab_size] draft_probs: torch.Tensor, # [batch_size, k, vocab_size] draft_token_ids: torch.Tensor, # [batch_size, k] - seeded_seqs: Optional[dict[int, torch.Generator]], + seeded_seqs: Optional[Dict[int, torch.Generator]], ) -> torch.Tensor: r"""Create bool matrix over the proposed draft tokens. If True, then a token can be accepted, else it should be @@ -376,7 +376,7 @@ def _multinomial( probs: torch.Tensor, num_samples: int, k: int, - seeded_seqs: dict[int, torch.Generator], + seeded_seqs: Dict[int, torch.Generator], ) -> torch.Tensor: if num_samples > 1: diff --git a/vllm/model_executor/layers/resampler.py b/vllm/model_executor/layers/resampler.py index 839688e313aae..4c9860006c328 100644 --- a/vllm/model_executor/layers/resampler.py +++ b/vllm/model_executor/layers/resampler.py @@ -33,7 +33,7 @@ """ import math from functools import partial -from typing import Callable, Optional, Union +from typing import Callable, Optional, Tuple, Union import numpy as np import torch @@ -69,7 +69,7 @@ def get_abs_pos(abs_pos: torch.Tensor, tgt_size: Union[torch.Tensor, # https://github.com/facebookresearch/mae/blob/efb2a8062c206524e35e47d04501ed4f544c0ae8/util/pos_embed.py#L20 def get_1d_sincos_pos_embed_from_grid( embed_dim: int, pos: np.ndarray, - version: tuple[int, int] = (2, 0)) -> torch.Tensor: + version: Tuple[int, int] = (2, 0)) -> torch.Tensor: """ embed_dim: output dimension for each position pos: a list of positions to be encoded: size (M,) / (H, W) @@ -96,7 +96,7 @@ def get_1d_sincos_pos_embed_from_grid( def get_2d_sincos_pos_embed_from_grid( embed_dim: int, grid: np.ndarray, - version: tuple[int, int] = (2, 0)) -> torch.Tensor: + version: Tuple[int, int] = (2, 0)) -> torch.Tensor: assert embed_dim % 2 == 0 # use half of dimensions to encode grid_h @@ -114,9 +114,9 @@ def get_2d_sincos_pos_embed_from_grid( def get_2d_sincos_pos_embed( embed_dim: int, - grid_size: Union[int, tuple[int, int]], + grid_size: Union[int, Tuple[int, int]], cls_token: bool = False, - version: tuple[int, int] = (2, 0), + version: Tuple[int, int] = (2, 0), ) -> torch.Tensor: """ grid_size: int of the grid height and width diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py index 384d09f55e321..64c2dac524f2b 100644 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -23,7 +23,7 @@ # limitations under the License. """Rotary Positional Embeddings.""" import math -from typing import Any, Optional, Union +from typing import Any, Dict, List, Optional, Tuple, Union import torch import torch.nn as nn @@ -128,7 +128,7 @@ def forward_native( query: torch.Tensor, key: torch.Tensor, offsets: Optional[torch.Tensor] = None, - ) -> tuple[torch.Tensor, torch.Tensor]: + ) -> Tuple[torch.Tensor, torch.Tensor]: """A PyTorch-native implementation of forward().""" if offsets is not None: positions = positions + offsets @@ -158,7 +158,7 @@ def forward_cuda( query: torch.Tensor, key: torch.Tensor, offsets: Optional[torch.Tensor] = None, - ) -> tuple[torch.Tensor, torch.Tensor]: + ) -> Tuple[torch.Tensor, torch.Tensor]: from vllm import _custom_ops as ops self.cos_sin_cache = self.cos_sin_cache.to(query.device, @@ -181,7 +181,7 @@ def forward_xpu( query: torch.Tensor, key: torch.Tensor, offsets: Optional[torch.Tensor] = None, - ) -> tuple[torch.Tensor, torch.Tensor]: + ) -> Tuple[torch.Tensor, torch.Tensor]: from vllm._ipex_ops import ipex_ops as ops self.cos_sin_cache = self.cos_sin_cache.to(positions.device, @@ -204,7 +204,7 @@ def forward_hpu( query: torch.Tensor, key: torch.Tensor, offsets: Optional[torch.Tensor] = None, - ) -> tuple[torch.Tensor, torch.Tensor]: + ) -> Tuple[torch.Tensor, torch.Tensor]: from habana_frameworks.torch.hpex.kernels import ( RotaryPosEmbeddingMode, apply_rotary_pos_emb) if offsets is not None: @@ -260,7 +260,7 @@ def forward_neuron( query: torch.Tensor, key: torch.Tensor, offsets: Optional[torch.Tensor] = None, - ) -> tuple[torch.Tensor, torch.Tensor]: + ) -> Tuple[torch.Tensor, torch.Tensor]: def _apply_rotary_emb_neuron( x: torch.Tensor, @@ -373,23 +373,23 @@ def __init__( max_position_embeddings: int, base: int, is_neox_style: bool, - scaling_factors: Union[list[float], float], + scaling_factors: Union[List[float], float], dtype: torch.dtype, ) -> None: if isinstance(scaling_factors, float): scaling_factors = [scaling_factors] - self.scaling_factors: list[float] = scaling_factors # noqa + self.scaling_factors: List[float] = scaling_factors # noqa super().__init__(head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype) # Lazy initialized. - self._scaling_factor_to_offset: dict[float, int] + self._scaling_factor_to_offset: Dict[float, int] def _compute_cos_sin_cache(self) -> torch.Tensor: inv_freq = self._compute_inv_freq(self.base) - cache_list: list[torch.Tensor] = [] + cache_list: List[torch.Tensor] = [] # offsets to the next cache in a tensor. # Each offset corresponds to the same index in scaling_factors. - offsets: list[int] = [] + offsets: List[int] = [] for scaling_factor in self.scaling_factors: # NOTE(woosuk): self.max_position_embeddings is the original # maximum length before applying the rope scaling. @@ -419,7 +419,7 @@ def _compute_cos_sin_cache(self) -> torch.Tensor: return torch.cat(cache_list, dim=0) @property - def scaling_factor_to_offset(self) -> dict[float, int]: + def scaling_factor_to_offset(self) -> Dict[float, int]: return self._scaling_factor_to_offset @@ -479,7 +479,7 @@ def _yarn_find_correction_range( high_rot: int, dim: int, base: float = 10000, - max_position_embeddings: int = 2048) -> tuple[int, int]: + max_position_embeddings: int = 2048) -> Tuple[int, int]: low = math.floor( _yarn_find_correction_dim(low_rot, dim, base, max_position_embeddings)) high = math.ceil( @@ -580,8 +580,8 @@ def __init__( base: int, is_neox_style: bool, dtype: torch.dtype, - short_factor: list[float], - long_factor: list[float], + short_factor: List[float], + long_factor: List[float], short_mscale: Optional[float] = None, long_mscale: Optional[float] = None, ): @@ -629,7 +629,7 @@ def __init__( long_short_cache, persistent=False) - def _compute_inv_freq(self, rescale_factors: list[float]) -> torch.Tensor: + def _compute_inv_freq(self, rescale_factors: List[float]) -> torch.Tensor: rescale_factors = torch.tensor(rescale_factors, dtype=torch.float32) inv_freq = 1.0 / (rescale_factors * (self.base**(torch.arange( 0, self.rotary_dim, 2, dtype=torch.float) / self.rotary_dim))) @@ -638,7 +638,7 @@ def _compute_inv_freq(self, rescale_factors: list[float]) -> torch.Tensor: def _compute_cos_sin_cache( self, max_position_embeddings: int, - rescale_factors: list[float], + rescale_factors: List[float], mscale: float, ) -> torch.Tensor: inv_freq = self._compute_inv_freq(rescale_factors) @@ -655,7 +655,7 @@ def forward( query: torch.Tensor, key: torch.Tensor, offsets: Optional[torch.Tensor] = None, - ) -> tuple[torch.Tensor, torch.Tensor]: + ) -> Tuple[torch.Tensor, torch.Tensor]: query = query.view(*query.shape[:-1], -1, self.head_size) key = key.view(*key.shape[:-1], -1, self.head_size) @@ -765,7 +765,7 @@ def forward( query: torch.Tensor, key: torch.Tensor, offsets: Optional[torch.Tensor] = None, - ) -> tuple[torch.Tensor, torch.Tensor]: + ) -> Tuple[torch.Tensor, torch.Tensor]: """PyTorch-native implementation equivalent to forward().""" query_rot = query[..., :self.rotary_dim] key_rot = key[..., :self.rotary_dim] @@ -857,7 +857,7 @@ def __init__( base: int, is_neox_style: bool, dtype: torch.dtype, - mrope_section: Optional[list[int]] = None, + mrope_section: Optional[List[int]] = None, ) -> None: # In Qwen2.5-VL, the maximum index value is related to the duration of # the input video. We enlarge max_position_embeddings to 4 times to get @@ -875,7 +875,7 @@ def forward( positions: torch.Tensor, query: torch.Tensor, key: torch.Tensor, - ) -> tuple[torch.Tensor, torch.Tensor]: + ) -> Tuple[torch.Tensor, torch.Tensor]: """PyTorch-native implementation equivalent to forward(). Args: @@ -921,14 +921,14 @@ def forward( @staticmethod def get_input_positions( - input_tokens: list[int], + input_tokens: List[int], hf_config: PretrainedConfig, - image_grid_thw: Union[list[list[int]], torch.Tensor], - video_grid_thw: Union[list[list[int]], torch.Tensor], - second_per_grid_ts: Optional[list[float]] = None, + image_grid_thw: Union[List[List[int]], torch.Tensor], + video_grid_thw: Union[List[List[int]], torch.Tensor], + second_per_grid_ts: Optional[List[float]] = None, context_len: int = 0, seq_len: Optional[int] = None, - ) -> tuple[list[list[int]], int]: + ) -> Tuple[List[List[int]], int]: """Get mrope input positions and delta value.""" llm_positions, mrope_position_delta = \ @@ -946,14 +946,14 @@ def get_input_positions( @staticmethod def get_input_positions_tensor( - input_tokens: list[int], + input_tokens: List[int], hf_config: PretrainedConfig, - image_grid_thw: Union[list[list[int]], torch.Tensor], - video_grid_thw: Union[list[list[int]], torch.Tensor], - second_per_grid_ts: Optional[list[float]] = None, + image_grid_thw: Union[List[List[int]], torch.Tensor], + video_grid_thw: Union[List[List[int]], torch.Tensor], + second_per_grid_ts: Optional[List[float]] = None, context_len: int = 0, seq_len: Optional[int] = None, - ) -> tuple[torch.Tensor, int]: + ) -> Tuple[torch.Tensor, int]: """Get mrope input positions and delta value.""" image_token_id = hf_config.image_token_id @@ -1052,7 +1052,7 @@ def get_next_input_positions( mrope_position_delta: int, context_len: int, seq_len: int, - ) -> list[list[int]]: + ) -> List[List[int]]: return [ list( range(context_len + mrope_position_delta, @@ -1071,7 +1071,7 @@ def get_next_input_positions_tensor( ).expand(3, -1) -_ROPE_DICT: dict[tuple, RotaryEmbedding] = {} +_ROPE_DICT: Dict[Tuple, RotaryEmbedding] = {} def get_rope( @@ -1080,7 +1080,7 @@ def get_rope( max_position: int, base: int, is_neox_style: bool = True, - rope_scaling: Optional[dict[str, Any]] = None, + rope_scaling: Optional[Dict[str, Any]] = None, dtype: Optional[torch.dtype] = None, partial_rotary_factor: float = 1.0, ) -> RotaryEmbedding: diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index b0669c117d416..07ee75593f7b7 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -2,11 +2,10 @@ """A layer that samples the next tokens from the model's outputs.""" import itertools import warnings -from collections.abc import Iterator from dataclasses import dataclass from importlib.util import find_spec from math import inf -from typing import Optional, Union +from typing import Dict, Iterator, List, Optional, Tuple, Union import msgspec import torch @@ -43,14 +42,14 @@ def get_sampler() -> torch.nn.Module: # (num_token_ids, num_parent_ids) per sequence group. -SampleResultType = list[tuple[list[int], list[int]]] +SampleResultType = List[Tuple[List[int], List[int]]] # Types of temporary data structures used for # computing sample_result -SampleMetadataType = dict[SamplingType, tuple[list[int], - list[SequenceGroupToSample]]] -MultinomialSamplesType = dict[SamplingType, torch.Tensor] -SampleResultsDictType = dict[int, tuple[list[int], list[int]]] +SampleMetadataType = Dict[SamplingType, Tuple[List[int], + List[SequenceGroupToSample]]] +MultinomialSamplesType = Dict[SamplingType, torch.Tensor] +SampleResultsDictType = Dict[int, Tuple[List[int], List[int]]] # Encapsulates temporary data structures for computing @@ -77,7 +76,7 @@ class SampleResultArgsType: MaybeDeferredSampleResultType = Union[SampleResultType, SampleResultArgsType] # Abbreviation of the _sample() return type -SampleReturnType = tuple[MaybeDeferredSampleResultType, Optional[torch.Tensor]] +SampleReturnType = Tuple[MaybeDeferredSampleResultType, Optional[torch.Tensor]] class SamplerOutput( @@ -91,7 +90,7 @@ class SamplerOutput( also has optional fields for device tensors. """ - outputs: list[CompletionSequenceGroupOutput] + outputs: List[CompletionSequenceGroupOutput] # On-device tensor containing probabilities of each token. sampled_token_probs: Optional[torch.Tensor] = None @@ -345,8 +344,8 @@ def _apply_min_tokens_penalty( """Apply min_tokens penalty which sets stop tokens to -inf if min_tokens have not been generated yet """ - # List of indices in logits that will be set to -inf - logits_to_penalize: list[tuple[int, int]] = [] + # list of indices in logits that will be set to -inf + logits_to_penalize: List[Tuple[int, int]] = [] logits_applied = 0 for seq_group in sampling_metadata.seq_groups: seq_ids = seq_group.seq_ids @@ -362,7 +361,7 @@ def _apply_min_tokens_penalty( min_tokens = sampling_params.min_tokens token_ids_to_penalize = sampling_params.all_stop_token_ids if min_tokens > 0 and token_ids_to_penalize: - seqs_to_penalize: list[int] = [] + seqs_to_penalize: List[int] = [] for j, seq_id in enumerate(seq_ids): seq_data = seq_group.seq_data[seq_id] if len(seq_data.output_token_ids_array) < min_tokens: @@ -432,7 +431,7 @@ def _apply_min_p( def _greedy_sample( - selected_seq_groups: list[SequenceGroupToSample], + selected_seq_groups: List[SequenceGroupToSample], samples: torch.Tensor, ) -> SampleResultType: """Run greedy sampling on a given samples. @@ -443,7 +442,7 @@ def _greedy_sample( samples could be smaller than selected_seq_groups if seq_group.do_sample is False. Returns: - tuple of (next_token_ids, parent_ids). The length of returned list is + Tuple of (next_token_ids, parent_ids). The length of returned list is same as the length of selected_seq_groups. If the corresponding seq_group has do_sample=False, tuple contains ([], []) """ @@ -467,7 +466,7 @@ def _greedy_sample( def _random_sample( - selected_seq_groups: list[SequenceGroupToSample], + selected_seq_groups: List[SequenceGroupToSample], random_samples: torch.Tensor, ) -> SampleResultType: """Run random sampling on a given samples. @@ -478,7 +477,7 @@ def _random_sample( length of samples could be smaller than selected_seq_groups if seq_group.do_sample is False. Returns: - tuple of (next_token_ids, parent_ids). The length of returned list is + Tuple of (next_token_ids, parent_ids). The length of returned list is same as the length of selected_seq_groups. If the corresponding seq_group has do_sample=False, tuple contains ([], []) """ @@ -518,7 +517,7 @@ def _random_sample( def _multinomial( probs: torch.Tensor, num_samples: int, - seq_groups: Optional[list[SequenceGroupToSample]] = None, + seq_groups: Optional[List[SequenceGroupToSample]] = None, ) -> torch.Tensor: if num_samples > 1: probs = probs.repeat_interleave(num_samples, dim=0) @@ -539,7 +538,7 @@ def _multinomial( def _top_k_top_p_multinomial_with_flashinfer( probs: torch.Tensor, top_ks: torch.Tensor, top_ps: torch.Tensor, - num_samples: int, seq_groups: Optional[list[SequenceGroupToSample]]): + num_samples: int, seq_groups: Optional[List[SequenceGroupToSample]]): max_top_k_round = 32 if num_samples > 1: probs = probs.repeat_interleave(num_samples, dim=0) @@ -644,7 +643,7 @@ def _sample_with_torch( tensors required for Pythonization ''' - categorized_seq_group_ids: dict[SamplingType, list[int]] = { + categorized_seq_group_ids: Dict[SamplingType, List[int]] = { t: [] for t in SamplingType } @@ -808,7 +807,7 @@ def get_logprobs( logprobs: torch.Tensor, sampling_metadata: SamplingMetadata, sample_results: SampleResultType, -) -> tuple[list[Optional[PromptLogprobs]], list[SampleLogprobs]]: +) -> Tuple[List[Optional[PromptLogprobs]], List[SampleLogprobs]]: """Return sample logprobs and prompt logprobs. The logic consists of 3 parts. @@ -837,9 +836,9 @@ def get_logprobs( """ # The index of query token to calculate logprobs. It includes both # prompt and sample logprob indices. - query_indices: list[int] = [] + query_indices: List[int] = [] # The next token ids to get the logprob value from. - next_token_ids: list[int] = [] + next_token_ids: List[int] = [] # The largest requested number of logprobs. We find logprobs as many as the # largest num logprobs in this API. If every logprobs is None, it will be # set to -1. @@ -921,8 +920,8 @@ def get_logprobs( ranks = ranks.to('cpu') # Find prompt/sample logprobs. - prompt_logprobs_per_seq_group: list[Optional[PromptLogprobs]] = [] - sample_logprobs_per_seq_group: list[SampleLogprobs] = [] + prompt_logprobs_per_seq_group: List[Optional[PromptLogprobs]] = [] + sample_logprobs_per_seq_group: List[SampleLogprobs] = [] top_logprob_idx = 0 selected_logprobs_idx = 0 @@ -973,7 +972,7 @@ def _get_prompt_logprob_if_needed( for idx, token_id in enumerate(next_prompt_tokens): # Calculate the prompt logprob of the real prompt tokens. # {token_id: (logprob, rank_from_vocab)} - prompt_logprobs_dict: dict[int, tuple[float, int]] = { + prompt_logprobs_dict: Dict[int, Tuple[float, int]] = { token_id: (selected_logprob_items[idx], rank_items[idx]) } @@ -1005,7 +1004,7 @@ def _get_prompt_logprob_if_needed( def _get_sampled_logprob_if_needed( seq_group: SequenceGroupToSample, - sample_result: tuple[list[int], list[int]], + sample_result: Tuple[List[int], List[int]], selected_logprobs: torch.Tensor, ranks: torch.Tensor, top_token_ids: torch.Tensor, @@ -1126,21 +1125,21 @@ def _modify_greedy_probs_inplace(logprobs: torch.Tensor, probs: torch.Tensor, def _build_sampler_output( maybe_deferred_sample_results: MaybeDeferredSampleResultType, sampling_metadata: SamplingMetadata, - prompt_logprobs: Optional[list[Optional[PromptLogprobs]]], - sample_logprobs: Optional[list[SampleLogprobs]], - on_device_tensors: Optional[tuple[torch.Tensor, torch.Tensor, + prompt_logprobs: Optional[List[Optional[PromptLogprobs]]], + sample_logprobs: Optional[List[SampleLogprobs]], + on_device_tensors: Optional[Tuple[torch.Tensor, torch.Tensor, torch.Tensor]], skip_sampler_cpu_output: bool = False, ) -> SamplerOutput: """Construct Python objects with the output of sampling. Args: - on_device_tensors: tuple containing on-device tensors with the + on_device_tensors: Tuple containing on-device tensors with the probabilities used in sampling and the sampled token ids. This allows post-processing without copies to CPU/serialization, e.g. in speculative decoding rejection sampling. """ - sampler_output: list[CompletionSequenceGroupOutput] = [] + sampler_output: List[CompletionSequenceGroupOutput] = [] if skip_sampler_cpu_output: assert isinstance(maybe_deferred_sample_results, SampleResultArgsType) @@ -1162,7 +1161,7 @@ def _build_sampler_output( prompt_logprobs, sample_logprobs): seq_ids = seq_group.seq_ids next_token_ids, parent_ids = sample_result - seq_outputs: list[SequenceOutput] = [] + seq_outputs: List[SequenceOutput] = [] for parent_id, next_token_id, logprobs in zip( parent_ids, next_token_ids, group_sample_logprobs): seq_outputs.append( @@ -1188,7 +1187,7 @@ def _build_sampler_output( deferred_sample_results_args=deferred_sample_results_args) -def _get_next_prompt_tokens(seq_group: SequenceGroupToSample) -> list[int]: +def _get_next_prompt_tokens(seq_group: SequenceGroupToSample) -> List[int]: """Get a list of next prompt tokens to compute logprob from a given sequence group. diff --git a/vllm/model_executor/layers/spec_decode_base_sampler.py b/vllm/model_executor/layers/spec_decode_base_sampler.py index 969cd59b57ccc..54fd43fc6592c 100644 --- a/vllm/model_executor/layers/spec_decode_base_sampler.py +++ b/vllm/model_executor/layers/spec_decode_base_sampler.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 from abc import abstractmethod -from typing import Optional, Union +from typing import Dict, Optional, Union import torch import torch.jit @@ -253,6 +253,6 @@ def forward( bonus_token_ids: torch.Tensor, draft_probs: torch.Tensor, draft_token_ids: torch.Tensor, - seeded_seqs: Optional[dict[int, torch.Generator]] = None, + seeded_seqs: Optional[Dict[int, torch.Generator]] = None, ) -> torch.Tensor: raise NotImplementedError diff --git a/vllm/model_executor/layers/utils.py b/vllm/model_executor/layers/utils.py index 917fb1d7a0f63..a9ef973917e19 100644 --- a/vllm/model_executor/layers/utils.py +++ b/vllm/model_executor/layers/utils.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 """Utility methods for model layers.""" +from typing import Tuple import torch @@ -8,7 +9,7 @@ def get_token_bin_counts_and_mask( tokens: torch.Tensor, vocab_size: int, num_seqs: int, -) -> tuple[torch.Tensor, torch.Tensor]: +) -> Tuple[torch.Tensor, torch.Tensor]: # Compute the bin counts for the tokens. # vocab_size + 1 for padding. bin_counts = torch.zeros((num_seqs, vocab_size + 1), diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py index d270f2c9d82dd..f65dfc3cb3294 100644 --- a/vllm/model_executor/layers/vocab_parallel_embedding.py +++ b/vllm/model_executor/layers/vocab_parallel_embedding.py @@ -1,8 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 -from collections.abc import Sequence from dataclasses import dataclass -from typing import Optional +from typing import List, Optional, Sequence, Tuple import torch import torch.nn.functional as F @@ -25,7 +24,7 @@ class UnquantizedEmbeddingMethod(QuantizeMethodBase): def create_weights(self, layer: torch.nn.Module, input_size_per_partition: int, - output_partition_sizes: list[int], input_size: int, + output_partition_sizes: List[int], input_size: int, output_size: int, params_dtype: torch.dtype, **extra_weight_attrs): """Create weights for embedding layer.""" @@ -141,7 +140,7 @@ def get_masked_input_and_mask( input_: torch.Tensor, org_vocab_start_index: int, org_vocab_end_index: int, num_org_vocab_padding: int, added_vocab_start_index: int, - added_vocab_end_index: int) -> tuple[torch.Tensor, torch.Tensor]: + added_vocab_end_index: int) -> Tuple[torch.Tensor, torch.Tensor]: # torch.compile will fuse all of the pointwise ops below # into a single kernel, making it very fast org_vocab_mask = (input_ >= org_vocab_start_index) & ( @@ -298,7 +297,7 @@ def _get_indices(cls, vocab_size_padded: int, org_vocab_size_padded: int, org_vocab_start_index, org_vocab_end_index, added_vocab_start_index, added_vocab_end_index) - def get_sharded_to_full_mapping(self) -> Optional[list[int]]: + def get_sharded_to_full_mapping(self) -> Optional[List[int]]: """Get a mapping that can be used to reindex the gathered logits for sampling. @@ -312,9 +311,9 @@ def get_sharded_to_full_mapping(self) -> Optional[list[int]]: if self.tp_size < 2: return None - base_embeddings: list[int] = [] - added_embeddings: list[int] = [] - padding: list[int] = [] + base_embeddings: List[int] = [] + added_embeddings: List[int] = [] + padding: List[int] = [] for tp_rank in range(self.tp_size): shard_indices = self._get_indices(self.num_embeddings_padded, self.org_vocab_size_padded, diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index a63e893ae31d8..46247eaf2a60c 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -12,9 +12,9 @@ import os import warnings from abc import ABC, abstractmethod -from collections.abc import Generator, Iterable from contextlib import contextmanager -from typing import Any, Callable, Optional, cast +from typing import (Any, Callable, Dict, Generator, Iterable, List, Optional, + Tuple, cast) import gguf import huggingface_hub @@ -67,7 +67,7 @@ def device_loading_context(module: torch.nn.Module, yield module return - original_device_states: dict[str, torch.device] = {} + original_device_states: Dict[str, torch.device] = {} # Store original device states and move parameters to GPU if they're on CPU for name, p in module.named_parameters(): @@ -253,7 +253,7 @@ def _prepare_weights( revision: Optional[str], fall_back_to_pt: bool, allow_patterns_overrides: Optional[list[str]], - ) -> tuple[str, list[str], bool]: + ) -> Tuple[str, List[str], bool]: """Prepare weights for the model. If the model is not local, it will be downloaded.""" @@ -298,7 +298,7 @@ def _prepare_weights( else: hf_folder = model_name_or_path - hf_weights_files: list[str] = [] + hf_weights_files: List[str] = [] for pattern in allow_patterns: hf_weights_files += glob.glob(os.path.join(hf_folder, pattern)) if len(hf_weights_files) > 0: @@ -333,7 +333,7 @@ def _prepare_weights( def _get_weights_iterator( self, source: "Source" - ) -> Generator[tuple[str, torch.Tensor], None, None]: + ) -> Generator[Tuple[str, torch.Tensor], None, None]: """Get an iterator for the model weights based on the load format.""" hf_folder, hf_weights_files, use_safetensors = self._prepare_weights( source.model_or_path, source.revision, source.fall_back_to_pt, @@ -372,7 +372,7 @@ def _get_all_weights( self, model_config: ModelConfig, model: nn.Module, - ) -> Generator[tuple[str, torch.Tensor], None, None]: + ) -> Generator[Tuple[str, torch.Tensor], None, None]: primary_weights = DefaultModelLoader.Source( model_config.model, model_config.revision, @@ -466,7 +466,7 @@ def _verify_config(self, model_config: ModelConfig, self.tensorizer_config.verify_with_parallel_config(parallel_config) def _get_weights_iterator( - self, ) -> Generator[tuple[str, torch.Tensor], None, None]: + self, ) -> Generator[Tuple[str, torch.Tensor], None, None]: tensorizer_args = self.tensorizer_config._construct_tensorizer_args() return tensorizer_weights_iterator(tensorizer_args) @@ -572,12 +572,12 @@ def __init__(self, load_config: LoadConfig): @staticmethod def _filter_subtensors( - tensors: dict[str, torch.Tensor], ) -> dict[str, torch.Tensor]: + tensors: Dict[str, torch.Tensor], ) -> Dict[str, torch.Tensor]: """ Filter out all tensors that share the same memory or a subset of the memory of another tensor. """ - same_storage_groups: dict[Any, list[tuple[str, torch.Tensor]]] = ( + same_storage_groups: Dict[Any, List[Tuple[str, torch.Tensor]]] = ( collections.defaultdict(list)) for key, tensor in tensors.items(): if tensor.numel(): @@ -587,7 +587,7 @@ def _filter_subtensors( def get_end_ptr(tensor: torch.Tensor) -> int: return tensor.view(-1)[-1].data_ptr() + tensor.element_size() - result: dict[str, torch.Tensor] = {} + result: Dict[str, torch.Tensor] = {} for group in same_storage_groups.values(): for k, t in group: a, b = t.data_ptr(), get_end_ptr(t) @@ -695,7 +695,7 @@ def save_model( part_idx = 0 total_size = 0 state_dict = ShardedStateLoader._filter_subtensors(model.state_dict()) - state_dict_part: dict[str, torch.Tensor] = {} + state_dict_part: Dict[str, torch.Tensor] = {} for key, tensor in state_dict.items(): param_size = tensor.nelement() * tensor.element_size() if max_size is not None and total_size + param_size > max_size: @@ -726,21 +726,21 @@ def __init__(self, load_config: LoadConfig): super().__init__(load_config) # Save the module names without sharding. - self.unsharded_weights_modules: list[str] = [] + self.unsharded_weights_modules: List[str] = [] # Save the module names that are sharded by column. - self.column_sharded_weights_modules: list[str] = [] + self.column_sharded_weights_modules: List[str] = [] # Store all module names (from transformers) that support # BNB quantization. - self.target_modules: list[str] = [] + self.target_modules: List[str] = [] # mapping weight names from transformers to vllm. self.weight_mapper: Callable = lambda name: name def _get_weight_files( self, model_name_or_path: str, - allowed_patterns: list[str], + allowed_patterns: List[str], revision: Optional[str] = None, - ) -> tuple[list[str], str]: + ) -> Tuple[List[str], str]: """Retrieve weight files. Download the files if necessary. Return the weight files and the file pattern.""" @@ -771,7 +771,7 @@ def _get_weight_files( f"No model weights found in: `{model_name_or_path}`") def _prepare_weights(self, model_name_or_path: str, - revision: Optional[str]) -> tuple[list[str], bool]: + revision: Optional[str]) -> Tuple[List[str], bool]: """Prepare weight files for the model.""" allowed_patterns = ["*.safetensors", "*.bin", "*.pt"] @@ -806,7 +806,7 @@ def _get_quantized_weights_iterator( revision: Optional[str], pre_quant: bool, load_8bit: bool, - ) -> tuple[Generator[tuple[str, torch.Tensor], None, None], dict[str, + ) -> Tuple[Generator[Tuple[str, torch.Tensor], None, None], Dict[str, Any]]: """Get an iterator to the model weights with bitsandbytes quantization, as well as the quantization state dictionary.""" @@ -826,7 +826,7 @@ def _get_quantized_weights_iterator( hf_weights_files, use_safetensors = self._prepare_weights( model_name_or_path, revision) - quant_state_dict: dict[str, Any] = {} + quant_state_dict: Dict[str, Any] = {} if pre_quant: if load_8bit: @@ -908,7 +908,7 @@ def _quantized_4bit_generator(self, hf_weights_files, use_safetensors, # Closure to parse quant_state for each prequant weight def _parse_quant_state(param_name: str, - temp_state_dict: dict) -> QuantState: + temp_state_dict: Dict) -> QuantState: quant_state = {} for k in temp_state_dict: if param_name + "." in k: @@ -1066,7 +1066,7 @@ def _load_weights(self, model_config: ModelConfig, # Modules whose weights might have fused on disk # we need their output_sizes to make shard in flight correctly with TP - self.maybe_fused_weights_modules: dict[str, list[int]] = {} + self.maybe_fused_weights_modules: Dict[str, List[int]] = {} self._get_bnb_target_modules(model) for name, module in model.named_modules(): # Some modules like `ReplicatedLinear` should not have their weights @@ -1131,7 +1131,7 @@ def _load_weights(self, model_config: ModelConfig, torch.cuda.empty_cache() param_dict = dict(model.named_parameters()) - stacked_quant_state_dict: dict[str, dict[int, Any]] = {} + stacked_quant_state_dict: Dict[str, Dict[int, Any]] = {} # TODO: Change this lazy import to normal import # after the checks are updated to run on a new version from vllm.model_executor.models.utils import is_pp_missing_parameter @@ -1284,8 +1284,8 @@ def _get_gguf_weights_map(self, model_config: ModelConfig): return gguf_to_hf_name_map def _get_weights_iterator( - self, model_name_or_path: str, gguf_to_hf_name_map: dict[str, str] - ) -> Generator[tuple[str, torch.Tensor], None, None]: + self, model_name_or_path: str, gguf_to_hf_name_map: Dict[str, str] + ) -> Generator[Tuple[str, torch.Tensor], None, None]: return gguf_quant_weights_iterator(model_name_or_path, gguf_to_hf_name_map) @@ -1339,7 +1339,7 @@ def __init__(self, load_config: LoadConfig): os.environ["RUNAI_STREAMER_S3_ENDPOINT"] = aws_endpoint_url def _prepare_weights(self, model_name_or_path: str, - revision: Optional[str]) -> list[str]: + revision: Optional[str]) -> List[str]: """Prepare weights for the model. If the model is not local, it will be downloaded.""" @@ -1378,7 +1378,7 @@ def _prepare_weights(self, model_name_or_path: str, def _get_weights_iterator( self, model_or_path: str, - revision: str) -> Generator[tuple[str, torch.Tensor], None, None]: + revision: str) -> Generator[Tuple[str, torch.Tensor], None, None]: """Get an iterator for the model weights based on the load format.""" hf_weights_files = self._prepare_weights(model_or_path, revision) return runai_safetensors_weights_iterator(hf_weights_files) diff --git a/vllm/model_executor/model_loader/neuron.py b/vllm/model_executor/model_loader/neuron.py index 3d1109f3dfba4..d900fb3a7d397 100644 --- a/vllm/model_executor/model_loader/neuron.py +++ b/vllm/model_executor/model_loader/neuron.py @@ -3,7 +3,7 @@ import copy import importlib import os -from typing import Optional +from typing import Dict, List, Optional, Tuple import torch import torch.nn as nn @@ -30,7 +30,7 @@ } # Models supported by Neuron. -_NEURON_SUPPORTED_MODELS: dict[str, tuple[str, str, str]] = { +_NEURON_SUPPORTED_MODELS: Dict[str, Tuple[str, str, str]] = { "LlamaForCausalLM": ("transformers_neuronx.llama.model", "LlamaForSampling", "LlamaForCausalLM"), "MistralForCausalLM": ("transformers_neuronx.mistral.model", @@ -124,7 +124,7 @@ def _get_model_architecture(config: PretrainedConfig) -> str: f"{list(_NEURON_SUPPORTED_MODELS.keys())}") -def _get_buckets(env: str, default_value: list[int]) -> list[int]: +def _get_buckets(env: str, default_value: List[int]) -> List[int]: env_value = os.getenv(env) if env_value is None: return default_value diff --git a/vllm/model_executor/model_loader/openvino.py b/vllm/model_executor/model_loader/openvino.py index c54090c16a51b..805f0cfc585e3 100644 --- a/vllm/model_executor/model_loader/openvino.py +++ b/vllm/model_executor/model_loader/openvino.py @@ -2,7 +2,7 @@ # ruff: noqa: SIM117 from pathlib import Path -from typing import Optional +from typing import List, Optional, Tuple import openvino as ov import torch @@ -147,7 +147,7 @@ def forward( self, input_ids: torch.Tensor, positions: torch.Tensor, - kv_caches: list[tuple[ov.Tensor, ov.Tensor]], + kv_caches: List[Tuple[ov.Tensor, ov.Tensor]], attn_metadata: OpenVINOAttentionMetadata, ) -> torch.Tensor: flatten_kv_cache = _flattenize_inputs(kv_caches) diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py index 0ff35b3a6dca1..117251ccf05f1 100644 --- a/vllm/model_executor/model_loader/tensorizer.py +++ b/vllm/model_executor/model_loader/tensorizer.py @@ -6,10 +6,9 @@ import os import re import time -from collections.abc import Generator from dataclasses import dataclass from functools import partial -from typing import BinaryIO, Optional, Union +from typing import BinaryIO, Generator, Optional, Tuple, Type, Union import torch from torch import nn @@ -68,7 +67,7 @@ class TensorizerConfig: s3_access_key_id: Optional[str] = None s3_secret_access_key: Optional[str] = None s3_endpoint: Optional[str] = None - model_class: Optional[type[torch.nn.Module]] = None + model_class: Optional[Type[torch.nn.Module]] = None hf_config: Optional[PretrainedConfig] = None dtype: Optional[Union[str, torch.dtype]] = None _is_sharded: bool = False @@ -366,7 +365,7 @@ def deserialize(self): def tensorizer_weights_iterator( tensorizer_args: "TensorizerArgs" -) -> Generator[tuple[str, torch.Tensor], None, None]: +) -> Generator[Tuple[str, torch.Tensor], None, None]: logger.warning("Deserializing HuggingFace models is not optimized for " "loading on vLLM, as tensorizer is forced to load to CPU. " "Consider deserializing a vLLM model instead for faster " diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py index d76304106eb48..9686231fb4bd1 100644 --- a/vllm/model_executor/model_loader/utils.py +++ b/vllm/model_executor/model_loader/utils.py @@ -2,7 +2,7 @@ """Utilities for selecting and loading models.""" import contextlib from dataclasses import dataclass, field -from typing import Optional +from typing import Dict, List, Optional, Tuple, Type import torch import transformers @@ -84,7 +84,7 @@ def resolve_transformers_fallback(model_config: ModelConfig, def get_model_architecture( - model_config: ModelConfig) -> tuple[type[nn.Module], str]: + model_config: ModelConfig) -> Tuple[Type[nn.Module], str]: architectures = getattr(model_config.hf_config, "architectures", []) # Special handling for quantized Mixtral. @@ -128,8 +128,8 @@ class ParamMapping: It creates a bidirectional mapping between packed parameters and their constituent parts. """ - packed_mapping: dict[str, list[str]] - inverse_packed_mapping: dict[str, tuple[str, + packed_mapping: Dict[str, List[str]] + inverse_packed_mapping: Dict[str, Tuple[str, int]] = field(default_factory=dict) def __post_init__(self): @@ -144,7 +144,7 @@ def __post_init__(self): ) def get_sub_modules(self, - module_name: str) -> Optional[tuple[str, list[str]]]: + module_name: str) -> Optional[Tuple[str, List[str]]]: for key, value in self.packed_mapping.items(): if module_name.endswith(key): return key, value @@ -152,7 +152,7 @@ def get_sub_modules(self, def configure_quant_config(quant_config: QuantizationConfig, - model_class: type[nn.Module]): + model_class: Type[nn.Module]): """ Pass packed_modules_mapping by reference to quant_config so that quant_config can properly match fused modules diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index 77df3884f27d5..245c199f75b18 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -8,8 +8,7 @@ import tempfile import time from collections import defaultdict -from collections.abc import Generator -from typing import Any, Callable, Optional, Union +from typing import Any, Callable, Dict, Generator, List, Optional, Tuple, Union import filelock import gguf @@ -218,9 +217,9 @@ def get_quant_config(model_config: ModelConfig, def download_weights_from_hf( model_name_or_path: str, cache_dir: Optional[str], - allow_patterns: list[str], + allow_patterns: List[str], revision: Optional[str] = None, - ignore_patterns: Optional[Union[str, list[str]]] = None, + ignore_patterns: Optional[Union[str, List[str]]] = None, ) -> str: """Download model weights from Hugging Face Hub. @@ -228,11 +227,11 @@ def download_weights_from_hf( model_name_or_path (str): The model name or path. cache_dir (Optional[str]): The cache directory to store the model weights. If None, will use HF defaults. - allow_patterns (list[str]): The allowed patterns for the + allow_patterns (List[str]): The allowed patterns for the weight files. Files matched by any of the patterns will be downloaded. revision (Optional[str]): The revision of the model. - ignore_patterns (Optional[Union[str, list[str]]]): The patterns to + ignore_patterns (Optional[Union[str, List[str]]]): The patterns to filter out the weight files. Files matched by any of the patterns will be ignored. @@ -312,9 +311,9 @@ def download_safetensors_index_file_from_hf( # Passing both of these to the weight loader functionality breaks. # So, we use the index_file to # look up which safetensors files should be used. -def filter_duplicate_safetensors_files(hf_weights_files: list[str], +def filter_duplicate_safetensors_files(hf_weights_files: List[str], hf_folder: str, - index_file: str) -> list[str]: + index_file: str) -> List[str]: # model.safetensors.index.json is a mapping from keys in the # torch state_dict to safetensors file holding that weight. index_file_name = os.path.join(hf_folder, index_file) @@ -337,7 +336,7 @@ def filter_duplicate_safetensors_files(hf_weights_files: list[str], def filter_files_not_needed_for_inference( - hf_weights_files: list[str]) -> list[str]: + hf_weights_files: List[str]) -> List[str]: """ Exclude files that are not needed for inference. @@ -366,8 +365,8 @@ def filter_files_not_needed_for_inference( def np_cache_weights_iterator( model_name_or_path: str, cache_dir: Optional[str], hf_folder: str, - hf_weights_files: list[str] -) -> Generator[tuple[str, torch.Tensor], None, None]: + hf_weights_files: List[str] +) -> Generator[Tuple[str, torch.Tensor], None, None]: """Iterate over the weights in the model np files. Will dump the model weights to numpy files if they are not already dumped. @@ -383,7 +382,7 @@ def np_cache_weights_iterator( # dumping the same model weights to numpy at the same time. with get_lock(model_name_or_path, cache_dir): if not os.path.exists(weight_names_file): - weight_names: list[str] = [] + weight_names: List[str] = [] for bin_file in tqdm( hf_weights_files, desc="Loading np_cache checkpoint shards", @@ -412,8 +411,8 @@ def np_cache_weights_iterator( def safetensors_weights_iterator( - hf_weights_files: list[str] -) -> Generator[tuple[str, torch.Tensor], None, None]: + hf_weights_files: List[str] +) -> Generator[Tuple[str, torch.Tensor], None, None]: """Iterate over the weights in the model safetensor files.""" enable_tqdm = not torch.distributed.is_initialized( ) or torch.distributed.get_rank() == 0 @@ -430,8 +429,8 @@ def safetensors_weights_iterator( def runai_safetensors_weights_iterator( - hf_weights_files: list[str] -) -> Generator[tuple[str, torch.Tensor], None, None]: + hf_weights_files: List[str] +) -> Generator[Tuple[str, torch.Tensor], None, None]: """Iterate over the weights in the model safetensor files.""" enable_tqdm = not torch.distributed.is_initialized( ) or torch.distributed.get_rank() == 0 @@ -447,8 +446,8 @@ def runai_safetensors_weights_iterator( def pt_weights_iterator( - hf_weights_files: list[str] -) -> Generator[tuple[str, torch.Tensor], None, None]: + hf_weights_files: List[str] +) -> Generator[Tuple[str, torch.Tensor], None, None]: """Iterate over the weights in the model bin/pt files.""" enable_tqdm = not torch.distributed.is_initialized( ) or torch.distributed.get_rank() == 0 @@ -464,7 +463,7 @@ def pt_weights_iterator( def get_gguf_extra_tensor_names( - gguf_file: str, gguf_to_hf_name_map: dict[str, str]) -> list[str]: + gguf_file: str, gguf_to_hf_name_map: Dict[str, str]) -> List[str]: reader = gguf.GGUFReader(gguf_file) expected_gguf_keys = set(gguf_to_hf_name_map.keys()) exact_gguf_keys = set([tensor.name for tensor in reader.tensors]) @@ -473,8 +472,8 @@ def get_gguf_extra_tensor_names( def gguf_quant_weights_iterator( - gguf_file: str, gguf_to_hf_name_map: dict[str, str] -) -> Generator[tuple[str, torch.Tensor], None, None]: + gguf_file: str, gguf_to_hf_name_map: Dict[str, str] +) -> Generator[Tuple[str, torch.Tensor], None, None]: """ Iterate over the quant weights in the model gguf files and convert them to torch tensors diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py index ed244541aefb4..e2d4a8de605b9 100644 --- a/vllm/model_executor/models/arctic.py +++ b/vllm/model_executor/models/arctic.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 """Inference-only Snowflake Arctic model.""" -from collections.abc import Iterable -from typing import Optional, Union +from typing import Iterable, List, Optional, Set, Tuple, Union import torch from torch import nn @@ -472,8 +471,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -481,8 +480,8 @@ def load_weights(self, weights: Iterable[tuple[str, ("qkv_proj", "v_proj", "v"), ] - mlp_params_mapping: list[tuple[str, str, int]] = [] - expert_params_mapping: list[tuple[str, str, int]] = [] + mlp_params_mapping: List[Tuple[str, str, int]] = [] + expert_params_mapping: List[Tuple[str, str, int]] = [] num_layers = self.config.num_hidden_layers for layer in range(num_layers): @@ -511,7 +510,7 @@ def load_weights(self, weights: Iterable[tuple[str, ("ws", f"experts.{expert_id}.w3.weight", expert_id)) params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() + loaded_params: Set[str] = set() logger.info( "It will take ~10 minutes loading from the 16-bit weights. " diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py index ffa931e054723..656e9b037d969 100644 --- a/vllm/model_executor/models/aria.py +++ b/vllm/model_executor/models/aria.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 -from collections.abc import Iterable, Mapping -from typing import Optional, TypedDict, Union +from typing import (Iterable, List, Mapping, Optional, Set, Tuple, TypedDict, + Union) import torch import torch.nn as nn @@ -67,8 +67,8 @@ def __init__( # Identity layer self.post_layernorm = nn.Identity() - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -76,7 +76,7 @@ def load_weights(self, weights: Iterable[tuple[str, ("qkv_proj", "v_proj", "v"), ] params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() + loaded_params: Set[str] = set() for name, loaded_weight in weights: # NOTE: post_layernorm is not used in Aria @@ -323,8 +323,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): # Adapted from LlamaModel.load_weights with the modification of adding # the expert weights mapping to `stacked_params_mapping` - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) (".qkv_proj", ".q_proj", "q"), @@ -336,7 +336,7 @@ def load_weights(self, weights: Iterable[tuple[str, ("experts.w2_weight", "experts.fc2.weight", 'w2'), ] params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() + loaded_params: Set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue @@ -533,7 +533,7 @@ def __init__( self.sampler = get_sampler() def _validate_image_sizes( - self, images: list[torch.Tensor]) -> list[torch.Tensor]: + self, images: List[torch.Tensor]) -> List[torch.Tensor]: if not all(img.shape == images[0].shape for img in images): raise ValueError("All images must be the same size") return images @@ -583,7 +583,7 @@ def _create_patch_attention_mask( def _process_image_input( self, image_input: AriaImagePixelInputs - ) -> tuple[torch.Tensor, torch.Tensor]: + ) -> Tuple[torch.Tensor, torch.Tensor]: assert self.vision_tower is not None pixel_values = image_input['pixel_values'] @@ -660,6 +660,6 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): loader = AutoWeightsLoader(self) loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py index 8ec42c5c62da1..4fb68e7b48da9 100644 --- a/vllm/model_executor/models/baichuan.py +++ b/vllm/model_executor/models/baichuan.py @@ -20,8 +20,7 @@ # limitations under the License. """Inference-only BaiChuan model compatible with HuggingFace weights.""" import math -from collections.abc import Iterable -from typing import Optional, Union +from typing import Iterable, Optional, Set, Tuple, Union import torch from torch import nn @@ -232,7 +231,7 @@ def forward( positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> tuple[torch.Tensor, torch.Tensor]: + ) -> Tuple[torch.Tensor, torch.Tensor]: # Self Attention if residual is None: residual = hidden_states @@ -393,15 +392,15 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("gate_up_proj", "gate_proj", 0), ("gate_up_proj", "up_proj", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() + loaded_params: Set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue diff --git a/vllm/model_executor/models/bamba.py b/vllm/model_executor/models/bamba.py index bf192823b8a20..69da05884ded8 100644 --- a/vllm/model_executor/models/bamba.py +++ b/vllm/model_executor/models/bamba.py @@ -1,8 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 """Inference-only Bamba model.""" # Added by the IBM Team, 2024 -from collections.abc import Iterable -from typing import Optional +from typing import Iterable, Optional, Set, Tuple import torch from torch import nn @@ -38,7 +37,7 @@ make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) -KVCache = tuple[torch.Tensor, torch.Tensor] +KVCache = Tuple[torch.Tensor, torch.Tensor] class BambaMLP(nn.Module): @@ -454,7 +453,7 @@ def get_seqlen_agnostic_capture_inputs(self, batch_size: int): return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size) def _get_mamba_cache_shape( - self) -> tuple[tuple[int, int], tuple[int, int]]: + self) -> Tuple[Tuple[int, int], Tuple[int, int]]: world_size = get_tensor_model_parallel_world_size() hidden_size = self.config.hidden_size @@ -502,8 +501,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -514,7 +513,7 @@ def load_weights(self, weights: Iterable[tuple[str, ] params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() + loaded_params: Set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue diff --git a/vllm/model_executor/models/bart.py b/vllm/model_executor/models/bart.py index 780eb18726613..93452696dca55 100644 --- a/vllm/model_executor/models/bart.py +++ b/vllm/model_executor/models/bart.py @@ -19,8 +19,7 @@ # limitations under the License. """PyTorch BART model.""" import math -from collections.abc import Iterable -from typing import Optional +from typing import Iterable, Optional, Tuple import torch from torch import nn @@ -882,14 +881,14 @@ def _rename_key(self, key: str): def _rename_stacked_param( self, name: str, - ) -> tuple[str, Optional[str]]: + ) -> Tuple[str, Optional[str]]: for key, mapping in self.stacked_params_mapping.items(): if key in name: name = name.replace(key, mapping["param_name"]) return name, mapping["shard_id"] return name, None - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): model_params_dict = dict(self.model.named_parameters()) top_params_dict = dict(self.named_parameters()) diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py index ced2ad9e966ff..4ff69527653d8 100644 --- a/vllm/model_executor/models/bert.py +++ b/vllm/model_executor/models/bert.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from collections.abc import Iterable -from typing import Optional +from typing import Iterable, Optional, Set, Tuple import torch from torch import nn @@ -349,8 +348,8 @@ def forward( token_type_ids=token_type_ids) return self.encoder(hidden_states) - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "query", "q"), @@ -359,7 +358,7 @@ def load_weights(self, weights: Iterable[tuple[str, ] params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() + loaded_params: Set[str] = set() for name, loaded_weight in weights: if self.pooler is None and "pooler" in name: continue @@ -424,7 +423,7 @@ def pooler( ) -> Optional[PoolerOutput]: return self._pooler(hidden_states, pooling_metadata) - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): weights = self.hf_to_vllm_mapper.apply(weights) weights = ((name, data) for name, data in weights if not name.startswith("lm_head.")) @@ -471,7 +470,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self._pooler = CrossEncodingPooler(config, self.classifier, self.bert.pooler) - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): self_weights = [] diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py index ec836732379d3..bedbdceb7721d 100644 --- a/vllm/model_executor/models/blip.py +++ b/vllm/model_executor/models/blip.py @@ -1,8 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 """Minimal implementation of BlipVisionModel intended to be only used within a vision language model.""" -from collections.abc import Iterable -from typing import Optional, Union +from typing import Iterable, Optional, Set, Tuple, Union import torch import torch.nn as nn @@ -294,8 +293,8 @@ def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: return self.post_layernorm(hidden_states) - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -303,7 +302,7 @@ def load_weights(self, weights: Iterable[tuple[str, ("qkv_proj", "v_proj", "v"), ] params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() + loaded_params: Set[str] = set() layer_count = len(self.encoder.layers) for name, loaded_weight in weights: diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index f755bcd59c43d..23bb3cd07f1d4 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -1,8 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 -from collections.abc import Iterable, Mapping from functools import cached_property -from typing import Literal, Optional, TypedDict, Union +from typing import (Iterable, Literal, Mapping, Optional, Set, Tuple, + TypedDict, Union) import torch import torch.nn as nn @@ -184,7 +184,7 @@ def forward( self, hidden_states: torch.Tensor, encoder_hidden_states: Optional[torch.FloatTensor] = None, - ) -> tuple[torch.Tensor]: + ) -> Tuple[torch.Tensor]: self_output = self.attention( hidden_states, encoder_hidden_states=encoder_hidden_states, @@ -725,7 +725,7 @@ def sample( ) -> Optional[SamplerOutput]: return self.language_model.sample(logits, sampling_metadata) - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py index 9d5bb7f77d1aa..84b79613abc47 100644 --- a/vllm/model_executor/models/bloom.py +++ b/vllm/model_executor/models/bloom.py @@ -18,8 +18,7 @@ # limitations under the License. """Inference-only BLOOM model compatible with HuggingFace weights.""" import math -from collections.abc import Iterable -from typing import Optional, Union +from typing import Iterable, Optional, Set, Tuple, Union import torch from torch import nn @@ -333,10 +332,10 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: set[str] = set() + loaded_params: Set[str] = set() for name, loaded_weight in weights: if name == "lm_head.weight": continue diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index ac0a1d102a106..e91399b2674df 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -1,8 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 -from collections.abc import Iterable, Mapping from functools import cached_property -from typing import Any, Literal, Optional, TypedDict, Union +from typing import (Any, Dict, Iterable, Literal, Mapping, Optional, Set, + Tuple, TypedDict, Union) import torch import torch.nn as nn @@ -233,7 +233,7 @@ def __init__( num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: Optional[dict[str, Any]] = None, + rope_scaling: Optional[Dict[str, Any]] = None, max_position_embeddings: int = 4096, quant_config: Optional[QuantizationConfig] = None, bias: bool = False, @@ -296,7 +296,7 @@ def __init__( prefix=f"{prefix}.attn") def _apply_qk_norm(self, q: torch.Tensor, - k: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + k: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: # reshape for layernorm q = q.reshape(-1, self.num_heads, self.head_dim) k = k.reshape(-1, self.num_kv_heads, self.head_dim) @@ -371,7 +371,7 @@ def forward( positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> tuple[torch.Tensor, torch.Tensor]: + ) -> Tuple[torch.Tensor, torch.Tensor]: if residual is None: residual = hidden_states @@ -442,7 +442,7 @@ def forward( positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> tuple[torch.Tensor, torch.Tensor]: + ) -> Tuple[torch.Tensor, torch.Tensor]: residual = hidden_states hidden_states = self.self_attn( @@ -777,7 +777,7 @@ def __init__(self, config: ChameleonVQVAEConfig): def encode( self, pixel_values: torch.Tensor - ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: hidden_states = self.encoder(pixel_values) hidden_states = self.quant_conv(hidden_states) quant, emb_loss, indices = self.quantize(hidden_states) @@ -790,7 +790,7 @@ class ChameleonImageVocabularyMapping: A class for mapping discrete image tokens from VQGAN to BPE tokens. """ - def __init__(self, vocab_map: dict[str, int]): + def __init__(self, vocab_map: Dict[str, int]): self.vocab_map = vocab_map self.image_token_id = vocab_map.get("") @@ -1059,8 +1059,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) (".qkv_proj", ".q_proj", "q"), @@ -1070,7 +1070,7 @@ def load_weights(self, weights: Iterable[tuple[str, (".gate_up_proj", ".up_proj", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() + loaded_params: Set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index e135dcda91ccc..6eca25212ee66 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -2,8 +2,7 @@ # Adapted from # https://github.com/THUDM/ChatGLM2-6B """Inference-only ChatGLM model compatible with THUDM weights.""" -from collections.abc import Iterable -from typing import Optional, Union +from typing import Iterable, Optional, Set, Tuple, Union import torch from torch import nn @@ -353,15 +352,15 @@ def forward( return hidden_states - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("linear_proj.merged_proj", "linear_proj.gate_proj", 0), ("linear_proj.merged_proj", "linear_proj.dense_h_to_4h", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() + loaded_params: Set[str] = set() for name, loaded_weight in weights: for (param_name, weight_name, shard_id) in stacked_params_mapping: @@ -445,7 +444,7 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): loader = AutoWeightsLoader(self) return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py index c1226d12ec2a9..dc3aa9cbe86b7 100644 --- a/vllm/model_executor/models/clip.py +++ b/vllm/model_executor/models/clip.py @@ -1,8 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 """Minimal implementation of CLIPVisionModel intended to be only used within a vision language model.""" -from collections.abc import Iterable -from typing import Optional, Union +from typing import Iterable, Optional, Set, Tuple, Union import torch import torch.nn as nn @@ -372,8 +371,8 @@ def device(self): # (TODO) Add prefix argument for filtering out weights to be loaded # ref: https://github.com/vllm-project/vllm/pull/7186#discussion_r1734163986 - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -381,7 +380,7 @@ def load_weights(self, weights: Iterable[tuple[str, ("qkv_proj", "v_proj", "v"), ] params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() + loaded_params: Set[str] = set() layer_count = len(self.vision_model.encoder.layers) for name, loaded_weight in weights: diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py index 3eb16762bc8fd..b0cb4a62333a4 100644 --- a/vllm/model_executor/models/commandr.py +++ b/vllm/model_executor/models/commandr.py @@ -21,8 +21,7 @@ # This file is based on the LLama model definition file in transformers """PyTorch Cohere model.""" -from collections.abc import Iterable -from typing import Optional, Union +from typing import Iterable, Optional, Set, Tuple, Union import torch import torch.utils.checkpoint @@ -255,7 +254,7 @@ def forward( positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> tuple[torch.Tensor, torch.Tensor]: + ) -> Tuple[torch.Tensor, torch.Tensor]: # Self Attention residual = hidden_states hidden_states, residual = self.input_layernorm(hidden_states, residual) @@ -409,8 +408,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -420,7 +419,7 @@ def load_weights(self, weights: Iterable[tuple[str, ("gate_up_proj", "up_proj", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() + loaded_params: Set[str] = set() for name, loaded_weight in weights: if (self.quant_config is not None and diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py index 492768b2bac3c..7830dd4ce2ec3 100644 --- a/vllm/model_executor/models/dbrx.py +++ b/vllm/model_executor/models/dbrx.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from collections.abc import Iterable -from typing import Optional, Union +from typing import Iterable, Optional, Set, Tuple, Union import torch import torch.nn as nn @@ -422,14 +421,14 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: expert_params_mapping = [( "w13" if weight_name in ["w1", "v1"] else "w2", f"mlp.{weight_name}", ) for weight_name in ["w1", "v1", "w2"]] params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: set[str] = set() + loaded_params: Set[str] = set() for name, loaded_weight in weights: if (self.quant_config is not None and diff --git a/vllm/model_executor/models/decilm.py b/vllm/model_executor/models/decilm.py index b008905cd5c91..b239b642f752b 100644 --- a/vllm/model_executor/models/decilm.py +++ b/vllm/model_executor/models/decilm.py @@ -24,7 +24,7 @@ # limitations under the License. """Inference-only DeciLM model compatible with HuggingFace weights.""" -from collections.abc import Iterable +from typing import Iterable, Set, Tuple import torch @@ -59,8 +59,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): delattr(config, "num_key_value_heads_per_layer") super().__init__(vllm_config=vllm_config) - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -70,7 +70,7 @@ def load_weights(self, weights: Iterable[tuple[str, ("gate_up_proj", "up_proj", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() + loaded_params: Set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue diff --git a/vllm/model_executor/models/deepseek.py b/vllm/model_executor/models/deepseek.py index 2eaa8e39916fc..c04e7a02bae23 100644 --- a/vllm/model_executor/models/deepseek.py +++ b/vllm/model_executor/models/deepseek.py @@ -22,8 +22,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only Deepseek model.""" -from collections.abc import Iterable -from typing import Any, Optional, Union +from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union import torch from torch import nn @@ -185,7 +184,7 @@ def __init__( num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: Optional[dict[str, Any]] = None, + rope_scaling: Optional[Dict[str, Any]] = None, max_position_embeddings: int = 8192, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, @@ -439,8 +438,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -451,7 +450,7 @@ def load_weights(self, weights: Iterable[tuple[str, ] params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() + loaded_params: Set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue diff --git a/vllm/model_executor/models/deepseek_mtp.py b/vllm/model_executor/models/deepseek_mtp.py index 74cedd0031973..cac1b2b3b11cc 100644 --- a/vllm/model_executor/models/deepseek_mtp.py +++ b/vllm/model_executor/models/deepseek_mtp.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -from collections.abc import Iterable -from typing import Optional +from typing import Iterable, Optional, Set, Tuple import torch import torch.nn as nn @@ -184,8 +183,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ ("gate_up_proj", "gate_proj", 0), ("gate_up_proj", "up_proj", 1), @@ -198,7 +197,7 @@ def load_weights(self, weights: Iterable[tuple[str, num_experts=self.config.n_routed_experts) params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() + loaded_params: Set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index 3c23a5f9f84ab..6ff3ef129a74b 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -22,8 +22,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only DeepseekV2/DeepseekV3 model.""" -from collections.abc import Iterable -from typing import Any, Optional, Union +from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union import torch from torch import nn @@ -188,7 +187,7 @@ def __init__( q_lora_rank: int, kv_lora_rank: int, rope_theta: float = 10000, - rope_scaling: Optional[dict[str, Any]] = None, + rope_scaling: Optional[Dict[str, Any]] = None, max_position_embeddings: int = 8192, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, @@ -340,7 +339,7 @@ def __init__( q_lora_rank: Optional[int], kv_lora_rank: int, rope_theta: float = 10000, - rope_scaling: Optional[dict[str, Any]] = None, + rope_scaling: Optional[Dict[str, Any]] = None, max_position_embeddings: int = 8192, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, @@ -693,8 +692,8 @@ def make_empty_intermediate_tensors( device=device), }) - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("gate_up_proj", "gate_proj", 0), @@ -710,7 +709,7 @@ def load_weights(self, weights: Iterable[tuple[str, num_experts=self.config.n_routed_experts) params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() + loaded_params: Set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py index 8550ad82640bb..ea217e2444040 100644 --- a/vllm/model_executor/models/deepseek_vl2.py +++ b/vllm/model_executor/models/deepseek_vl2.py @@ -3,9 +3,9 @@ # adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/faf18023f24b962b32d9f0a2d89e402a8d383a78/deepseek_vl2/models/modeling_deepseek_vl_v2.py """Inference-only Deepseek-VL2 model compatible with HuggingFace weights.""" import math -from collections.abc import Iterable, Mapping from functools import cached_property -from typing import Literal, Optional, TypedDict, Union +from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple, + TypedDict, Union) import torch import torch.nn as nn @@ -50,7 +50,7 @@ class DeepseekVL2ImagePixelInputs(TypedDict): type: Literal["pixel_values"] - data: Union[torch.Tensor, list[torch.Tensor]] + data: Union[torch.Tensor, List[torch.Tensor]] """ Shape: `(batch_size * num_images, num_channels, height, width)` """ @@ -62,7 +62,7 @@ class DeepseekVL2ImagePixelInputs(TypedDict): class DeepseekVL2VImageEmbeddingInputs(TypedDict): type: Literal["image_embeds"] - data: Union[torch.Tensor, list[torch.Tensor]] + data: Union[torch.Tensor, List[torch.Tensor]] """Shape: `(batch_size * num_images, image_feature_size, hidden_size)` `hidden_size` must match the hidden size of language model backbone. @@ -416,8 +416,8 @@ def sampler(self): return get_sampler() def _validate_pixel_values( - self, data: Union[torch.Tensor, list[torch.Tensor]] - ) -> Union[torch.Tensor, list[torch.Tensor]]: + self, data: Union[torch.Tensor, List[torch.Tensor]] + ) -> Union[torch.Tensor, List[torch.Tensor]]: h = w = self.vision_config.image_size expected_dims = (3, h, w) @@ -437,8 +437,8 @@ def _validate_shape(d: torch.Tensor): return data def _validate_images_spatial_crop( - self, data: Union[torch.Tensor, list[torch.Tensor]] - ) -> Union[torch.Tensor, list[torch.Tensor]]: + self, data: Union[torch.Tensor, List[torch.Tensor]] + ) -> Union[torch.Tensor, List[torch.Tensor]]: expected_dims = 2 def _validate_shape(d: torch.Tensor): @@ -665,8 +665,8 @@ def sample( ) -> Optional[SamplerOutput]: return self.language_model.sample(logits, sampling_metadata) - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader(self) autoloaded_weights = loader.load_weights(weights, diff --git a/vllm/model_executor/models/eagle.py b/vllm/model_executor/models/eagle.py index 31821ba36d403..f2a2935e6c694 100644 --- a/vllm/model_executor/models/eagle.py +++ b/vllm/model_executor/models/eagle.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from collections.abc import Iterable -from typing import Optional +from typing import Iterable, Optional, Tuple import torch import torch.nn as nn @@ -166,7 +165,7 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): # This implementation is incompitable with https://huggingface.co/yuhuili/EAGLE-LLaMA3-Instruct-8B # due to missing lm_head weights and its config being that of a # Llama model. Here's a compatible version with the same weights: diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py index 41b6208116e2c..79939f6f40e4e 100644 --- a/vllm/model_executor/models/exaone.py +++ b/vllm/model_executor/models/exaone.py @@ -24,8 +24,7 @@ # limitations under the License. """Inference-only Exaone model compatible with HuggingFace weights.""" -from collections.abc import Iterable -from typing import Any, Optional, Union +from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union import torch from torch import nn @@ -104,7 +103,7 @@ def __init__( num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: Optional[dict[str, Any]] = None, + rope_scaling: Optional[Dict[str, Any]] = None, max_position_embeddings: int = 8192, quant_config: Optional[QuantizationConfig] = None, bias: bool = False, @@ -198,7 +197,7 @@ def __init__( num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: Optional[dict[str, Any]] = None, + rope_scaling: Optional[Dict[str, Any]] = None, max_position_embeddings: int = 8192, quant_config: Optional[QuantizationConfig] = None, bias: bool = False, @@ -284,7 +283,7 @@ def forward( positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> tuple[torch.Tensor, torch.Tensor]: + ) -> Tuple[torch.Tensor, torch.Tensor]: # Self Attention if residual is None: residual = hidden_states @@ -481,8 +480,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) (".qkv_proj", ".q_proj", "q"), @@ -492,7 +491,7 @@ def load_weights(self, weights: Iterable[tuple[str, (".gate_up_proj", ".c_fc_1", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() + loaded_params: Set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue diff --git a/vllm/model_executor/models/fairseq2_llama.py b/vllm/model_executor/models/fairseq2_llama.py index 00dbbebb120e8..310aca999bc2d 100644 --- a/vllm/model_executor/models/fairseq2_llama.py +++ b/vllm/model_executor/models/fairseq2_llama.py @@ -16,7 +16,7 @@ # limitations under the License. """Llama model for fairseq2 weights.""" -from collections.abc import Iterable +from typing import Iterable, Set, Tuple import torch from torch.nn import Parameter @@ -44,8 +44,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): f"model.{self.tp_rank}.pt", ] - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: # fairseq2's serialization adds a wrapper to usual .pt state_dict's: # { "model_key": my_model_name, "my_model_name": state_dict } # which we first need to unpack @@ -102,7 +102,7 @@ def reshape_fairseq2_weights( name: str, loaded_weight: torch.Tensor, params: dict[str, Parameter], - ) -> tuple[str, torch.Tensor]: + ) -> Tuple[str, torch.Tensor]: """Reshape fairseq2's weights.""" def permute(w: torch.Tensor, n_heads: int) -> torch.Tensor: diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py index 21a1038adc83c..7154ac2e6a5af 100644 --- a/vllm/model_executor/models/falcon.py +++ b/vllm/model_executor/models/falcon.py @@ -20,8 +20,7 @@ """PyTorch Falcon model.""" import math -from collections.abc import Iterable -from typing import Optional, Union +from typing import Iterable, Optional, Set, Tuple, Union import torch from torch import nn @@ -461,8 +460,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: total_num_heads = self.config.num_attention_heads if self.config.new_decoder_architecture: total_num_kv_heads = self.config.num_kv_heads @@ -472,7 +471,7 @@ def load_weights(self, weights: Iterable[tuple[str, total_num_kv_heads = total_num_heads num_query_heads_per_kv_head = total_num_heads // total_num_kv_heads params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: set[str] = set() + loaded_params: Set[str] = set() for name, loaded_weight in weights: if name == "lm_head.weight" and self.tie_word_embeddings: # Falcon uses tied embeddings except Falcon-11b. diff --git a/vllm/model_executor/models/florence2.py b/vllm/model_executor/models/florence2.py index 92fb39c74ffe4..b71d0de8d707d 100644 --- a/vllm/model_executor/models/florence2.py +++ b/vllm/model_executor/models/florence2.py @@ -1,10 +1,9 @@ # SPDX-License-Identifier: Apache-2.0 import math -from collections import OrderedDict -from collections.abc import Iterable, Mapping from functools import cached_property -from typing import Literal, Optional, TypedDict, Union +from typing import (Iterable, List, Literal, Mapping, Optional, OrderedDict, + Set, Tuple, TypedDict, Union) import torch import torch.nn as nn @@ -720,8 +719,8 @@ def sample(self, logits: torch.Tensor, next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -730,7 +729,7 @@ def load_weights(self, weights: Iterable[tuple[str, ] params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() + loaded_params: Set[str] = set() for name, loaded_weight in weights: for (param_name, weight_name, shard_id) in stacked_params_mapping: if weight_name not in name: @@ -935,8 +934,8 @@ def sampler(self): return get_sampler() def _validate_pixel_values( - self, data: Union[torch.Tensor, list[torch.Tensor]] - ) -> Union[torch.Tensor, list[torch.Tensor]]: + self, data: Union[torch.Tensor, List[torch.Tensor]] + ) -> Union[torch.Tensor, List[torch.Tensor]]: size = self.processor_config["size"] h, w = size["height"], size["width"] @@ -957,12 +956,12 @@ def _validate_shape(d: torch.Tensor): return data def _parse_and_validate_image_input(self, **kwargs: object): - pixel_values: Optional[Union[list[list[torch.Tensor]], - list[torch.Tensor], + pixel_values: Optional[Union[List[List[torch.Tensor]], + List[torch.Tensor], torch.Tensor]] = kwargs.pop( "pixel_values", None) - image_embeds: Optional[Union[list[list[torch.Tensor]], - list[torch.Tensor], + image_embeds: Optional[Union[List[List[torch.Tensor]], + List[torch.Tensor], torch.Tensor]] = kwargs.pop( "image_embeds", None) @@ -1112,7 +1111,7 @@ def sample( ) -> SamplerOutput: return self.language_model.sample(logits, sampling_metadata) - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index f68c73b624579..7e4cc6bac5e61 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -17,8 +17,8 @@ # limitations under the License. """ PyTorch Fuyu model.""" import math -from collections.abc import Iterable, Mapping -from typing import Literal, Optional, TypedDict +from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple, + TypedDict) import torch import torch.nn as nn @@ -58,7 +58,7 @@ class FuyuImagePatchInputs(TypedDict): `(batch_size * num_patches, patch_size_x * patch_size_y * num_channels)` """ - patches_per_image: list[int] + patches_per_image: List[int] """ List of number of total patches for each image in the batch. This is used to restore the first two dimensions of `flat_data`. @@ -390,7 +390,7 @@ def sample( next_tokens = self.language_model.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py index 4af0a7e4c13f7..da17646c540fd 100644 --- a/vllm/model_executor/models/gemma.py +++ b/vllm/model_executor/models/gemma.py @@ -15,9 +15,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only Gemma model compatible with HuggingFace weights.""" -from collections.abc import Iterable from functools import cache -from typing import Optional, Union +from typing import Iterable, Optional, Set, Tuple, Union import torch from torch import nn @@ -233,7 +232,7 @@ def forward( positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> tuple[torch.Tensor, torch.Tensor]: + ) -> Tuple[torch.Tensor, torch.Tensor]: # Self Attention if residual is None: residual = hidden_states @@ -384,8 +383,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -395,7 +394,7 @@ def load_weights(self, weights: Iterable[tuple[str, ("gate_up_proj", "up_proj", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() + loaded_params: Set[str] = set() for name, loaded_weight in weights: for (param_name, shard_name, shard_id) in stacked_params_mapping: if shard_name not in name: diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py index 4c359dddfee80..cf744fc2b9d12 100644 --- a/vllm/model_executor/models/gemma2.py +++ b/vllm/model_executor/models/gemma2.py @@ -15,8 +15,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from collections.abc import Iterable -from typing import Optional, Union +from typing import Iterable, Optional, Set, Tuple, Union import torch from torch import nn @@ -220,7 +219,7 @@ def forward( positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> tuple[torch.Tensor, torch.Tensor]: + ) -> Tuple[torch.Tensor, torch.Tensor]: if residual is None: residual = hidden_states hidden_states = self.input_layernorm(hidden_states) @@ -307,8 +306,8 @@ def forward( hidden_states, _ = self.norm(hidden_states, residual) return hidden_states - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -318,7 +317,7 @@ def load_weights(self, weights: Iterable[tuple[str, ("gate_up_proj", "up_proj", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() + loaded_params: Set[str] = set() for name, loaded_weight in weights: if (self.quant_config is not None and (scale_name := self.quant_config.get_cache_scale(name))): @@ -429,8 +428,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader( self, skip_prefixes=(["lm_head."] diff --git a/vllm/model_executor/models/glm4v.py b/vllm/model_executor/models/glm4v.py index 4b95b044f12b5..48543c5642ea4 100644 --- a/vllm/model_executor/models/glm4v.py +++ b/vllm/model_executor/models/glm4v.py @@ -4,8 +4,7 @@ # https://github.com/THUDM/CogAgent """Inference-only CogAgent model compatible with THUDM weights.""" from argparse import Namespace -from collections.abc import Mapping -from typing import Literal, Optional, TypedDict, Union +from typing import Literal, Mapping, Optional, TypedDict, Union import torch from torch import nn diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py index 9c88675031b73..776c03f652bdc 100644 --- a/vllm/model_executor/models/gpt2.py +++ b/vllm/model_executor/models/gpt2.py @@ -18,8 +18,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only GPT-2 model compatible with HuggingFace weights.""" -from collections.abc import Iterable -from typing import Optional, Union +from typing import Iterable, Optional, Set, Tuple, Union import torch from torch import nn @@ -291,10 +290,10 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: set[str] = set() + loaded_params: Set[str] = set() for name, loaded_weight in weights: if ".attn.bias" in name or ".attn.masked_bias" in name: # Skip attention mask. diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py index 7bb67a6fc94b3..43f3d4f6dc9cc 100644 --- a/vllm/model_executor/models/gpt_bigcode.py +++ b/vllm/model_executor/models/gpt_bigcode.py @@ -19,8 +19,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only GPTBigCode model compatible with HuggingFace weights.""" -from collections.abc import Iterable -from typing import Optional, Union +from typing import Iterable, Optional, Set, Tuple, Union import torch from torch import nn @@ -314,10 +313,10 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: set[str] = set() + loaded_params: Set[str] = set() for name, loaded_weight in weights: if "lm_head.weight" in name: continue diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py index 1fb91b17fce46..752aec0b223dd 100644 --- a/vllm/model_executor/models/gpt_j.py +++ b/vllm/model_executor/models/gpt_j.py @@ -17,8 +17,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only GPT-J model compatible with HuggingFace weights.""" -from collections.abc import Iterable -from typing import Optional, Union +from typing import Iterable, Optional, Set, Tuple, Union import torch from torch import nn @@ -284,8 +283,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -295,7 +294,7 @@ def load_weights(self, weights: Iterable[tuple[str, ("gate_up_proj", "up_proj", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() + loaded_params: Set[str] = set() for name, loaded_weight in weights: if "attn.bias" in name or "attn.masked_bias" in name: continue diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py index 79780a8a02aa9..4b30c7bb30359 100644 --- a/vllm/model_executor/models/gpt_neox.py +++ b/vllm/model_executor/models/gpt_neox.py @@ -17,8 +17,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only GPT-NeoX model compatible with HuggingFace weights.""" -from collections.abc import Iterable -from typing import Optional, Union +from typing import Iterable, Optional, Set, Tuple, Union import torch from torch import nn @@ -296,10 +295,10 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() + loaded_params: Set[str] = set() for name, loaded_weight in weights: if ("attention.bias" in name or "attention.masked_bias" in name or "rotary_emb.inv_freq" in name): diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py index 6f892fc5c4eee..201e15d3a30f8 100644 --- a/vllm/model_executor/models/granite.py +++ b/vllm/model_executor/models/granite.py @@ -22,8 +22,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only IBM Granite model compatible with HuggingFace weights.""" -from collections.abc import Iterable -from typing import Any, Optional, Union +from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union import torch from torch import nn @@ -99,7 +98,7 @@ def __init__( num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: Optional[dict[str, Any]] = None, + rope_scaling: Optional[Dict[str, Any]] = None, max_position_embeddings: int = 8192, quant_config: Optional[QuantizationConfig] = None, bias: bool = False, @@ -232,7 +231,7 @@ def forward( self, positions: torch.Tensor, hidden_states: torch.Tensor, - ) -> tuple[torch.Tensor, torch.Tensor]: + ) -> Tuple[torch.Tensor, torch.Tensor]: # Self Attention residual = hidden_states hidden_states = self.input_layernorm(hidden_states) @@ -428,8 +427,8 @@ def make_empty_intermediate_tensors( device=device), }) - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) (".qkv_proj", ".q_proj", "q"), @@ -439,7 +438,7 @@ def load_weights(self, weights: Iterable[tuple[str, (".gate_up_proj", ".up_proj", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() + loaded_params: Set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py index a978bdc142549..9b56874a8add8 100644 --- a/vllm/model_executor/models/granitemoe.py +++ b/vllm/model_executor/models/granitemoe.py @@ -22,8 +22,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only GraniteMoe model.""" -from collections.abc import Iterable -from typing import Optional +from typing import Iterable, Optional, Set, Tuple import torch from torch import nn @@ -403,8 +402,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: new_weights = {} for n, p in weights: if n.endswith('.block_sparse_moe.input_linear.weight'): diff --git a/vllm/model_executor/models/grok1.py b/vllm/model_executor/models/grok1.py index 846569ac0a284..f2e82017f6530 100644 --- a/vllm/model_executor/models/grok1.py +++ b/vllm/model_executor/models/grok1.py @@ -21,8 +21,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only Grok1 model.""" -from collections.abc import Iterable -from typing import Optional, Union +from typing import Iterable, List, Optional, Set, Tuple, Union import torch import torch.nn.functional as F @@ -265,7 +264,7 @@ def forward( kv_cache: torch.Tensor, attn_metadata: AttentionMetadata, residual: Optional[torch.Tensor], - ) -> tuple[torch.Tensor, torch.Tensor]: + ) -> Tuple[torch.Tensor, torch.Tensor]: # Self Attention if residual is None: residual = hidden_states @@ -340,7 +339,7 @@ def forward( self, input_ids: torch.Tensor, positions: torch.Tensor, - kv_caches: list[torch.Tensor], + kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors], inputs_embeds: Optional[torch.Tensor] = None, @@ -432,7 +431,7 @@ def forward( self, input_ids: torch.Tensor, positions: torch.Tensor, - kv_caches: list[torch.Tensor], + kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, inputs_embeds: Optional[torch.Tensor] = None, @@ -459,8 +458,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -478,7 +477,7 @@ def load_weights(self, weights: Iterable[tuple[str, num_experts=num_experts) params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() + loaded_params: Set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: diff --git a/vllm/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py index 7fb0c6cc130ec..bab9c256b9aa0 100644 --- a/vllm/model_executor/models/h2ovl.py +++ b/vllm/model_executor/models/h2ovl.py @@ -7,8 +7,7 @@ # Copyright (c) 2024 H2O.AI # Licensed under Apache 2.0 License [see LICENSE for details] # -------------------------------------------------------- -from collections.abc import Mapping -from typing import Optional +from typing import Mapping, Optional import torch from PIL import Image diff --git a/vllm/model_executor/models/idefics2_vision_model.py b/vllm/model_executor/models/idefics2_vision_model.py index e26c7d2b816a5..f9c2175b29881 100644 --- a/vllm/model_executor/models/idefics2_vision_model.py +++ b/vllm/model_executor/models/idefics2_vision_model.py @@ -17,8 +17,7 @@ # limitations under the License. """PyTorch Idefics2 model.""" -from collections.abc import Iterable -from typing import Optional +from typing import Iterable, Optional, Set, Tuple import torch from torch import nn @@ -319,8 +318,8 @@ def forward( last_hidden_state = self.post_layernorm(encoder_outputs) return last_hidden_state - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -328,7 +327,7 @@ def load_weights(self, weights: Iterable[tuple[str, ("qkv_proj", "v_proj", "v"), ] params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() + loaded_params: Set[str] = set() for name, loaded_weight in weights: for param_name, weight_name, shard_id in stacked_params_mapping: if weight_name not in name: diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py index a22211c40abbb..0a8763cf910ca 100644 --- a/vllm/model_executor/models/idefics3.py +++ b/vllm/model_executor/models/idefics3.py @@ -16,8 +16,8 @@ """Inference-only Idefics3 model compatible with HuggingFace weights.""" import math -from collections.abc import Iterable, Mapping -from typing import Literal, Optional, TypedDict, Union +from typing import (Dict, Iterable, List, Literal, Mapping, Optional, Set, + Tuple, TypedDict, Union) import torch import torch.utils.checkpoint @@ -84,7 +84,7 @@ class Idefics3ProcessingInfo(BaseProcessingInfo): def get_hf_processor( self, *, - size: Optional[dict[str, int]] = None, + size: Optional[Dict[str, int]] = None, **kwargs: object, ) -> Idefics3Processor: if size is not None: @@ -426,8 +426,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.image_token_id = self.config.image_token_id def _validate_pixel_values( - self, data: Union[torch.Tensor, list[torch.Tensor]] - ) -> Union[torch.Tensor, list[torch.Tensor]]: + self, data: Union[torch.Tensor, List[torch.Tensor]] + ) -> Union[torch.Tensor, List[torch.Tensor]]: h = w = self.config.vision_config.image_size expected_dims = (3, h, w) @@ -676,8 +676,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index b7fee00694659..47bd05f140c81 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import (TYPE_CHECKING, ClassVar, Literal, Optional, Protocol, - Union, overload, runtime_checkable) +from typing import (TYPE_CHECKING, ClassVar, Dict, List, Literal, Optional, + Protocol, Type, Union, overload, runtime_checkable) import torch from typing_extensions import TypeIs, TypeVar @@ -88,7 +88,7 @@ class _SupportsMultiModalType(Protocol): @overload def supports_multimodal( - model: type[object]) -> TypeIs[type[SupportsMultiModal]]: + model: Type[object]) -> TypeIs[Type[SupportsMultiModal]]: ... @@ -98,8 +98,8 @@ def supports_multimodal(model: object) -> TypeIs[SupportsMultiModal]: def supports_multimodal( - model: Union[type[object], object], -) -> Union[TypeIs[type[SupportsMultiModal]], TypeIs[SupportsMultiModal]]: + model: Union[Type[object], object], +) -> Union[TypeIs[Type[SupportsMultiModal]], TypeIs[SupportsMultiModal]]: if isinstance(model, type): return isinstance(model, _SupportsMultiModalType) @@ -120,9 +120,9 @@ class SupportsLoRA(Protocol): """ # The `embedding_module` and `embedding_padding_modules` # are empty by default. - embedding_modules: ClassVar[dict[str, str]] = {} - embedding_padding_modules: ClassVar[list[str]] = [] - packed_modules_mapping: ClassVar[dict[str, list[str]]] = {} + embedding_modules: ClassVar[Dict[str, str]] = {} + embedding_padding_modules: ClassVar[List[str]] = [] + packed_modules_mapping: ClassVar[Dict[str, List[str]]] = {} # We can't use runtime_checkable with ClassVar for issubclass checks @@ -131,13 +131,13 @@ class SupportsLoRA(Protocol): class _SupportsLoRAType(Protocol): supports_lora: Literal[True] - packed_modules_mapping: dict[str, list[str]] - embedding_modules: dict[str, str] - embedding_padding_modules: list[str] + packed_modules_mapping: Dict[str, List[str]] + embedding_modules: Dict[str, str] + embedding_padding_modules: List[str] @overload -def supports_lora(model: type[object]) -> TypeIs[type[SupportsLoRA]]: +def supports_lora(model: Type[object]) -> TypeIs[Type[SupportsLoRA]]: ... @@ -147,8 +147,8 @@ def supports_lora(model: object) -> TypeIs[SupportsLoRA]: def supports_lora( - model: Union[type[object], object], -) -> Union[TypeIs[type[SupportsLoRA]], TypeIs[SupportsLoRA]]: + model: Union[Type[object], object], +) -> Union[TypeIs[Type[SupportsLoRA]], TypeIs[SupportsLoRA]]: result = _supports_lora(model) if not result: @@ -177,7 +177,7 @@ def supports_lora( return result -def _supports_lora(model: Union[type[object], object]) -> bool: +def _supports_lora(model: Union[Type[object], object]) -> bool: if isinstance(model, type): return isinstance(model, _SupportsLoRAType) @@ -242,7 +242,7 @@ def forward( @overload -def supports_pp(model: type[object]) -> TypeIs[type[SupportsPP]]: +def supports_pp(model: Type[object]) -> TypeIs[Type[SupportsPP]]: ... @@ -252,8 +252,8 @@ def supports_pp(model: object) -> TypeIs[SupportsPP]: def supports_pp( - model: Union[type[object], object], -) -> Union[bool, TypeIs[type[SupportsPP]], TypeIs[SupportsPP]]: + model: Union[Type[object], object], +) -> Union[bool, TypeIs[Type[SupportsPP]], TypeIs[SupportsPP]]: supports_attributes = _supports_pp_attributes(model) supports_inspect = _supports_pp_inspect(model) @@ -284,14 +284,14 @@ def supports_pp( return supports_attributes and supports_inspect -def _supports_pp_attributes(model: Union[type[object], object]) -> bool: +def _supports_pp_attributes(model: Union[Type[object], object]) -> bool: if isinstance(model, type): return isinstance(model, _SupportsPPType) return isinstance(model, SupportsPP) -def _supports_pp_inspect(model: Union[type[object], object]) -> bool: +def _supports_pp_inspect(model: Union[Type[object], object]) -> bool: model_forward = getattr(model, "forward", None) if not callable(model_forward): return False @@ -322,13 +322,13 @@ def has_inner_state(model: object) -> TypeIs[HasInnerState]: @overload -def has_inner_state(model: type[object]) -> TypeIs[type[HasInnerState]]: +def has_inner_state(model: Type[object]) -> TypeIs[Type[HasInnerState]]: ... def has_inner_state( - model: Union[type[object], object] -) -> Union[TypeIs[type[HasInnerState]], TypeIs[HasInnerState]]: + model: Union[Type[object], object] +) -> Union[TypeIs[Type[HasInnerState]], TypeIs[HasInnerState]]: if isinstance(model, type): return isinstance(model, _HasInnerStateType) @@ -359,13 +359,13 @@ def is_attention_free(model: object) -> TypeIs[IsAttentionFree]: @overload -def is_attention_free(model: type[object]) -> TypeIs[type[IsAttentionFree]]: +def is_attention_free(model: Type[object]) -> TypeIs[Type[IsAttentionFree]]: ... def is_attention_free( - model: Union[type[object], object] -) -> Union[TypeIs[type[IsAttentionFree]], TypeIs[IsAttentionFree]]: + model: Union[Type[object], object] +) -> Union[TypeIs[Type[IsAttentionFree]], TypeIs[IsAttentionFree]]: if isinstance(model, type): return isinstance(model, _IsAttentionFreeType) @@ -396,13 +396,13 @@ def is_hybrid(model: object) -> TypeIs[IsHybrid]: @overload -def is_hybrid(model: type[object]) -> TypeIs[type[IsHybrid]]: +def is_hybrid(model: Type[object]) -> TypeIs[Type[IsHybrid]]: ... def is_hybrid( - model: Union[type[object], object] -) -> Union[TypeIs[type[IsHybrid]], TypeIs[IsHybrid]]: + model: Union[Type[object], object] +) -> Union[TypeIs[Type[IsHybrid]], TypeIs[IsHybrid]]: if isinstance(model, type): return isinstance(model, _IsHybridType) @@ -418,7 +418,7 @@ class SupportsCrossEncoding(Protocol): @overload def supports_cross_encoding( - model: type[object]) -> TypeIs[type[SupportsCrossEncoding]]: + model: Type[object]) -> TypeIs[Type[SupportsCrossEncoding]]: ... @@ -428,8 +428,8 @@ def supports_cross_encoding(model: object) -> TypeIs[SupportsCrossEncoding]: def _supports_cross_encoding( - model: Union[type[object], object], -) -> Union[TypeIs[type[SupportsCrossEncoding]], TypeIs[SupportsCrossEncoding]]: + model: Union[Type[object], object], +) -> Union[TypeIs[Type[SupportsCrossEncoding]], TypeIs[SupportsCrossEncoding]]: if isinstance(model, type): return isinstance(model, SupportsCrossEncoding) @@ -438,15 +438,15 @@ def _supports_cross_encoding( def supports_cross_encoding( - model: Union[type[object], object], -) -> Union[TypeIs[type[SupportsCrossEncoding]], TypeIs[SupportsCrossEncoding]]: + model: Union[Type[object], object], +) -> Union[TypeIs[Type[SupportsCrossEncoding]], TypeIs[SupportsCrossEncoding]]: return is_pooling_model(model) and _supports_cross_encoding(model) class SupportsQuant: """The interface required for all models that support quantization.""" - packed_modules_mapping: ClassVar[dict[str, list[str]]] = {} + packed_modules_mapping: ClassVar[Dict[str, List[str]]] = {} quant_config: Optional[QuantizationConfig] = None def __new__(cls, *args, **kwargs) -> "SupportsQuant": @@ -482,7 +482,7 @@ class SupportsTranscription(Protocol): @overload def supports_transcription( - model: type[object]) -> TypeIs[type[SupportsTranscription]]: + model: Type[object]) -> TypeIs[Type[SupportsTranscription]]: ... @@ -492,8 +492,8 @@ def supports_transcription(model: object) -> TypeIs[SupportsTranscription]: def supports_transcription( - model: Union[type[object], object], -) -> Union[TypeIs[type[SupportsTranscription]], TypeIs[SupportsTranscription]]: + model: Union[Type[object], object], +) -> Union[TypeIs[Type[SupportsTranscription]], TypeIs[SupportsTranscription]]: if isinstance(model, type): return isinstance(model, SupportsTranscription) diff --git a/vllm/model_executor/models/interfaces_base.py b/vllm/model_executor/models/interfaces_base.py index 55e31803903bc..22c9287509ed7 100644 --- a/vllm/model_executor/models/interfaces_base.py +++ b/vllm/model_executor/models/interfaces_base.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import (TYPE_CHECKING, Optional, Protocol, Union, overload, +from typing import (TYPE_CHECKING, Optional, Protocol, Type, Union, overload, runtime_checkable) import torch @@ -21,7 +21,7 @@ # The type of hidden states # Currently, T = torch.Tensor for all models except for Medusa -# which has T = list[torch.Tensor] +# which has T = List[torch.Tensor] T = TypeVar("T", default=torch.Tensor) T_co = TypeVar("T_co", default=torch.Tensor, covariant=True) @@ -49,12 +49,12 @@ def forward( ... -def _check_vllm_model_init(model: Union[type[object], object]) -> bool: +def _check_vllm_model_init(model: Union[Type[object], object]) -> bool: model_init = model.__init__ return supports_kw(model_init, "vllm_config") -def _check_vllm_model_forward(model: Union[type[object], object]) -> bool: +def _check_vllm_model_forward(model: Union[Type[object], object]) -> bool: model_forward = getattr(model, "forward", None) if not callable(model_forward): return False @@ -76,7 +76,7 @@ def _check_vllm_model_forward(model: Union[type[object], object]) -> bool: @overload -def is_vllm_model(model: type[object]) -> TypeIs[type[VllmModel]]: +def is_vllm_model(model: Type[object]) -> TypeIs[Type[VllmModel]]: ... @@ -86,8 +86,8 @@ def is_vllm_model(model: object) -> TypeIs[VllmModel]: def is_vllm_model( - model: Union[type[object], object], -) -> Union[TypeIs[type[VllmModel]], TypeIs[VllmModel]]: + model: Union[Type[object], object], +) -> Union[TypeIs[Type[VllmModel]], TypeIs[VllmModel]]: return _check_vllm_model_init(model) and _check_vllm_model_forward(model) @@ -114,7 +114,7 @@ def sample( @overload def is_text_generation_model( - model: type[object]) -> TypeIs[type[VllmModelForTextGeneration]]: + model: Type[object]) -> TypeIs[Type[VllmModelForTextGeneration]]: ... @@ -125,8 +125,8 @@ def is_text_generation_model( def is_text_generation_model( - model: Union[type[object], object], -) -> Union[TypeIs[type[VllmModelForTextGeneration]], + model: Union[Type[object], object], +) -> Union[TypeIs[Type[VllmModelForTextGeneration]], TypeIs[VllmModelForTextGeneration]]: if not is_vllm_model(model): return False @@ -151,7 +151,7 @@ def pooler( @overload -def is_pooling_model(model: type[object]) -> TypeIs[type[VllmModelForPooling]]: +def is_pooling_model(model: Type[object]) -> TypeIs[Type[VllmModelForPooling]]: ... @@ -161,8 +161,8 @@ def is_pooling_model(model: object) -> TypeIs[VllmModelForPooling]: def is_pooling_model( - model: Union[type[object], object], -) -> Union[TypeIs[type[VllmModelForPooling]], TypeIs[VllmModelForPooling]]: + model: Union[Type[object], object], +) -> Union[TypeIs[Type[VllmModelForPooling]], TypeIs[VllmModelForPooling]]: if not is_vllm_model(model): return False diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py index bb467f40118ef..0499f339b2465 100644 --- a/vllm/model_executor/models/intern_vit.py +++ b/vllm/model_executor/models/intern_vit.py @@ -6,9 +6,8 @@ # Copyright (c) 2023 OpenGVLab # Licensed under The MIT License [see LICENSE for details] # -------------------------------------------------------- -from collections.abc import Iterable from functools import partial -from typing import Optional +from typing import Iterable, Optional, Set, Tuple import torch import torch.nn as nn @@ -464,10 +463,10 @@ def forward( return encoder_outputs - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() + loaded_params: Set[str] = set() for name, loaded_weight in weights: param = params_dict[name] weight_loader = getattr(param, "weight_loader", diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py index f31cc2d9ec842..41ca399b9efbc 100644 --- a/vllm/model_executor/models/internlm2.py +++ b/vllm/model_executor/models/internlm2.py @@ -1,8 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 -from collections.abc import Iterable from functools import partial -from typing import Any, Optional, Union +from typing import Any, Dict, Iterable, Optional, Set, Tuple, Type, Union import torch from torch import nn @@ -83,7 +82,7 @@ def __init__( num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: Optional[dict[str, Any]] = None, + rope_scaling: Optional[Dict[str, Any]] = None, max_position_embeddings: int = 8192, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, @@ -227,7 +226,7 @@ def forward( positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> tuple[torch.Tensor, torch.Tensor]: + ) -> Tuple[torch.Tensor, torch.Tensor]: # Self Attention if residual is None: residual = hidden_states @@ -254,7 +253,7 @@ def __init__( *, vllm_config: VllmConfig, prefix: str = "", - layer_type: type[InternLMDecoderLayer] = InternLMDecoderLayer): + layer_type: Type[InternLMDecoderLayer] = InternLMDecoderLayer): super().__init__() config = vllm_config.model_config.hf_config @@ -319,7 +318,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "", - model_type: type[InternLM2Model] = InternLM2Model): + model_type: Type[InternLM2Model] = InternLM2Model): super().__init__() config = vllm_config.model_config.hf_config quant_config = vllm_config.quant_config @@ -373,15 +372,15 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("gate_up_proj", "w1", 0), ("gate_up_proj", "w3", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() + loaded_params: Set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue @@ -419,7 +418,7 @@ def __init__( *, vllm_config: VllmConfig, prefix: str = "", - model_type: type[InternLM2Model] = InternLM2Model, + model_type: Type[InternLM2Model] = InternLM2Model, ): super().__init__(vllm_config=vllm_config, prefix=prefix, diff --git a/vllm/model_executor/models/internlm2_ve.py b/vllm/model_executor/models/internlm2_ve.py index 6893d0239121d..69b0caab8f8ec 100644 --- a/vllm/model_executor/models/internlm2_ve.py +++ b/vllm/model_executor/models/internlm2_ve.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Optional, Union +from typing import Optional, Tuple, Union import torch from torch import nn @@ -66,7 +66,7 @@ def forward( hidden_states: torch.Tensor, residual: Optional[torch.Tensor], visual_token_mask: Optional[torch.Tensor] = None, - ) -> tuple[torch.Tensor, torch.Tensor]: + ) -> Tuple[torch.Tensor, torch.Tensor]: # Self Attention if residual is None: residual = hidden_states diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index a47265afe71f2..52ddb279cca39 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -7,9 +7,9 @@ # Licensed under The MIT License [see LICENSE for details] # -------------------------------------------------------- from abc import ABC, abstractmethod -from collections.abc import Iterable, Mapping from functools import cached_property -from typing import Literal, Optional, TypedDict, TypeVar, Union +from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple, + TypedDict, TypeVar, Union) import torch import torch.nn as nn @@ -55,7 +55,7 @@ class InternVLImagePixelInputs(TypedDict): Shape: `(batch_size * num_images * (1 + num_patches), num_channels, height, width)` """ - patches_per_image: list[int] + patches_per_image: List[int] """ List of number of total patches for each image in the batch. """ @@ -976,7 +976,7 @@ def sample( ) -> Optional[SamplerOutput]: return self.language_model.sample(logits, sampling_metadata) - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/jais.py b/vllm/model_executor/models/jais.py index 4ccc5c9605eba..78fe6588eddce 100644 --- a/vllm/model_executor/models/jais.py +++ b/vllm/model_executor/models/jais.py @@ -21,8 +21,7 @@ """Inference-only Jais model compatible with HuggingFace weights.""" import math -from collections.abc import Iterable -from typing import Optional, Union +from typing import Iterable, Optional, Set, Tuple, Union import torch from torch import nn @@ -344,10 +343,10 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: set[str] = set() + loaded_params: Set[str] = set() for name, loaded_weight in weights: if "lm_head.weight" in name: # GPT-2 ties the weights of the embedding layer and the final diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py index 707e8a3571e0c..14e56df6cadf8 100644 --- a/vllm/model_executor/models/jamba.py +++ b/vllm/model_executor/models/jamba.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 """Inference-only Jamba model.""" -from collections.abc import Iterable -from typing import Optional +from typing import Iterable, Optional, Set, Tuple import torch from torch import nn @@ -36,7 +35,7 @@ make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) -KVCache = tuple[torch.Tensor, torch.Tensor] +KVCache = Tuple[torch.Tensor, torch.Tensor] class JambaMoE(nn.Module): @@ -438,7 +437,7 @@ def get_seqlen_agnostic_capture_inputs(self, batch_size: int): return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size) def _get_mamba_cache_shape( - self) -> tuple[tuple[int, int], tuple[int, int]]: + self) -> Tuple[Tuple[int, int], Tuple[int, int]]: world_size = get_tensor_model_parallel_world_size() hidden_size = self.config.hidden_size conv_state_shape = ( @@ -468,8 +467,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -486,7 +485,7 @@ def load_weights(self, weights: Iterable[tuple[str, num_experts=self.config.num_experts) params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() + loaded_params: Set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue @@ -587,7 +586,7 @@ def pooler( logits = self.score(hidden_states) return self._pooler(logits, pooling_metadata) - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): # TODO: The reward weights themselves have float32 accuracy data, we # would like to load them in fp32 to get that extra precision. super().load_weights(weights) diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 9fe844ec36b12..a0aff9e609d9e 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -22,8 +22,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only LLaMA model compatible with HuggingFace weights.""" -from collections.abc import Iterable -from typing import Any, Optional, Union +from typing import Any, Dict, Iterable, Optional, Set, Tuple, Type, Union import torch from torch import nn @@ -102,7 +101,7 @@ def __init__(self, num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: Optional[dict[str, Any]] = None, + rope_scaling: Optional[Dict[str, Any]] = None, max_position_embeddings: int = 8192, quant_config: Optional[QuantizationConfig] = None, bias: bool = False, @@ -268,7 +267,7 @@ def forward( positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> tuple[torch.Tensor, torch.Tensor]: + ) -> Tuple[torch.Tensor, torch.Tensor]: # Self Attention if residual is None: residual = hidden_states @@ -293,7 +292,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "", - layer_type: type[LlamaDecoderLayer] = LlamaDecoderLayer): + layer_type: Type[LlamaDecoderLayer] = LlamaDecoderLayer): super().__init__() config = vllm_config.model_config.hf_config @@ -368,8 +367,8 @@ def forward( hidden_states, _ = self.norm(hidden_states, residual) return hidden_states - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) (".qkv_proj", ".q_proj", "q"), @@ -379,7 +378,7 @@ def load_weights(self, weights: Iterable[tuple[str, (".gate_up_proj", ".up_proj", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() + loaded_params: Set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue @@ -546,8 +545,8 @@ def sample(self, logits: torch.Tensor, next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader( self, skip_prefixes=(["lm_head."] @@ -563,7 +562,7 @@ def maybe_remap_mistral( self, name: str, loaded_weight: torch.Tensor, - ) -> tuple[str, torch.Tensor]: + ) -> Tuple[str, torch.Tensor]: def permute(w: torch.Tensor, n_heads: int): attn_in = self.config.head_dim * n_heads diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 00cad32caed6b..72b1591306f26 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -1,10 +1,9 @@ # SPDX-License-Identifier: Apache-2.0 from abc import abstractmethod -from collections.abc import Iterable, Mapping from functools import cached_property -from typing import (Final, Literal, Optional, Protocol, TypedDict, TypeVar, - Union) +from typing import (Final, Iterable, List, Literal, Mapping, Optional, + Protocol, Set, Tuple, TypedDict, TypeVar, Union) import torch import torch.nn as nn @@ -48,7 +47,7 @@ class LlavaImagePixelInputs(TypedDict): type: Literal["pixel_values"] - data: Union[torch.Tensor, list[torch.Tensor]] + data: Union[torch.Tensor, List[torch.Tensor]] """ Shape: `(batch_size * num_images, num_channels, height, width)` @@ -730,8 +729,8 @@ def sample( ) -> Optional[SamplerOutput]: return self.language_model.sample(logits, sampling_metadata) - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index 052f07afd83ba..6a050d7798a20 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -1,10 +1,9 @@ # SPDX-License-Identifier: Apache-2.0 from abc import abstractmethod -from collections.abc import Iterable, Mapping from functools import cached_property -from typing import (Final, Literal, Optional, Protocol, TypedDict, TypeVar, - Union) +from typing import (Final, Iterable, List, Literal, Mapping, Optional, + Protocol, Set, Tuple, TypedDict, TypeVar, Union) import torch import torch.nn as nn @@ -33,7 +32,7 @@ class LlavaNextImagePixelInputs(TypedDict): type: Literal["pixel_values"] - data: Union[torch.Tensor, list[torch.Tensor]] + data: Union[torch.Tensor, List[torch.Tensor]] """ Shape: `(batch_size * num_images, 1 + num_patches, num_channels, height, width)` @@ -276,8 +275,8 @@ def _validate_shape(d: torch.Tensor): return data def _validate_pixel_values( - self, data: Union[torch.Tensor, list[torch.Tensor]] - ) -> Union[torch.Tensor, list[torch.Tensor]]: + self, data: Union[torch.Tensor, List[torch.Tensor]] + ) -> Union[torch.Tensor, List[torch.Tensor]]: h = w = self.config.vision_config.image_size expected_dims = (3, h, w) @@ -459,7 +458,7 @@ def _process_image_pixels( def _process_image_input( self, image_input: LlavaNextImageInputs, - ) -> Union[torch.Tensor, list[torch.Tensor]]: + ) -> Union[torch.Tensor, List[torch.Tensor]]: if image_input["type"] == "image_embeds": return [image_input["data"]] @@ -588,7 +587,7 @@ def sample( ) -> Optional[SamplerOutput]: return self.language_model.sample(logits, sampling_metadata) - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py index 731c9ba09c883..807d6977ed409 100644 --- a/vllm/model_executor/models/llava_next_video.py +++ b/vllm/model_executor/models/llava_next_video.py @@ -1,9 +1,9 @@ # SPDX-License-Identifier: Apache-2.0 import math -from collections.abc import Iterable, Mapping from functools import cached_property -from typing import Literal, Optional, TypedDict, Union +from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple, + TypedDict, Union) import torch import torch.nn as nn @@ -36,7 +36,7 @@ class LlavaNextVideoPixelInputs(TypedDict): type: Literal["pixel_values_videos"] - data: Union[torch.Tensor, list[torch.Tensor]] + data: Union[torch.Tensor, List[torch.Tensor]] """ Shape: `(batch_size, num_frames, num_channels, height, width)` @@ -320,8 +320,8 @@ def sampler(self): return get_sampler() def _validate_video_pixel_values( - self, data: Union[torch.Tensor, list[torch.Tensor]] - ) -> Union[torch.Tensor, list[torch.Tensor]]: + self, data: Union[torch.Tensor, List[torch.Tensor]] + ) -> Union[torch.Tensor, List[torch.Tensor]]: h = w = self.config.vision_config.image_size expected_dims = (3, h, w) @@ -346,7 +346,7 @@ def _parse_and_validate_video_input( A legal video input should have the following dimensions: { "pixel_values_videos" : - list[b, Tensor(nb_frames, nb_channels, height, width)] + List[b, Tensor(nb_frames, nb_channels, height, width)] } """ pixel_values = kwargs.pop("pixel_values_videos", None) @@ -485,8 +485,8 @@ def sample( ) -> Optional[SamplerOutput]: return self.language_model.sample(logits, sampling_metadata) - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader( self, # This model doesn't support images for now diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py index faeeae7d42a6c..e57eea4286e94 100644 --- a/vllm/model_executor/models/llava_onevision.py +++ b/vllm/model_executor/models/llava_onevision.py @@ -1,9 +1,9 @@ # SPDX-License-Identifier: Apache-2.0 import math -from collections.abc import Iterable, Mapping from functools import cached_property -from typing import Final, Literal, Optional, Protocol, TypedDict, Union +from typing import (Final, Iterable, List, Literal, Mapping, Optional, + Protocol, Set, Tuple, TypedDict, Union) import torch import torch.nn as nn @@ -42,7 +42,7 @@ class LlavaOnevisionVideoPixelInputs(TypedDict): type: Literal["pixel_values_videos"] - data: Union[torch.Tensor, list[torch.Tensor]] + data: Union[torch.Tensor, List[torch.Tensor]] """ Shape: `(batch_size, num_videos, num_frames, num_channels, height, width)` @@ -54,7 +54,7 @@ class LlavaOnevisionVideoPixelInputs(TypedDict): class LlavaOnevisionImagePixelInputs(TypedDict): type: Literal["pixel_values"] - data: Union[torch.Tensor, list[torch.Tensor]] + data: Union[torch.Tensor, List[torch.Tensor]] """ Shape: `(batch_size * num_images, 1 + num_patches, num_channels, height, width)` @@ -480,8 +480,8 @@ def _validate_shape(d: torch.Tensor): return data def _validate_image_pixel_values( - self, data: Union[torch.Tensor, list[torch.Tensor]] - ) -> Union[torch.Tensor, list[torch.Tensor]]: + self, data: Union[torch.Tensor, List[torch.Tensor]] + ) -> Union[torch.Tensor, List[torch.Tensor]]: h = w = self.config.vision_config.image_size expected_dims = (3, h, w) @@ -539,8 +539,8 @@ def _parse_and_validate_image_input( raise AssertionError("This line should be unreachable.") def _validate_video_pixel_values( - self, data: Union[torch.Tensor, list[torch.Tensor]] - ) -> Union[torch.Tensor, list[torch.Tensor]]: + self, data: Union[torch.Tensor, List[torch.Tensor]] + ) -> Union[torch.Tensor, List[torch.Tensor]]: h = w = self.config.vision_config.image_size expected_dims = (3, h, w) @@ -566,7 +566,7 @@ def _parse_and_validate_video_input( A legal video input should have the following dimensions: { "pixel_values_videos" : - list[b, Tensor(nb_frames, nb_channels, height, width)] + List[b, Tensor(nb_frames, nb_channels, height, width)] } """ pixel_values = kwargs.pop("pixel_values_videos", None) @@ -719,7 +719,7 @@ def _merge_image_patch_embeddings(self, def _process_image_pixels( self, inputs: LlavaOnevisionImagePixelInputs, - ) -> Union[torch.Tensor, list[torch.Tensor]]: + ) -> Union[torch.Tensor, List[torch.Tensor]]: assert self.vision_tower is not None pixel_values = inputs["data"] @@ -748,7 +748,7 @@ def _process_image_pixels( def _process_image_input( self, image_input: LlavaOnevisionImageInputs, - ) -> Union[torch.Tensor, list[torch.Tensor]]: + ) -> Union[torch.Tensor, List[torch.Tensor]]: if image_input["type"] == "image_embeds": return [image_input["data"]] @@ -972,7 +972,7 @@ def sample( ) -> Optional[SamplerOutput]: return self.language_model.sample(logits, sampling_metadata) - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py index 28a770abec6ae..9f1cd8c29a5a0 100644 --- a/vllm/model_executor/models/mamba.py +++ b/vllm/model_executor/models/mamba.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 """PyTorch MAMBA model.""" -from collections.abc import Iterable -from typing import Optional +from typing import Iterable, Optional, Set, Tuple import torch from torch import nn @@ -31,7 +30,7 @@ make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) -KVCache = tuple[torch.Tensor, torch.Tensor] +KVCache = Tuple[torch.Tensor, torch.Tensor] class MambaDecoderLayer(nn.Module): @@ -229,7 +228,7 @@ def get_seqlen_agnostic_capture_inputs(self, batch_size: int): return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size) def _get_mamba_cache_shape( - self) -> tuple[tuple[int, int], tuple[int, int]]: + self) -> Tuple[Tuple[int, int], Tuple[int, int]]: world_size = get_tensor_model_parallel_world_size() conv_state_shape = ( self.config.intermediate_size // world_size, @@ -255,10 +254,10 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() + loaded_params: Set[str] = set() for name, loaded_weight in weights: if "A_log" in name: name = name.replace("A_log", "A") diff --git a/vllm/model_executor/models/mamba2.py b/vllm/model_executor/models/mamba2.py index fd37c45f6b872..266cdc243ac44 100644 --- a/vllm/model_executor/models/mamba2.py +++ b/vllm/model_executor/models/mamba2.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 """PyTorch MAMBA2 model.""" -from collections.abc import Iterable -from typing import Optional +from typing import Iterable, Optional, Set, Tuple import torch from torch import nn @@ -34,7 +33,7 @@ make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) -KVCache = tuple[torch.Tensor, torch.Tensor] +KVCache = Tuple[torch.Tensor, torch.Tensor] class Mamba2DecoderLayer(nn.Module): @@ -249,7 +248,7 @@ def get_seqlen_agnostic_capture_inputs(self, batch_size: int): return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size) def _get_mamba_cache_shape( - self) -> tuple[tuple[int, int], tuple[int, int]]: + self) -> Tuple[Tuple[int, int], Tuple[int, int]]: world_size = get_tensor_model_parallel_world_size() conv_state_shape, temporal_state_shape = None, None @@ -295,10 +294,10 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() + loaded_params: Set[str] = set() for name, loaded_weight in weights: if "A_log" in name: name = name.replace("A_log", "A") diff --git a/vllm/model_executor/models/mamba_cache.py b/vllm/model_executor/models/mamba_cache.py index ae1fc937b989c..d529833093cea 100644 --- a/vllm/model_executor/models/mamba_cache.py +++ b/vllm/model_executor/models/mamba_cache.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 from dataclasses import dataclass +from typing import Dict, List, Tuple import torch @@ -23,8 +24,8 @@ def at_layer_idx(self, layer_idx): class MambaCacheManager: def __init__(self, vllm_config: VllmConfig, dtype: torch.dtype, - num_mamba_layers: int, conv_state_shape: tuple[int, int], - temporal_state_shape: tuple[int, int]): + num_mamba_layers: int, conv_state_shape: Tuple[int, int], + temporal_state_shape: Tuple[int, int]): # Determine max batch size to set size of MambaCache max_batch_size = vllm_config.scheduler_config.max_num_seqs @@ -44,7 +45,7 @@ def __init__(self, vllm_config: VllmConfig, dtype: torch.dtype, # Maps between the request id and a dict that maps between the seq_id # and its index inside the self.mamba_cache - self.mamba_cache_indices_mapping: dict[str, dict[int, int]] = {} + self.mamba_cache_indices_mapping: Dict[str, Dict[int, int]] = {} self.free_cache_indices = list(range(max_batch_size)) def current_run_tensors(self, **kwargs) -> MambaCacheParams: @@ -146,8 +147,8 @@ def _assign_seq_id_to_cache_index(self, cur_rid: str, seq_id: int, return self.mamba_cache_indices_mapping[cur_rid][seq_id] def _prepare_current_run_mamba_cache( - self, request_ids_to_seq_ids: dict[str, list[int]], - finished_requests_ids: list[str]) -> list[int]: + self, request_ids_to_seq_ids: Dict[str, list[int]], + finished_requests_ids: List[str]) -> List[int]: return [ self._assign_seq_id_to_cache_index(req_id, seq_id, finished_requests_ids) @@ -156,7 +157,7 @@ def _prepare_current_run_mamba_cache( ] def _release_finished_requests(self, - finished_seq_groups_req_ids: list[str]): + finished_seq_groups_req_ids: List[str]): for req_id in finished_seq_groups_req_ids: if req_id in self.mamba_cache_indices_mapping: for seq_id in self.mamba_cache_indices_mapping[req_id]: diff --git a/vllm/model_executor/models/medusa.py b/vllm/model_executor/models/medusa.py index ac0b281f359c3..a19d7da5654b6 100644 --- a/vllm/model_executor/models/medusa.py +++ b/vllm/model_executor/models/medusa.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from collections.abc import Iterable -from typing import Optional +from typing import Iterable, List, Optional, Set, Tuple import torch import torch.nn as nn @@ -97,13 +96,13 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: # checkpoint file has token_map tensor. self.token_map = None - def forward(self, hidden_states: torch.Tensor) -> list[torch.Tensor]: + def forward(self, hidden_states: torch.Tensor) -> List[torch.Tensor]: return [block(hidden_states) for block in self.blocks] def compute_logits( - self, hidden_states: list[torch.Tensor], - sampling_metadata: SamplingMetadata) -> list[torch.Tensor]: - logits_lst: list[torch.Tensor] = [] + self, hidden_states: List[torch.Tensor], + sampling_metadata: SamplingMetadata) -> List[torch.Tensor]: + logits_lst: List[torch.Tensor] = [] for hs, lm_head in zip(hidden_states, self.lm_heads): _logits = self.logits_processor(lm_head, hs, sampling_metadata) @@ -128,9 +127,9 @@ def compute_logits( def sample( self, - logits: list[torch.Tensor], + logits: List[torch.Tensor], sampling_metadata: SamplingMetadata, - ) -> list[SamplerOutput]: + ) -> List[SamplerOutput]: logits = torch.stack(logits, dim=0).float() logprobs = torch.log_softmax(logits, dim=-1) token_ids = logits.argmax(-1) # support only top-1 for now @@ -145,7 +144,7 @@ def sample( token_prob_list.append(probs[:, seq_group.sample_indices]) token_logprob_list.append(logprobs[:, seq_group.sample_indices]) - outputs: list[Optional[SamplerOutput]] = [] + outputs: List[Optional[SamplerOutput]] = [] for idx in range(len(sampling_metadata.seq_groups)): outputs.append( SamplerOutput( @@ -161,7 +160,7 @@ def generate_proposals( self, previous_hidden_states: torch.Tensor, sampling_metadata: SamplingMetadata, - ) -> list[SamplerOutput]: + ) -> List[SamplerOutput]: return self.sample( logits=self.compute_logits( hidden_states=self.forward(previous_hidden_states), @@ -170,10 +169,10 @@ def generate_proposals( sampling_metadata=sampling_metadata, ) - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() + loaded_params: Set[str] = set() weights_map = {} diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py index 938b1e40899d8..34e1f3927a9af 100644 --- a/vllm/model_executor/models/minicpm.py +++ b/vllm/model_executor/models/minicpm.py @@ -23,8 +23,7 @@ # limitations under the License. """Inference-only MiniCPM model compatible with HuggingFace weights.""" import math -from collections.abc import Iterable -from typing import Any, Optional, Union +from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union import torch from torch import nn @@ -192,7 +191,7 @@ def __init__( num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: Optional[dict[str, Any]] = None, + rope_scaling: Optional[Dict[str, Any]] = None, max_position_embeddings: int = 8192, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, @@ -331,7 +330,7 @@ def forward( positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> tuple[torch.Tensor, torch.Tensor]: + ) -> Tuple[torch.Tensor, torch.Tensor]: # Self Attention residual = hidden_states hidden_states = self.input_layernorm(hidden_states) @@ -431,8 +430,8 @@ def forward( hidden_states = self.norm(hidden_states) return hidden_states - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -449,7 +448,7 @@ def load_weights(self, weights: Iterable[tuple[str, for weight_name in ["w1", "w2", "w3"] ] params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() + loaded_params: Set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue @@ -594,8 +593,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader( self, skip_prefixes=(["lm_head."] diff --git a/vllm/model_executor/models/minicpm3.py b/vllm/model_executor/models/minicpm3.py index 2a6867d12d993..1b24c38cef1b0 100644 --- a/vllm/model_executor/models/minicpm3.py +++ b/vllm/model_executor/models/minicpm3.py @@ -23,7 +23,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only MiniCPM3 model compatible with HuggingFace weights.""" -from typing import Any, Optional +from typing import Any, Dict, Optional import torch from torch import nn @@ -58,7 +58,7 @@ def __init__( q_lora_rank: int, kv_lora_rank: int, rope_theta: float = 10000, - rope_scaling: Optional[dict[str, Any]] = None, + rope_scaling: Optional[Dict[str, Any]] = None, max_position_embeddings: int = 8192, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, diff --git a/vllm/model_executor/models/minicpmo.py b/vllm/model_executor/models/minicpmo.py index 981ed7ab95c0e..e354e5323327f 100644 --- a/vllm/model_executor/models/minicpmo.py +++ b/vllm/model_executor/models/minicpmo.py @@ -22,9 +22,9 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only MiniCPM-O model compatible with HuggingFace weights.""" -from collections.abc import Iterable, Mapping from functools import partial -from typing import Any, Callable, Literal, Optional, TypedDict, Union +from typing import (Any, Callable, Dict, Iterable, List, Literal, Mapping, + Optional, Set, Tuple, TypedDict, Union) import torch from torch import nn @@ -80,7 +80,7 @@ class MiniCPMOAudioFeatureInputs(TypedDict): class MiniCPMOAudioEmbeddingInputs(TypedDict): type: Literal["audio_embeds"] - data: list[torch.Tensor] + data: List[torch.Tensor] """ Shape: `(batch_size * num_images * num_slices, hidden_size)` @@ -152,7 +152,7 @@ def _parse_audio_data( class MiniCPMOProcessingInfo(MiniCPMVProcessingInfo): audio_pattern = "()" - def get_supported_mm_modalities(self) -> list[str]: + def get_supported_mm_modalities(self) -> List[str]: return ["image", "video", "audio"] def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: @@ -261,7 +261,7 @@ def get_audio_prompt_texts(self, return self.info.get_hf_processor().get_audio_placeholder( audio_lens, chunk_input, chunk_length) - def get_special_tokens(self) -> dict[str, torch.Tensor]: + def get_special_tokens(self) -> Dict[str, torch.Tensor]: tokenizer = self.info.get_tokenizer() special_tokens = super().get_special_tokens() if hasattr(tokenizer, "audio_start_id"): @@ -272,7 +272,7 @@ def get_special_tokens(self) -> dict[str, torch.Tensor]: return special_tokens def process_audios(self, mm_data: Mapping[str, object], - mm_kwargs: Mapping[str, object]) -> dict[str, object]: + mm_kwargs: Mapping[str, object]) -> Dict[str, object]: audios = mm_data.pop("audios", []) audio_embeds = mm_data.pop("audio_embeds", []) if isinstance(audios, (list, torch.Tensor)) and len(audios) > 0: @@ -343,13 +343,13 @@ def get_modality_num_counter(self, modality: str) -> str: return "audio_lens" return super().get_modality_num_counter(modality) - def get_num_slices_by_modality(self, inputs: dict[str, object], + def get_num_slices_by_modality(self, inputs: Dict[str, object], modality: str, index: int) -> int: if modality == "audio": return inputs["audio"]["audio_num_segments"][index] return super().get_num_slices_by_modality(inputs, modality, index) - def get_prompt_texts_by_modality(self, inputs: dict[str, object], + def get_prompt_texts_by_modality(self, inputs: Dict[str, object], modality: str, index: int) -> str: if modality == "audio": return self.get_audio_prompt_texts( @@ -359,7 +359,7 @@ def get_prompt_texts_by_modality(self, inputs: dict[str, object], def _get_prompt_replacements( self, mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, Any], - out_mm_kwargs: MultiModalKwargs) -> list[PromptReplacement]: + out_mm_kwargs: MultiModalKwargs) -> List[PromptReplacement]: placeholder = { "image": self.info.image_pattern, "video": self.info.video_pattern, @@ -579,8 +579,8 @@ def init_audio_module(self, *, vllm_config: VllmConfig, prefix: str = ""): self.audio_encoder_layer = -1 return model - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader(self, skip_prefixes=["tts"]) return loader.load_weights(weights) @@ -742,7 +742,7 @@ def _get_audio_bounds(self, input_ids: torch.Tensor, def _parse_and_validate_audio_inputs( self, input_ids: torch.Tensor, - **kwargs: object) -> tuple[MiniCPMOAudioInputs]: + **kwargs: object) -> Tuple[MiniCPMOAudioInputs]: audio_features = kwargs.pop("audio_features", []) audio_feature_lens = kwargs.pop("audio_feature_lens", []) audio_embeds = kwargs.pop("audio_embeds", None) diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index 154da20d33e98..2699958331f3d 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -25,9 +25,9 @@ import math import re from collections import Counter -from collections.abc import Iterable, Mapping from functools import cached_property, partial -from typing import Any, Callable, Literal, Optional, TypedDict, Union +from typing import (Any, Callable, Dict, Iterable, List, Literal, Mapping, + Optional, Set, Tuple, TypedDict, Union) import numpy as np import torch @@ -72,7 +72,7 @@ class MiniCPMVImagePixelInputs(TypedDict): type: Literal["pixel_values"] - data: list[torch.Tensor] + data: List[torch.Tensor] """ Shape: `(batch_size * num_images * num_slices, num_channels, height, width)` @@ -128,7 +128,7 @@ def __init__(self, num_heads: int, kv_dim: Optional[int] = None, norm_layer: Callable[[int], nn.LayerNorm] = DEFAULT_LN, - max_size: tuple[int, int] = (70, 70), + max_size: Tuple[int, int] = (70, 70), quant_config: Optional[QuantizationConfig] = None, prefix: str = "") -> None: super().__init__(num_queries, @@ -143,7 +143,7 @@ def __init__(self, self._set_2d_pos_cache(self.max_size) def _set_2d_pos_cache(self, - max_size: tuple[int, int], + max_size: Tuple[int, int], device: torch.types.Device = "cpu") -> None: pos_embed_arr = get_2d_sincos_pos_embed(self.embed_dim, max_size, @@ -213,7 +213,7 @@ def forward(self, x: torch.Tensor, return x -def get_version_by_config(config: PretrainedConfig) -> tuple[int, ...]: +def get_version_by_config(config: PretrainedConfig) -> Tuple[int, ...]: version_float = getattr(config, "version", None) # The old configs do not include version number @@ -352,7 +352,7 @@ def get_image_processor(self): def get_model_version(self): return get_version_by_config(self.get_hf_config()) - def get_supported_mm_modalities(self) -> list[str]: + def get_supported_mm_modalities(self) -> List[str]: if self.get_model_version() == (2, 6): return ["image", "video"] else: @@ -394,7 +394,7 @@ def get_max_slice_num(self) -> int: return max_slice_num def get_sliced_grid(self, image_size: ImageSize, - max_slice_num: int) -> tuple[int, int]: + max_slice_num: int) -> Tuple[int, int]: if self.get_model_version() == (2, 6): slice_grid = self.get_image_processor().get_sliced_grid( image_size, max_slice_num) @@ -536,7 +536,7 @@ def get_video_prompt_texts(self, image_size: ImageSize, use_image_id=False) for image_idx in range(num_frames)) return prompt_texts - def get_special_tokens(self) -> dict[str, torch.Tensor]: + def get_special_tokens(self) -> Dict[str, torch.Tensor]: tokenizer = self.info.get_tokenizer() special_tokens = { "im_start_id": torch.tensor(tokenizer.im_start_id), @@ -556,7 +556,7 @@ def repack_processor_outputs(outputs: Any) -> BatchFeature: return outputs def process_images(self, mm_data: Mapping[str, object], - mm_kwargs: Mapping[str, object]) -> dict[str, object]: + mm_kwargs: Mapping[str, object]) -> Dict[str, object]: images = mm_data.pop("images", []) image_embeds = mm_data.pop("image_embeds", []) if isinstance(images, Image.Image): @@ -579,7 +579,7 @@ def process_images(self, mm_data: Mapping[str, object], return image_outputs def process_videos(self, mm_data: Mapping[str, object], - mm_kwargs: Mapping[str, object]) -> dict[str, object]: + mm_kwargs: Mapping[str, object]) -> Dict[str, object]: videos = mm_data.pop("videos", []) video_embeds = mm_data.pop("video_embeds", []) if len(videos) > 0 and isinstance(videos[0], Image.Image): @@ -639,7 +639,7 @@ def process_mm_inputs(self, mm_data, mm_kwargs) -> object: "video": self.process_videos(mm_data, mm_kwargs) } - def get_input_modalities(self, mm_data) -> list[str]: + def get_input_modalities(self, mm_data) -> List[str]: supported_mm_modalities = self.info.get_supported_mm_modalities() input_modalities = [] for modality in supported_mm_modalities: @@ -653,7 +653,7 @@ def get_modality_num_counter(self, modality: str) -> str: elif modality == "video": return "video_image_sizes" - def get_num_slices_by_modality(self, inputs: dict[str, object], + def get_num_slices_by_modality(self, inputs: Dict[str, object], modality: str, index: int) -> int: if modality == "image": return self.info.get_image_slice_nums( @@ -667,8 +667,8 @@ def get_num_slices_by_modality(self, inputs: dict[str, object], else: raise ValueError(f"Unexpected modality: {modality}") - def check_mm_inputs(self, inputs: dict[str, object], - matches: list[str]) -> None: + def check_mm_inputs(self, inputs: Dict[str, object], + matches: List[str]) -> None: counts = Counter(matches) for modality, count in counts.items(): if modality not in inputs or not inputs[modality]: @@ -680,7 +680,7 @@ def check_mm_inputs(self, inputs: dict[str, object], f"{modality} inputs while you pass " f"{len(inputs[modality][counter_key])}") - def get_prompt_texts_by_modality(self, inputs: dict[str, object], + def get_prompt_texts_by_modality(self, inputs: Dict[str, object], modality: str, index: int) -> str: if modality == "image": return self.get_image_prompt_texts( @@ -743,7 +743,7 @@ def _hf_processor_applies_repl( def _get_prompt_replacements( self, mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, Any], - out_mm_kwargs: MultiModalKwargs) -> list[PromptReplacement]: + out_mm_kwargs: MultiModalKwargs) -> List[PromptReplacement]: placeholder = { "image": self.info.image_pattern, "video": self.info.video_pattern, @@ -775,7 +775,7 @@ def _get_mm_fields_config( def apply( self, - prompt: Union[str, list[int]], + prompt: Union[str, List[int]], mm_data: MultiModalDataDict, hf_processor_mm_kwargs: Mapping[str, object], ) -> MultiModalInputs: @@ -851,7 +851,7 @@ def get_embedding_with_vision( self, input_ids: torch.Tensor, image_inputs: Optional[MiniCPMVImageInputs], - ) -> tuple[torch.Tensor, torch.Tensor]: + ) -> Tuple[torch.Tensor, torch.Tensor]: vlm_embedding: torch.Tensor = self.llm.get_input_embeddings(input_ids) if image_inputs is None: # No image @@ -977,8 +977,8 @@ def _parse_and_validate_image_inputs( f"{len(modality_mm_data['pixel_values'])} vs. " f"{len(modality_mm_data['tgt_sizes'])}") - pixel_values_flat: list[torch.Tensor] = [] - tgt_sizes_flat: list[torch.Tensor] = [] + pixel_values_flat: List[torch.Tensor] = [] + tgt_sizes_flat: List[torch.Tensor] = [] for b in range(batch_size): mm_counts = {"image": 0, "video": 0} if self.version == (2, 6) \ else {"image": 0} @@ -1068,8 +1068,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) @@ -1105,7 +1105,7 @@ def init_resampler(self, def get_vision_embedding( self, - pixel_values: list[torch.Tensor], + pixel_values: List[torch.Tensor], patch_attn_mask: Optional[torch.Tensor] = None, tgt_sizes: Optional[torch.Tensor] = None, ) -> torch.Tensor: @@ -1185,7 +1185,7 @@ def init_resampler(self, def get_vision_embedding( self, - pixel_values: list[torch.Tensor], + pixel_values: List[torch.Tensor], patch_attn_mask: Optional[torch.Tensor] = None, tgt_sizes: Optional[torch.Tensor] = None, ) -> torch.Tensor: @@ -1268,7 +1268,7 @@ def init_resampler(self, def get_vision_embedding( self, - pixel_values: list[torch.Tensor], + pixel_values: List[torch.Tensor], patch_attn_mask: Optional[torch.Tensor] = None, tgt_sizes: Optional[torch.Tensor] = None, ) -> torch.Tensor: @@ -1363,7 +1363,7 @@ def init_resampler(self, def get_vision_embedding( self, - pixel_values: list[torch.Tensor], + pixel_values: List[torch.Tensor], patch_attn_mask: Optional[torch.Tensor] = None, tgt_sizes: Optional[torch.Tensor] = None, ) -> torch.Tensor: diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index ef8c3e60357ad..c8dea557e5715 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -22,8 +22,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only Mixtral model.""" -from collections.abc import Iterable -from typing import Optional, Union +from typing import Iterable, Optional, Set, Tuple, Union import torch from torch import nn @@ -391,8 +390,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -409,7 +408,7 @@ def load_weights(self, weights: Iterable[tuple[str, num_experts=self.config.num_local_experts) params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() + loaded_params: Set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py index ac441cc79bac8..21b52d9f54c76 100644 --- a/vllm/model_executor/models/mixtral_quant.py +++ b/vllm/model_executor/models/mixtral_quant.py @@ -22,8 +22,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only Mixtral model.""" -from collections.abc import Iterable -from typing import Optional, Union +from typing import Iterable, Optional, Set, Tuple, Union import numpy as np import torch @@ -404,8 +403,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -414,7 +413,7 @@ def load_weights(self, weights: Iterable[tuple[str, ] params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() + loaded_params: Set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py index 491abcfb1cbb5..459928fe3fb0e 100644 --- a/vllm/model_executor/models/mllama.py +++ b/vllm/model_executor/models/mllama.py @@ -15,8 +15,8 @@ # limitations under the License. """PyTorch Mllama model.""" import math -from collections.abc import Iterable, Mapping -from typing import Literal, Optional, TypedDict, Union +from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple, + TypedDict, Union) import numpy as np import torch @@ -318,8 +318,8 @@ def __init__( self, in_channels: int, out_channels: int, - kernel_size: Union[int, tuple[int, int]], - stride: Union[int, tuple[int, int]], + kernel_size: Union[int, Tuple[int, int]], + stride: Union[int, Tuple[int, int]], bias: bool = False, ) -> None: super().__init__() @@ -551,7 +551,7 @@ def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, - ) -> Union[tuple, BaseModelOutput]: + ) -> Union[Tuple, BaseModelOutput]: encoder_states = () for i, encoder_layer in enumerate(self.layers): @@ -824,7 +824,7 @@ def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.Tensor], - kv_range_for_decode: Optional[list[tuple[int, int]]], + kv_range_for_decode: Optional[List[Tuple[int, int]]], cross_attention_states: Optional[torch.Tensor], ) -> torch.Tensor: qkv_dec, _ = self.qkv_proj(hidden_states) @@ -860,7 +860,7 @@ def _attention_with_mask( k: torch.Tensor, v: torch.Tensor, attention_mask: torch.Tensor, - kv_range_for_decode: list[tuple[int, int]], + kv_range_for_decode: List[Tuple[int, int]], ) -> torch.Tensor: kv_cache = self.attn.kv_cache[self.pipeline_parallel_rank] attn_metadata: AttentionMetadata = get_forward_context().attn_metadata @@ -973,7 +973,7 @@ def forward( hidden_states: torch.Tensor, cross_attention_states: torch.Tensor, cross_attention_mask: torch.Tensor, - kv_range_for_decode: Optional[list[tuple[int, int]]], + kv_range_for_decode: Optional[List[Tuple[int, int]]], full_text_row_masked_out_mask: torch.Tensor, ) -> torch.Tensor: residual = hidden_states @@ -1044,8 +1044,8 @@ def forward( positions: Optional[torch.LongTensor], cross_attention_states: Optional[torch.LongTensor], cross_attention_mask: Optional[torch.LongTensor], - kv_range_for_decode: Optional[list[tuple[int, int]]], - full_text_row_masked_out_mask: Optional[tuple[torch.Tensor, + kv_range_for_decode: Optional[List[Tuple[int, int]]], + full_text_row_masked_out_mask: Optional[Tuple[torch.Tensor, torch.Tensor]], skip_cross_attention: bool, ) -> torch.Tensor: @@ -1108,8 +1108,8 @@ def forward( positions: Optional[torch.LongTensor], cross_attention_states: Optional[torch.LongTensor], cross_attention_mask: Optional[torch.LongTensor], - kv_range_for_decode: Optional[list[tuple[int, int]]], - full_text_row_masked_out_mask: Optional[tuple[torch.Tensor, + kv_range_for_decode: Optional[List[Tuple[int, int]]], + full_text_row_masked_out_mask: Optional[Tuple[torch.Tensor, torch.Tensor]], skip_cross_attention: bool, ) -> torch.Tensor: @@ -1188,26 +1188,26 @@ def sample( def _parse_and_validate_image_input(self, **kwargs: object): # tensor with the same shape will be batched together by # MultiModalKwargs.batch, so pixel_values here can be: - # - list[list[torch.Tensor]]: + # - List[List[torch.Tensor]]: # with shape (num_tiles, 3, image_res, image_res) - # - list[torch.Tensor]: + # - List[torch.Tensor]: # with shape (num_image, num_tiles, 3, image_res, image_res) # - torch.Tensor: # with shape (bs, num_image, num_tiles, 3, image_res, image_res) - pixel_values: Optional[Union[list[list[torch.Tensor]], - list[torch.Tensor], + pixel_values: Optional[Union[List[List[torch.Tensor]], + List[torch.Tensor], torch.Tensor]] = kwargs.pop( "pixel_values", None) - image_embeds: Optional[Union[list[list[torch.Tensor]], - list[torch.Tensor], + image_embeds: Optional[Union[List[List[torch.Tensor]], + List[torch.Tensor], torch.Tensor]] = kwargs.pop( "image_embeds", None) - aspect_ratio_ids: Optional[Union[list[list[torch.Tensor]], - list[torch.Tensor], + aspect_ratio_ids: Optional[Union[List[List[torch.Tensor]], + List[torch.Tensor], torch.Tensor]] = kwargs.pop( "aspect_ratio_ids", None) - aspect_ratio_mask: Optional[Union[list[list[torch.Tensor]], - list[torch.Tensor], + aspect_ratio_mask: Optional[Union[List[List[torch.Tensor]], + List[torch.Tensor], torch.Tensor]] = kwargs.pop( "aspect_ratio_mask", None) @@ -1236,7 +1236,7 @@ def _parse_and_validate_image_input(self, **kwargs: object): def flat_encoder_result(self, cross_attention_states: torch.Tensor, attn_metadata: AttentionMetadata, - actual_encoder_seq_lens: list[int]): + actual_encoder_seq_lens: List[int]): cross_attention_states_flat = torch.zeros( sum(actual_encoder_seq_lens), @@ -1257,8 +1257,8 @@ def get_cross_attention_states( self, image_inputs: MllamaImagePixelInputs, attn_metadata: AttentionMetadata, - actual_encoder_seq_lens: list[int], - ) -> tuple[torch.Tensor]: + actual_encoder_seq_lens: List[int], + ) -> Tuple[torch.Tensor]: # NOTE: llama's reference implementation runs vision model on CPU pixel_values = image_inputs['data'] aspect_ratio_ids = image_inputs['aspect_ratio_ids'] @@ -1282,10 +1282,10 @@ def get_cross_attention_mask( self, input_ids: torch.Tensor, attn_metadata: AttentionMetadata, - num_tiles: list[list[int]], + num_tiles: List[List[int]], num_tokens_per_tile: int, dtype: torch.dtype, - ) -> tuple[torch.Tensor, torch.Tensor]: + ) -> Tuple[torch.Tensor, torch.Tensor]: token_ids = input_ids.tolist() start = 0 batch_token_ids = [] @@ -1337,7 +1337,7 @@ def forward( input_ids: torch.Tensor, positions: torch.Tensor, **kwargs: object, - ) -> Union[tuple, CausalLMOutputWithPast]: + ) -> Union[Tuple, CausalLMOutputWithPast]: attn_metadata = get_forward_context().attn_metadata if attn_metadata.num_prefill_tokens > 0 and \ attn_metadata.num_decode_tokens > 0: @@ -1397,8 +1397,8 @@ def forward( return outputs - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) (".qkv_proj", ".q_proj", "q"), @@ -1408,7 +1408,7 @@ def load_weights(self, weights: Iterable[tuple[str, (".gate_up_proj", ".up_proj", 1), ] params_dict = dict(self.named_parameters()) - updated_params: set[str] = set() + updated_params: Set[str] = set() for name, loaded_weight in weights: if 'patch_embedding.weight' in name: name = name.replace('patch_embedding.weight', @@ -1450,7 +1450,7 @@ def load_weights(self, weights: Iterable[tuple[str, return updated_params -def skip_attention_mask(sparse_mask: list[list[int]]) -> bool: +def skip_attention_mask(sparse_mask: List[List[int]]) -> bool: for mask in sparse_mask: # Skip text-only samples. if len(mask) == 0: @@ -1468,10 +1468,10 @@ def skip_attention_mask(sparse_mask: list[list[int]]) -> bool: def convert_sparse_cross_attention_mask_to_dense( - sparse_mask: list[list[list[int]]], - num_tiles: list[list[int]], - lengths: list[int], -) -> tuple[np.ndarray, list[tuple[int, int]]]: + sparse_mask: List[List[List[int]]], + num_tiles: List[List[int]], + lengths: List[int], +) -> Tuple[np.ndarray, List[Tuple[int, int]]]: total_length = sum(lengths) total_tiles = sum([sum(tiles) for tiles in num_tiles]) dense_mask = np.zeros(shape=(total_length, total_tiles), dtype=np.int64) diff --git a/vllm/model_executor/models/mlp_speculator.py b/vllm/model_executor/models/mlp_speculator.py index a7d7aa7d44ef2..2920427f94f7b 100644 --- a/vllm/model_executor/models/mlp_speculator.py +++ b/vllm/model_executor/models/mlp_speculator.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import math -from collections.abc import Iterable +from typing import Iterable, List, Set, Tuple import torch import torch.nn as nn @@ -148,7 +148,7 @@ def generate_proposals( previous_hidden_states: torch.Tensor, num_predict_tokens: int, sampling_metadata: SamplingMetadata, - ) -> list[SamplerOutput]: + ) -> List[SamplerOutput]: if num_predict_tokens > self.max_speculative_tokens: raise ValueError(f"Max speculative tokens for model is " f"{self.max_speculative_tokens}, but " @@ -190,10 +190,10 @@ def generate_proposals( return next_tokens - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() + loaded_params: Set[str] = set() for name, loaded_weight in weights: name = name.replace("speculator.", "") param = params_dict.get(name) diff --git a/vllm/model_executor/models/module_mapping.py b/vllm/model_executor/models/module_mapping.py index 25e6f594069ef..23814e6322d2e 100644 --- a/vllm/model_executor/models/module_mapping.py +++ b/vllm/model_executor/models/module_mapping.py @@ -4,7 +4,7 @@ # https://github.com/modelscope/ms-swift/blob/v2.4.2/swift/utils/module_mapping.py from dataclasses import dataclass, field -from typing import Union +from typing import List, Union @dataclass @@ -46,17 +46,17 @@ class ModelKeys: @dataclass class MultiModelKeys(ModelKeys): - language_model: list[str] = field(default_factory=list) - connector: list[str] = field(default_factory=list) + language_model: List[str] = field(default_factory=list) + connector: List[str] = field(default_factory=list) # vision tower and audio tower - tower_model: list[str] = field(default_factory=list) - generator: list[str] = field(default_factory=list) + tower_model: List[str] = field(default_factory=list) + generator: List[str] = field(default_factory=list) @staticmethod - def from_string_field(language_model: Union[str, list[str]] = None, - connector: Union[str, list[str]] = None, - tower_model: Union[str, list[str]] = None, - generator: Union[str, list[str]] = None, + def from_string_field(language_model: Union[str, List[str]] = None, + connector: Union[str, List[str]] = None, + tower_model: Union[str, List[str]] = None, + generator: Union[str, List[str]] = None, **kwargs) -> 'MultiModelKeys': def to_list(value): diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index ec0a239abd662..cc4d38d8740b2 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -1,10 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 import math -from collections.abc import Iterable, Mapping from dataclasses import dataclass from functools import cached_property, partial -from typing import Optional, TypedDict, Union, cast +from typing import (Iterable, List, Mapping, Optional, Set, Tuple, TypedDict, + Union, cast) import numpy as np import torch @@ -71,13 +71,13 @@ class MolmoImageInputs(TypedDict): - images: Union[torch.Tensor, list[torch.Tensor]] + images: Union[torch.Tensor, List[torch.Tensor]] """Shape: `(batch_size, num_crops, num_patch, patch_dim)`""" - image_masks: Optional[Union[torch.Tensor, list[torch.Tensor]]] + image_masks: Optional[Union[torch.Tensor, List[torch.Tensor]]] """Shape: `(batch_size, num_crops, num_patch)`""" - feat_is_patch: Union[torch.Tensor, list[torch.Tensor]] + feat_is_patch: Union[torch.Tensor, List[torch.Tensor]] """ A boolean mask indicating which image features correspond to patch tokens. @@ -85,7 +85,7 @@ class MolmoImageInputs(TypedDict): Shape: `(batch_size, num_crops, num_patch)` """ - embed_is_patch: Union[torch.Tensor, list[torch.Tensor]] + embed_is_patch: Union[torch.Tensor, List[torch.Tensor]] """ A boolean mask indicating which image embeddings correspond to patch tokens. @@ -99,7 +99,7 @@ class MolmoImageInputs(TypedDict): @dataclass class VisionBackboneConfig: - image_default_input_size: tuple[int, int] = (336, 336) + image_default_input_size: Tuple[int, int] = (336, 336) image_patch_size: int = 14 image_pos_patch_size: int = 14 image_emb_dim: int = 1024 @@ -276,7 +276,7 @@ def __init__( for _ in range(config.image_num_layers) ]) - def forward(self, x: torch.Tensor) -> list[torch.Tensor]: + def forward(self, x: torch.Tensor) -> List[torch.Tensor]: hidden_states = [] for r in self.resblocks: x = r(x) @@ -343,7 +343,7 @@ def add_pos_emb(self, x: torch.Tensor, patch_num: int) -> torch.Tensor: def forward(self, x: torch.Tensor, - patch_num: Optional[int] = None) -> list[torch.Tensor]: + patch_num: Optional[int] = None) -> List[torch.Tensor]: """ : param x: (batch_size, num_patch, n_pixels) """ @@ -443,7 +443,7 @@ def __init__( ) def _apply_qk_norm(self, q: torch.Tensor, - k: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + k: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: if self.tp_size > 1: q = tensor_model_parallel_all_gather(q.contiguous()) k = tensor_model_parallel_all_gather(k.contiguous()) @@ -579,7 +579,7 @@ def forward( positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> tuple[torch.Tensor, Optional[tuple[torch.Tensor, torch.Tensor]]]: + ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]: # Self Attention if residual is None: residual = hidden_states @@ -605,7 +605,7 @@ def forward( positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> tuple[torch.Tensor, Optional[tuple[torch.Tensor, torch.Tensor]]]: + ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]: # Self Attention residual = hidden_states hidden_states = self.self_attn( @@ -697,7 +697,7 @@ def encode_image(self, images: torch.Tensor) -> torch.Tensor: def forward( self, images: torch.Tensor, image_masks: torch.Tensor - ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: # image_features: (batch_size, num_crops(=num_image), num_patch, nximage_emb_dim) # noqa: E501 batch_size, num_image = images.shape[:2] @@ -748,15 +748,15 @@ def forward( # image_features: (batch_size, num_image, num_patch, d_model) return image_features - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("merged_linear", "gate_proj", 0), ("merged_linear", "up_proj", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() + loaded_params: Set[str] = set() for name, loaded_weight in weights: for (param_name, weight_name, shard_id) in stacked_params_mapping: @@ -863,10 +863,10 @@ def forward( hidden_states = self.norm(hidden_states) return hidden_states - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() + loaded_params: Set[str] = set() for name, loaded_weight in weights: if name.endswith(".bias") and name not in params_dict: @@ -1508,7 +1508,7 @@ def _parse_and_validate_image_input( def _process_image_input( self, image_input: MolmoImageInputs, - ) -> Union[torch.Tensor, list[torch.Tensor]]: + ) -> Union[torch.Tensor, List[torch.Tensor]]: if isinstance(image_input["images"], list): # Call the vision backbone on the whole batch at once images_flat = flatten_bn(image_input["images"], concat=True) @@ -1665,7 +1665,7 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): loader = AutoWeightsLoader(self) weights = _get_weights_with_merged_embedding(weights) @@ -1683,8 +1683,8 @@ def get_mm_mapping(self) -> MultiModelKeys: def _get_weights_with_merged_embedding( - weights: Iterable[tuple[str, torch.Tensor]] -) -> Iterable[tuple[str, torch.Tensor]]: + weights: Iterable[Tuple[str, torch.Tensor]] +) -> Iterable[Tuple[str, torch.Tensor]]: embedding_weights = {} for name, weight in weights: if "wte.embedding" in name: diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py index aa63897d22c6b..d716818f31c03 100644 --- a/vllm/model_executor/models/mpt.py +++ b/vllm/model_executor/models/mpt.py @@ -2,8 +2,7 @@ # Adapted from https://huggingface.co/mosaicml/mpt-7b/tree/main import math -from collections.abc import Iterable -from typing import Optional, Union +from typing import Iterable, Optional, Set, Tuple, Union import torch import torch.nn as nn @@ -317,10 +316,10 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: set[str] = set() + loaded_params: Set[str] = set() for name, loaded_weight in weights: # Skip loading extra bias for GPTQ models. if name.endswith(".bias") and name not in params_dict: diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py index b882392720813..3b86b91465ca8 100644 --- a/vllm/model_executor/models/nemotron.py +++ b/vllm/model_executor/models/nemotron.py @@ -22,8 +22,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only Nemotron model compatible with HuggingFace weights.""" -from collections.abc import Iterable -from typing import Any, Optional, Union +from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union import torch from torch import nn @@ -71,7 +70,7 @@ def _cast_if_autocast_enabled(*args): class NemotronLayerNorm1P(nn.LayerNorm): def __init__(self, - normalized_shape: Union[int, list[int], torch.Size], + normalized_shape: Union[int, List[int], torch.Size], eps: float = 1e-5, elementwise_affine: bool = True, bias: bool = True, @@ -135,7 +134,7 @@ def __init__( num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: Optional[dict[str, Any]] = None, + rope_scaling: Optional[Dict[str, Any]] = None, max_position_embeddings: int = 8192, quant_config: Optional[QuantizationConfig] = None, bias: bool = False, @@ -269,7 +268,7 @@ def forward( positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> tuple[torch.Tensor, torch.Tensor]: + ) -> Tuple[torch.Tensor, torch.Tensor]: # Self Attention if residual is None: residual = hidden_states @@ -454,8 +453,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) (".qkv_proj", ".q_proj", "q"), @@ -463,7 +462,7 @@ def load_weights(self, weights: Iterable[tuple[str, (".qkv_proj", ".v_proj", "v"), ] params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() + loaded_params: Set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue diff --git a/vllm/model_executor/models/nvlm_d.py b/vllm/model_executor/models/nvlm_d.py index dafb0e45ea921..5de8eeb3fffed 100644 --- a/vllm/model_executor/models/nvlm_d.py +++ b/vllm/model_executor/models/nvlm_d.py @@ -6,8 +6,7 @@ # Copyright (c) 2024 NVIDIA # Licensed under Apache 2.0 License [see LICENSE for details] # -------------------------------------------------------- -from collections.abc import Mapping -from typing import Optional +from typing import Mapping, Optional import torch import torch.nn as nn diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py index e77f344097bd8..4a341c97d6cdf 100644 --- a/vllm/model_executor/models/olmo.py +++ b/vllm/model_executor/models/olmo.py @@ -22,8 +22,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only OLMo model compatible with HuggingFace weights.""" -from collections.abc import Iterable -from typing import Optional, Union +from typing import Iterable, Optional, Set, Tuple, Union import torch from torch import nn @@ -211,7 +210,7 @@ def forward( self, positions: torch.Tensor, hidden_states: torch.Tensor, - ) -> tuple[torch.Tensor, Optional[tuple[torch.Tensor, torch.Tensor]]]: + ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]: # Attention block. residual = hidden_states hidden_states = self.input_layernorm(hidden_states) @@ -349,8 +348,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -360,7 +359,7 @@ def load_weights(self, weights: Iterable[tuple[str, ("gate_up_proj", "up_proj", 1), ] params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: set[str] = set() + loaded_params: Set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue diff --git a/vllm/model_executor/models/olmo2.py b/vllm/model_executor/models/olmo2.py index fe8bb677aea60..54cc851de9347 100644 --- a/vllm/model_executor/models/olmo2.py +++ b/vllm/model_executor/models/olmo2.py @@ -23,9 +23,8 @@ # limitations under the License. """Inference-only OLMo2 model compatible with HuggingFace weights.""" -from collections.abc import Iterable from functools import partial -from typing import Optional, Union +from typing import Iterable, Optional, Tuple, Union import torch from torch import nn @@ -137,7 +136,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): ) def _apply_qk_norm(self, q: torch.Tensor, - k: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + k: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: if self.tp_size > 1: q = tensor_model_parallel_all_gather(q.contiguous()) k = tensor_model_parallel_all_gather(k.contiguous()) @@ -372,7 +371,7 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py index b53bed453d0eb..e27ff5deace29 100644 --- a/vllm/model_executor/models/olmoe.py +++ b/vllm/model_executor/models/olmoe.py @@ -12,8 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only OLMoE model compatible with HuggingFace weights.""" -from collections.abc import Iterable -from typing import Any, Optional, Union +from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union import torch from torch import nn @@ -103,7 +102,7 @@ def __init__( num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: Optional[dict[str, Any]] = None, + rope_scaling: Optional[Dict[str, Any]] = None, max_position_embeddings: int = 4096, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, @@ -358,8 +357,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -378,7 +377,7 @@ def load_weights(self, weights: Iterable[tuple[str, num_experts=self.config.num_experts) params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() + loaded_params: Set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py index 921039d2b5222..e4775478a54d1 100644 --- a/vllm/model_executor/models/opt.py +++ b/vllm/model_executor/models/opt.py @@ -18,8 +18,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only OPT model compatible with HuggingFace weights.""" -from collections.abc import Iterable -from typing import Optional, Union +from typing import Iterable, Optional, Set, Tuple, Union import torch from torch import nn @@ -372,8 +371,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -381,7 +380,7 @@ def load_weights(self, weights: Iterable[tuple[str, ("qkv_proj", "v_proj", "v"), ] params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: set[str] = set() + loaded_params: Set[str] = set() for name, loaded_weight in weights: if "lm_head.weight" in name and self.config.tie_word_embeddings: continue diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py index a8155c63c0ef3..6668ede91eecb 100644 --- a/vllm/model_executor/models/orion.py +++ b/vllm/model_executor/models/orion.py @@ -5,8 +5,7 @@ # Copyright (c) OrionStar Inc. # LICENSE: https://huggingface.co/OrionStarAI/Orion-14B-Base/blob/main/LICENSE """Inference-only Orion-14B model compatible with HuggingFace weights.""" -from collections.abc import Iterable -from typing import Any, Optional, Union +from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union import torch from torch import nn @@ -74,7 +73,7 @@ def __init__( num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: Optional[dict[str, Any]] = None, + rope_scaling: Optional[Dict[str, Any]] = None, max_position_embeddings: int = 8192, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, @@ -188,7 +187,7 @@ def forward( self, positions: torch.Tensor, hidden_states: torch.Tensor, - ) -> tuple[torch.Tensor, torch.Tensor]: + ) -> Tuple[torch.Tensor, torch.Tensor]: # Self Attention residual = hidden_states hidden_states = self.input_layernorm(hidden_states) @@ -314,8 +313,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -325,7 +324,7 @@ def load_weights(self, weights: Iterable[tuple[str, ("gate_up_proj", "up_proj", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() + loaded_params: Set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py index caa895a5adf68..02d1861b8027c 100644 --- a/vllm/model_executor/models/paligemma.py +++ b/vllm/model_executor/models/paligemma.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 -from collections.abc import Iterable, Mapping -from typing import Literal, Optional, TypedDict, Union +from typing import (Iterable, Literal, Mapping, Optional, Set, Tuple, + TypedDict, Union) import torch from torch import nn @@ -323,7 +323,7 @@ def sample( ) -> Optional[SamplerOutput]: return self.language_model.sample(logits, sampling_metadata) - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/persimmon.py b/vllm/model_executor/models/persimmon.py index 6906720e623a3..db8d170a8c91b 100644 --- a/vllm/model_executor/models/persimmon.py +++ b/vllm/model_executor/models/persimmon.py @@ -21,8 +21,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only persimmon model compatible with HuggingFace weights.""" -from collections.abc import Iterable -from typing import Optional, Union +from typing import Iterable, Optional, Set, Tuple, Union import torch from torch import nn @@ -314,10 +313,10 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: set[str] = set() + loaded_params: Set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py index cf53dc39c58f2..6ee80210c2b4d 100644 --- a/vllm/model_executor/models/phi.py +++ b/vllm/model_executor/models/phi.py @@ -36,8 +36,7 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. """Inference-only Phi-1.5 model compatible with HuggingFace weights.""" -from collections.abc import Iterable -from typing import Optional, Union +from typing import Iterable, Optional, Set, Tuple, Union import torch from torch import nn @@ -316,8 +315,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -325,7 +324,7 @@ def load_weights(self, weights: Iterable[tuple[str, ("qkv_proj", "v_proj", "v") ] params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() + loaded_params: Set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: diff --git a/vllm/model_executor/models/phi3_small.py b/vllm/model_executor/models/phi3_small.py index e092fb06eb120..33984f54ae271 100644 --- a/vllm/model_executor/models/phi3_small.py +++ b/vllm/model_executor/models/phi3_small.py @@ -1,8 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import math -from collections.abc import Iterable -from typing import Optional, Union +from typing import Iterable, Optional, Set, Tuple, Union import torch from torch import nn @@ -232,8 +231,8 @@ def forward( self, positions: torch.Tensor, hidden_states: torch.Tensor, - ) -> tuple[torch.Tensor, Optional[torch.Tensor], - Optional[tuple[torch.Tensor]]]: + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], + Optional[Tuple[torch.Tensor]]]: qkv, _ = self.query_key_value(hidden_states) qkv = qkv.view(qkv.shape[:-1] + @@ -447,11 +446,11 @@ def sample( sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() + loaded_params: Set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index 389ae88bd0483..0f45f131065a8 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -16,7 +16,7 @@ # limitations under the License. from collections.abc import Iterable, Mapping, Sequence from functools import cached_property -from typing import Any, Literal, Optional, TypedDict, Union +from typing import Any, List, Literal, Optional, Set, Tuple, TypedDict, Union import torch import torch.nn as nn @@ -96,7 +96,7 @@ def _init_img_processor(hf_config: PretrainedConfig, class Phi3VImagePixelInputs(TypedDict): type: Literal["pixel_values"] - data: Union[torch.Tensor, list[torch.Tensor]] + data: Union[torch.Tensor, List[torch.Tensor]] """ Shape: `(batch_size * num_images, 1 + num_patches, num_channels, height, width)` @@ -115,7 +115,7 @@ class Phi3VImagePixelInputs(TypedDict): class Phi3VImageEmbeddingInputs(TypedDict): type: Literal["image_embeds"] - data: Union[torch.Tensor, list[torch.Tensor]] + data: Union[torch.Tensor, List[torch.Tensor]] """Shape: `(batch_size * num_images, image_feature_size, hidden_size)` `hidden_size` must match the hidden size of language model backbone. @@ -569,8 +569,8 @@ def _validate_shape(d: torch.Tensor): return data def _validate_pixel_values( - self, data: Union[torch.Tensor, list[torch.Tensor]] - ) -> Union[torch.Tensor, list[torch.Tensor]]: + self, data: Union[torch.Tensor, List[torch.Tensor]] + ) -> Union[torch.Tensor, List[torch.Tensor]]: h = w = CLIP_VIT_LARGE_PATCH14_336_CONFIG.image_size expected_dims = (3, h, w) @@ -708,8 +708,8 @@ def sample( ) -> Optional[SamplerOutput]: return self.language_model.sample(logits, sampling_metadata) - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader(self) autoloaded_weights = loader.load_weights(weights, diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py index 1104dd705c693..c35c7e9fcce74 100644 --- a/vllm/model_executor/models/phimoe.py +++ b/vllm/model_executor/models/phimoe.py @@ -22,8 +22,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only PhiMoE model.""" -from collections.abc import Iterable -from typing import Optional, Union +from typing import Iterable, Optional, Set, Tuple, Union import torch from torch import nn @@ -582,8 +581,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -598,7 +597,7 @@ def load_weights(self, weights: Iterable[tuple[str, num_experts=self.config.num_local_experts) params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() + loaded_params: Set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index c0ad3e122db3e..87b1d50749a2c 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -1,10 +1,9 @@ # SPDX-License-Identifier: Apache-2.0 import math -from collections.abc import Iterable, Mapping from dataclasses import dataclass, fields from functools import cached_property -from typing import Optional, Union +from typing import Iterable, List, Mapping, Optional, Set, Tuple, Union import torch import torch.nn as nn @@ -296,10 +295,10 @@ def forward( def _parse_and_validate_image_input( self, - images: Optional[Union[list[list[torch.Tensor]], list[torch.Tensor], + images: Optional[Union[List[List[torch.Tensor]], List[torch.Tensor], torch.Tensor]] = None, image_tokens: Optional[torch.Tensor] = None, - ) -> tuple[Optional[list[torch.Tensor]], Optional[torch.Tensor]]: + ) -> Tuple[Optional[List[torch.Tensor]], Optional[torch.Tensor]]: if images is None: return None, None @@ -332,7 +331,7 @@ def _parse_and_validate_image_input( return images, image_tokens def _process_image_input(self, - image_input: list[torch.Tensor]) -> torch.Tensor: + image_input: List[torch.Tensor]) -> torch.Tensor: return self.vision_language_adapter(self.vision_encoder(image_input)) def compute_logits( @@ -350,12 +349,12 @@ def sample( ) -> Optional[SamplerOutput]: return self.language_model.sample(logits, sampling_metadata) - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): - def is_vision_encoder_weights(weight: tuple[str, torch.Tensor]): + def is_vision_encoder_weights(weight: Tuple[str, torch.Tensor]): return weight[0].startswith("vision_encoder") - def is_vision_lang_adapter_weights(weight: tuple[str, torch.Tensor]): + def is_vision_lang_adapter_weights(weight: Tuple[str, torch.Tensor]): return weight[0].startswith("vision_language_adapter") # Get references to parameters for direct loading @@ -454,7 +453,7 @@ def apply_rotary_emb_vit( xq: torch.Tensor, xk: torch.Tensor, freqs_cis: torch.Tensor, -) -> tuple[torch.Tensor, torch.Tensor]: +) -> Tuple[torch.Tensor, torch.Tensor]: xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2)) xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2)) assert freqs_cis.dtype == torch.complex64 @@ -559,7 +558,7 @@ def forward( return x -def position_meshgrid(patch_embeds_list: list[torch.Tensor], ) -> torch.Tensor: +def position_meshgrid(patch_embeds_list: List[torch.Tensor], ) -> torch.Tensor: positions = torch.cat([ torch.stack( torch.meshgrid( @@ -621,7 +620,7 @@ def freqs_cis(self) -> torch.Tensor: def forward( self, - images: list[torch.Tensor], + images: List[torch.Tensor], ) -> torch.Tensor: """ Args: @@ -857,7 +856,7 @@ def forward( hidden_states: torch.Tensor, attention_mask: torch.Tensor, position_embeddings: torch.Tensor, - ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: batch, patches, _ = hidden_states.size() qkv_states, _ = self.qkv_proj(hidden_states) @@ -1018,7 +1017,7 @@ def __init__( def forward( self, - pixel_values: list[torch.Tensor], + pixel_values: List[torch.Tensor], feature_sample_layers: Optional[list[int]] = None, ) -> torch.Tensor: """ @@ -1078,8 +1077,8 @@ def forward( # (TODO) Add prefix argument for filtering out weights to be loaded # ref: https://github.com/vllm-project/vllm/pull/7186#discussion_r1734163986 - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) (".qkv_proj", ".q_proj", "q"), @@ -1089,7 +1088,7 @@ def load_weights(self, weights: Iterable[tuple[str, (".gate_up_proj", ".up_proj", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() + loaded_params: Set[str] = set() layer_count = len(self.transformer.layers) for name, loaded_weight in weights: diff --git a/vllm/model_executor/models/prithvi_geospatial_mae.py b/vllm/model_executor/models/prithvi_geospatial_mae.py index a60fe05333cb6..3d95e949e71da 100644 --- a/vllm/model_executor/models/prithvi_geospatial_mae.py +++ b/vllm/model_executor/models/prithvi_geospatial_mae.py @@ -15,8 +15,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only IBM/NASA Prithvi Geospatial model.""" -from collections.abc import Iterable, Mapping -from typing import Optional, Union +from typing import Iterable, Mapping, Optional, Set, Tuple, Union import torch import torch.nn as nn @@ -159,7 +158,7 @@ def __init__(self, vllm_config: VllmConfig, prefix: str = ""): "by PrithviGeospatialMAE.") def _parse_and_validate_multimodal_data( - self, **kwargs) -> tuple[torch.Tensor, torch.Tensor | None]: + self, **kwargs) -> Tuple[torch.Tensor, torch.Tensor | None]: pixel_values = kwargs.pop("pixel_values", None) if not isinstance(pixel_values, torch.Tensor): @@ -200,8 +199,8 @@ def pooler( ) -> Optional[PoolerOutput]: return PoolerOutput([PoolingSequenceGroupOutput(hidden_states)]) - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: params_list = [] model_buffers = dict(self.named_buffers()) loaded_buffers = [] diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index b5f5079b6d416..96abfb9d1096c 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -6,8 +6,7 @@ # LICENSE: https://huggingface.co/Qwen/Qwen-7B/blob/main/LICENSE """Inference-only QWen model compatible with HuggingFace weights.""" -from collections.abc import Iterable -from typing import Any, Optional, Union +from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union import torch from torch import nn @@ -78,7 +77,7 @@ def __init__( num_heads: int, max_position_embeddings: int, rope_theta: float = 10000, - rope_scaling: Optional[dict[str, Any]] = None, + rope_scaling: Optional[Dict[str, Any]] = None, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", @@ -168,7 +167,7 @@ def forward( positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> tuple[torch.Tensor, torch.Tensor]: + ) -> Tuple[torch.Tensor, torch.Tensor]: # Self Attention if residual is None: residual = hidden_states @@ -295,15 +294,15 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("gate_up_proj", "w2", 0), ("gate_up_proj", "w1", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() + loaded_params: Set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index 491053ae296a5..fe615c41aeaa1 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -23,8 +23,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only Qwen2 model compatible with HuggingFace weights.""" -from collections.abc import Iterable -from typing import Optional, Union +from typing import Iterable, Optional, Set, Tuple, Union import torch from torch import nn @@ -109,7 +108,7 @@ def __init__(self, rope_theta: float = 10000, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, - rope_scaling: Optional[tuple] = None, + rope_scaling: Optional[Tuple] = None, prefix: str = "", attn_type: str = AttentionType.DECODER) -> None: super().__init__() @@ -233,7 +232,7 @@ def forward( positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> tuple[torch.Tensor, torch.Tensor]: + ) -> Tuple[torch.Tensor, torch.Tensor]: # Self Attention if residual is None: residual = hidden_states @@ -350,8 +349,8 @@ def forward( hidden_states, _ = self.norm(hidden_states, residual) return hidden_states - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -361,7 +360,7 @@ def load_weights(self, weights: Iterable[tuple[str, ("gate_up_proj", "up_proj", 1), ] params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: set[str] = set() + loaded_params: Set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue @@ -482,8 +481,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader( self, skip_prefixes=(["lm_head."] @@ -551,7 +550,7 @@ def pooler( ) -> Optional[PoolerOutput]: return self._pooler(hidden_states, pooling_metadata) - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): weights = self.hf_to_vllm_mapper.apply(weights) weights = ((name, data) for name, data in weights if not name.startswith("lm_head.")) diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 4b1ff026c1d37..858cf28d2b873 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -24,9 +24,9 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only Qwen2.5-VL model compatible with HuggingFace weights.""" -from collections.abc import Iterable, Mapping from functools import cached_property, partial -from typing import Callable, Literal, Optional, TypedDict, Union +from typing import (Callable, Iterable, List, Literal, Mapping, Optional, Set, + Tuple, TypedDict, Union) import torch import torch.nn as nn @@ -90,7 +90,7 @@ class Qwen2_5_VLImageEmbeddingInputs(TypedDict): type: Literal["image_embeds"] image_embeds: torch.Tensor """Supported types: - - list[`torch.Tensor`]: A list of tensors holding all images' features. + - List[`torch.Tensor`]: A list of tensors holding all images' features. Each tensor holds an image's features. - `torch.Tensor`: A tensor holding all images' features (concatenation of all images' feature tensors). @@ -136,7 +136,7 @@ class Qwen2_5_VLVideoEmbeddingInputs(TypedDict): type: Literal["video_embeds"] video_embeds: torch.Tensor """Supported types: - - list[`torch.Tensor`]: A list of tensors holding all videos' features. + - List[`torch.Tensor`]: A list of tensors holding all videos' features. Each tensor holds an video's features. - `torch.Tensor`: A tensor holding all videos' features (concatenation of all videos' feature tensors). @@ -647,8 +647,8 @@ def forward( hidden_states = hidden_states[reverse_indices, :] return hidden_states - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -656,7 +656,7 @@ def load_weights(self, weights: Iterable[tuple[str, ("qkv_proj", "v_proj", "v"), ] params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: set[str] = set() + loaded_params: Set[str] = set() for name, loaded_weight in weights: for (param_name, weight_name, shard_id) in stacked_params_mapping: @@ -688,7 +688,7 @@ def get_hf_processor( min_pixels: Optional[int] = None, max_pixels: Optional[int] = None, size: Optional[dict[str, int]] = None, - fps: Optional[Union[float, list[float]]] = None, + fps: Optional[Union[float, List[float]]] = None, **kwargs: object, ) -> Qwen2_5_VLProcessor: if fps is not None: @@ -1064,8 +1064,8 @@ def sample( ) -> Optional[SamplerOutput]: return self.language_model.sample(logits, sampling_metadata) - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py index a8aafe3a39883..f0dc8573ee14e 100644 --- a/vllm/model_executor/models/qwen2_audio.py +++ b/vllm/model_executor/models/qwen2_audio.py @@ -21,9 +21,9 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only Qwen2-Audio model compatible with HuggingFace weights.""" -from collections.abc import Iterable, Mapping from functools import cached_property -from typing import Any, Optional, TypedDict, Union +from typing import (Any, Iterable, Mapping, Optional, Set, Tuple, TypedDict, + Union) import torch import torch.nn as nn @@ -416,7 +416,7 @@ def sample( ) -> Optional[SamplerOutput]: return self.language_model.sample(logits, sampling_metadata) - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py index 8011ebb1df87a..41536b34b2f2d 100644 --- a/vllm/model_executor/models/qwen2_moe.py +++ b/vllm/model_executor/models/qwen2_moe.py @@ -23,8 +23,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only Qwen2MoE model compatible with HuggingFace weights.""" -from collections.abc import Iterable -from typing import Any, Optional, Union +from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union import torch import torch.nn.functional as F @@ -169,7 +168,7 @@ def __init__( num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: Optional[dict[str, Any]] = None, + rope_scaling: Optional[Dict[str, Any]] = None, max_position_embeddings: int = 8192, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, @@ -430,8 +429,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -450,7 +449,7 @@ def load_weights(self, weights: Iterable[tuple[str, num_experts=self.config.num_experts) params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() + loaded_params: Set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue diff --git a/vllm/model_executor/models/qwen2_rm.py b/vllm/model_executor/models/qwen2_rm.py index 4ab6c75b639a0..21cc9e8ed1c6b 100644 --- a/vllm/model_executor/models/qwen2_rm.py +++ b/vllm/model_executor/models/qwen2_rm.py @@ -5,8 +5,7 @@ # Copyright 2024 The Qwen team. # Copyright 2023 The vLLM team. """Inference-only Qwen2-RM model compatible with HuggingFace weights.""" -from collections.abc import Iterable -from typing import Optional, Union +from typing import Iterable, Optional, Set, Tuple, Union import torch from torch import nn @@ -95,8 +94,8 @@ def pooler( ) -> Optional[PoolerOutput]: return self._pooler(hidden_states, pooling_metadata) - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader(self, ignore_unexpected_prefixes=["lm_head."]) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index a0dfbe68292b9..849ef7293bb7f 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -23,9 +23,9 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only Qwen2-VL model compatible with HuggingFace weights.""" -from collections.abc import Iterable, Mapping from functools import cached_property, partial -from typing import Any, Callable, Literal, Optional, TypedDict, Union +from typing import (Any, Callable, Iterable, Literal, Mapping, Optional, Set, + Tuple, Type, TypedDict, Union) import torch import torch.nn as nn @@ -100,7 +100,7 @@ class Qwen2VLImageEmbeddingInputs(TypedDict): type: Literal["image_embeds"] image_embeds: torch.Tensor """Supported types: - - list[`torch.Tensor`]: A list of tensors holding all images' features. + - List[`torch.Tensor`]: A list of tensors holding all images' features. Each tensor holds an image's features. - `torch.Tensor`: A tensor holding all images' features (concatenation of all images' feature tensors). @@ -140,7 +140,7 @@ class Qwen2VLVideoEmbeddingInputs(TypedDict): type: Literal["video_embeds"] video_embeds: torch.Tensor """Supported types: - - list[`torch.Tensor`]: A list of tensors holding all videos' features. + - List[`torch.Tensor`]: A list of tensors holding all videos' features. Each tensor holds an video's features. - `torch.Tensor`: A tensor holding all videos' features (concatenation of all videos' feature tensors). @@ -169,7 +169,7 @@ def __init__( self, in_features: int, hidden_features: int, - act_layer: type[nn.Module] = QuickGELU, + act_layer: Type[nn.Module] = QuickGELU, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", ): @@ -383,7 +383,7 @@ def __init__( dim: int, num_heads: int, mlp_ratio: float, - act_layer: type[nn.Module] = QuickGELU, + act_layer: Type[nn.Module] = QuickGELU, norm_layer: Optional[Callable[[int], nn.Module]] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", @@ -629,8 +629,8 @@ def forward( return x - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -638,7 +638,7 @@ def load_weights(self, weights: Iterable[tuple[str, ("qkv_proj", "v_proj", "v"), ] params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: set[str] = set() + loaded_params: Set[str] = set() for name, loaded_weight in weights: for (param_name, weight_name, shard_id) in stacked_params_mapping: @@ -1371,8 +1371,8 @@ def sample( ) -> Optional[SamplerOutput]: return self.language_model.sample(logits, sampling_metadata) - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) diff --git a/vllm/model_executor/models/qwen_vl.py b/vllm/model_executor/models/qwen_vl.py index f662db2225c84..e0d8bf2fa3d25 100644 --- a/vllm/model_executor/models/qwen_vl.py +++ b/vllm/model_executor/models/qwen_vl.py @@ -9,9 +9,9 @@ import math import re import unicodedata -from collections.abc import Collection, Mapping, Set from functools import lru_cache, partial -from typing import Callable, Literal, Optional, TypedDict, Union +from typing import (AbstractSet, Callable, Collection, List, Literal, Mapping, + Optional, TypedDict, Union) import torch from torch import nn @@ -393,7 +393,7 @@ class TokenizerWithoutImagePad(tokenizer.__class__): # type: ignore def tokenize( self, text: str, - allowed_special: Union[Set[str], str] = "all", + allowed_special: Union[AbstractSet[str], str] = "all", disallowed_special: Union[Collection[str], str] = (), **kwargs, ) -> list[Union[bytes, str]]: @@ -409,7 +409,7 @@ def tokenize( def _decode( self, - token_ids: Union[int, list[int]], + token_ids: Union[int, List[int]], skip_special_tokens: bool = False, errors: Optional[str] = None, **kwargs, diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index bd0e1ced63632..75e31d557dd10 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -10,10 +10,10 @@ import sys import tempfile from abc import ABC, abstractmethod -from collections.abc import Set from dataclasses import dataclass, field from functools import lru_cache -from typing import Callable, Optional, TypeVar, Union +from typing import (AbstractSet, Callable, Dict, List, Optional, Tuple, Type, + TypeVar, Union) import cloudpickle import torch.nn as nn @@ -230,7 +230,7 @@ class _ModelInfo: supports_transcription: bool @staticmethod - def from_model_cls(model: type[nn.Module]) -> "_ModelInfo": + def from_model_cls(model: Type[nn.Module]) -> "_ModelInfo": return _ModelInfo( architecture=model.__name__, is_text_generation_model=is_text_generation_model(model), @@ -251,7 +251,7 @@ def inspect_model_cls(self) -> _ModelInfo: raise NotImplementedError @abstractmethod - def load_model_cls(self) -> type[nn.Module]: + def load_model_cls(self) -> Type[nn.Module]: raise NotImplementedError @@ -262,10 +262,10 @@ class _RegisteredModel(_BaseRegisteredModel): """ interfaces: _ModelInfo - model_cls: type[nn.Module] + model_cls: Type[nn.Module] @staticmethod - def from_model_cls(model_cls: type[nn.Module]): + def from_model_cls(model_cls: Type[nn.Module]): return _RegisteredModel( interfaces=_ModelInfo.from_model_cls(model_cls), model_cls=model_cls, @@ -274,7 +274,7 @@ def from_model_cls(model_cls: type[nn.Module]): def inspect_model_cls(self) -> _ModelInfo: return self.interfaces - def load_model_cls(self) -> type[nn.Module]: + def load_model_cls(self) -> Type[nn.Module]: return self.model_cls @@ -291,7 +291,7 @@ def inspect_model_cls(self) -> _ModelInfo: return _run_in_subprocess( lambda: _ModelInfo.from_model_cls(self.load_model_cls())) - def load_model_cls(self) -> type[nn.Module]: + def load_model_cls(self) -> Type[nn.Module]: mod = importlib.import_module(self.module_name) return getattr(mod, self.class_name) @@ -300,7 +300,7 @@ def load_model_cls(self) -> type[nn.Module]: def _try_load_model_cls( model_arch: str, model: _BaseRegisteredModel, -) -> Optional[type[nn.Module]]: +) -> Optional[Type[nn.Module]]: from vllm.platforms import current_platform current_platform.verify_model_arch(model_arch) try: @@ -327,15 +327,15 @@ def _try_inspect_model_cls( @dataclass class _ModelRegistry: # Keyed by model_arch - models: dict[str, _BaseRegisteredModel] = field(default_factory=dict) + models: Dict[str, _BaseRegisteredModel] = field(default_factory=dict) - def get_supported_archs(self) -> Set[str]: + def get_supported_archs(self) -> AbstractSet[str]: return self.models.keys() def register_model( self, model_arch: str, - model_cls: Union[type[nn.Module], str], + model_cls: Union[Type[nn.Module], str], ) -> None: """ Register an external model to be used in vLLM. @@ -374,7 +374,7 @@ def register_model( self.models[model_arch] = model - def _raise_for_unsupported(self, architectures: list[str]): + def _raise_for_unsupported(self, architectures: List[str]): all_supported_archs = self.get_supported_archs() if any(arch in all_supported_archs for arch in architectures): @@ -387,7 +387,7 @@ def _raise_for_unsupported(self, architectures: list[str]): f"Supported architectures: {all_supported_archs}") def _try_load_model_cls(self, - model_arch: str) -> Optional[type[nn.Module]]: + model_arch: str) -> Optional[Type[nn.Module]]: if model_arch not in self.models: return None @@ -401,8 +401,8 @@ def _try_inspect_model_cls(self, model_arch: str) -> Optional[_ModelInfo]: def _normalize_archs( self, - architectures: Union[str, list[str]], - ) -> list[str]: + architectures: Union[str, List[str]], + ) -> List[str]: if isinstance(architectures, str): architectures = [architectures] if not architectures: @@ -417,8 +417,8 @@ def _normalize_archs( def inspect_model_cls( self, - architectures: Union[str, list[str]], - ) -> tuple[_ModelInfo, str]: + architectures: Union[str, List[str]], + ) -> Tuple[_ModelInfo, str]: architectures = self._normalize_archs(architectures) for arch in architectures: @@ -430,8 +430,8 @@ def inspect_model_cls( def resolve_model_cls( self, - architectures: Union[str, list[str]], - ) -> tuple[type[nn.Module], str]: + architectures: Union[str, List[str]], + ) -> Tuple[Type[nn.Module], str]: architectures = self._normalize_archs(architectures) for arch in architectures: @@ -443,63 +443,63 @@ def resolve_model_cls( def is_text_generation_model( self, - architectures: Union[str, list[str]], + architectures: Union[str, List[str]], ) -> bool: model_cls, _ = self.inspect_model_cls(architectures) return model_cls.is_text_generation_model def is_pooling_model( self, - architectures: Union[str, list[str]], + architectures: Union[str, List[str]], ) -> bool: model_cls, _ = self.inspect_model_cls(architectures) return model_cls.is_pooling_model def is_cross_encoder_model( self, - architectures: Union[str, list[str]], + architectures: Union[str, List[str]], ) -> bool: model_cls, _ = self.inspect_model_cls(architectures) return model_cls.supports_cross_encoding def is_multimodal_model( self, - architectures: Union[str, list[str]], + architectures: Union[str, List[str]], ) -> bool: model_cls, _ = self.inspect_model_cls(architectures) return model_cls.supports_multimodal def is_pp_supported_model( self, - architectures: Union[str, list[str]], + architectures: Union[str, List[str]], ) -> bool: model_cls, _ = self.inspect_model_cls(architectures) return model_cls.supports_pp def model_has_inner_state( self, - architectures: Union[str, list[str]], + architectures: Union[str, List[str]], ) -> bool: model_cls, _ = self.inspect_model_cls(architectures) return model_cls.has_inner_state def is_attention_free_model( self, - architectures: Union[str, list[str]], + architectures: Union[str, List[str]], ) -> bool: model_cls, _ = self.inspect_model_cls(architectures) return model_cls.is_attention_free def is_hybrid_model( self, - architectures: Union[str, list[str]], + architectures: Union[str, List[str]], ) -> bool: model_cls, _ = self.inspect_model_cls(architectures) return model_cls.is_hybrid def is_transcription_model( self, - architectures: Union[str, list[str]], + architectures: Union[str, List[str]], ) -> bool: model_cls, _ = self.inspect_model_cls(architectures) return model_cls.supports_transcription diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py index 97e1bb3eb913f..f86fa268072db 100644 --- a/vllm/model_executor/models/roberta.py +++ b/vllm/model_executor/models/roberta.py @@ -1,8 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import itertools -from collections.abc import Iterable -from typing import Optional +from typing import Iterable, Optional, Tuple import torch from torch import nn @@ -24,8 +23,8 @@ def roberta_task_weights_filter( - all_weights: Iterable[tuple[str, torch.Tensor]] -) -> tuple[Iterable[tuple[str, torch.Tensor]], Iterable[tuple[str, + all_weights: Iterable[Tuple[str, torch.Tensor]] +) -> Tuple[Iterable[Tuple[str, torch.Tensor]], Iterable[Tuple[str, torch.Tensor]]]: """ Separate task-specific weights that are applied on top @@ -179,7 +178,7 @@ def _build_model(self, prefix=prefix, embedding_class=RobertaEmbedding) - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): weights = self.hf_to_vllm_mapper.apply(weights) # Separate weights in "roberta"-prefixed and all else (not in memory). # For use with models like FacebookAI/roberta-base. @@ -218,7 +217,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.classifier = RobertaClassificationHead(config) self._pooler = CrossEncodingPooler(config, self.classifier) - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): bert_weights, task_weights = roberta_task_weights_filter(weights) self.roberta.load_weights(bert_weights) diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py index eecae4175561f..2892f696107be 100644 --- a/vllm/model_executor/models/siglip.py +++ b/vllm/model_executor/models/siglip.py @@ -3,8 +3,7 @@ within a vision language model.""" import math -from collections.abc import Iterable -from typing import Optional, Union +from typing import Iterable, Optional, Set, Tuple, Union import torch from PIL import Image @@ -334,7 +333,7 @@ def __init__( def forward( self, hidden_states: torch.Tensor, - ) -> tuple[torch.Tensor, None]: + ) -> Tuple[torch.Tensor, None]: residual = hidden_states hidden_states = self.layer_norm1(hidden_states) @@ -549,8 +548,8 @@ def forward( feature_sample_layers=feature_sample_layers, ) - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -558,7 +557,7 @@ def load_weights(self, weights: Iterable[tuple[str, ("qkv_proj", "v_proj", "v"), ] params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() + loaded_params: Set[str] = set() layer_count = len(self.vision_model.encoder.layers) for name, loaded_weight in weights: diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py index 94f44ff21291e..0f9e517aeb557 100644 --- a/vllm/model_executor/models/solar.py +++ b/vllm/model_executor/models/solar.py @@ -23,8 +23,7 @@ # limitations under the License. """Inference-only Solar model compatible with HuggingFace weights.""" -from collections.abc import Iterable -from typing import Any, Optional, Union +from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union import torch from torch import nn @@ -103,7 +102,7 @@ def __init__( num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: Optional[dict[str, Any]] = None, + rope_scaling: Optional[Dict[str, Any]] = None, max_position_embeddings: int = 8192, quant_config: Optional[QuantizationConfig] = None, bias: bool = False, @@ -238,7 +237,7 @@ def forward( positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> tuple[torch.Tensor, torch.Tensor]: + ) -> Tuple[torch.Tensor, torch.Tensor]: # Self Attention if residual is None: residual = hidden_states @@ -450,8 +449,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) (".qkv_proj", ".q_proj", "q"), @@ -461,7 +460,7 @@ def load_weights(self, weights: Iterable[tuple[str, (".gate_up_proj", ".up_proj", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() + loaded_params: Set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py index 3afcccc18af07..a15faec547b95 100644 --- a/vllm/model_executor/models/stablelm.py +++ b/vllm/model_executor/models/stablelm.py @@ -20,8 +20,7 @@ # https://huggingface.co/stabilityai/stablelm-3b-4e1t/blob/main/config.json """Inference-only StabeLM (https://github.com/Stability-AI/StableLM) model compatible with HuggingFace weights.""" -from collections.abc import Iterable -from typing import Optional, Union +from typing import Iterable, Optional, Set, Tuple, Union import torch from torch import nn @@ -182,7 +181,7 @@ def forward( self, positions: torch.Tensor, hidden_states: torch.Tensor, - ) -> tuple[torch.Tensor, torch.Tensor]: + ) -> Tuple[torch.Tensor, torch.Tensor]: # Self Attention residual = hidden_states hidden_states = self.input_layernorm(hidden_states) @@ -307,8 +306,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -318,7 +317,7 @@ def load_weights(self, weights: Iterable[tuple[str, ("gate_up_proj", "up_proj", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() + loaded_params: Set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py index 2665367286841..90098af9dde0e 100644 --- a/vllm/model_executor/models/starcoder2.py +++ b/vllm/model_executor/models/starcoder2.py @@ -19,8 +19,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """ PyTorch Starcoder2 model.""" -from collections.abc import Iterable -from typing import Optional, Union +from typing import Iterable, Optional, Set, Tuple, Union import torch from torch import nn @@ -320,8 +319,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -330,7 +329,7 @@ def load_weights(self, weights: Iterable[tuple[str, ] params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: set[str] = set() + loaded_params: Set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue diff --git a/vllm/model_executor/models/telechat2.py b/vllm/model_executor/models/telechat2.py index 2175488116dc6..a38035e37ec73 100644 --- a/vllm/model_executor/models/telechat2.py +++ b/vllm/model_executor/models/telechat2.py @@ -19,7 +19,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from collections.abc import Iterable +from typing import Iterable, Set, Tuple import torch @@ -48,14 +48,14 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): layer.mlp.gate_up_proj.bias = None layer.mlp.gate_up_proj.skip_bias_add = True - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ ('gate_up_proj', 'gate_proj', 0), ('gate_up_proj', 'up_proj', 1), ] params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() + loaded_params: Set[str] = set() total_num_heads = self.config.n_head head_dim = self.config.hidden_size // total_num_heads for name, loaded_weight in weights: @@ -123,8 +123,8 @@ class TeleChat2ForCausalLM(LlamaForCausalLM): def _init_model(self, vllm_config: VllmConfig, prefix: str = ""): return TeleChat2Model(vllm_config=vllm_config, prefix=prefix) - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader( self, diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index 27d4ad15a829b..1c3c443b29413 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -15,8 +15,7 @@ # limitations under the License. """Wrapper around `transformers` models""" import re -from collections.abc import Iterable -from typing import Literal, Optional, Union +from typing import Iterable, Literal, Optional, Union import torch from torch import nn diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index b1905348e78ef..b8d4aef252e5f 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -3,9 +3,9 @@ # Adapted from https://github.com/fixie-ai/ultravox/blob/ecd58c4041030bae2ad15aa6bcf04ab43199ea02/ultravox/model/ultravox_model.py """PyTorch Ultravox model.""" import math -from collections.abc import Iterable, Mapping from functools import cached_property -from typing import Any, Literal, Optional, TypedDict, Union +from typing import (Any, Iterable, Literal, Mapping, Optional, Set, Tuple, + TypedDict, Union) import torch import torch.utils.checkpoint @@ -551,8 +551,8 @@ def sample( ) -> Optional[SamplerOutput]: return self.language_model.sample(logits, sampling_metadata) - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader(self, ignore_unexpected_prefixes=["audio_tower."]) diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index 97433f1dde050..fff4be34ddbeb 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -1,9 +1,9 @@ # SPDX-License-Identifier: Apache-2.0 import itertools -from collections.abc import Iterable, Mapping from dataclasses import dataclass, field -from typing import Callable, Literal, Optional, Protocol, Union, overload +from typing import (Callable, Dict, Iterable, List, Literal, Mapping, Optional, + Protocol, Set, Tuple, Union, overload) import torch import torch.nn as nn @@ -56,8 +56,8 @@ def _map_name(self, key: str) -> Optional[str]: return key def apply( - self, weights: Iterable[tuple[str, torch.Tensor]] - ) -> Iterable[tuple[str, torch.Tensor]]: + self, weights: Iterable[Tuple[str, torch.Tensor]] + ) -> Iterable[Tuple[str, torch.Tensor]]: return ((out_name, data) for name, data in weights if (out_name := self._map_name(name)) is not None) @@ -82,8 +82,8 @@ def __init__( self, module: nn.Module, *, - skip_prefixes: Optional[list[str]] = None, - ignore_unexpected_prefixes: Optional[list[str]] = None, + skip_prefixes: Optional[List[str]] = None, + ignore_unexpected_prefixes: Optional[List[str]] = None, ) -> None: super().__init__() @@ -93,8 +93,8 @@ def __init__( def _groupby_prefix( self, - weights: Iterable[tuple[str, torch.Tensor]], - ) -> Iterable[tuple[str, Iterable[tuple[str, torch.Tensor]]]]: + weights: Iterable[Tuple[str, torch.Tensor]], + ) -> Iterable[Tuple[str, Iterable[Tuple[str, torch.Tensor]]]]: weights_by_parts = ((weight_name.split(".", 1), weight_data) for weight_name, weight_data in weights) @@ -127,7 +127,7 @@ def _load_param( self, base_prefix: str, param: nn.Parameter, - weights: Iterable[tuple[str, torch.Tensor]], + weights: Iterable[Tuple[str, torch.Tensor]], ) -> Iterable[str]: for weight_name, weight_data in weights: weight_qualname = self._get_qualname(base_prefix, weight_name) @@ -160,7 +160,7 @@ def _load_module( self, base_prefix: str, module: nn.Module, - weights: Iterable[tuple[str, torch.Tensor]], + weights: Iterable[Tuple[str, torch.Tensor]], ) -> Iterable[str]: if isinstance(module, PPMissingLayer): return @@ -225,10 +225,10 @@ def _load_module( def load_weights( self, - weights: Iterable[tuple[str, torch.Tensor]], + weights: Iterable[Tuple[str, torch.Tensor]], *, mapper: Optional[WeightsMapper] = None, - ) -> set[str]: + ) -> Set[str]: if mapper is not None: weights = mapper.apply(weights) @@ -266,13 +266,13 @@ def flatten_bn(x: torch.Tensor) -> torch.Tensor: @overload -def flatten_bn(x: list[torch.Tensor]) -> list[torch.Tensor]: +def flatten_bn(x: List[torch.Tensor]) -> List[torch.Tensor]: ... @overload def flatten_bn( - x: Union[list[torch.Tensor], torch.Tensor], + x: Union[List[torch.Tensor], torch.Tensor], *, concat: Literal[True], ) -> torch.Tensor: @@ -281,18 +281,18 @@ def flatten_bn( @overload def flatten_bn( - x: Union[list[torch.Tensor], torch.Tensor], + x: Union[List[torch.Tensor], torch.Tensor], *, concat: bool = False, -) -> Union[list[torch.Tensor], torch.Tensor]: +) -> Union[List[torch.Tensor], torch.Tensor]: ... def flatten_bn( - x: Union[list[torch.Tensor], torch.Tensor], + x: Union[List[torch.Tensor], torch.Tensor], *, concat: bool = False, -) -> Union[list[torch.Tensor], torch.Tensor]: +) -> Union[List[torch.Tensor], torch.Tensor]: """ Flatten the ``B`` and ``N`` dimensions of batched multimodal inputs. @@ -416,7 +416,7 @@ def merge_multimodal_embeddings( input_ids: torch.Tensor, inputs_embeds: torch.Tensor, multimodal_embeddings: NestedTensors, - placeholder_token_id: Union[int, list[int]], + placeholder_token_id: Union[int, List[int]], ) -> torch.Tensor: """ Merge ``multimodal_embeddings`` into ``inputs_embeds`` by overwriting the @@ -544,7 +544,7 @@ def make_layers( num_hidden_layers: int, layer_fn: LayerFn, prefix: str, -) -> tuple[int, int, torch.nn.ModuleList]: +) -> Tuple[int, int, torch.nn.ModuleList]: """Make a list of layers with the given layer function, taking pipeline parallelism into account. """ @@ -562,10 +562,10 @@ def make_layers( # NOTE: don't use lru_cache here because it can prevent garbage collection -_model_to_pp_missing_layer_names: dict[int, list[str]] = {} +_model_to_pp_missing_layer_names: Dict[int, List[str]] = {} -def get_pp_missing_layer_names(model: torch.nn.Module) -> list[str]: +def get_pp_missing_layer_names(model: torch.nn.Module) -> List[str]: """Get the names of the missing layers in a pipeline parallel model.""" model_id = id(model) if model_id in _model_to_pp_missing_layer_names: @@ -593,7 +593,7 @@ def is_pp_missing_parameter(name: str, model: torch.nn.Module) -> bool: for missing_layer_name in get_pp_missing_layer_names(model)) -def make_empty_intermediate_tensors_factory(keys: list[str], hidden_size: int): +def make_empty_intermediate_tensors_factory(keys: List[str], hidden_size: int): def make_empty_intermediate_tensors( batch_size: int, @@ -632,7 +632,7 @@ def extract_layer_index(layer_name: str) -> int: - "model.encoder.layers.0.sub.1" -> ValueError """ subnames = layer_name.split(".") - int_vals: list[int] = [] + int_vals: List[int] = [] for subname in subnames: try: int_vals.append(int(subname)) diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py index e855db13c63ae..e5f77e08c4035 100644 --- a/vllm/model_executor/models/whisper.py +++ b/vllm/model_executor/models/whisper.py @@ -1,8 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 import math -from collections.abc import Iterable, Mapping -from typing import Optional, TypedDict, Union +from typing import (Iterable, List, Mapping, Optional, Set, Tuple, TypedDict, + Union) import torch from torch import nn @@ -389,7 +389,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.embed_positions.weight.copy_( sinusoids(*self.embed_positions.weight.shape)) - def forward(self, input_features: Union[torch.Tensor, list[torch.Tensor]]): + def forward(self, input_features: Union[torch.Tensor, List[torch.Tensor]]): hidden_states = [] for features in input_features: embeds = nn.functional.gelu(self.conv1(features)) @@ -467,7 +467,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): def forward( self, - input_features: Optional[Union[torch.Tensor, list[torch.Tensor]]], + input_features: Optional[Union[torch.Tensor, List[torch.Tensor]]], input_ids: Optional[torch.Tensor], positions: torch.Tensor, ) -> torch.Tensor: @@ -481,14 +481,14 @@ def forward( def get_encoder_outputs( self, - input_features: Optional[Union[torch.Tensor, list[torch.Tensor]]], + input_features: Optional[Union[torch.Tensor, List[torch.Tensor]]], ) -> Optional[torch.Tensor]: if input_features is None: return None return self.encoder(input_features) - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) (".self_attn.qkv_proj", ".self_attn.q_proj", "q"), @@ -498,7 +498,7 @@ def load_weights(self, weights: Iterable[tuple[str, (".encoder_attn.kv_proj", ".encoder_attn.v_proj", "v"), ] params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() + loaded_params: Set[str] = set() for name, loaded_weight in weights: for param_name, weight_name, shard_id in stacked_params_mapping: if weight_name not in name: @@ -737,8 +737,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader(self, skip_prefixes=["proj_out."]) # add fake zeros bias for k_proj to state_dict @@ -747,8 +747,8 @@ def load_weights(self, weights: Iterable[tuple[str, def _create_fake_bias_for_k_proj( - weights: Iterable[tuple[str, torch.Tensor]] -) -> Iterable[tuple[str, torch.Tensor]]: + weights: Iterable[Tuple[str, torch.Tensor]] +) -> Iterable[Tuple[str, torch.Tensor]]: """ Create full zeros bias for k_proj weight in self-attention layers. So that the bias for k_proj in qkv_proj can be initialized with zeros. diff --git a/vllm/model_executor/pooling_metadata.py b/vllm/model_executor/pooling_metadata.py index 4c5db7396c03c..dea8b0e9d471d 100644 --- a/vllm/model_executor/pooling_metadata.py +++ b/vllm/model_executor/pooling_metadata.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 from dataclasses import dataclass -from typing import Any +from typing import Any, Dict, List, Tuple import torch @@ -23,9 +23,9 @@ class PoolingMetadata: def __init__( self, - seq_groups: list[tuple[list[int], PoolingParams]], - seq_data: dict[int, Any], # Specific data related to sequences - prompt_lens: list[int], + seq_groups: List[Tuple[List[int], PoolingParams]], + seq_data: Dict[int, Any], # Specific data related to sequences + prompt_lens: List[int], ) -> None: self.seq_groups = seq_groups self.seq_data = seq_data diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py index 3a18e4d43c550..0a580a4e907de 100644 --- a/vllm/model_executor/sampling_metadata.py +++ b/vllm/model_executor/sampling_metadata.py @@ -2,7 +2,7 @@ from array import array from dataclasses import dataclass -from typing import Optional +from typing import Dict, List, Optional, Tuple import torch @@ -25,10 +25,10 @@ class SequenceGroupToSample: # |-- query_len ---| # Sequence ids for the sequence group in a previous step. - seq_ids: list[int] + seq_ids: List[int] sampling_params: SamplingParams # seq_id -> sequence data. - seq_data: dict[int, SequenceData] + seq_data: Dict[int, SequenceData] # The length of the sequence (all tokens seen in the past + new token to # compute attention) of the sequence group. None if it is in a decode # stage. @@ -44,9 +44,9 @@ class SequenceGroupToSample: is_prompt: bool # Query token indices from logits. to compute prompt logprob. Empty if # prompt logprob is not required. - prompt_logprob_indices: list[int] + prompt_logprob_indices: List[int] # Sample token indices from logits. Empty if sampling is not required. - sample_indices: list[int] + sample_indices: List[int] @property def do_sample(self): @@ -78,7 +78,7 @@ class SamplingMetadataCache: """Used to cache SamplingMetadata objects between scheduler iterations""" def __init__(self): - self._seq_group_to_sample_cache: dict[int, PyObjectCache] = {} + self._seq_group_to_sample_cache: Dict[int, PyObjectCache] = {} def get_cached_seq_group_to_sample(self, num_seqs): if num_seqs not in self._seq_group_to_sample_cache: @@ -130,9 +130,9 @@ def sample(logits): def __init__( self, - seq_groups: list[SequenceGroupToSample], + seq_groups: List[SequenceGroupToSample], selected_token_indices: torch.Tensor, - categorized_sample_indices: dict[SamplingType, torch.Tensor], + categorized_sample_indices: Dict[SamplingType, torch.Tensor], num_prompts: int, skip_sampler_cpu_output: bool = False, reuse_sampling_tensors: bool = False, @@ -146,12 +146,12 @@ def __init__( @staticmethod def prepare( - seq_group_metadata_list: list[SequenceGroupMetadata], - seq_lens: list[int], - query_lens: list[int], + seq_group_metadata_list: List[SequenceGroupMetadata], + seq_lens: List[int], + query_lens: List[int], device: str, pin_memory: bool, - generators: Optional[dict[str, torch.Generator]] = None, + generators: Optional[Dict[str, torch.Generator]] = None, cache: Optional[SamplingMetadataCache] = None, ) -> "SamplingMetadata": ( @@ -195,16 +195,16 @@ def __repr__(self) -> str: def _prepare_seq_groups( - seq_group_metadata_list: list[SequenceGroupMetadata], - seq_lens: list[int], - query_lens: list[int], + seq_group_metadata_list: List[SequenceGroupMetadata], + seq_lens: List[int], + query_lens: List[int], device: str, - generators: Optional[dict[str, torch.Generator]] = None, + generators: Optional[Dict[str, torch.Generator]] = None, cache: Optional[SamplingMetadataCache] = None, -) -> tuple[ - list[SequenceGroupToSample], - list[int], - dict[SamplingType, list[int]], +) -> Tuple[ + List[SequenceGroupToSample], + List[int], + Dict[SamplingType, List[int]], int, ]: """Prepare sequence groups and indices for sampling. @@ -227,17 +227,17 @@ def _prepare_seq_groups( num_prompts: Total number of prompts from `seq_group_metadata_list`. """ # Batched sequence groups for the current model forward stsep. - seq_groups: list[SequenceGroupToSample] = [] + seq_groups: List[SequenceGroupToSample] = [] # A list of token indices to sample/compute logprob. It is used to # prune the outcome logits from the model for the performance. - selected_token_indices: list[int] = [] + selected_token_indices: List[int] = [] # Used for selected_token_indices. model_output_idx = 0 # Sampling type -> ( # indices to sample/prompt logprob within pruned output logits, # indices to sample within pruned logits) - categorized_sample_indices: dict[SamplingType, list[int]] = { + categorized_sample_indices: Dict[SamplingType, List[int]] = { t: [] for t in SamplingType } @@ -265,9 +265,9 @@ def _prepare_seq_groups( # If the current seq group is in decode stage, it is None. seq_len: Optional[int] = None query_len: Optional[int] = None - prompt_logprob_indices: list[int] = (sample_obj.prompt_logprob_indices + prompt_logprob_indices: List[int] = (sample_obj.prompt_logprob_indices if cache is not None else []) - sample_indices: list[int] = (sample_obj.sample_indices + sample_indices: List[int] = (sample_obj.sample_indices if cache is not None else []) do_sample = seq_group_metadata.do_sample @@ -389,16 +389,16 @@ def from_sampling_metadata( vocab_size: int, device: torch.device, dtype: torch.dtype, - ) -> tuple["SamplingTensors", bool, bool, bool]: - prompt_tokens: list[array] = [] - output_tokens: list[array] = [] - top_ks: list[int] = [] - temperatures: list[float] = [] - top_ps: list[float] = [] - min_ps: list[float] = [] - presence_penalties: list[float] = [] - frequency_penalties: list[float] = [] - repetition_penalties: list[float] = [] + ) -> Tuple["SamplingTensors", bool, bool, bool]: + prompt_tokens: List[array] = [] + output_tokens: List[array] = [] + top_ks: List[int] = [] + temperatures: List[float] = [] + top_ps: List[float] = [] + min_ps: List[float] = [] + presence_penalties: List[float] = [] + frequency_penalties: List[float] = [] + repetition_penalties: List[float] = [] do_penalties = False do_top_p_top_k = False do_min_p = False @@ -496,15 +496,15 @@ def from_sampling_metadata( @classmethod def from_lists( cls, - temperatures: list[float], - top_ps: list[float], - top_ks: list[int], - min_ps: list[float], - presence_penalties: list[float], - frequency_penalties: list[float], - repetition_penalties: list[float], - prompt_tokens: list[array], - output_tokens: list[array], + temperatures: List[float], + top_ps: List[float], + top_ks: List[int], + min_ps: List[float], + presence_penalties: List[float], + frequency_penalties: List[float], + repetition_penalties: List[float], + prompt_tokens: List[array], + output_tokens: List[array], vocab_size: int, device: torch.device, dtype: torch.dtype, diff --git a/vllm/model_executor/utils.py b/vllm/model_executor/utils.py index f9d89e64bd9db..04f922dfd77aa 100644 --- a/vllm/model_executor/utils.py +++ b/vllm/model_executor/utils.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 """Utils for model executor.""" -from typing import Any, Optional +from typing import Any, Dict, Optional import torch @@ -12,7 +12,7 @@ def set_random_seed(seed: int) -> None: def set_weight_attrs( weight: torch.Tensor, - weight_attrs: Optional[dict[str, Any]], + weight_attrs: Optional[Dict[str, Any]], ): """Set attributes on a weight tensor. diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py index e0b160a65047a..c48d07ba365ba 100644 --- a/vllm/multimodal/base.py +++ b/vllm/multimodal/base.py @@ -2,10 +2,9 @@ from abc import ABC, abstractmethod from collections import defaultdict -from collections.abc import Sequence from pathlib import Path from typing import (TYPE_CHECKING, Any, Callable, Generic, NamedTuple, - Optional, TypeVar, Union) + Optional, Sequence, Tuple, Type, TypeVar, Union) from torch import nn @@ -40,7 +39,7 @@ """ _T = TypeVar("_T") -N = TypeVar("N", bound=type[nn.Module]) +N = TypeVar("N", bound=Type[nn.Module]) class MultiModalPlugin(ABC): @@ -275,7 +274,7 @@ def __init__(self): @classmethod def from_seq_group( cls, seq_group: "SequenceGroupMetadata", positions: range - ) -> tuple[Optional[MultiModalDataDict], dict[str, + ) -> Tuple[Optional[MultiModalDataDict], dict[str, "MultiModalPlaceholderMap"]]: """ Returns the multi-modal items that intersect with the portion of a diff --git a/vllm/multimodal/hasher.py b/vllm/multimodal/hasher.py index 11665ef667538..7d277fd67deca 100644 --- a/vllm/multimodal/hasher.py +++ b/vllm/multimodal/hasher.py @@ -1,8 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import pickle -from collections.abc import Iterable, Mapping -from typing import TYPE_CHECKING, Optional +from typing import TYPE_CHECKING, Iterable, Mapping, Optional import numpy as np import torch diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py index f76982ef8d729..98ece8f806f1d 100644 --- a/vllm/multimodal/image.py +++ b/vllm/multimodal/image.py @@ -3,7 +3,7 @@ import base64 from io import BytesIO from pathlib import Path -from typing import TYPE_CHECKING, Any, Optional +from typing import TYPE_CHECKING, Any, Dict, Optional import torch from PIL import Image @@ -31,7 +31,7 @@ def get_data_key(self) -> str: def _get_hf_image_processor( self, model_config: "ModelConfig", - mm_processor_kwargs: Optional[dict[str, Any]] = None, + mm_processor_kwargs: Optional[Dict[str, Any]] = None, ): if mm_processor_kwargs is None: mm_processor_kwargs = {} diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index d79290ef98322..1882ffe9bf69f 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -2,9 +2,9 @@ import functools from collections import UserDict -from collections.abc import Mapping, Sequence from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, Generic, Optional, Protocol, TypeVar +from typing import (TYPE_CHECKING, Any, Dict, Generic, Mapping, Optional, + Protocol, Sequence, Type, TypeVar) import torch.nn as nn @@ -29,7 +29,7 @@ logger = init_logger(__name__) -N = TypeVar("N", bound=type[nn.Module]) +N = TypeVar("N", bound=Type[nn.Module]) _I = TypeVar("_I", bound=BaseProcessingInfo) _I_co = TypeVar("_I_co", bound=BaseProcessingInfo, covariant=True) @@ -83,13 +83,13 @@ def build_processor( return self.processor(info, dummy_inputs_builder, cache=cache) -class _MultiModalLimits(UserDict["ModelConfig", dict[str, int]]): +class _MultiModalLimits(UserDict["ModelConfig", Dict[str, int]]): """ Wraps `_limits_by_model` for a more informative error message when attempting to access a model that does not exist. """ - def __getitem__(self, key: "ModelConfig") -> dict[str, int]: + def __getitem__(self, key: "ModelConfig") -> Dict[str, int]: try: return super().__getitem__(key) except KeyError as exc: @@ -170,7 +170,7 @@ def map_input( self, model_config: "ModelConfig", data: MultiModalDataDict, - mm_processor_kwargs: Optional[dict[str, Any]] = None, + mm_processor_kwargs: Optional[Dict[str, Any]] = None, ) -> MultiModalKwargs: """ Apply an input mapper to the data passed to the model. @@ -184,7 +184,7 @@ def map_input( Note: This should be called after :meth:`init_mm_limits_per_prompt`. """ - merged_dict: dict[str, NestedTensors] = {} + merged_dict: Dict[str, NestedTensors] = {} for data_key, data_value in data.items(): plugin = self._get_plugin(data_key) diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py index 0b3d3f8c79d72..8004377191b38 100644 --- a/vllm/multimodal/video.py +++ b/vllm/multimodal/video.py @@ -4,7 +4,7 @@ from functools import partial from io import BytesIO from pathlib import Path -from typing import TYPE_CHECKING, Any, Optional +from typing import TYPE_CHECKING, Any, Dict, Optional import numpy as np import numpy.typing as npt @@ -39,7 +39,7 @@ def get_data_key(self) -> str: def _get_hf_video_processor( self, model_config: "ModelConfig", - mm_processor_kwargs: Optional[dict[str, Any]] = None, + mm_processor_kwargs: Optional[Dict[str, Any]] = None, ): if mm_processor_kwargs is None: mm_processor_kwargs = {} diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index cb880a23bd673..c6f3ccf0a3c49 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -5,7 +5,8 @@ import os from functools import lru_cache, wraps -from typing import TYPE_CHECKING, Callable, Optional, TypeVar, Union +from typing import (TYPE_CHECKING, Callable, List, Optional, Tuple, TypeVar, + Union) import torch from typing_extensions import ParamSpec @@ -99,7 +100,7 @@ def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool: return True @classmethod - def is_full_nvlink(cls, device_ids: list[int]) -> bool: + def is_full_nvlink(cls, device_ids: List[int]) -> bool: raise NotImplementedError @classmethod @@ -286,7 +287,7 @@ def get_device_capability(cls, @with_nvml_context def has_device_capability( cls, - capability: Union[tuple[int, int], int], + capability: Union[Tuple[int, int], int], device_id: int = 0, ) -> bool: try: @@ -319,7 +320,7 @@ def get_device_total_memory(cls, device_id: int = 0) -> int: @classmethod @with_nvml_context - def is_full_nvlink(cls, physical_device_ids: list[int]) -> bool: + def is_full_nvlink(cls, physical_device_ids: List[int]) -> bool: """ query if the set of gpus are fully connected by nvlink (1 hop) """ @@ -384,7 +385,7 @@ def get_device_total_memory(cls, device_id: int = 0) -> int: return device_props.total_memory @classmethod - def is_full_nvlink(cls, physical_device_ids: list[int]) -> bool: + def is_full_nvlink(cls, physical_device_ids: List[int]) -> bool: logger.exception( "NVLink detection not possible, as context support was" " not found. Assuming no NVLink available.") diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index fb46e00c0bb0d..0e4988a4fa74d 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -4,7 +4,7 @@ import platform import random from platform import uname -from typing import TYPE_CHECKING, NamedTuple, Optional, Union +from typing import TYPE_CHECKING, NamedTuple, Optional, Tuple, Union import numpy as np import torch @@ -162,7 +162,7 @@ def get_device_capability( @classmethod def has_device_capability( cls, - capability: Union[tuple[int, int], int], + capability: Union[Tuple[int, int], int], device_id: int = 0, ) -> bool: """ diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index 09aca00fd9e54..a4f18cbfc5871 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -2,7 +2,7 @@ import os from functools import lru_cache, wraps -from typing import TYPE_CHECKING, Optional +from typing import TYPE_CHECKING, Dict, List, Optional import torch @@ -36,7 +36,7 @@ logger.warning("Failed to import from vllm._rocm_C with %r", e) # Models not supported by ROCm. -_ROCM_UNSUPPORTED_MODELS: list[str] = [] +_ROCM_UNSUPPORTED_MODELS: List[str] = [] # Models partially supported by ROCm. # Architecture -> Reason. @@ -44,7 +44,7 @@ "Triton flash attention. For half-precision SWA support, " "please use CK flash attention by setting " "`VLLM_USE_TRITON_FLASH_ATTN=0`") -_ROCM_PARTIALLY_SUPPORTED_MODELS: dict[str, str] = { +_ROCM_PARTIALLY_SUPPORTED_MODELS: Dict[str, str] = { "Qwen2ForCausalLM": _ROCM_SWA_REASON, "MistralForCausalLM": diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py index d72ab2bd088c7..389cb87281031 100644 --- a/vllm/plugins/__init__.py +++ b/vllm/plugins/__init__.py @@ -2,7 +2,7 @@ import logging import os -from typing import Callable +from typing import Callable, Dict import torch @@ -14,7 +14,7 @@ plugins_loaded = False -def load_plugins_by_group(group: str) -> dict[str, Callable]: +def load_plugins_by_group(group: str) -> Dict[str, Callable]: import sys if sys.version_info < (3, 10): from importlib_metadata import entry_points diff --git a/vllm/profiler/layerwise_profile.py b/vllm/profiler/layerwise_profile.py index 6934d328a87ef..6351ef63da2be 100644 --- a/vllm/profiler/layerwise_profile.py +++ b/vllm/profiler/layerwise_profile.py @@ -3,7 +3,7 @@ import copy from collections import defaultdict from dataclasses import asdict, dataclass, field -from typing import Any, Callable, Optional, TypeAlias, Union +from typing import Any, Callable, Dict, List, Optional, Tuple, TypeAlias, Union import pandas as pd from torch._C._autograd import DeviceType, _KinetoEvent, _ProfilerResult @@ -20,7 +20,7 @@ class _ModuleTreeNode: event: _ProfilerEvent parent: Optional['_ModuleTreeNode'] = None - children: list['_ModuleTreeNode'] = field(default_factory=list) + children: List['_ModuleTreeNode'] = field(default_factory=list) trace: str = "" @property @@ -60,19 +60,19 @@ class ModelStatsEntry: @dataclass class _StatsTreeNode: entry: StatsEntry - children: list[StatsEntry] + children: List[StatsEntry] parent: Optional[StatsEntry] @dataclass class LayerwiseProfileResults(profile): _kineto_results: _ProfilerResult - _kineto_event_correlation_map: dict[int, - list[_KinetoEvent]] = field(init=False) - _event_correlation_map: dict[int, list[FunctionEvent]] = field(init=False) - _module_tree: list[_ModuleTreeNode] = field(init=False) - _model_stats_tree: list[_StatsTreeNode] = field(init=False) - _summary_stats_tree: list[_StatsTreeNode] = field(init=False) + _kineto_event_correlation_map: Dict[int, + List[_KinetoEvent]] = field(init=False) + _event_correlation_map: Dict[int, List[FunctionEvent]] = field(init=False) + _module_tree: List[_ModuleTreeNode] = field(init=False) + _model_stats_tree: List[_StatsTreeNode] = field(init=False) + _summary_stats_tree: List[_StatsTreeNode] = field(init=False) # profile metadata num_running_seqs: Optional[int] = None @@ -82,7 +82,7 @@ def __post_init__(self): self._build_module_tree() self._build_stats_trees() - def print_model_table(self, column_widths: dict[str, int] = None): + def print_model_table(self, column_widths: Dict[str, int] = None): _column_widths = dict(name=60, cpu_time_us=12, cuda_time_us=12, @@ -100,7 +100,7 @@ def print_model_table(self, column_widths: dict[str, int] = None): filtered_model_table, indent_style=lambda indent: "|" + "-" * indent + " ")) - def print_summary_table(self, column_widths: dict[str, int] = None): + def print_summary_table(self, column_widths: Dict[str, int] = None): _column_widths = dict(name=80, cuda_time_us=12, pct_cuda_time=12, @@ -142,7 +142,7 @@ def convert_stats_to_dict(self) -> dict[str, Any]: } @staticmethod - def _indent_row_names_based_on_depth(depths_rows: list[tuple[int, + def _indent_row_names_based_on_depth(depths_rows: List[Tuple[int, StatsEntry]], indent_style: Union[Callable[[int], str], @@ -229,7 +229,7 @@ def _total_cuda_time(self): [self._cumulative_cuda_time(root) for root in self._module_tree]) def _build_stats_trees(self): - summary_dict: dict[str, _StatsTreeNode] = {} + summary_dict: Dict[str, _StatsTreeNode] = {} total_cuda_time = self._total_cuda_time() def pct_cuda_time(cuda_time_us): @@ -238,7 +238,7 @@ def pct_cuda_time(cuda_time_us): def build_summary_stats_tree_df( node: _ModuleTreeNode, parent: Optional[_StatsTreeNode] = None, - summary_trace: tuple[str] = ()): + summary_trace: Tuple[str] = ()): if event_has_module(node.event): name = event_module_repr(node.event) @@ -313,8 +313,8 @@ def build_model_stats_tree_df(node: _ModuleTreeNode, self._model_stats_tree.append(build_model_stats_tree_df(root)) def _flatten_stats_tree( - self, tree: list[_StatsTreeNode]) -> list[tuple[int, StatsEntry]]: - entries: list[tuple[int, StatsEntry]] = [] + self, tree: List[_StatsTreeNode]) -> List[Tuple[int, StatsEntry]]: + entries: List[Tuple[int, StatsEntry]] = [] def df_traversal(node: _StatsTreeNode, depth=0): entries.append((depth, node.entry)) @@ -327,10 +327,10 @@ def df_traversal(node: _StatsTreeNode, depth=0): return entries def _convert_stats_tree_to_dict(self, - tree: list[_StatsTreeNode]) -> list[dict]: - root_dicts: list[dict] = [] + tree: List[_StatsTreeNode]) -> List[Dict]: + root_dicts: List[Dict] = [] - def df_traversal(node: _StatsTreeNode, curr_json_list: list[dict]): + def df_traversal(node: _StatsTreeNode, curr_json_list: List[Dict]): curr_json_list.append({ "entry": asdict(node.entry), "children": [] diff --git a/vllm/profiler/utils.py b/vllm/profiler/utils.py index b26fd4dd8c071..62b39f510703e 100644 --- a/vllm/profiler/utils.py +++ b/vllm/profiler/utils.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import dataclasses -from typing import Callable, Union +from typing import Callable, Dict, List, Type, Union from torch._C._profiler import _EventType, _ProfilerEvent, _TensorMetadata @@ -30,14 +30,14 @@ def trim_string_back(string, width): class TablePrinter: - def __init__(self, row_cls: type[dataclasses.dataclass], - column_widths: dict[str, int]): + def __init__(self, row_cls: Type[dataclasses.dataclass], + column_widths: Dict[str, int]): self.row_cls = row_cls self.fieldnames = [x.name for x in dataclasses.fields(row_cls)] self.column_widths = column_widths assert set(self.column_widths.keys()) == set(self.fieldnames) - def print_table(self, rows: list[dataclasses.dataclass]): + def print_table(self, rows: List[dataclasses.dataclass]): self._print_header() self._print_line() for row in rows: diff --git a/vllm/prompt_adapter/models.py b/vllm/prompt_adapter/models.py index bd1dc51c468b1..795591606f259 100644 --- a/vllm/prompt_adapter/models.py +++ b/vllm/prompt_adapter/models.py @@ -2,7 +2,7 @@ import logging import math -from typing import Any, Callable, Optional +from typing import Any, Callable, Dict, List, Optional, Type import torch from torch import nn @@ -45,7 +45,7 @@ def convert_to_embedding_indices(indices): def convert_mapping( mapping: PromptAdapterMapping, - prompt_adapter_index_to_id: list[Optional[int]], + prompt_adapter_index_to_id: List[Optional[int]], ) -> torch.Tensor: """Converts PromptAdapterMapping to index tensors. @@ -127,8 +127,8 @@ def __init__( prompt_adapter_config: the PromptAdapter config, """ self.model: nn.Module = model - # dict instead of a Set for compatibility with LRUCache. - self.prompt_adapter_index_to_id: list[ + # Dict instead of a Set for compatibility with LRUCache. + self.prompt_adapter_index_to_id: List[ Optional[int]] = [None] * self.prompt_adapter_slots self.max_num_seqs = max_num_seqs self.max_num_batched_tokens = math.ceil(max_num_batched_tokens / 8) * 8 @@ -139,7 +139,7 @@ def __init__( self.base_indices = torch.tensor([-1]) self.base_embedding_indices = torch.tensor([]) - self.modules: dict[str, nn.Module] = {} + self.modules: Dict[str, nn.Module] = {} self._create_prompt_adapter_modules() self._last_mapping: Optional[PromptAdapterMapping] = None @@ -252,7 +252,7 @@ def remove_adapter(self, adapter_id: int) -> bool: return remove_adapter(adapter_id, self._registered_adapters, self.deactivate_adapter) - def list_adapters(self) -> dict[int, Any]: + def list_adapters(self) -> Dict[int, Any]: return list_adapters(self._registered_adapters) def get_adapter(self, adapter_id: int) -> Optional[Any]: @@ -284,7 +284,7 @@ def __init__( self._active_adapters = PromptAdapterLRUCache( self.prompt_adapter_slots, self._deactivate_adapter) - def list_adapters(self) -> dict[int, PromptAdapterModel]: + def list_adapters(self) -> Dict[int, PromptAdapterModel]: """List all registered PromptAdapterModel.""" return dict(self._registered_adapters.cache) @@ -344,7 +344,7 @@ def create_prompt_adapter_manager( max_num_seqs: int, max_num_batched_tokens: int, prompt_adapter_config: PromptAdapterConfig, - prompt_adapter_manager_cls: type[ + prompt_adapter_manager_cls: Type[ PromptAdapterModelManager] = PromptAdapterModelManager, **kwargs) -> PromptAdapterModelManager: """Create a PromptAdapterModel for a given model.""" diff --git a/vllm/prompt_adapter/worker_manager.py b/vllm/prompt_adapter/worker_manager.py index dbf82ab698480..28dcc16871120 100644 --- a/vllm/prompt_adapter/worker_manager.py +++ b/vllm/prompt_adapter/worker_manager.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import logging -from typing import Any, Optional +from typing import Any, Optional, Set, Type import torch @@ -28,7 +28,7 @@ class WorkerPromptAdapterManager(AbstractWorkerManager): loaded (unless they are already loaded), and every other prompt_adapter will be unloaded.""" - _manager_cls: type[PromptAdapterModelManager] = PromptAdapterModelManager + _manager_cls: Type[PromptAdapterModelManager] = PromptAdapterModelManager def __init__( self, @@ -36,7 +36,7 @@ def __init__( max_num_batched_tokens: int, device: torch.device, prompt_adapter_config: PromptAdapterConfig, - prompt_adapter_model_cls: type[PromptAdapterModel] = PromptAdapterModel + prompt_adapter_model_cls: Type[PromptAdapterModel] = PromptAdapterModel ): self._adapter_manager: PromptAdapterModelManager self.max_num_seqs = max_num_seqs @@ -90,7 +90,7 @@ def add_dummy_prompt_adapter( def pin_adapter(self, adapter_id: int) -> bool: return self._adapter_manager.pin_adapter(adapter_id) - def set_active_adapters(self, requests: set[Any], + def set_active_adapters(self, requests: Set[Any], mapping: Optional[Any]) -> None: set_active_adapters_worker(requests, mapping, self._apply_adapters, self._adapter_manager.set_adapter_mapping) @@ -101,7 +101,7 @@ def add_adapter(self, adapter_request: Any) -> bool: self._adapter_manager.add_adapter, self._adapter_manager.activate_adapter) - def _apply_adapters(self, adapter_requests: set[Any]) -> None: + def _apply_adapters(self, adapter_requests: Set[Any]) -> None: apply_adapters_worker(adapter_requests, self.list_adapters, self._adapter_manager.adapter_slots, self.remove_adapter, self.add_adapter) @@ -112,7 +112,7 @@ def remove_adapter(self, adapter_id: int) -> bool: def remove_all_adapters(self): self._adapter_manager.remove_all_adapters() - def list_adapters(self) -> set[int]: + def list_adapters(self) -> Set[int]: return list_adapters_worker(self._adapter_manager.list_adapters) @@ -125,7 +125,7 @@ class LRUCacheWorkerPromptAdapterManager(WorkerPromptAdapterManager): and least recently used prompt_adapters will be unloaded if the cache is above capacity.""" - _prompt_adapter_manager_cls: type[ + _prompt_adapter_manager_cls: Type[ LRUCachePromptAdapterModelManager] = LRUCachePromptAdapterModelManager def create_prompt_adapter_manager( @@ -143,7 +143,7 @@ def create_prompt_adapter_manager( return prompt_adapter_manager.model def _apply_adapters( - self, prompt_adapter_requests: set[PromptAdapterRequest]) -> None: + self, prompt_adapter_requests: Set[PromptAdapterRequest]) -> None: prompt_adapters_map = { prompt_adapter_request.prompt_adapter_id: prompt_adapter_request for prompt_adapter_request in prompt_adapter_requests diff --git a/vllm/spec_decode/batch_expansion.py b/vllm/spec_decode/batch_expansion.py index 9d2524656a52d..e08ed742a5225 100644 --- a/vllm/spec_decode/batch_expansion.py +++ b/vllm/spec_decode/batch_expansion.py @@ -1,9 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 from array import array -from collections.abc import Iterator from itertools import chain, count -from typing import Optional +from typing import Iterator, List, Optional, Tuple import torch @@ -104,10 +103,10 @@ def score_proposals( def _expand_batch( self, - seq_group_metadata_list: list[SequenceGroupMetadata], - proposal_token_ids_list: list[list[TokenId]], - proposal_lens_list: list[int], - ) -> tuple[list[int], list[int], list[SequenceGroupMetadata], int]: + seq_group_metadata_list: List[SequenceGroupMetadata], + proposal_token_ids_list: List[List[TokenId]], + proposal_lens_list: List[int], + ) -> Tuple[List[int], List[int], List[SequenceGroupMetadata], int]: """Given the input sequences and potentially multiple corresponding proposal tokens, create a new batch where each sequence has a single query token. @@ -140,8 +139,8 @@ def _expand_batch( def _contract_non_speculative( self, scores: SpeculativeScores, - seq_group_metadata_list: list[SequenceGroupMetadata], - non_spec_indices: list[int], non_spec_outputs: SpeculativeScores, + seq_group_metadata_list: List[SequenceGroupMetadata], + non_spec_indices: List[int], non_spec_outputs: SpeculativeScores, has_prompt_log: bool) -> SpeculativeScores: """ Augment input `scores` with non-speculative requests outputs. @@ -184,10 +183,10 @@ def _contract_non_speculative( def _contract_batch( self, - contracted_seq_group_metadata_list: list[SequenceGroupMetadata], + contracted_seq_group_metadata_list: List[SequenceGroupMetadata], target_sampler_output: SamplerOutput, proposals: SpeculativeProposals, num_scoring_tokens: int, - non_spec_indices: list[int], spec_indices: list[int], + non_spec_indices: List[int], spec_indices: List[int], k: int) -> SpeculativeScores: """Contract the expanded batch back into its original size. This maps the scores of speculative tokens back to their original @@ -315,10 +314,10 @@ def _contract_batch_all_spec( def _create_scoring_model_input( self, - seq_group_metadata_list: list[SequenceGroupMetadata], - proposal_token_ids: list[list[TokenId]], # shape: [batch_size, k] + seq_group_metadata_list: List[SequenceGroupMetadata], + proposal_token_ids: List[List[TokenId]], # shape: [batch_size, k] target_seq_ids_iter: Iterator[TargetSeqId], - ) -> list[SequenceGroupMetadata]: + ) -> List[SequenceGroupMetadata]: """Given the original input sequences and proposed tokens from the draft model, create a list of target sequences that can be used for scoring. @@ -345,10 +344,10 @@ def _create_scoring_model_input( def _create_target_seq_group_metadata( self, input_seq_group_metadata: SequenceGroupMetadata, - proposal_token_ids: list[list[TokenId]], # shape: [batch_size, k] + proposal_token_ids: List[List[TokenId]], # shape: [batch_size, k] batch_index: int, target_seq_ids_iter: Iterator[TargetSeqId], - ) -> list[SequenceGroupMetadata]: + ) -> List[SequenceGroupMetadata]: """Given an input sequence group metadata and a list of draft tokens, create a list of target SequenceGroupMetadata, one for each token id that needs to be scored. @@ -368,7 +367,7 @@ def _create_target_seq_group_metadata( proposal_token_ids[batch_index]) sampling_params = input_seq_group_metadata.sampling_params - target_seq_group_metadata_list: list[SequenceGroupMetadata] = [] + target_seq_group_metadata_list: List[SequenceGroupMetadata] = [] for i, token_ids in enumerate(token_ids_to_score): target_seq_group_metadata_list.append( self._create_single_target_seq_group_metadata( @@ -386,7 +385,7 @@ def _create_single_target_seq_group_metadata( seq_group_metadata: SequenceGroupMetadata, seq_id: SeqId, target_seq_id: TargetSeqId, - token_ids: list[TokenId], + token_ids: List[TokenId], sampling_params: SamplingParams, ) -> SequenceGroupMetadata: """Create a single target SequenceGroupMetadata. @@ -434,7 +433,7 @@ def _create_single_target_seq_group_metadata( @staticmethod def _split_scoring_output( sampler_output: SamplerOutput, num_scoring_tokens: int - ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, Optional[torch.Tensor], torch.Tensor, torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: """Split the target model output into speculative and non-speculative @@ -469,7 +468,7 @@ def _split_scoring_output( @staticmethod def _create_target_seq_id_iterator( - seq_ids: list[SeqId]) -> Iterator[TargetSeqId]: + seq_ids: List[SeqId]) -> Iterator[TargetSeqId]: """Create an iterator for creating target sequence ids. Target sequence ids are distinct from sequence ids because we create a distinct target sequence id for each proposal token to be scored. @@ -481,8 +480,8 @@ def _create_target_seq_id_iterator( @staticmethod def _get_token_ids_to_score( - full_spec_token_ids: list[TokenId] # shape: [k] - ) -> list[list[TokenId]]: + full_spec_token_ids: List[TokenId] # shape: [k] + ) -> List[List[TokenId]]: """Given an int tensor of proposal token ids, return a list of token ids that should be scored. @@ -498,7 +497,7 @@ def _get_token_ids_to_score( [0, 1, 2] [0, 1, 2, 3] """ - empty_token_ids: list[TokenId] = [] + empty_token_ids: List[TokenId] = [] token_ids_to_score = [empty_token_ids] token_ids_to_score.extend(full_spec_token_ids[:i + 1] diff --git a/vllm/spec_decode/draft_model_runner.py b/vllm/spec_decode/draft_model_runner.py index eb1bb04cdaa8f..c54e6abe18d73 100644 --- a/vllm/spec_decode/draft_model_runner.py +++ b/vllm/spec_decode/draft_model_runner.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Optional +from typing import List, Optional import torch @@ -171,12 +171,12 @@ def set_indices_of_seq_with_bonus_tokens(self, def execute_model( self, model_input: ModelRunnerInputBase, - kv_caches: list[torch.Tensor], + kv_caches: List[torch.Tensor], previous_hidden_states: Optional[torch.Tensor] = None, intermediate_tensors: Optional[IntermediateTensors] = None, num_steps: int = 1, **kwargs, - ) -> Optional[list[SamplerOutput]]: + ) -> Optional[List[SamplerOutput]]: """Executes num_steps forward passes with advacement of input tensors on the GPU. Look at supports_gpu_multi_step(..) for pre-conditions. @@ -268,7 +268,7 @@ def execute_model( model_executable = self.model hidden_states = previous_hidden_states - outputs: list[SamplerOutput] = [] + outputs: List[SamplerOutput] = [] for step in range(num_steps): multi_modal_kwargs = model_input.multi_modal_kwargs or {} diff --git a/vllm/spec_decode/interfaces.py b/vllm/spec_decode/interfaces.py index 3ae9c10f31e4b..dd085ad776384 100644 --- a/vllm/spec_decode/interfaces.py +++ b/vllm/spec_decode/interfaces.py @@ -2,7 +2,7 @@ from abc import ABC, abstractmethod from dataclasses import dataclass -from typing import Optional, Union +from typing import List, Optional, Set, Union import torch @@ -58,7 +58,7 @@ class SpeculativeScores: # Scoring model may also return logprobs for prompt tokens # for each request, when chunked prefill is enabled. - prompt_logprobs: Optional[list[PromptLogprobs]] = None + prompt_logprobs: Optional[List[PromptLogprobs]] = None def __repr__(self): return (f"SpeculativeScores(" @@ -74,7 +74,7 @@ def get_spec_proposals( execute_model_req: ExecuteModelRequest, # If set, this contains all sequence IDs that were assigned # bonus tokens in their last forward pass. - seq_ids_with_bonus_token_in_last_step: set[int], + seq_ids_with_bonus_token_in_last_step: Set[int], ) -> SpeculativeProposals: raise NotImplementedError diff --git a/vllm/spec_decode/medusa_worker.py b/vllm/spec_decode/medusa_worker.py index 9aa3bda34b669..0b62a988e8b26 100644 --- a/vllm/spec_decode/medusa_worker.py +++ b/vllm/spec_decode/medusa_worker.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import weakref -from typing import Optional +from typing import List, Optional, Set, Tuple import torch @@ -45,8 +45,8 @@ def sampler_output( execute_model_req: ExecuteModelRequest, sample_len: int, # Unused parameter. - seq_ids_with_bonus_token_in_last_step: set[int], - ) -> tuple[list[SamplerOutput], bool]: + seq_ids_with_bonus_token_in_last_step: Set[int], + ) -> Tuple[List[SamplerOutput], bool]: """Run the model forward pass to generate sample_len future tokens. Returns the list of sampler output, one per layer, along with indicator of whether torch tensor in sampler output need to be transposed in @@ -76,13 +76,13 @@ def sampler_output( def _prepare_input_tensors( self, - seq_group_metadata_list: Optional[list[SequenceGroupMetadata]], - ) -> tuple[list[int], list[int]]: + seq_group_metadata_list: Optional[List[SequenceGroupMetadata]], + ) -> Tuple[List[int], List[int]]: if not seq_group_metadata_list: return [], [] - seq_lens: list[int] = [] - query_lens: list[int] = [] + seq_lens: List[int] = [] + query_lens: List[int] = [] for seq_group_metadata in seq_group_metadata_list: is_prompt = seq_group_metadata.is_prompt @@ -105,7 +105,7 @@ def _prepare_input_tensors( def get_spec_proposals( self, execute_model_req: ExecuteModelRequest, - seq_ids_with_bonus_token_in_last_step: set[int], + seq_ids_with_bonus_token_in_last_step: Set[int], ) -> SpeculativeProposals: """Produce speculations given an input batch of sequences. The number of speculative tokens per sequence is determined by max_proposal_len. diff --git a/vllm/spec_decode/mlp_speculator_worker.py b/vllm/spec_decode/mlp_speculator_worker.py index 5074a4f93fdd1..bdaf31895e25d 100644 --- a/vllm/spec_decode/mlp_speculator_worker.py +++ b/vllm/spec_decode/mlp_speculator_worker.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Optional +from typing import List, Optional, Set, Tuple import torch @@ -24,8 +24,8 @@ def sampler_output( sample_len: int, # Unused parameter. MLPSpeculatorWorker does not use the KV Cache and # therefore does not need this parameter. - seq_ids_with_bonus_token_in_last_step: set[int], - ) -> tuple[list[SamplerOutput], bool]: + seq_ids_with_bonus_token_in_last_step: Set[int], + ) -> Tuple[List[SamplerOutput], bool]: """Run the model forward pass to generate sample_len future tokens. Returns the list of sampler output, one per layer, along with indicator of whether torch tensor in sampler output need to be transposed in @@ -59,14 +59,14 @@ def sampler_output( def _prepare_input_tensors( self, - seq_group_metadata_list: Optional[list[SequenceGroupMetadata]], - ) -> tuple[torch.Tensor, list[int], list[int]]: + seq_group_metadata_list: Optional[List[SequenceGroupMetadata]], + ) -> Tuple[torch.Tensor, List[int], List[int]]: if not seq_group_metadata_list: return torch.empty(0, device=self.device), [], [] - input_tokens: list[int] = [] - seq_lens: list[int] = [] - query_lens: list[int] = [] + input_tokens: List[int] = [] + seq_lens: List[int] = [] + query_lens: List[int] = [] for seq_group_metadata in seq_group_metadata_list: is_prompt = seq_group_metadata.is_prompt diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index 0b414fa505741..c28d413efe747 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -2,6 +2,7 @@ import copy import weakref +from typing import Dict, List, Set, Tuple import torch @@ -60,8 +61,8 @@ def sampler_output( self, execute_model_req: ExecuteModelRequest, sample_len: int, - seq_ids_with_bonus_token_in_last_step: set[int], - ) -> tuple[list[SamplerOutput], bool]: + seq_ids_with_bonus_token_in_last_step: Set[int], + ) -> Tuple[List[SamplerOutput], bool]: """Run the model forward pass sample_len times. Returns the list of sampler output, one per model forward pass, along with indicator of whether torch tensor in sampler output need to be transposed in latter @@ -78,7 +79,7 @@ def sampler_output( execute_model_req, seq_ids_with_bonus_token_in_last_step) # Run model sample_len times. - model_outputs: list[SamplerOutput] = [] + model_outputs: List[SamplerOutput] = [] if current_platform.is_cuda_alike() and isinstance( self.model_runner, TP1DraftModelRunner ) and self.model_runner.supports_gpu_multi_step(expanded_request): @@ -96,7 +97,7 @@ def sampler_output( # and other restrictions that are part of DraftModelRunner's # supports_gpu_multi_step(..) for _ in range(sample_len): - model_output: list[SamplerOutput] = self.worker.execute_model( + model_output: List[SamplerOutput] = self.worker.execute_model( execute_model_req=expanded_request) assert (len(model_output) == 1 ), "composing multistep workers not supported" @@ -118,7 +119,7 @@ def sampler_output( def _expand_execute_model_request( execute_model_req: ExecuteModelRequest, seq_with_bonus_token_in_last_step: set, - ) -> tuple[ExecuteModelRequest, list[int]]: + ) -> Tuple[ExecuteModelRequest, List[int]]: """ Expands the execute model request based on sequences with bonus tokens. @@ -135,11 +136,11 @@ def _expand_execute_model_request( contain bonus tokens. Returns: - tuple[ExecuteModelRequest, list[int]]: The updated execute model + Tuple[ExecuteModelRequest, List[int]]: The updated execute model request with expanded sequences and a list of indices corresponding to the original sequence groups. """ - updated_seq_group_metadata_list: list[SequenceGroupMetadata] = [] + updated_seq_group_metadata_list: List[SequenceGroupMetadata] = [] updated_execute_model_req = execute_model_req.clone( updated_seq_group_metadata_list) indices_of_original_sequence_groups = [] @@ -178,8 +179,8 @@ def _expand_execute_model_request( @staticmethod def _filter_model_output( - expanded_batch_outputs: list[SamplerOutput], - output_indices_to_retain: torch.Tensor) -> list[SamplerOutput]: + expanded_batch_outputs: List[SamplerOutput], + output_indices_to_retain: torch.Tensor) -> List[SamplerOutput]: """ Filters the model output to include only the specified sequence outputs. This method contracts the expanded batch output from the @@ -187,13 +188,13 @@ def _filter_model_output( provided indices. Args: - expanded_batch_output (list[SamplerOutput]): The expanded output + expanded_batch_output (List[SamplerOutput]): The expanded output batch from the model. output_indices_to_retain (torch.Tensor): Indices of the model outputs to retain. Returns: - list[SamplerOutput]: A list containing the filtered model + List[SamplerOutput]: A list containing the filtered model outputs for the specified indices. """ return [ @@ -230,9 +231,9 @@ def get_spec_proposals( @staticmethod def _append_new_tokens( - model_output: list[SamplerOutput], - seq_group_metadata_list: list[SequenceGroupMetadata], - indices_of_seq_with_bonus_tokens: list[int]) -> None: + model_output: List[SamplerOutput], + seq_group_metadata_list: List[SequenceGroupMetadata], + indices_of_seq_with_bonus_tokens: List[int]) -> None: """Given model output from a single run, append the tokens to the sequences. This is normally done outside of the worker, but it is required if the worker is to perform multiple forward passes. @@ -279,7 +280,7 @@ def _shallow_copy_seq_group_metadata( new_seq_group_metadata = copy.copy(seq_group_metadata) # We must shallow-copy seq_data as we will append token ids - new_seq_data: dict[int, SequenceData] = {} + new_seq_data: Dict[int, SequenceData] = {} for seq_id, old_seq_data in seq_group_metadata.seq_data.items(): new_seq_data[seq_id] = copy.copy(old_seq_data) new_seq_data[seq_id].output_token_ids =\ @@ -291,7 +292,7 @@ def _shallow_copy_seq_group_metadata( @staticmethod def _copy_seq_metadata_excluding_last_token( seq_group_metadata: SequenceGroupMetadata, - seq_ids_to_copy: set[int], + seq_ids_to_copy: Set[int], ) -> SequenceGroupMetadata: """ Creates a shallow copy of the given SequenceGroupMetadata, retaining @@ -302,7 +303,7 @@ def _copy_seq_metadata_excluding_last_token( Parameters: seq_group_metadata (SequenceGroupMetadata): The original sequence group metadata. - seq_ids_to_copy (set[int]): The set of sequence IDs to include in the + seq_ids_to_copy (Set[int]): The set of sequence IDs to include in the copy. Returns: @@ -312,7 +313,7 @@ def _copy_seq_metadata_excluding_last_token( # Shallow-copy the SequenceGroupMetadata. new_seq_group_metadata = copy.copy(seq_group_metadata) # Shallow-copy seq_data and modify the output_token_ids. - new_seq_data: dict[int, SequenceData] = {} + new_seq_data: Dict[int, SequenceData] = {} for seq_id, old_seq_data in seq_group_metadata.seq_data.items(): if (seq_id in seq_ids_to_copy): new_seq_data[seq_id] = copy.copy(old_seq_data) @@ -331,7 +332,7 @@ def _copy_seq_metadata_excluding_last_token( return new_seq_group_metadata def _assert_enough_kv_space( - self, seq_group_metadata_list: list[SequenceGroupMetadata], + self, seq_group_metadata_list: List[SequenceGroupMetadata], num_steps: int) -> None: """Assert there are enough physical blocks per sequence to store the current KV plus additional KV from num_steps tokens. diff --git a/vllm/spec_decode/ngram_worker.py b/vllm/spec_decode/ngram_worker.py index 7e89e824b67b9..57ae173af6744 100644 --- a/vllm/spec_decode/ngram_worker.py +++ b/vllm/spec_decode/ngram_worker.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import weakref -from typing import Optional +from typing import List, Optional, Set, Tuple import torch import torch.nn as nn @@ -71,8 +71,8 @@ def sampler_output( sample_len: int, # Unused parameter. NGramWorker does not use the KV Cache and # therefore does not need this parameter. - seq_ids_with_bonus_token_in_last_step: set[int], - ) -> tuple[Optional[list[Optional[SamplerOutput]]], bool]: + seq_ids_with_bonus_token_in_last_step: Set[int], + ) -> Tuple[Optional[List[Optional[SamplerOutput]]], bool]: """NGram match algo to pick proposal candidate. Returns the list of sampler output, one per SequenceGroupMetadata. @@ -82,8 +82,8 @@ def sampler_output( self._raise_if_unsupported(execute_model_req) has_spec_out = False - token_id_list: list[Optional[torch.Tensor]] = [] - token_prob_list: list[Optional[torch.Tensor]] = [] + token_id_list: List[Optional[torch.Tensor]] = [] + token_prob_list: List[Optional[torch.Tensor]] = [] for idx, seq_group_metadata in enumerate( execute_model_req.seq_group_metadata_list): seq_data = next(iter(seq_group_metadata.seq_data.values())) @@ -142,7 +142,7 @@ def sampler_output( if not has_spec_out: return None, False - outputs: list[Optional[SamplerOutput]] = [] + outputs: List[Optional[SamplerOutput]] = [] for idx in range(len(execute_model_req.seq_group_metadata_list)): if token_id_list[idx] is None: outputs.append(None) @@ -164,7 +164,7 @@ def get_spec_proposals( execute_model_req: ExecuteModelRequest, # Unused parameter. NGramWorker does not use the KV Cache and # therefore does not need this parameter. - seq_ids_with_bonus_token_in_last_step: set[int], + seq_ids_with_bonus_token_in_last_step: Set[int], ) -> SpeculativeProposals: """Produce speculations given an input batch of sequences. The number of speculative tokens per sequence is determined by max_proposal_len. diff --git a/vllm/spec_decode/proposer_worker_base.py b/vllm/spec_decode/proposer_worker_base.py index bb1b11465e525..2829d631b49ee 100644 --- a/vllm/spec_decode/proposer_worker_base.py +++ b/vllm/spec_decode/proposer_worker_base.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 from abc import ABC, abstractmethod -from typing import Optional +from typing import List, Optional, Set, Tuple from vllm.model_executor.layers.sampler import SamplerOutput from vllm.sequence import ExecuteModelRequest @@ -23,8 +23,8 @@ def sampler_output( # This parameter is only used by the MultiStepWorker, which relies on # the KV cache for token generation. It is not used by workers that # do not utilize the KV cache. - seq_ids_with_bonus_token_in_last_step: set[int] - ) -> tuple[Optional[list[SamplerOutput]], bool]: + seq_ids_with_bonus_token_in_last_step: Set[int] + ) -> Tuple[Optional[List[SamplerOutput]], bool]: raise NotImplementedError def set_include_gpu_probs_tensor(self) -> None: @@ -42,11 +42,11 @@ class NonLLMProposerWorkerBase(ProposerWorkerBase, ABC): def execute_model( self, execute_model_req: Optional[ExecuteModelRequest] = None - ) -> list[SamplerOutput]: + ) -> List[SamplerOutput]: """get_spec_proposals is used to get the proposals""" return [] - def determine_num_available_blocks(self) -> tuple[int, int]: + def determine_num_available_blocks(self) -> Tuple[int, int]: """This is never called on the proposer, only the target model""" raise NotImplementedError diff --git a/vllm/spec_decode/smaller_tp_proposer_worker.py b/vllm/spec_decode/smaller_tp_proposer_worker.py index 87f5803b33d2d..6919562465097 100644 --- a/vllm/spec_decode/smaller_tp_proposer_worker.py +++ b/vllm/spec_decode/smaller_tp_proposer_worker.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Optional +from typing import List, Optional, Set, Tuple import torch import torch.nn as nn @@ -48,12 +48,12 @@ def maybe_wrap_worker(cls, worker, draft_tensor_parallel_size: int, logger.info("Wrapping {%s} in {%s}", type(worker), cls) return cls(worker, draft_ranks) - def __init__(self, worker: MultiStepWorker, draft_ranks: list[int]): + def __init__(self, worker: MultiStepWorker, draft_ranks: List[int]): """Create a SmallerTpProposerWorker. Args: worker (MultiStepWorker): an actual worker wrapped with this class - draft_ranks (list[int]): if this value is given, only the GPU ranks + draft_ranks (List[int]): if this value is given, only the GPU ranks written in this value participate in draft generation """ self._worker = worker @@ -105,7 +105,7 @@ def load_model(self) -> None: with self._patch_tensor_parallel_group(): self._worker.load_model() - def determine_num_available_blocks(self) -> tuple[int, int]: + def determine_num_available_blocks(self) -> Tuple[int, int]: if self._is_dummy: # this case is not used now return -1, -1 @@ -125,8 +125,8 @@ def sampler_output( self, execute_model_req: ExecuteModelRequest, sample_len: int, - seq_ids_with_bonus_token_in_last_step: set[int], - ) -> tuple[list[SamplerOutput], bool]: + seq_ids_with_bonus_token_in_last_step: Set[int], + ) -> Tuple[List[SamplerOutput], bool]: # Do not check _is_dummy, as it's always called by get_spec_proposals return self._worker.sampler_output( execute_model_req, sample_len, @@ -135,7 +135,7 @@ def sampler_output( def get_spec_proposals( self, execute_model_req: ExecuteModelRequest, - seq_ids_with_bonus_token_in_last_step: set[int], + seq_ids_with_bonus_token_in_last_step: Set[int], ) -> SpeculativeProposals: """Produce speculations given an input batch of sequences. The number of speculative tokens per sequence is determined by max_proposal_len. @@ -157,7 +157,7 @@ def get_model(self) -> nn.Module: def execute_model( self, execute_model_req: Optional[ExecuteModelRequest] = None - ) -> list[SamplerOutput]: + ) -> List[SamplerOutput]: if self._is_dummy: return [] diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 9c48e231c2d4d..871a3aee63063 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -3,7 +3,7 @@ import copy from collections import defaultdict from functools import cached_property -from typing import Any, Optional +from typing import Any, Dict, List, Optional, Set, Tuple, Type import torch import torch.nn as nn @@ -148,7 +148,7 @@ class SpecDecodeWorker(LoRANotSupportedWorkerBase): def create_worker( cls, scorer_worker: WorkerBase, - draft_worker_kwargs: dict[str, Any], + draft_worker_kwargs: Dict[str, Any], disable_mqa_scorer: bool, disable_by_batch_size: Optional[int], draft_token_acceptance_method: str, @@ -324,10 +324,10 @@ def __init__( # Tracks the sequence IDs that received a bonus token ID in # their last forward pass. Needed only if KV cache is being # used for token generation such as in the case of MultiStepWorker. - self._seq_with_bonus_token_in_last_step: set[int] = set() + self._seq_with_bonus_token_in_last_step: Set[int] = set() # Tracks the currently active request ids and the sequence IDs # corresponding to them - self._request_id_seq_id_mapping: dict[str, set[int]] = defaultdict(set) + self._request_id_seq_id_mapping: Dict[str, Set[int]] = defaultdict(set) # Tracks if the proposer worker uses the KV cache or not. self.probs_dtype = self.spec_decode_sampler.probs_dtype @@ -374,7 +374,7 @@ def init_device(self) -> None: self.spec_decode_sampler.init_tensors(self.rank, device_type=self.device) - scorer_cls: type[SpeculativeScorer] + scorer_cls: Type[SpeculativeScorer] if self.disable_mqa_scorer: scorer_cls = BatchExpansionTop1Scorer logger.info("[Speculative Decoding] Use batch " @@ -419,7 +419,7 @@ def _configure_model_sampler_for_spec_decode(self): self.proposer_worker.set_include_gpu_probs_tensor() self.proposer_worker.set_should_modify_greedy_probs_inplace() - def determine_num_available_blocks(self) -> tuple[int, int]: + def determine_num_available_blocks(self) -> Tuple[int, int]: """Determine the number of cache blocks to use. This is done by profiling the scorer model (which is typically the @@ -456,7 +456,7 @@ def get_model(self) -> nn.Module: def execute_model( self, execute_model_req: Optional[ExecuteModelRequest] = None - ) -> list[SamplerOutput]: + ) -> List[SamplerOutput]: """Perform speculative decoding on the input batch. """ if self.rank != self._driver_rank: @@ -560,7 +560,7 @@ def _should_disable_all_speculation( def _maybe_disable_speculative_tokens( self, disable_all_speculation: bool, - seq_group_metadata_list: list[SequenceGroupMetadata]) -> None: + seq_group_metadata_list: List[SequenceGroupMetadata]) -> None: if not disable_all_speculation: return @@ -574,7 +574,7 @@ def _maybe_disable_speculative_tokens( def _serialize_sampler_output_no_logprobs( self, execute_model_req: ExecuteModelRequest, - sampler_output: SamplerOutput) -> list[SamplerOutput]: + sampler_output: SamplerOutput) -> List[SamplerOutput]: """ Creates and returns a `SamplerOutput` with only the token IDs being serialized to CPU and populated in `CompletionSequenceGroupOutput`. @@ -609,7 +609,7 @@ def _serialize_sampler_output_no_logprobs( execute_model_req.seq_group_metadata_list \ for seq_id, seq_data in sg.seq_data.items() ] - completion_seq_group_output_list: list[ + completion_seq_group_output_list: List[ CompletionSequenceGroupOutput] = [] output_index = 0 # Make sure the non-terminal prefill chunks are still aligned with @@ -664,7 +664,7 @@ def _serialize_sampler_output_no_logprobs( @nvtx_range("spec_decode_worker._run_no_spec") def _run_no_spec(self, execute_model_req: ExecuteModelRequest, - skip_proposer: bool) -> list[SamplerOutput]: + skip_proposer: bool) -> List[SamplerOutput]: """Run a single generation step without any speculation. The input is sent to the proposer and scorer model so that the KV cache is consistent between the two. When skip_proposer is True, the proposer model is @@ -759,7 +759,7 @@ def _run_non_driver_rank(self) -> bool: @nvtx_range("spec_decode_worker._run_speculative_decoding_step") def _run_speculative_decoding_step( self, execute_model_req: ExecuteModelRequest, - num_lookahead_slots: int) -> list[SamplerOutput]: + num_lookahead_slots: int) -> List[SamplerOutput]: """Execute a single step of speculative decoding. This invokes the proposer worker to get k speculative tokens for each @@ -838,11 +838,11 @@ def _run_speculative_decoding_step( @nvtx_range("spec_decode_worker._verify_tokens") def _verify_tokens( self, - seq_group_metadata_list: list[SequenceGroupMetadata], + seq_group_metadata_list: List[SequenceGroupMetadata], proposal_scores: SpeculativeScores, proposals: SpeculativeProposals, max_proposal_len: int, - ) -> tuple[torch.Tensor, torch.Tensor]: + ) -> Tuple[torch.Tensor, torch.Tensor]: """Determine which speculative tokens are accepted using the probabilities of each token according to the proposer and scorer models. @@ -875,7 +875,7 @@ def _verify_tokens( proposal_token_ids = proposals.proposal_token_ids[spec_indices] # Sampler arguments - sampler_extra_kwargs: dict[str, Any] = {} + sampler_extra_kwargs: Dict[str, Any] = {} if self.generators and isinstance(self.spec_decode_sampler, SpecDecodeStochasticBaseSampler): sampler_extra_kwargs["seeded_seqs"] = { @@ -934,14 +934,14 @@ def _verify_tokens( def _create_output_sampler_list( self, - seq_group_metadata_list: list[SequenceGroupMetadata], + seq_group_metadata_list: List[SequenceGroupMetadata], accepted_token_ids: torch.Tensor, # shape: [batch_size, k+1] target_logprobs: torch.Tensor, # shape: [batch_size, k+1, vocab_size] prompt_logprobs: Optional[ torch.Tensor], # shape: [nprompt_tokens, vocab_size] k: int, - stage_times: tuple[float, float, float], - ) -> list[SamplerOutput]: + stage_times: Tuple[float, float, float], + ) -> List[SamplerOutput]: """Given the accepted token ids, create a list of SamplerOutput. The output is padded with -1 tokens such that each sequence has @@ -984,7 +984,7 @@ def _create_output_sampler_list( # Non-terminal prefill chunks will end up here as rows with just -1s # i.e mixed-batch [[-1, 1576], [-1, 29884], [-1, -1], [-1, -1]] while # terminal chunks will only have one generated token at time 0. - sampler_output_list: list[SamplerOutput] = [] + sampler_output_list: List[SamplerOutput] = [] # Prefills are not multi-step (return at most 1 token), in order to # avoid padding or repetition to fit decodes, we separate them. @@ -1058,7 +1058,7 @@ def _create_output_sampler_list( if not sg.is_prompt): break - step_output_token_ids: list[CompletionSequenceGroupOutput] = [] + step_output_token_ids: List[CompletionSequenceGroupOutput] = [] for sequence_index in range(batch_size): seq_meta = seq_group_metadata_list[sequence_index] # Prompts already processed above. @@ -1124,9 +1124,9 @@ def _create_dummy_logprob_lists( batch_size: int, num_steps: int, num_top_k: int, - ) -> tuple[list[list[int]], list[list[float]], - list[list[list[Optional[float]]]], - list[list[list[Optional[int]]]]]: + ) -> Tuple[List[List[int]], List[List[float]], + List[List[List[Optional[float]]]], + List[List[List[Optional[int]]]]]: """ Creates and returns four dummy lists representing token probabilities and their ranks. @@ -1153,10 +1153,10 @@ def _create_dummy_logprob_lists( for _ in range(num_steps)] accepted_token_id_logprobs_by_step = [[0.0] * batch_size for _ in range(num_steps)] - topk_logprobs_by_step: list[list[list[Optional[float]]]] = [[ + topk_logprobs_by_step: List[List[List[Optional[float]]]] = [[ [None] * num_top_k for _ in range(batch_size) ] for _ in range(num_steps)] - topk_indices_by_step: list[list[list[Optional[int]]]] = [[ + topk_indices_by_step: List[List[List[Optional[int]]]] = [[ [None] * num_top_k for _ in range(batch_size) ] for _ in range(num_steps)] return (accepted_token_id_ranks_by_step, @@ -1168,9 +1168,9 @@ def _create_logprob_lists_from_tensors( target_logprobs_by_step: torch.Tensor, accepted_token_ids_by_step: torch.Tensor, num_top_k: int, - ) -> tuple[list[list[int]], list[list[float]], - list[list[list[Optional[float]]]], - list[list[list[Optional[int]]]]]: + ) -> Tuple[List[List[int]], List[List[float]], + List[List[List[Optional[float]]]], + List[List[List[Optional[int]]]]]: """ Creates and returns four lists representing token probabilities and their ranks. @@ -1232,9 +1232,9 @@ def _track_finished_requests(self, execute_model_req: ExecuteModelRequest): del self._request_id_seq_id_mapping[finished_request] def _track_sequences_with_bonus_tokens( - self, seq_ids: list[int], - request_ids_seq_ids_mapping: dict[str, set[int]], - accepted_token_ids_by_step: list[list[int]]): + self, seq_ids: List[int], + request_ids_seq_ids_mapping: Dict[str, Set[int]], + accepted_token_ids_by_step: List[List[int]]): """ Updates the internal data structures which keep track of sequences which have been assigned bonus tokens in their last forward pass. diff --git a/vllm/spec_decode/target_model_runner.py b/vllm/spec_decode/target_model_runner.py index b13a070c2e454..08e773c562bf8 100644 --- a/vllm/spec_decode/target_model_runner.py +++ b/vllm/spec_decode/target_model_runner.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Optional +from typing import List, Optional from vllm.sequence import SequenceGroupMetadata from vllm.worker.model_runner_base import (ModelRunnerBase, @@ -28,9 +28,9 @@ def __init__(self, model_runner: ModelRunnerBase): def prepare_model_input( self, - seq_group_metadata_list: list[SequenceGroupMetadata], + seq_group_metadata_list: List[SequenceGroupMetadata], virtual_engine: int = 0, - finished_requests_ids: Optional[list[str]] = None, + finished_requests_ids: Optional[List[str]] = None, ) -> ModelRunnerInputBase: model_input: ModelRunnerInputBase =\ self.model_runner.prepare_model_input( diff --git a/vllm/spec_decode/top1_proposer.py b/vllm/spec_decode/top1_proposer.py index 14d5891c12a71..b538923c03e74 100644 --- a/vllm/spec_decode/top1_proposer.py +++ b/vllm/spec_decode/top1_proposer.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Optional +from typing import List, Optional, Set, Tuple import torch @@ -44,7 +44,7 @@ def __init__( def get_spec_proposals( self, execute_model_req: ExecuteModelRequest, - seq_ids_with_bonus_token_in_last_step: set[int], + seq_ids_with_bonus_token_in_last_step: Set[int], ) -> SpeculativeProposals: """Get speculative proposals given the input batch. @@ -115,18 +115,18 @@ def get_spec_proposals( def _split_by_proposal_len( self, - seq_group_metadata_list: list[SequenceGroupMetadata], + seq_group_metadata_list: List[SequenceGroupMetadata], proposal_len: int, - ) -> tuple[list[int], list[SequenceGroupMetadata], list[int]]: + ) -> Tuple[List[int], List[SequenceGroupMetadata], List[int]]: """Split sequences by two groups: 1. Sequences with non-zero proposal length. 2. Sequences with zero proposal length (due to disabled speculation or exceed the maximum model length). """ - proposal_lens: list[int] = [] - nonzero_proposal_len_seqs: list[SequenceGroupMetadata] = [] - nonzero_proposal_len_indices: list[int] = [] + proposal_lens: List[int] = [] + nonzero_proposal_len_seqs: List[SequenceGroupMetadata] = [] + nonzero_proposal_len_indices: List[int] = [] for i, seq_group_metadata in enumerate(seq_group_metadata_list): # The speculative decoding for this request has either been disabled # (e.g. due to high traffic) or this is a prompt request. @@ -174,9 +174,9 @@ def _remove_no_proposal_seqs(proposal_lens, maybe_sampler_output, return (proposal_lens, maybe_sampler_output, nonzero_proposal_len_indices) - new_proposal_lens: list[int] = [] - new_nonzero_proposal_len_indices: list[int] = [] - new_maybe_sampler_output: list[SamplerOutput] = [] + new_proposal_lens: List[int] = [] + new_nonzero_proposal_len_indices: List[int] = [] + new_maybe_sampler_output: List[SamplerOutput] = [] nonzero_proposal_len_idx_ptr = 0 seq_idx = 0 while seq_idx < len( @@ -217,11 +217,11 @@ def _merge_outputs( self, batch_size: int, proposal_len: int, - maybe_sampler_output: Optional[list[SamplerOutput]], - proposal_lens: list[int], - nonzero_proposal_len_indices: list[int], + maybe_sampler_output: Optional[List[SamplerOutput]], + proposal_lens: List[int], + nonzero_proposal_len_indices: List[int], sampler_transposed: bool, - ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: """After speculations are produced, merge the speculation results with the skipped sequences. """ diff --git a/vllm/spec_decode/util.py b/vllm/spec_decode/util.py index 1676be8ded73a..9c04680a6a7ab 100644 --- a/vllm/spec_decode/util.py +++ b/vllm/spec_decode/util.py @@ -1,9 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 import time -from collections.abc import Sequence from contextlib import contextmanager -from typing import Optional +from typing import Dict, List, Optional, Sequence, Tuple import torch @@ -17,14 +16,14 @@ def get_all_num_logprobs( - seq_group_metadata_list: list[SequenceGroupMetadata]) -> list[int]: + seq_group_metadata_list: List[SequenceGroupMetadata]) -> List[int]: """Given a list of SequenceGroupMetadata, create a list of all num_logprobs. If the sampling params do not call for any logprobs, return 0 for that sequence. """ - all_num_logprobs: list[int] = [] + all_num_logprobs: List[int] = [] for seq_group_metadata in seq_group_metadata_list: num_logprobs = seq_group_metadata.sampling_params.logprobs if num_logprobs is None: @@ -38,7 +37,7 @@ def get_sampled_token_logprobs( # shape [num_steps, batch_size, vocab_size] logprob_tensor: torch.Tensor, sampled_token_ids: torch.Tensor, # shape [num_steps, batch_size] -) -> tuple[torch.Tensor, torch.Tensor]: +) -> Tuple[torch.Tensor, torch.Tensor]: """Get the logprobs for the sampled tokens. Returns the ranks and logprobs. """ num_steps, batch_size, vocab_size = logprob_tensor.shape @@ -60,21 +59,21 @@ def create_logprobs_output( token_id: int, token_id_logprob_rank: int, token_id_logprob: float, - topk_token_ids: list[Optional[int]], - topk_logprobs: list[Optional[float]], -) -> dict[int, Logprob]: - """Create a Logprob dict for a token given the sampling results. + topk_token_ids: List[Optional[int]], + topk_logprobs: List[Optional[float]], +) -> Dict[int, Logprob]: + """Create a Logprob Dict for a token given the sampling results. Args: token_id (int): The sampled token for the sequence. token_id_logprob_rank (int): The logprob rank of the sampled token. token_id_logprob (float): The logprob value of the sampled token. - topk_token_ids (list[Optional[int]]): The list of top-k token ids. - topk_logprobs (list[Optional[float]]): The list of top-k logprobs. + topk_token_ids (List[Optional[int]]): The list of top-k token ids. + topk_logprobs (List[Optional[float]]): The list of top-k logprobs. """ # vLLM logprobs always include the sampled token. In addition, the user may # request topk-logprobs (where top-k varies per user up to max_logprobs). - logprobs: dict[int, Logprob] = { + logprobs: Dict[int, Logprob] = { token_id: Logprob( logprob=token_id_logprob, rank=token_id_logprob_rank, @@ -98,8 +97,8 @@ def create_sequence_group_output( token_id_logprob_rank: int, token_id_logprob: float, seq_id: SeqId, - topk_token_ids: list[Optional[int]], - topk_logprobs: list[Optional[float]], + topk_token_ids: List[Optional[int]], + topk_logprobs: List[Optional[float]], prompt_logprobs: Optional[PromptLogprobs] = None, ) -> CompletionSequenceGroupOutput: """Create a SequenceGroupOutput given the sampling results. @@ -109,8 +108,8 @@ def create_sequence_group_output( token_id_logprob_rank (int): The logprob rank of the sampled token. token_id_logprob (float): The logprob value of the sampled token. seq_id (int): The sequence id. - topk_token_ids (list[Optional[int]]): The list of top-k token ids. - topk_logprobs (list[Optional[float]]): The list of top-k logprobs. + topk_token_ids (List[Optional[int]]): The list of top-k token ids. + topk_logprobs (List[Optional[float]]): The list of top-k logprobs. """ logprobs = create_logprobs_output( @@ -132,17 +131,17 @@ def create_sequence_group_output( def split_batch_by_proposal_len( - seq_group_metadata_list: list[SequenceGroupMetadata], - proposal_lens: list[int], -) -> tuple[tuple[list[SequenceGroupMetadata], list[int]], tuple[ - list[SequenceGroupMetadata], list[int]]]: + seq_group_metadata_list: List[SequenceGroupMetadata], + proposal_lens: List[int], +) -> Tuple[Tuple[List[SequenceGroupMetadata], List[int]], Tuple[ + List[SequenceGroupMetadata], List[int]]]: """Utility function that splits a batch based on whether the proposal len is zero or not. We should remove this once vLLM supports per-sequence proposal lens in a batch. """ - nonzero_lists: tuple[list[SequenceGroupMetadata], list[int]] = ([], []) - zero_lists: tuple[list[SequenceGroupMetadata], list[int]] = ([], []) + nonzero_lists: Tuple[List[SequenceGroupMetadata], List[int]] = ([], []) + zero_lists: Tuple[List[SequenceGroupMetadata], List[int]] = ([], []) for i, (seq_group, proposal_len) in enumerate( zip(seq_group_metadata_list, proposal_lens)): seq_groups, indices = nonzero_lists if proposal_len else zero_lists @@ -153,7 +152,7 @@ def split_batch_by_proposal_len( def sampler_output_to_torch( sampler_output_list: Sequence[SamplerOutput], sampler_transposed: bool -) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: +) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: """Utility function which converts a list of SamplerOutput to tensors. sampler_transposed here is used as the indicator for whether diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index a9f3625a90a9e..1937b13884711 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -6,7 +6,7 @@ import time from functools import cache from pathlib import Path -from typing import Any, Callable, Literal, Optional, Union +from typing import Any, Callable, Dict, Literal, Optional, Type, Union import huggingface_hub from huggingface_hub import hf_hub_download @@ -53,11 +53,11 @@ logger = init_logger(__name__) -_CONFIG_REGISTRY_OVERRIDE_HF: dict[str, type[PretrainedConfig]] = { +_CONFIG_REGISTRY_OVERRIDE_HF: Dict[str, Type[PretrainedConfig]] = { "mllama": MllamaConfig } -_CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = { +_CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = { "chatglm": ChatGLMConfig, "cohere2": Cohere2Config, "dbrx": DbrxConfig, @@ -193,7 +193,7 @@ def patch_rope_scaling(config: PretrainedConfig) -> None: patch_rope_scaling_dict(rope_scaling) -def patch_rope_scaling_dict(rope_scaling: dict[str, Any]) -> None: +def patch_rope_scaling_dict(rope_scaling: Dict[str, Any]) -> None: if "rope_type" in rope_scaling and "type" in rope_scaling: rope_type = rope_scaling["rope_type"] rope_type_legacy = rope_scaling["type"] @@ -701,7 +701,7 @@ def get_hf_image_processor_config( model: Union[str, Path], revision: Optional[str] = None, **kwargs, -) -> dict[str, Any]: +) -> Dict[str, Any]: # ModelScope does not provide an interface for image_processor if VLLM_USE_MODELSCOPE: return dict() diff --git a/vllm/transformers_utils/configs/arctic.py b/vllm/transformers_utils/configs/arctic.py index 2261f0a9e9aac..5ab70c0e41362 100644 --- a/vllm/transformers_utils/configs/arctic.py +++ b/vllm/transformers_utils/configs/arctic.py @@ -8,7 +8,7 @@ """ Arctic model configuration""" from dataclasses import asdict, dataclass -from typing import Any +from typing import Any, Dict from transformers.configuration_utils import PretrainedConfig from transformers.utils import logging @@ -192,14 +192,14 @@ def __init__( ) @classmethod - def from_dict(cls, config_dict: dict[str, Any], **kwargs) -> "ArcticConfig": + def from_dict(cls, config_dict: Dict[str, Any], **kwargs) -> "ArcticConfig": result = super().from_dict(config_dict, **kwargs) config = result[0] if isinstance(result, tuple) else result if isinstance(config.quantization, dict): config.quantization = ArcticQuantizationConfig(**config.quantization) return result - def to_dict(self) -> dict[str, Any]: + def to_dict(self) -> Dict[str, Any]: ret = super().to_dict() if isinstance(ret["quantization"], ArcticQuantizationConfig): ret["quantization"] = asdict(ret["quantization"]) diff --git a/vllm/transformers_utils/configs/cohere2.py b/vllm/transformers_utils/configs/cohere2.py index 21328d7675b82..e30409b3af5f0 100644 --- a/vllm/transformers_utils/configs/cohere2.py +++ b/vllm/transformers_utils/configs/cohere2.py @@ -61,7 +61,7 @@ class Cohere2Config(PretrainedConfig): Whether to tie weight embeddings rope_theta (`float`, *optional*, defaults to 10000.0): The base period of the RoPE embeddings. - rope_scaling (`dict`, *optional*): + rope_scaling (`Dict`, *optional*): Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value accordingly. @@ -86,11 +86,11 @@ class Cohere2Config(PretrainedConfig): `beta_slow` (`float`, *optional*): Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear ramp function. If unspecified, it defaults to 1. - `short_factor` (`list[float]`, *optional*): + `short_factor` (`List[float]`, *optional*): Only used with 'longrope'. The scaling factor to be applied to short contexts (< `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden size divided by the number of attention heads divided by 2 - `long_factor` (`list[float]`, *optional*): + `long_factor` (`List[float]`, *optional*): Only used with 'longrope'. The scaling factor to be applied to long contexts (< `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden size divided by the number of attention heads divided by 2 diff --git a/vllm/transformers_utils/configs/deepseek_vl2.py b/vllm/transformers_utils/configs/deepseek_vl2.py index a54486fa41cd1..24d4052d87211 100644 --- a/vllm/transformers_utils/configs/deepseek_vl2.py +++ b/vllm/transformers_utils/configs/deepseek_vl2.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/faf18023f24b962b32d9f0a2d89e402a8d383a78/deepseek_vl2/models/modeling_deepseek_vl_v2.py#L115-L268 +from typing import Tuple from transformers.configuration_utils import PretrainedConfig @@ -190,12 +191,12 @@ class DeepseekVLV2Config(PretrainedConfig): tile_tag: str = "2D" global_view_pos: str = "head" - candidate_resolutions: tuple[tuple[int, int]] = ((384, 384), ) + candidate_resolutions: Tuple[Tuple[int, int]] = ((384, 384), ) def __init__(self, tile_tag: str = "tile_tag", global_view_pos: str = "head", - candidate_resolutions: tuple[tuple[int, + candidate_resolutions: Tuple[Tuple[int, int]] = ((384, 384), ), **kwargs): super().__init__(**kwargs) diff --git a/vllm/transformers_utils/configs/exaone.py b/vllm/transformers_utils/configs/exaone.py index 76f6fffd7ee40..39364367e3031 100644 --- a/vllm/transformers_utils/configs/exaone.py +++ b/vllm/transformers_utils/configs/exaone.py @@ -17,12 +17,14 @@ # limitations under the License. """Exaone model configuration""" +from typing import Dict + from transformers.configuration_utils import PretrainedConfig from transformers.utils import logging logger = logging.get_logger(__name__) -EXAONE_PRETRAINED_CONFIG_ARCHIVE_MAP: dict[str, str] = {} +EXAONE_PRETRAINED_CONFIG_ARCHIVE_MAP: Dict[str, str] = {} class ExaoneConfig(PretrainedConfig): diff --git a/vllm/transformers_utils/configs/jais.py b/vllm/transformers_utils/configs/jais.py index b947c6a9e2b4b..be0f3b7e5e529 100644 --- a/vllm/transformers_utils/configs/jais.py +++ b/vllm/transformers_utils/configs/jais.py @@ -98,7 +98,7 @@ class JAISConfig(PretrainedConfig): Scale attention weights by dividing by hidden_size instead of sqrt(hidden_size). Need to set scale_attn_weights to `True` as well. - alibi_scaling (`dict`, *optional*): + alibi_scaling (`Dict`, *optional*): Dictionary containing the scaling configuration for ALiBi embeddings. Currently only supports linear scaling strategy. Can specify either the scaling `factor` (must be @@ -108,7 +108,7 @@ class JAISConfig(PretrainedConfig): formats are `{"type": strategy name, "factor": scaling factor}` or `{"type": strategy name, "train_seq_len": training sequence length}`. - architectures (`list`, *optional*, defaults to ['JAISLMHeadModel']): + architectures (`List`, *optional*, defaults to ['JAISLMHeadModel']): architecture names for Jais. Example: diff --git a/vllm/transformers_utils/configs/mlp_speculator.py b/vllm/transformers_utils/configs/mlp_speculator.py index 70f60752905cb..c761f659e5b2c 100644 --- a/vllm/transformers_utils/configs/mlp_speculator.py +++ b/vllm/transformers_utils/configs/mlp_speculator.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Optional +from typing import List, Optional from transformers import PretrainedConfig @@ -17,7 +17,7 @@ def __init__(self, emb_dim: int = 4096, inner_dim: int = 0, n_predict: int = 3, - top_k_tokens_per_head: Optional[list[int]] = None, + top_k_tokens_per_head: Optional[List[int]] = None, n_candidates: int = 5, tie_weights: bool = False, scale_input: bool = False, @@ -34,7 +34,7 @@ def __init__(self, the inner dimension of the model. If 0, will be the emb_dim. n_predict: int the number of lookaheads for the speculator - top_k_tokens_per_head: list[int] + top_k_tokens_per_head: List[int] Number of tokens to consider from each head when forming the candidate tree. For each candidate branch in the tree, head n produces topk[n] diff --git a/vllm/transformers_utils/configs/mpt.py b/vllm/transformers_utils/configs/mpt.py index 2d52658d3973c..96356135f6b28 100644 --- a/vllm/transformers_utils/configs/mpt.py +++ b/vllm/transformers_utils/configs/mpt.py @@ -4,11 +4,11 @@ # https://huggingface.co/mosaicml/mpt-7b/blob/main/configuration_mpt.py """A HuggingFace-style model configuration.""" import warnings -from typing import Any, Optional, Union +from typing import Any, Dict, Optional, Union from transformers import PretrainedConfig -attn_config_defaults: dict = { +attn_config_defaults: Dict = { 'attn_type': 'multihead_attention', 'attn_pdrop': 0.0, 'attn_impl': 'triton', @@ -20,8 +20,8 @@ 'alibi': False, 'alibi_bias_max': 8 } -ffn_config_defaults: dict = {'ffn_type': 'mptmlp'} -init_config_defaults: dict = { +ffn_config_defaults: Dict = {'ffn_type': 'mptmlp'} +init_config_defaults: Dict = { 'name': 'kaiming_normal_', 'fan_mode': 'fan_in', 'init_nonlinearity': 'relu', @@ -52,15 +52,15 @@ def __init__(self, resid_pdrop: float = 0.0, emb_pdrop: float = 0.0, learned_pos_emb: bool = True, - attn_config: dict = attn_config_defaults, - ffn_config: dict = ffn_config_defaults, + attn_config: Dict = attn_config_defaults, + ffn_config: Dict = ffn_config_defaults, init_device: str = 'cpu', logit_scale: Optional[Union[float, str]] = None, no_bias: bool = False, embedding_fraction: float = 1.0, norm_type: str = 'low_precision_layernorm', use_cache: bool = False, - init_config: dict = init_config_defaults, + init_config: Dict = init_config_defaults, fc_type: str = 'torch', verbose: Optional[int] = None, **kwargs: Any): @@ -102,8 +102,8 @@ def __init__(self, self._validate_config() def _set_config_defaults( - self, config: dict[str, Any], - config_defaults: dict[str, Any]) -> dict[str, Any]: + self, config: Dict[str, Any], + config_defaults: Dict[str, Any]) -> Dict[str, Any]: for (k, v) in config_defaults.items(): if k not in config: config[k] = v diff --git a/vllm/transformers_utils/configs/olmo2.py b/vllm/transformers_utils/configs/olmo2.py index 9935f5d9573e7..c6e446333b43d 100644 --- a/vllm/transformers_utils/configs/olmo2.py +++ b/vllm/transformers_utils/configs/olmo2.py @@ -62,7 +62,7 @@ class Olmo2Config(PretrainedConfig): Whether to tie weight embeddings rope_theta (`float`, *optional*, defaults to 10000.0): The base period of the RoPE embeddings. - rope_scaling (`dict`, *optional*): + rope_scaling (`Dict`, *optional*): Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update diff --git a/vllm/transformers_utils/configs/solar.py b/vllm/transformers_utils/configs/solar.py index 6eaf699d17bee..0d5db896b93d3 100644 --- a/vllm/transformers_utils/configs/solar.py +++ b/vllm/transformers_utils/configs/solar.py @@ -108,7 +108,7 @@ class SolarConfig(PretrainedConfig): Whether to tie weight embeddings rope_theta (`float`, *optional*, defaults to 10000.0): The base period of the RoPE embeddings. - rope_scaling (`dict`, *optional*): + rope_scaling (`Dict`, *optional*): Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling diff --git a/vllm/transformers_utils/configs/ultravox.py b/vllm/transformers_utils/configs/ultravox.py index 4c50724272634..6b2765db94e78 100644 --- a/vllm/transformers_utils/configs/ultravox.py +++ b/vllm/transformers_utils/configs/ultravox.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # Adapted from https://github.com/fixie-ai/ultravox/blob/ecd58c4041030bae2ad15aa6bcf04ab43199ea02/ultravox/model/ultravox_config.py -from typing import Any, Optional +from typing import Any, Dict, Optional import transformers @@ -48,8 +48,8 @@ class UltravoxConfig(transformers.PretrainedConfig): def __init__( self, - audio_config: Optional[dict[str, Any]] = None, - text_config: Optional[dict[str, Any]] = None, + audio_config: Optional[Dict[str, Any]] = None, + text_config: Optional[Dict[str, Any]] = None, audio_model_id: Optional[str] = None, text_model_id: Optional[str] = None, ignore_index: int = -100, @@ -58,8 +58,8 @@ def __init__( stack_factor: int = 8, norm_init: float = 0.4, projector_act: str = "swiglu", - text_model_lora_config: Optional[dict[str, Any]] = None, - audio_model_lora_config: Optional[dict[str, Any]] = None, + text_model_lora_config: Optional[Dict[str, Any]] = None, + audio_model_lora_config: Optional[Dict[str, Any]] = None, projector_ln_mid: bool = False, **kwargs, ): diff --git a/vllm/transformers_utils/detokenizer.py b/vllm/transformers_utils/detokenizer.py index a35bf76dc7277..9d1d4bb92e4ab 100644 --- a/vllm/transformers_utils/detokenizer.py +++ b/vllm/transformers_utils/detokenizer.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Optional +from typing import Dict, List, Optional from vllm.sequence import (VLLM_INVALID_TOKEN_ID, Logprob, SamplingParams, Sequence, SequenceGroup) @@ -22,7 +22,7 @@ def get_tokenizer_for_seq(self, sequence: Sequence) -> AnyTokenizer: return self.tokenizer_group.get_lora_tokenizer(sequence.lora_request) def decode_prompt_logprobs_inplace(self, seq_group: SequenceGroup, - prompt_logprobs: list[Optional[dict[ + prompt_logprobs: List[Optional[Dict[ int, Logprob]]], position_offset: int) -> None: """Decodes the logprobs for the prompt of a sequence group. @@ -49,7 +49,7 @@ def decode_prompt_logprobs_inplace(self, seq_group: SequenceGroup, read_offset = 0 next_iter_prefix_offset = 0 next_iter_read_offset = 0 - next_iter_tokens: list[str] = [] + next_iter_tokens: List[str] = [] prev_tokens = None for token_position_in_logprob, prompt_logprobs_for_token in enumerate( diff --git a/vllm/transformers_utils/detokenizer_utils.py b/vllm/transformers_utils/detokenizer_utils.py index 7373fa0ede237..a1fa27773fe5c 100644 --- a/vllm/transformers_utils/detokenizer_utils.py +++ b/vllm/transformers_utils/detokenizer_utils.py @@ -1,11 +1,11 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Optional +from typing import List, Optional, Tuple from .tokenizer import AnyTokenizer -def _replace_none_with_empty(tokens: list[Optional[str]]): +def _replace_none_with_empty(tokens: List[Optional[str]]): for i, token in enumerate(tokens): if token is None: tokens[i] = "" @@ -13,7 +13,7 @@ def _replace_none_with_empty(tokens: list[Optional[str]]): def _convert_tokens_to_string_with_added_encoders( tokenizer: AnyTokenizer, - output_tokens: list[str], + output_tokens: List[str], skip_special_tokens: bool, spaces_between_special_tokens: bool, ) -> str: @@ -22,8 +22,8 @@ def _convert_tokens_to_string_with_added_encoders( # NOTE(woosuk): The following code is slow because it runs a for loop over # the output_tokens. In Python, running a for loop over a list can be slow # even when the loop body is very simple. - sub_texts: list[str] = [] - current_sub_text: list[str] = [] + sub_texts: List[str] = [] + current_sub_text: List[str] = [] all_special_tokens = set(tokenizer.all_special_tokens) for token in output_tokens: if skip_special_tokens and token in all_special_tokens: @@ -52,9 +52,9 @@ def _convert_tokens_to_string_with_added_encoders( def convert_prompt_ids_to_tokens( tokenizer: AnyTokenizer, - prompt_ids: list[int], + prompt_ids: List[int], skip_special_tokens: bool = False, -) -> tuple[list[str], int, int]: +) -> Tuple[List[str], int, int]: """Converts the prompt ids to tokens and returns the tokens and offsets for incremental detokenization. @@ -76,8 +76,8 @@ def convert_prompt_ids_to_tokens( def convert_ids_list_to_tokens( tokenizer: AnyTokenizer, - token_ids: list[int], -) -> list[str]: + token_ids: List[int], +) -> List[str]: """Detokenize the input ids individually. Args: @@ -98,13 +98,13 @@ def convert_ids_list_to_tokens( # under Apache 2.0 license def detokenize_incrementally( tokenizer: AnyTokenizer, - all_input_ids: list[int], - prev_tokens: Optional[list[str]], + all_input_ids: List[int], + prev_tokens: Optional[List[str]], prefix_offset: int, read_offset: int, skip_special_tokens: bool = False, spaces_between_special_tokens: bool = True, -) -> tuple[list[str], str, int, int]: +) -> Tuple[List[str], str, int, int]: """Detokenizes the input ids incrementally and returns the new tokens and the new text. diff --git a/vllm/transformers_utils/processors/deepseek_vl2.py b/vllm/transformers_utils/processors/deepseek_vl2.py index a0f216e55e136..d37381ea9925f 100644 --- a/vllm/transformers_utils/processors/deepseek_vl2.py +++ b/vllm/transformers_utils/processors/deepseek_vl2.py @@ -24,6 +24,7 @@ # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. import math +from typing import List, Tuple import torch import torchvision.transforms as T @@ -35,8 +36,8 @@ class ImageTransform: def __init__(self, - mean: tuple[float, float, float] = (0.5, 0.5, 0.5), - std: tuple[float, float, float] = (0.5, 0.5, 0.5), + mean: Tuple[float, float, float] = (0.5, 0.5, 0.5), + std: Tuple[float, float, float] = (0.5, 0.5, 0.5), normalize: bool = True): self.mean = mean self.std = std @@ -61,11 +62,11 @@ class DeepseekVLV2Processor(ProcessorMixin): def __init__( self, tokenizer: LlamaTokenizerFast, - candidate_resolutions: tuple[tuple[int, int]], + candidate_resolutions: Tuple[Tuple[int, int]], patch_size: int, downsample_ratio: int, - image_mean: tuple[float, float, float] = (0.5, 0.5, 0.5), - image_std: tuple[float, float, float] = (0.5, 0.5, 0.5), + image_mean: Tuple[float, float, float] = (0.5, 0.5, 0.5), + image_std: Tuple[float, float, float] = (0.5, 0.5, 0.5), normalize: bool = True, image_token: str = "", pad_token: str = "<|▁pad▁|>", @@ -169,13 +170,13 @@ def encode(self, text: str, bos: bool = True, eos: bool = False): return t - def decode(self, t: list[int], **kwargs) -> str: + def decode(self, t: List[int], **kwargs) -> str: return self.tokenizer.decode(t, **kwargs) def process_one( self, prompt: str, - images: list[Image.Image], + images: List[Image.Image], inference_mode: bool = True, **kwargs, ): @@ -183,8 +184,8 @@ def process_one( Args: prompt (str): the formatted prompt; - conversations (list[dict]): conversations with a list of messages; - images (list[ImageType]): the list of images; + conversations (List[Dict]): conversations with a list of messages; + images (List[ImageType]): the list of images; inference_mode (bool): if True, then remove the last eos token; system_prompt (str): the system prompt; **kwargs: @@ -195,7 +196,7 @@ def process_one( - target_ids (torch.LongTensor): [N + image tokens] - pixel_values (torch.FloatTensor): [n_patches, 3, H, W] - image_id (int): the id of the image token - - num_image_tokens (list[int]): the number of image tokens + - num_image_tokens (List[int]): the number of image tokens """ assert (prompt is not None and images is not None @@ -256,7 +257,7 @@ def __call__( self, *, prompt: str, - images: list[Image.Image], + images: List[Image.Image], inference_mode: bool = True, **kwargs, ): @@ -264,7 +265,7 @@ def __call__( Args: prompt (str): the formatted prompt; - images (list[ImageType]): the list of images; + images (List[ImageType]): the list of images; inference_mode (bool): if True, then remove the last eos token; **kwargs: @@ -273,7 +274,7 @@ def __call__( - input_ids (torch.LongTensor): [N + image tokens] - images (torch.FloatTensor): [n_images, 3, H, W] - image_id (int): the id of the image token - - num_image_tokens (list[int]): the number of image tokens + - num_image_tokens (List[int]): the number of image tokens """ prepare = self.process_one( @@ -287,7 +288,7 @@ def __call__( def tokenize_with_images( self, conversation: str, - images: list[Image.Image], + images: List[Image.Image], bos: bool = True, eos: bool = True, cropping: bool = True, diff --git a/vllm/transformers_utils/tokenizer_base.py b/vllm/transformers_utils/tokenizer_base.py index b4eb081c9b99d..bb5ddaf88b219 100644 --- a/vllm/transformers_utils/tokenizer_base.py +++ b/vllm/transformers_utils/tokenizer_base.py @@ -2,7 +2,7 @@ import importlib from abc import ABC, abstractmethod -from typing import TYPE_CHECKING, Any, Optional, Union +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union if TYPE_CHECKING: from vllm.entrypoints.chat_utils import ChatCompletionMessageParam @@ -12,17 +12,17 @@ class TokenizerBase(ABC): @property @abstractmethod - def all_special_tokens_extended(self) -> list[str]: + def all_special_tokens_extended(self) -> List[str]: raise NotImplementedError() @property @abstractmethod - def all_special_tokens(self) -> list[str]: + def all_special_tokens(self) -> List[str]: raise NotImplementedError() @property @abstractmethod - def all_special_ids(self) -> list[int]: + def all_special_ids(self) -> List[int]: raise NotImplementedError() @property @@ -66,7 +66,7 @@ def __len__(self) -> int: @abstractmethod def __call__( self, - text: Union[str, list[str], list[int]], + text: Union[str, List[str], List[int]], text_pair: Optional[str] = None, add_special_tokens: bool = False, truncation: bool = False, @@ -75,11 +75,11 @@ def __call__( raise NotImplementedError() @abstractmethod - def get_vocab(self) -> dict[str, int]: + def get_vocab(self) -> Dict[str, int]: raise NotImplementedError() @abstractmethod - def get_added_vocab(self) -> dict[str, int]: + def get_added_vocab(self) -> Dict[str, int]: raise NotImplementedError() @abstractmethod @@ -88,44 +88,44 @@ def encode_one( text: str, truncation: bool = False, max_length: Optional[int] = None, - ) -> list[int]: + ) -> List[int]: raise NotImplementedError() @abstractmethod def encode(self, text: str, - add_special_tokens: Optional[bool] = None) -> list[int]: + add_special_tokens: Optional[bool] = None) -> List[int]: raise NotImplementedError() @abstractmethod def apply_chat_template(self, - messages: list["ChatCompletionMessageParam"], - tools: Optional[list[dict[str, Any]]] = None, - **kwargs) -> list[int]: + messages: List["ChatCompletionMessageParam"], + tools: Optional[List[Dict[str, Any]]] = None, + **kwargs) -> List[int]: raise NotImplementedError() @abstractmethod - def convert_tokens_to_string(self, tokens: list[str]) -> str: + def convert_tokens_to_string(self, tokens: List[str]) -> str: raise NotImplementedError() @abstractmethod def decode(self, - ids: Union[list[int], int], + ids: Union[List[int], int], skip_special_tokens: bool = True) -> str: raise NotImplementedError() @abstractmethod def convert_ids_to_tokens( self, - ids: list[int], + ids: List[int], skip_special_tokens: bool = True, - ) -> list[str]: + ) -> List[str]: raise NotImplementedError() class TokenizerRegistry: # Tokenizer name -> (tokenizer module, tokenizer class) - REGISTRY: dict[str, tuple[str, str]] = {} + REGISTRY: Dict[str, Tuple[str, str]] = {} @staticmethod def register(name: str, module: str, class_name: str) -> None: diff --git a/vllm/transformers_utils/tokenizer_group/__init__.py b/vllm/transformers_utils/tokenizer_group/__init__.py index 2c976b3b267d5..c223768b16d6b 100644 --- a/vllm/transformers_utils/tokenizer_group/__init__.py +++ b/vllm/transformers_utils/tokenizer_group/__init__.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Optional +from typing import Optional, Type from vllm.config import (LoRAConfig, ModelConfig, ParallelConfig, SchedulerConfig, TokenizerPoolConfig) @@ -35,7 +35,7 @@ def init_tokenizer_from_configs(model_config: ModelConfig, def get_tokenizer_group(tokenizer_pool_config: Optional[TokenizerPoolConfig], **init_kwargs) -> BaseTokenizerGroup: - tokenizer_cls: type[BaseTokenizerGroup] + tokenizer_cls: Type[BaseTokenizerGroup] if tokenizer_pool_config is None: tokenizer_cls = TokenizerGroup elif isinstance(tokenizer_pool_config.pool_type, type) and issubclass( diff --git a/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py index 6cd63984dbfdc..fbdfa3e57e172 100644 --- a/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py +++ b/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 from abc import ABC, abstractmethod -from typing import Optional +from typing import List, Optional from vllm.config import TokenizerPoolConfig from vllm.lora.request import LoRARequest @@ -35,7 +35,7 @@ def encode(self, prompt: str, request_id: Optional[str] = None, lora_request: Optional[LoRARequest] = None, - add_special_tokens: Optional[bool] = None) -> list[int]: + add_special_tokens: Optional[bool] = None) -> List[int]: """Encode a prompt using the tokenizer group.""" pass @@ -45,7 +45,7 @@ async def encode_async( prompt: str, request_id: Optional[str] = None, lora_request: Optional[LoRARequest] = None, - add_special_tokens: Optional[bool] = None) -> list[int]: + add_special_tokens: Optional[bool] = None) -> List[int]: """Encode a prompt using the tokenizer group.""" pass diff --git a/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py index 86044de936821..30cab752ccf3c 100644 --- a/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py +++ b/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py @@ -2,7 +2,7 @@ import asyncio import os -from typing import Optional +from typing import List, Optional try: from ray.exceptions import ActorDiedError # type: ignore @@ -115,7 +115,7 @@ def encode(self, prompt: str, request_id: Optional[str] = None, lora_request: Optional[LoRARequest] = None, - add_special_tokens: Optional[bool] = None) -> list[int]: + add_special_tokens: Optional[bool] = None) -> List[int]: """Encode a prompt using the tokenizer group. We pick an idle actor and use it to encode the prompt. @@ -166,7 +166,7 @@ async def encode_async( prompt: str, request_id: Optional[str] = None, lora_request: Optional[LoRARequest] = None, - add_special_tokens: Optional[bool] = None) -> list[int]: + add_special_tokens: Optional[bool] = None) -> List[int]: """Encode a prompt using the tokenizer group. We pick an idle actor and use it to encode the prompt. diff --git a/vllm/transformers_utils/tokenizer_group/tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/tokenizer_group.py index 2eaf821de3338..025971cb7e477 100644 --- a/vllm/transformers_utils/tokenizer_group/tokenizer_group.py +++ b/vllm/transformers_utils/tokenizer_group/tokenizer_group.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Optional +from typing import List, Optional from vllm.config import TokenizerPoolConfig from vllm.lora.request import LoRARequest @@ -43,7 +43,7 @@ def get_max_input_len(self, return self.max_input_length def _raise_if_input_too_long(self, - encoded_tokens: list[int], + encoded_tokens: List[int], lora_request: Optional[LoRARequest] = None): input_length = len(encoded_tokens) if lora_request: @@ -58,7 +58,7 @@ def encode(self, prompt: str, request_id: Optional[str] = None, lora_request: Optional[LoRARequest] = None, - add_special_tokens: Optional[bool] = None) -> list[int]: + add_special_tokens: Optional[bool] = None) -> List[int]: tokenizer = self.get_lora_tokenizer(lora_request) ret = encode_tokens(tokenizer, prompt, @@ -71,7 +71,7 @@ async def encode_async( prompt: str, request_id: Optional[str] = None, lora_request: Optional[LoRARequest] = None, - add_special_tokens: Optional[bool] = None) -> list[int]: + add_special_tokens: Optional[bool] = None) -> List[int]: tokenizer = await self.get_lora_tokenizer_async(lora_request) ret = encode_tokens(tokenizer, prompt, diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py index dc38388c3dbee..801597bd36508 100644 --- a/vllm/transformers_utils/tokenizers/mistral.py +++ b/vllm/transformers_utils/tokenizers/mistral.py @@ -4,7 +4,7 @@ import re from dataclasses import dataclass from pathlib import Path -from typing import TYPE_CHECKING, Any, Optional, Union, cast +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union, cast import huggingface_hub from huggingface_hub import HfApi, hf_hub_download @@ -28,7 +28,7 @@ @dataclass class Encoding: - input_ids: Union[list[int], list[list[int]]] + input_ids: Union[List[int], List[List[int]]] def maybe_serialize_tool_calls(request: "ChatCompletionRequest"): @@ -98,7 +98,7 @@ def truncate_tool_call_ids(request: "ChatCompletionRequest"): request.messages[i]["tool_call_id"] = tool_call_id -def list_local_repo_files(repo_id: str, revision: Optional[str]) -> list[str]: +def list_local_repo_files(repo_id: str, revision: Optional[str]) -> List[str]: repo_cache = os.path.join( huggingface_hub.constants.HF_HUB_CACHE, huggingface_hub.constants.REPO_ID_SEPARATOR.join( @@ -118,7 +118,7 @@ def list_local_repo_files(repo_id: str, revision: Optional[str]) -> list[str]: return [] -def find_tokenizer_file(files: list[str]): +def find_tokenizer_file(files: List[str]): file_pattern = re.compile( r"^tokenizer\.model\.v.*$|^tekken\.json$|^tokenizer\.mm\.model\.v.*$") @@ -136,14 +136,14 @@ def find_tokenizer_file(files: list[str]): def make_mistral_chat_completion_request( - messages: list["ChatCompletionMessageParam"], - tools: Optional[list[dict[str, + messages: List["ChatCompletionMessageParam"], + tools: Optional[List[Dict[str, Any]]] = None) -> "ChatCompletionRequest": - last_message = cast(dict[str, Any], messages[-1]) + last_message = cast(Dict[str, Any], messages[-1]) if last_message["role"] == "assistant": last_message["prefix"] = True - last_message = cast(dict[str, Any], messages[-1]) + last_message = cast(Dict[str, Any], messages[-1]) if last_message["role"] == "assistant": last_message["prefix"] = True @@ -194,7 +194,7 @@ def __init__(self, tokenizer: "PublicMistralTokenizer") -> None: raise TypeError(f"Unsupported tokenizer: {type(tokenizer_)}") self._vocab = tokenizer_.vocab() - # Convert to a dict[str, int] to match protocol, but this is a lossy + # Convert to a Dict[str, int] to match protocol, but this is a lossy # conversion. There may be multiple token ids that decode to the same # string due to partial UTF-8 byte sequences being converted to � self._vocab_dict = { @@ -252,7 +252,7 @@ def _download_mistral_tokenizer_from_hf(tokenizer_name: str, # the following attributes are set to fit VLLM's design and are used # by the guided structured output backends. @property - def all_special_tokens_extended(self) -> list[str]: + def all_special_tokens_extended(self) -> List[str]: from mistral_common.tokens.tokenizers.base import SpecialTokens # tekken defines its own extended special tokens list @@ -266,11 +266,11 @@ def all_special_tokens_extended(self) -> list[str]: ] @property - def all_special_tokens(self) -> list[str]: + def all_special_tokens(self) -> List[str]: return self.all_special_tokens_extended @property - def all_special_ids(self) -> list[int]: + def all_special_ids(self) -> List[int]: return [ self.all_special_tokens.index(t) for t in self.all_special_tokens ] @@ -308,21 +308,21 @@ def __len__(self) -> int: def __call__( self, - text: Union[str, list[str], list[int]], + text: Union[str, List[str], List[int]], text_pair: Optional[str] = None, add_special_tokens: bool = False, truncation: bool = False, max_length: Optional[int] = None, ): - input_ids: Union[list[int], list[list[int]]] - # For list[str], original prompt text + input_ids: Union[List[int], List[List[int]]] + # For List[str], original prompt text if is_list_of(text, str): - input_ids_: list[list[int]] = [] + input_ids_: List[List[int]] = [] for p in text: each_input_ids = self.encode_one(p, truncation, max_length) input_ids_.append(each_input_ids) input_ids = input_ids_ - # For list[int], apply chat template output, already tokens. + # For List[int], apply chat template output, already tokens. elif is_list_of(text, int): input_ids = text # For str, single prompt text @@ -330,12 +330,12 @@ def __call__( input_ids = self.encode_one(text, truncation, max_length) return Encoding(input_ids=input_ids) - def get_vocab(self) -> dict[str, int]: + def get_vocab(self) -> Dict[str, int]: # NB: the dictionary form of the vocabulary collapses token ids that map # to the same string but have different bytes return self._vocab_dict - def get_added_vocab(self) -> dict[str, int]: + def get_added_vocab(self) -> Dict[str, int]: # Mistral tokenizers have no added vocabulary return {} @@ -344,7 +344,7 @@ def encode_one( text: str, truncation: bool = False, max_length: Optional[int] = None, - ) -> list[int]: + ) -> List[int]: # Mistral Tokenizers should not add special tokens input_ids = self.encode(text) @@ -354,7 +354,7 @@ def encode_one( def encode(self, text: str, - add_special_tokens: Optional[bool] = None) -> list[int]: + add_special_tokens: Optional[bool] = None) -> List[int]: # `encode` should only be used for prompt completion # it should never be used for chat_completion. # For chat completion use `apply_chat_template` @@ -366,9 +366,9 @@ def encode(self, return self.tokenizer.encode(text, bos=True, eos=False) def apply_chat_template(self, - messages: list["ChatCompletionMessageParam"], - tools: Optional[list[dict[str, Any]]] = None, - **kwargs) -> list[int]: + messages: List["ChatCompletionMessageParam"], + tools: Optional[List[Dict[str, Any]]] = None, + **kwargs) -> List[int]: request = make_mistral_chat_completion_request(messages, tools) encoded = self.mistral.encode_chat_completion(request) @@ -376,7 +376,7 @@ def apply_chat_template(self, # encode-decode to get clean prompt return encoded.tokens - def convert_tokens_to_string(self, tokens: list[str]) -> str: + def convert_tokens_to_string(self, tokens: List[str]) -> str: from mistral_common.tokens.tokenizers.base import SpecialTokens if self.is_tekken: tokens = [ @@ -409,7 +409,7 @@ def _token_to_id(t: str): # make sure certain special tokens like Tool calls are # not decoded special_tokens = {SpecialTokens.tool_calls} - regular_tokens: list[str] = [] + regular_tokens: List[str] = [] decoded_list = [] for token in tokens: @@ -434,7 +434,7 @@ def _token_to_id(t: str): # See: guided_decoding/outlines_logits_processors.py::_adapt_tokenizer # for more. def decode(self, - ids: Union[list[int], int], + ids: Union[List[int], int], skip_special_tokens: bool = True) -> str: assert ( skip_special_tokens @@ -446,9 +446,9 @@ def decode(self, def convert_ids_to_tokens( self, - ids: list[int], + ids: List[int], skip_special_tokens: bool = True, - ) -> list[str]: + ) -> List[str]: from mistral_common.tokens.tokenizers.base import SpecialTokens # TODO(Patrick) - potentially allow special tokens to not be skipped diff --git a/vllm/transformers_utils/utils.py b/vllm/transformers_utils/utils.py index a8a14e5ad073e..87e446f894384 100644 --- a/vllm/transformers_utils/utils.py +++ b/vllm/transformers_utils/utils.py @@ -2,7 +2,7 @@ from os import PathLike from pathlib import Path -from typing import Optional, Union +from typing import List, Optional, Union def is_s3(model_or_path: str) -> bool: @@ -26,7 +26,7 @@ def modelscope_list_repo_files( repo_id: str, revision: Optional[str] = None, token: Union[str, bool, None] = None, -) -> list[str]: +) -> List[str]: """List files in a modelscope repo.""" from modelscope.hub.api import HubApi api = HubApi() diff --git a/vllm/usage/usage_lib.py b/vllm/usage/usage_lib.py index d8795e1e0557b..fbbb21c89370a 100644 --- a/vllm/usage/usage_lib.py +++ b/vllm/usage/usage_lib.py @@ -9,7 +9,7 @@ from enum import Enum from pathlib import Path from threading import Thread -from typing import Any, Optional, Union +from typing import Any, Dict, Optional, Union from uuid import uuid4 import cpuinfo @@ -27,7 +27,7 @@ _USAGE_STATS_ENABLED = None _USAGE_STATS_SERVER = envs.VLLM_USAGE_STATS_SERVER -_GLOBAL_RUNTIME_DATA: dict[str, Union[str, int, bool]] = {} +_GLOBAL_RUNTIME_DATA: Dict[str, Union[str, int, bool]] = {} _USAGE_ENV_VARS_TO_COLLECT = [ "VLLM_USE_MODELSCOPE", @@ -150,7 +150,7 @@ def __init__(self) -> None: def report_usage(self, model_architecture: str, usage_context: UsageContext, - extra_kvs: Optional[dict[str, Any]] = None) -> None: + extra_kvs: Optional[Dict[str, Any]] = None) -> None: t = Thread(target=self._report_usage_worker, args=(model_architecture, usage_context, extra_kvs or {}), daemon=True) @@ -158,13 +158,13 @@ def report_usage(self, def _report_usage_worker(self, model_architecture: str, usage_context: UsageContext, - extra_kvs: dict[str, Any]) -> None: + extra_kvs: Dict[str, Any]) -> None: self._report_usage_once(model_architecture, usage_context, extra_kvs) self._report_continous_usage() def _report_usage_once(self, model_architecture: str, usage_context: UsageContext, - extra_kvs: dict[str, Any]) -> None: + extra_kvs: Dict[str, Any]) -> None: # Platform information from vllm.platforms import current_platform if current_platform.is_cuda_alike(): @@ -227,7 +227,7 @@ def _report_continous_usage(self): self._write_to_file(data) self._send_to_server(data) - def _send_to_server(self, data: dict[str, Any]) -> None: + def _send_to_server(self, data: Dict[str, Any]) -> None: try: global_http_client = global_http_connection.get_sync_client() global_http_client.post(_USAGE_STATS_SERVER, json=data) @@ -235,7 +235,7 @@ def _send_to_server(self, data: dict[str, Any]) -> None: # silently ignore unless we are using debug log logging.debug("Failed to send usage data to server") - def _write_to_file(self, data: dict[str, Any]) -> None: + def _write_to_file(self, data: Dict[str, Any]) -> None: os.makedirs(os.path.dirname(_USAGE_STATS_JSON_PATH), exist_ok=True) Path(_USAGE_STATS_JSON_PATH).touch(exist_ok=True) with open(_USAGE_STATS_JSON_PATH, "a") as f: diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py index 9c6e6dc74280f..3960392cf74ef 100644 --- a/vllm/worker/cache_engine.py +++ b/vllm/worker/cache_engine.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 """CacheEngine class for managing the KV cache.""" +from typing import List import numpy as np import torch @@ -73,12 +74,12 @@ def _allocate_kv_cache( self, num_blocks: int, device: str, - ) -> list[torch.Tensor]: + ) -> List[torch.Tensor]: """Allocates KV cache on the specified device.""" kv_cache_shape = self.attn_backend.get_kv_cache_shape( num_blocks, self.block_size, self.num_kv_heads, self.head_size) pin_memory = is_pin_memory_available() if device == "cpu" else False - kv_cache: list[torch.Tensor] = [] + kv_cache: List[torch.Tensor] = [] # Align entries so they are 256 byte aligned for better performance # Primarily targets MLA as this typically only ends up having entries diff --git a/vllm/worker/cpu_enc_dec_model_runner.py b/vllm/worker/cpu_enc_dec_model_runner.py index 85afa979f1993..ac7c93e48395d 100644 --- a/vllm/worker/cpu_enc_dec_model_runner.py +++ b/vllm/worker/cpu_enc_dec_model_runner.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import dataclasses -from typing import TYPE_CHECKING, Any, Optional, cast +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, cast import torch @@ -31,7 +31,7 @@ class EncoderDecoderModelInputForCPU(ModelInputForCPUWithSamplingMetadata): encoder_input_tokens: Optional[torch.Tensor] = None encoder_input_positions: Optional[torch.Tensor] = None - def as_broadcastable_tensor_dict(self) -> dict[str, Any]: + def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: tensor_dict = { "input_tokens": self.input_tokens, "input_positions": self.input_positions, @@ -47,7 +47,7 @@ def as_broadcastable_tensor_dict(self) -> dict[str, Any]: @classmethod def from_broadcasted_tensor_dict( cls, - tensor_dict: dict[str, Any], + tensor_dict: Dict[str, Any], attn_backend: Optional["AttentionBackend"] = None, ) -> "EncoderDecoderModelInputForCPU": return cast( @@ -57,19 +57,19 @@ def from_broadcasted_tensor_dict( class CPUEncoderDecoderModelRunner( CPUModelRunnerBase[EncoderDecoderModelInputForCPU]): - _model_input_cls: type[EncoderDecoderModelInputForCPU] = ( + _model_input_cls: Type[EncoderDecoderModelInputForCPU] = ( EncoderDecoderModelInputForCPU) - _builder_cls: type[ModelInputForCPUBuilder] = ModelInputForCPUBuilder + _builder_cls: Type[ModelInputForCPUBuilder] = ModelInputForCPUBuilder def _list_to_int32_tensor( self, - _list: list[int], + _list: List[int], ) -> torch.Tensor: return torch.tensor(_list, dtype=torch.int32, device=self.device) def _list_to_long_tensor( self, - _list: list[int], + _list: List[int], ) -> torch.Tensor: return torch.tensor(_list, dtype=torch.long, device=self.device) @@ -80,7 +80,7 @@ def _empty_long_tensor(self) -> torch.Tensor: return self._list_to_long_tensor([]) def make_model_input_from_broadcasted_tensor_dict( - self, tensor_dict: dict[str, + self, tensor_dict: Dict[str, Any]) -> EncoderDecoderModelInputForCPU: return EncoderDecoderModelInputForCPU.from_broadcasted_tensor_dict( tensor_dict, @@ -89,9 +89,9 @@ def make_model_input_from_broadcasted_tensor_dict( def prepare_model_input( self, - seq_group_metadata_list: list[SequenceGroupMetadata], + seq_group_metadata_list: List[SequenceGroupMetadata], virtual_engine: int = 0, - finished_requests_ids: Optional[list[str]] = None + finished_requests_ids: Optional[List[str]] = None ) -> EncoderDecoderModelInputForCPU: model_input = self._prepare_model_input_tensors( seq_group_metadata_list, finished_requests_ids) @@ -120,9 +120,9 @@ def prepare_model_input( def _prepare_encoder_model_input_tensors( self, - seq_group_metadata_list: list[SequenceGroupMetadata], + seq_group_metadata_list: List[SequenceGroupMetadata], model_input: EncoderDecoderModelInputForCPU, - ) -> tuple[AttentionMetadata, Optional[torch.Tensor], + ) -> Tuple[AttentionMetadata, Optional[torch.Tensor], Optional[torch.Tensor]]: """Helper method to prepare the encoder- and cross-attn-related model inputs based on a given sequence group. These additional inputs @@ -167,7 +167,7 @@ def _prepare_encoder_model_input_tensors( is_prompt = seq_group_metadata_list[0].is_prompt # Build encoder inputs - encoder_seq_lens: list[int] = [] + encoder_seq_lens: List[int] = [] if is_prompt: # Prefill phase. cross_block_tables = self._empty_int32_tensor().view( @@ -279,10 +279,10 @@ def _prepare_encoder_model_input_tensors( def execute_model( self, model_input: EncoderDecoderModelInputForCPU, - kv_caches: list[torch.Tensor], + kv_caches: List[torch.Tensor], intermediate_tensors: Optional[IntermediateTensors] = None, num_steps: int = 1, - ) -> Optional[list[SamplerOutput]]: + ) -> Optional[List[SamplerOutput]]: if num_steps > 1: raise ValueError( "CPU worker does not support multi-step execution.") diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py index bb0de9fff0ee1..8407f073040ee 100644 --- a/vllm/worker/cpu_model_runner.py +++ b/vllm/worker/cpu_model_runner.py @@ -4,7 +4,8 @@ import weakref from collections import defaultdict from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union +from typing import (TYPE_CHECKING, Any, Dict, List, Optional, Set, Type, + TypeVar, Union) import torch from torch import nn @@ -52,13 +53,13 @@ class ModelInputForCPU(ModelRunnerInputBase): attn_metadata: Optional["AttentionMetadata"] = None multi_modal_kwargs: Optional[BatchedTensorInputs] = None virtual_engine: Optional[int] = None - seq_lens: Optional[list[int]] = None - query_lens: Optional[list[int]] = None + seq_lens: Optional[List[int]] = None + query_lens: Optional[List[int]] = None lora_mapping: Optional["LoRAMapping"] = None - lora_requests: Optional[set[LoRARequest]] = None + lora_requests: Optional[Set[LoRARequest]] = None def as_broadcastable_tensor_dict( - self) -> dict[str, Union[int, torch.Tensor]]: + self) -> Dict[str, Union[int, torch.Tensor]]: tensor_dict = { "input_tokens": self.input_tokens, "input_positions": self.input_positions, @@ -73,8 +74,8 @@ def as_broadcastable_tensor_dict( @classmethod def from_broadcasted_tensor_dict( - cls: type[TModelInputForCPU], - tensor_dict: dict[str, Any], + cls: Type[TModelInputForCPU], + tensor_dict: Dict[str, Any], attn_backend: Optional["AttentionBackend"] = None ) -> TModelInputForCPU: if attn_backend is not None: @@ -91,7 +92,7 @@ class ModelInputForCPUWithSamplingMetadata(ModelInputForCPU): sampling_metadata: Optional["SamplingMetadata"] = None is_prompt: Optional[bool] = None - def as_broadcastable_tensor_dict(self) -> dict[str, Any]: + def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: tensor_dict = { "input_tokens": self.input_tokens, "input_positions": self.input_positions, @@ -106,7 +107,7 @@ def as_broadcastable_tensor_dict(self) -> dict[str, Any]: @classmethod def from_broadcasted_tensor_dict( cls, - tensor_dict: dict[str, Any], + tensor_dict: Dict[str, Any], attn_backend: Optional["AttentionBackend"] = None, ) -> "ModelInputForCPUWithSamplingMetadata": tensor_dict = _init_sampling_metadata_from_tensor_dict(tensor_dict) @@ -122,28 +123,28 @@ class ModelInputData: def __init__(self, use_mrope: bool): self.use_mrope = use_mrope - self.input_tokens: list[int] = [] - self.input_positions: list[int] = [] - self.token_type_ids: Optional[list[int]] = [] - self.seq_lens: list[int] = [] - self.query_lens: list[int] = [] - self.prefill_block_tables: list[list[int]] = [] - self.decode_block_tables: list[list[int]] = [] + self.input_tokens: List[int] = [] + self.input_positions: List[int] = [] + self.token_type_ids: Optional[List[int]] = [] + self.seq_lens: List[int] = [] + self.query_lens: List[int] = [] + self.prefill_block_tables: List[List[int]] = [] + self.decode_block_tables: List[List[int]] = [] self.max_decode_seq_len: int = 0 self.num_prefills: int = 0 self.num_prefill_tokens: int = 0 self.num_decode_tokens: int = 0 - self.slot_mapping: list[int] = [] - self.multi_modal_inputs_list: list[MultiModalKwargs] = [] - self.multi_modal_placeholder_maps: dict[ + self.slot_mapping: List[int] = [] + self.multi_modal_inputs_list: List[MultiModalKwargs] = [] + self.multi_modal_placeholder_maps: Dict[ str, MultiModalPlaceholderMap] = defaultdict( MultiModalPlaceholderMap) - self.input_mrope_positions: list[list[int]] = [[] + self.input_mrope_positions: List[List[int]] = [[] for _ in range(3)] def __init__(self, runner: "CPUModelRunner", - finished_requests_ids: Optional[list[str]] = None) -> None: + finished_requests_ids: Optional[List[str]] = None) -> None: super().__init__() self.runner = runner self.chunked_prefill = (runner.scheduler_config.chunked_prefill_enabled @@ -161,8 +162,8 @@ def __init__(self, self.att_metadata_builder = attn_backend.get_builder_cls()(self) def prepare(self, - finished_requests_ids: Optional[list[str]] = None) -> None: - self.seq_group_metadata_list: list[SequenceGroupMetadata] = [] + finished_requests_ids: Optional[List[str]] = None) -> None: + self.seq_group_metadata_list: List[SequenceGroupMetadata] = [] self.input_data = ModelInputForCPUBuilder.ModelInputData( self.runner.model_config.uses_mrope) self.att_metadata_builder.prepare() @@ -171,7 +172,7 @@ def add_seq_group(self, seq_group_metadata: SequenceGroupMetadata): self.seq_group_metadata_list.append(seq_group_metadata) def set_seq_group_list( - self, seq_group_metadata_list: list[SequenceGroupMetadata]): + self, seq_group_metadata_list: List[SequenceGroupMetadata]): self.seq_group_metadata_list = seq_group_metadata_list def build(self) -> ModelInputForCPU: @@ -410,7 +411,7 @@ def _compute_multi_modal_input(self, placeholder_map) def _prepare_lora_input( - self, seq_group_metadata_list: list[SequenceGroupMetadata], + self, seq_group_metadata_list: List[SequenceGroupMetadata], is_prefill: bool) -> LoRAMapping: index_mapping = [] prompt_mapping = [] @@ -432,8 +433,8 @@ class CPUModelRunnerBase(ModelRunnerBase[TModelInputForCPU]): """ Helper class for shared methods between CPU model runners. """ - _model_input_cls: type[TModelInputForCPU] - _builder_cls: type[ModelInputForCPUBuilder] + _model_input_cls: Type[TModelInputForCPU] + _builder_cls: Type[ModelInputForCPUBuilder] builder: ModelInputForCPUBuilder def __init__( @@ -522,8 +523,8 @@ def get_model(self) -> nn.Module: def _prepare_model_input_tensors( self, - seq_group_metadata_list: list[SequenceGroupMetadata], - finished_requests_ids: Optional[list[str]] = None + seq_group_metadata_list: List[SequenceGroupMetadata], + finished_requests_ids: Optional[List[str]] = None ) -> TModelInputForCPU: """Helper method to prepare the model input based on a given sequence group. Prepares metadata needed for the base model forward pass but not @@ -549,7 +550,7 @@ def remove_all_loras(self): raise RuntimeError("LoRA is not enabled.") self.lora_manager.remove_all_adapters() - def set_active_loras(self, lora_requests: set[LoRARequest], + def set_active_loras(self, lora_requests: Set[LoRARequest], lora_mapping: LoRAMapping) -> None: if not self.lora_manager: raise RuntimeError("LoRA is not enabled.") @@ -570,20 +571,20 @@ def pin_lora(self, lora_id: int) -> bool: raise RuntimeError("LoRA is not enabled.") return self.lora_manager.pin_adapter(lora_id) - def list_loras(self) -> set[int]: + def list_loras(self) -> Set[int]: if not self.lora_manager: raise RuntimeError("LoRA is not enabled.") return self.lora_manager.list_adapters() class CPUModelRunner(CPUModelRunnerBase[ModelInputForCPUWithSamplingMetadata]): - _model_input_cls: type[ModelInputForCPUWithSamplingMetadata] = ( + _model_input_cls: Type[ModelInputForCPUWithSamplingMetadata] = ( ModelInputForCPUWithSamplingMetadata) - _builder_cls: type[ModelInputForCPUBuilder] = ModelInputForCPUBuilder + _builder_cls: Type[ModelInputForCPUBuilder] = ModelInputForCPUBuilder def make_model_input_from_broadcasted_tensor_dict( self, - tensor_dict: dict[str, Any], + tensor_dict: Dict[str, Any], ) -> ModelInputForCPUWithSamplingMetadata: return ModelInputForCPUWithSamplingMetadata.from_broadcasted_tensor_dict( # noqa: E501 tensor_dict, @@ -592,9 +593,9 @@ def make_model_input_from_broadcasted_tensor_dict( def prepare_model_input( self, - seq_group_metadata_list: list[SequenceGroupMetadata], + seq_group_metadata_list: List[SequenceGroupMetadata], virtual_engine: int = 0, - finished_requests_ids: Optional[list[str]] = None + finished_requests_ids: Optional[List[str]] = None ) -> ModelInputForCPUWithSamplingMetadata: """Prepare the model input based on a given sequence group, including metadata for the sampling step. @@ -622,11 +623,11 @@ def prepare_model_input( def execute_model( self, model_input: ModelInputForCPUWithSamplingMetadata, - kv_caches: list[torch.Tensor], + kv_caches: List[torch.Tensor], intermediate_tensors: Optional[IntermediateTensors] = None, num_steps: int = 1, previous_hidden_states: Optional[torch.Tensor] = None, - ) -> Optional[list[SamplerOutput]]: + ) -> Optional[List[SamplerOutput]]: if num_steps > 1: raise ValueError( "CPU worker does not support multi-step execution.") diff --git a/vllm/worker/cpu_pooling_model_runner.py b/vllm/worker/cpu_pooling_model_runner.py index 810d7373e302b..1ceb2557c6b3d 100644 --- a/vllm/worker/cpu_pooling_model_runner.py +++ b/vllm/worker/cpu_pooling_model_runner.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import dataclasses -from typing import Any, Optional, Union +from typing import Any, Dict, List, Optional, Tuple, Type, Union import torch @@ -25,18 +25,18 @@ class ModelInputForCPUWithPoolingMetadata(ModelInputForCPU): class CPUPoolingModelRunner( CPUModelRunnerBase[ModelInputForCPUWithPoolingMetadata]): - _model_input_cls: type[ModelInputForCPUWithPoolingMetadata] = ( + _model_input_cls: Type[ModelInputForCPUWithPoolingMetadata] = ( ModelInputForCPUWithPoolingMetadata) - _builder_cls: type[ModelInputForCPUBuilder] = ModelInputForCPUBuilder + _builder_cls: Type[ModelInputForCPUBuilder] = ModelInputForCPUBuilder @torch.inference_mode() def execute_model( self, model_input: ModelInputForCPUWithPoolingMetadata, - kv_caches: list[torch.Tensor], + kv_caches: List[torch.Tensor], intermediate_tensors: Optional[IntermediateTensors] = None, num_steps: int = 1, - ) -> Optional[Union[list[PoolerOutput], IntermediateTensors]]: + ) -> Optional[Union[List[PoolerOutput], IntermediateTensors]]: if num_steps > 1: raise ValueError( "CPU worker does not support multi-step execution.") @@ -72,7 +72,7 @@ def execute_model( def make_model_input_from_broadcasted_tensor_dict( self, - tensor_dict: dict[str, + tensor_dict: Dict[str, Any]) -> ModelInputForCPUWithPoolingMetadata: return ModelInputForCPUWithPoolingMetadata.from_broadcasted_tensor_dict( tensor_dict, @@ -81,9 +81,9 @@ def make_model_input_from_broadcasted_tensor_dict( def prepare_model_input( self, - seq_group_metadata_list: Optional[list[SequenceGroupMetadata]], + seq_group_metadata_list: Optional[List[SequenceGroupMetadata]], virtual_engine: int = 0, - finished_requests_ids: Optional[list[str]] = None + finished_requests_ids: Optional[List[str]] = None ) -> ModelInputForCPUWithPoolingMetadata: assert seq_group_metadata_list is not None model_input = self._prepare_model_input_tensors( @@ -99,17 +99,17 @@ def prepare_model_input( def _prepare_pooling( self, - seq_group_metadata_list: list[SequenceGroupMetadata], - prompt_lens: list[int], + seq_group_metadata_list: List[SequenceGroupMetadata], + prompt_lens: List[int], ) -> PoolingMetadata: """Prepare PoolingMetadata for the sequence group metadata list.""" - seq_groups: list[tuple[list[int], PoolingParams]] = [] + seq_groups: List[Tuple[List[int], PoolingParams]] = [] for i, seq_group_metadata in enumerate(seq_group_metadata_list): seq_ids = list(seq_group_metadata.seq_data.keys()) pooling_params = seq_group_metadata.pooling_params seq_groups.append((seq_ids, pooling_params)) - seq_data: dict[int, SequenceData] = {} + seq_data: Dict[int, SequenceData] = {} for seq_group_metadata in seq_group_metadata_list: seq_data.update(seq_group_metadata.seq_data) diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py index cb729af319d8d..27b1a2dd1be8c 100644 --- a/vllm/worker/cpu_worker.py +++ b/vllm/worker/cpu_worker.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 """A CPU worker class.""" -from typing import Optional +from typing import Dict, List, Optional, Set, Tuple, Type import torch import torch.distributed @@ -71,23 +71,23 @@ def __init__(self, cache_config: CacheConfig, model_config: ModelConfig, def _allocate_kv_cache( self, num_blocks: int, - ) -> list[torch.Tensor]: + ) -> List[torch.Tensor]: """Allocates KV cache on CPU.""" kv_cache_shape = self.attn_backend.get_kv_cache_shape( num_blocks, self.block_size, self.num_heads, self.head_size) - kv_cache: list[torch.Tensor] = [] + kv_cache: List[torch.Tensor] = [] for _ in range(self.num_layers): kv_cache.append( torch.empty(kv_cache_shape, dtype=self.dtype, device="cpu")) return kv_cache - def swap_in(self, src_to_dst: dict[int, int]) -> None: + def swap_in(self, src_to_dst: Dict[int, int]) -> None: raise NotImplementedError("Swap is not supported in CPUCacheEngine.") - def swap_out(self, src_to_dst: dict[int, int]) -> None: + def swap_out(self, src_to_dst: Dict[int, int]) -> None: raise NotImplementedError("Swap is not supported in CPUCacheEngine.") - def copy(self, src_to_dsts: dict[int, list[int]]) -> None: + def copy(self, src_to_dsts: Dict[int, List[int]]) -> None: self.attn_backend.copy_blocks(self.cpu_cache, src_to_dsts) @staticmethod @@ -129,7 +129,7 @@ def __init__( distributed_init_method: str, kv_cache_dtype: Optional[str] = "auto", is_driver_worker: bool = False, - model_runner_cls: Optional[type[CPUModelRunner]] = None, + model_runner_cls: Optional[Type[CPUModelRunner]] = None, ) -> None: WorkerBase.__init__(self, vllm_config=vllm_config) @@ -163,7 +163,7 @@ def __init__( or (speculative_config.draft_model_config.hf_config.model_type not in ["medusa", "mlp_speculator", "eagle"]) \ else {"return_hidden_states": True} - ModelRunnerClass: type[CPUModelRunnerBase] = CPUModelRunner + ModelRunnerClass: Type[CPUModelRunnerBase] = CPUModelRunner if self.model_config.runner_type == "pooling": ModelRunnerClass = CPUPoolingModelRunner elif self.model_config.is_encoder_decoder: @@ -178,9 +178,9 @@ def __init__( self.model_runner = model_runner_cls(self.model_runner) # Uninitialized cache engine. Will be initialized by # initialize_cache. - self.cache_engine: list[CPUCacheEngine] + self.cache_engine: List[CPUCacheEngine] # Initialize cpu_cache as pooling models don't initialize kv_caches - self.cpu_cache: Optional[list[list[torch.Tensor]]] = None + self.cpu_cache: Optional[List[List[torch.Tensor]]] = None # Torch profiler. Enabled and configured through env vars: # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace @@ -221,7 +221,7 @@ def init_device(self) -> None: def load_model(self): self.model_runner.load_model() - def determine_num_available_blocks(self) -> tuple[int, int]: + def determine_num_available_blocks(self) -> Tuple[int, int]: """Determine the number of blocks available for the KV cache. This determines how many KV blocks can fit into the configured CPU @@ -276,7 +276,7 @@ def remove_lora(self, lora_id: int) -> bool: def pin_lora(self, lora_id: int) -> bool: return self.model_runner.pin_lora(lora_id) - def list_loras(self) -> set[int]: + def list_loras(self) -> Set[int]: return self.model_runner.list_loras() def _validate_num_cpu_blocks(self, num_cpu_blocks: int) -> None: @@ -324,7 +324,7 @@ def do_metadata_broadcast(self) -> bool: return self.parallel_config.tensor_parallel_size > 1 @property - def kv_cache(self) -> Optional[list[list[torch.Tensor]]]: + def kv_cache(self) -> Optional[List[List[torch.Tensor]]]: return self.cpu_cache @property diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py index 48e9cf5f56562..5f39f2fa4947c 100644 --- a/vllm/worker/enc_dec_model_runner.py +++ b/vllm/worker/enc_dec_model_runner.py @@ -2,7 +2,7 @@ import dataclasses import itertools -from typing import Any, Optional, cast +from typing import Any, Dict, List, Optional, Tuple, Type, cast import torch import torch.distributed @@ -44,7 +44,7 @@ class EncoderDecoderModelInput(ModelInputForGPUWithSamplingMetadata): encoder_input_tokens: Optional[torch.Tensor] = None encoder_input_positions: Optional[torch.Tensor] = None - def as_broadcastable_tensor_dict(self) -> dict[str, Any]: + def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: tensor_dict = { "input_tokens": self.input_tokens, "input_positions": self.input_positions, @@ -63,7 +63,7 @@ def as_broadcastable_tensor_dict(self) -> dict[str, Any]: @classmethod def from_broadcasted_tensor_dict( cls, - tensor_dict: dict[str, Any], + tensor_dict: Dict[str, Any], attn_backend: Optional["AttentionBackend"] = None, ) -> "EncoderDecoderModelInput": return cast( @@ -72,9 +72,9 @@ def from_broadcasted_tensor_dict( class EncoderDecoderModelRunner(GPUModelRunnerBase[EncoderDecoderModelInput]): - _model_input_cls: type[EncoderDecoderModelInput] = ( + _model_input_cls: Type[EncoderDecoderModelInput] = ( EncoderDecoderModelInput) - _builder_cls: type[ModelInputForGPUBuilder] = (ModelInputForGPUBuilder) + _builder_cls: Type[ModelInputForGPUBuilder] = (ModelInputForGPUBuilder) def __init__( self, @@ -133,13 +133,13 @@ def raise_backend_err(): def _list_to_int32_tensor( self, - _list: list[int], + _list: List[int], ) -> torch.Tensor: return torch.tensor(_list, dtype=torch.int32, device=self.device) def _list_to_long_tensor( self, - _list: list[int], + _list: List[int], ) -> torch.Tensor: return torch.tensor(_list, dtype=torch.long, device=self.device) @@ -153,10 +153,10 @@ def _empty_long_tensor(self) -> torch.Tensor: def execute_model( self, model_input: EncoderDecoderModelInput, - kv_caches: list[torch.Tensor], + kv_caches: List[torch.Tensor], intermediate_tensors: Optional[IntermediateTensors] = None, num_steps: int = 1, - ) -> Optional[list[PoolerOutput]]: + ) -> Optional[List[PoolerOutput]]: if num_steps > 1: raise ValueError("num_steps > 1 is not supported in " "EncoderDecoderModelRunner") @@ -207,7 +207,7 @@ def execute_model( return [output] def make_model_input_from_broadcasted_tensor_dict( - self, tensor_dict: dict[str, Any]) -> EncoderDecoderModelInput: + self, tensor_dict: Dict[str, Any]) -> EncoderDecoderModelInput: return EncoderDecoderModelInput.from_broadcasted_tensor_dict( tensor_dict, attn_backend=self.attn_backend, @@ -215,9 +215,9 @@ def make_model_input_from_broadcasted_tensor_dict( def prepare_model_input( self, - seq_group_metadata_list: list[SequenceGroupMetadata], + seq_group_metadata_list: List[SequenceGroupMetadata], virtual_engine: int = 0, - finished_requests_ids: Optional[list[str]] = None + finished_requests_ids: Optional[List[str]] = None ) -> EncoderDecoderModelInput: """Prepare the model input based on a given sequence group, including metadata for the sampling step. @@ -270,7 +270,7 @@ def profile_run(self) -> None: # Profile memory usage with max_num_sequences sequences and the total # number of tokens equal to max_num_batched_tokens. - seqs: list[SequenceGroupMetadata] = [] + seqs: List[SequenceGroupMetadata] = [] max_mm_tokens = self.mm_registry.get_max_multimodal_tokens( self.model_config) @@ -332,9 +332,9 @@ def profile_run(self) -> None: def _prepare_encoder_model_input_tensors( self, - seq_group_metadata_list: list[SequenceGroupMetadata], + seq_group_metadata_list: List[SequenceGroupMetadata], model_input: EncoderDecoderModelInput, - ) -> tuple[AttentionMetadata, Optional[torch.Tensor], + ) -> Tuple[AttentionMetadata, Optional[torch.Tensor], Optional[torch.Tensor]]: """Helper method to prepare the encoder- and cross-attn-related model inputs based on a given sequence group. These additional inputs @@ -379,7 +379,7 @@ def _prepare_encoder_model_input_tensors( is_prompt = seq_group_metadata_list[0].is_prompt # Build encoder inputs - encoder_seq_lens: list[int] = [] + encoder_seq_lens: List[int] = [] if is_prompt: # Prefill phase. cross_block_tables = self._empty_int32_tensor().view( diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index 57b916c2e2cdb..d6eaf84e40f6b 100644 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -17,8 +17,8 @@ from array import array from dataclasses import dataclass, field from enum import IntEnum -from typing import (TYPE_CHECKING, Any, Callable, NamedTuple, Optional, - TypeVar, Union) +from typing import (TYPE_CHECKING, Any, Callable, Dict, List, NamedTuple, + Optional, Set, Tuple, Type, TypeVar, Union) import habana_frameworks.torch as htorch import habana_frameworks.torch.internal.bridge_config as bc @@ -70,7 +70,7 @@ class Singleton(type): - _instances: dict[type, object] = {} + _instances: Dict[type, object] = {} def __call__(cls, *args, **kwargs): if cls not in cls._instances: @@ -80,18 +80,18 @@ def __call__(cls, *args, **kwargs): @dataclass class HPUBucketingGlobalState(metaclass=Singleton): - prompt_bs_bucket_cfg: tuple[int, int, int] = field(init=False) - decode_bs_bucket_cfg: tuple[int, int, int] = field(init=False) - prompt_seq_bucket_cfg: tuple[int, int, int] = field(init=False) - decode_block_bucket_cfg: tuple[int, int, int] = field(init=False) - prompt_buckets: list[tuple[int, int]] = field(init=False) - decode_buckets: list[tuple[int, int]] = field(init=False) + prompt_bs_bucket_cfg: Tuple[int, int, int] = field(init=False) + decode_bs_bucket_cfg: Tuple[int, int, int] = field(init=False) + prompt_seq_bucket_cfg: Tuple[int, int, int] = field(init=False) + decode_block_bucket_cfg: Tuple[int, int, int] = field(init=False) + prompt_buckets: List[Tuple[int, int]] = field(init=False) + decode_buckets: List[Tuple[int, int]] = field(init=False) def subtuple(obj: object, typename: str, - to_copy: list[str], - to_override: Optional[dict[str, object]] = None): + to_copy: List[str], + to_override: Optional[Dict[str, object]] = None): if obj is None: return None if to_override is None: @@ -123,7 +123,7 @@ def read_bucket_settings(phase: str, dim: str, **defaults): return values -def warmup_range(config: tuple[int, int, int]): +def warmup_range(config: Tuple[int, int, int]): """Generate a warmup range. Start from bmin and multiply by 2 until you reach bstep. @@ -225,7 +225,7 @@ def round_up(value: int, k: int): return (value + k - 1) // k * k -def find_bucket(value: int, config: tuple[int, int, int]): +def find_bucket(value: int, config: Tuple[int, int, int]): bmin, bstep, _ = config next_step = round_up(value, bstep) next_pow = next_pow2(value, bmin) @@ -406,16 +406,16 @@ def sample(self, *args, **kwargs): class PreparePromptMetadata(NamedTuple): input_tokens: torch.Tensor - input_positions: list[list[int]] + input_positions: List[List[int]] attn_metadata: Optional[AttentionMetadata] - seq_lens: list[int] - query_lens: list[int] - lora_index_mapping: list[list[int]] - lora_prompt_mapping: list[list[int]] - lora_requests: set[LoRARequest] - multi_modal_kwargs: Optional[dict[str, BatchedTensorInputs]] - slot_mapping: list[list[int]] - lora_ids: list[int] + seq_lens: List[int] + query_lens: List[int] + lora_index_mapping: List[List[int]] + lora_prompt_mapping: List[List[int]] + lora_requests: Set[LoRARequest] + multi_modal_kwargs: Optional[Dict[str, BatchedTensorInputs]] + slot_mapping: List[List[int]] + lora_ids: List[int] @classmethod def empty(cls): @@ -434,13 +434,13 @@ def empty(cls): class PrepareDecodeMetadata(NamedTuple): input_tokens: torch.Tensor - input_positions: list[list[int]] + input_positions: List[List[int]] attn_metadata: Optional[AttentionMetadata] - lora_index_mapping: list[list[int]] - lora_prompt_mapping: list[list[int]] - lora_requests: set[LoRARequest] - slot_mapping: list[list[int]] - lora_ids: list[int] + lora_index_mapping: List[List[int]] + lora_prompt_mapping: List[List[int]] + lora_requests: Set[LoRARequest] + slot_mapping: List[List[int]] + lora_ids: List[int] @classmethod def empty(cls): @@ -477,19 +477,19 @@ class ModelInputForHPU(ModelRunnerInputBase): """ input_tokens: Optional[torch.Tensor] = None input_positions: Optional[torch.Tensor] = None - seq_lens: Optional[list[int]] = None - query_lens: Optional[list[int]] = None + seq_lens: Optional[List[int]] = None + query_lens: Optional[List[int]] = None lora_mapping: Optional["LoRAMapping"] = None - lora_requests: Optional[set[LoRARequest]] = None + lora_requests: Optional[Set[LoRARequest]] = None attn_metadata: Optional["AttentionMetadata"] = None - multi_modal_kwargs: Optional[dict[str, torch.Tensor]] = None + multi_modal_kwargs: Optional[Dict[str, torch.Tensor]] = None real_batch_size: Optional[int] = None batch_size_padded: Optional[int] = None virtual_engine: int = 0 - lora_ids: Optional[list[int]] = None + lora_ids: Optional[List[int]] = None async_callback: Optional[Callable] = None - def as_broadcastable_tensor_dict(self) -> dict[str, Any]: + def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: tensor_dict = { "input_tokens": self.input_tokens, "input_positions": self.input_positions, @@ -506,8 +506,8 @@ def as_broadcastable_tensor_dict(self) -> dict[str, Any]: @classmethod def from_broadcasted_tensor_dict( - cls: type[TModelInputForHPU], - tensor_dict: dict[str, Any], + cls: Type[TModelInputForHPU], + tensor_dict: Dict[str, Any], attn_backend: Optional["AttentionBackend"] = None, ) -> TModelInputForHPU: if attn_backend is not None: @@ -526,7 +526,7 @@ class ModelInputForHPUWithSamplingMetadata(ModelInputForHPU): # used by the driver worker. is_prompt: Optional[bool] = None - def as_broadcastable_tensor_dict(self) -> dict[str, Any]: + def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: tensor_dict = { "input_tokens": self.input_tokens, "input_positions": self.input_positions, @@ -543,7 +543,7 @@ def as_broadcastable_tensor_dict(self) -> dict[str, Any]: @classmethod def from_broadcasted_tensor_dict( cls, - tensor_dict: dict[str, Any], + tensor_dict: Dict[str, Any], attn_backend: Optional["AttentionBackend"] = None, ) -> "ModelInputForHPUWithSamplingMetadata": tensor_dict = _init_sampling_metadata_from_tensor_dict(tensor_dict) @@ -558,7 +558,7 @@ class HPUModelRunnerBase(ModelRunnerBase[TModelInputForHPU]): """ Helper class for shared methods between GPU model runners. """ - _model_input_cls: type[TModelInputForHPU] + _model_input_cls: Type[TModelInputForHPU] def __init__( self, @@ -754,7 +754,7 @@ def _setup_buckets(self) -> None: step=self.block_size, max=max(self.block_size, self.max_num_seqs * max_decode_seq // self.block_size)) - self.graphed_buckets: set[Any] = set() + self.graphed_buckets: Set[Any] = set() msg = ("Prompt bucket config (min, step, max_warmup) " f"bs:{self.bucketing_global_state.prompt_bs_bucket_cfg}, " @@ -768,20 +768,20 @@ def _setup_buckets(self) -> None: def _prepare_prompt( self, - seq_group_metadata_list: list[SequenceGroupMetadata], + seq_group_metadata_list: List[SequenceGroupMetadata], ) -> PreparePromptMetadata: - input_tokens: list[list[int]] = [] - input_positions: list[list[int]] = [] - slot_mapping: list[list[int]] = [] - lora_index_mapping: list[list[int]] = [] - lora_prompt_mapping: list[list[int]] = [] - lora_requests: set[LoRARequest] = set() - - seq_lens: list[int] = [] - context_lens: list[int] = [] - query_lens: list[int] = [] - prefix_block_tables: list[list[int]] = [] - multi_modal_kwargs_list: list[MultiModalKwargs] = [] + input_tokens: List[List[int]] = [] + input_positions: List[List[int]] = [] + slot_mapping: List[List[int]] = [] + lora_index_mapping: List[List[int]] = [] + lora_prompt_mapping: List[List[int]] = [] + lora_requests: Set[LoRARequest] = set() + + seq_lens: List[int] = [] + context_lens: List[int] = [] + query_lens: List[int] = [] + prefix_block_tables: List[List[int]] = [] + multi_modal_kwargs_list: List[MultiModalKwargs] = [] if len(seq_group_metadata_list) == 0: return PreparePromptMetadata.empty() @@ -885,7 +885,7 @@ def _prepare_prompt( self.bucketing_global_state.prompt_seq_bucket_cfg), self.block_size) - lora_ids: list[int] = [] + lora_ids: List[int] = [] for seq_group_metadata, context_len in zip(seq_group_metadata_list, context_lens): lora_id = seq_group_metadata.lora_int_id @@ -959,20 +959,20 @@ def _prepare_prompt( def _prepare_decode( self, - seq_group_metadata_list: list[SequenceGroupMetadata], + seq_group_metadata_list: List[SequenceGroupMetadata], ) -> PrepareDecodeMetadata: - input_tokens: list[list[int]] = [] - input_positions: list[list[int]] = [] - slot_mapping: list[list[int]] = [] - seq_lens: list[int] = [] - block_tables: list[list[int]] = [] - lora_index_mapping: list[list[int]] = [] - lora_prompt_mapping: list[list[int]] = [] - lora_requests: set[LoRARequest] = set() + input_tokens: List[List[int]] = [] + input_positions: List[List[int]] = [] + slot_mapping: List[List[int]] = [] + seq_lens: List[int] = [] + block_tables: List[List[int]] = [] + lora_index_mapping: List[List[int]] = [] + lora_prompt_mapping: List[List[int]] = [] + lora_requests: Set[LoRARequest] = set() if len(seq_group_metadata_list) == 0: return PrepareDecodeMetadata.empty() - lora_ids: list[int] = [] + lora_ids: List[int] = [] dummy_slots = itertools.cycle( range(_PAD_SLOT_ID, _PAD_SLOT_ID + self.block_size)) @@ -1051,7 +1051,7 @@ def _prepare_decode( block_bucket_size = find_bucket( block_bucket_size, self.bucketing_global_state.decode_block_bucket_cfg) - indices: list[Any] + indices: List[Any] indices = [None] * block_bucket_size for i, bid in enumerate(block_list): indices[bid] = i @@ -1113,8 +1113,8 @@ def _prepare_decode( def prepare_input_tensors( self, - seq_group_metadata_list: list[SequenceGroupMetadata], - ) -> tuple[TModelInputForHPU, SamplingMetadata]: + seq_group_metadata_list: List[SequenceGroupMetadata], + ) -> Tuple[TModelInputForHPU, SamplingMetadata]: if len(seq_group_metadata_list) == 0: return self._model_input_cls(), None @@ -1366,8 +1366,8 @@ def warmup_scenario(self, # that will have unique loras, an therefore the max amount of memory # consumption create dummy lora request copies from the lora request # passed in, which contains a lora from the lora warmup path. - dummy_lora_requests: list[LoRARequest] = [] - dummy_lora_requests_per_seq: list[LoRARequest] = [] + dummy_lora_requests: List[LoRARequest] = [] + dummy_lora_requests_per_seq: List[LoRARequest] = [] if self.lora_config and is_lora_profile_run: assert self.lora_manager is not None with self.lora_manager.dummy_lora_cache(): @@ -1431,7 +1431,7 @@ def remove_all_loras(self): raise RuntimeError("LoRA is not enabled.") self.lora_manager.remove_all_adapters() - def set_active_loras(self, lora_requests: set[LoRARequest], + def set_active_loras(self, lora_requests: Set[LoRARequest], lora_mapping: LoRAMapping) -> None: if not self.lora_manager: raise RuntimeError("LoRA is not enabled.") @@ -1452,7 +1452,7 @@ def pin_lora(self, lora_id: int) -> bool: raise RuntimeError("LoRA is not enabled.") return self.lora_manager.pin_adapter(lora_id) - def list_loras(self) -> set[int]: + def list_loras(self) -> Set[int]: if not self.lora_manager: raise RuntimeError("LoRA is not enabled.") return self.lora_manager.list_adapters() @@ -1486,8 +1486,8 @@ def warmup_graphs(self, idx = 0 phase = f'Graph/{"Prompt" if is_prompt else "Decode"}' num_candidates = len(buckets) - ordering : Union[Callable[[Any], tuple[Any, Any]], \ - Callable[[Any], tuple[Any, Any, Any]]] + ordering : Union[Callable[[Any], Tuple[Any, Any]], \ + Callable[[Any], Tuple[Any, Any, Any]]] if strategy == 'min_tokens': ordering = lambda b: (b[0] * b[1], b[1], b[0]) elif strategy == 'max_bs': @@ -1533,7 +1533,7 @@ def log_graph_warmup_summary(self, buckets, is_prompt, total_mem): logger.info(msg) @torch.inference_mode() - def warmup_model(self, kv_caches: list[torch.Tensor]) -> None: + def warmup_model(self, kv_caches: List[torch.Tensor]) -> None: if profile := os.environ.get('VLLM_PT_PROFILE', None): phase, bs, seq_len, graph = profile.split('_') is_prompt = phase == 'prompt' @@ -1805,12 +1805,12 @@ class HPUModelRunner(HPUModelRunnerBase[ModelInputForHPUWithSamplingMetadata]): """ GPU model runner with sampling step. """ - _model_input_cls: type[ModelInputForHPUWithSamplingMetadata] = ( + _model_input_cls: Type[ModelInputForHPUWithSamplingMetadata] = ( ModelInputForHPUWithSamplingMetadata) def make_model_input_from_broadcasted_tensor_dict( self, - tensor_dict: dict[str, Any], + tensor_dict: Dict[str, Any], ) -> ModelInputForHPUWithSamplingMetadata: return ( ModelInputForHPUWithSamplingMetadata.from_broadcasted_tensor_dict( @@ -1821,9 +1821,9 @@ def make_model_input_from_broadcasted_tensor_dict( @torch.inference_mode() def prepare_model_input( self, - seq_group_metadata_list: list[SequenceGroupMetadata], + seq_group_metadata_list: List[SequenceGroupMetadata], virtual_engine: int = 0, - finished_requests_ids: Optional[list[str]] = None + finished_requests_ids: Optional[List[str]] = None ) -> ModelInputForHPUWithSamplingMetadata: """Prepare the model input based on a given sequence group, including metadata for the sampling step. @@ -1862,7 +1862,7 @@ def _check_config(self, batch_size, seq_len, is_prompt, warmup_mode): logger.warning("Configuration: (%s, %s, %s) was not warmed-up!", phase, batch_size, seq_len) - def create_lora_mask(self, input_tokens: torch.Tensor, lora_ids: list[int], + def create_lora_mask(self, input_tokens: torch.Tensor, lora_ids: List[int], is_prompt: bool): ''' This is a helper function to create the mask for lora computations. @@ -1936,11 +1936,11 @@ def create_lora_mask(self, input_tokens: torch.Tensor, lora_ids: list[int], def execute_model( self, model_input: ModelInputForHPUWithSamplingMetadata, - kv_caches: list[torch.Tensor], + kv_caches: List[torch.Tensor], intermediate_tensors: Optional[IntermediateTensors] = None, num_steps: int = 1, warmup_mode=False, - ) -> Optional[Union[list[SamplerOutput], IntermediateTensors]]: + ) -> Optional[Union[List[SamplerOutput], IntermediateTensors]]: if num_steps > 1: raise ValueError( "num_steps > 1 is not supported in HPUModelRunner") diff --git a/vllm/worker/hpu_worker.py b/vllm/worker/hpu_worker.py index 51f297bc9745e..ccb175d88fd3c 100644 --- a/vllm/worker/hpu_worker.py +++ b/vllm/worker/hpu_worker.py @@ -7,7 +7,7 @@ import contextlib import gc import os -from typing import Optional +from typing import List, Optional, Set, Tuple, Type import habana_frameworks.torch as htorch # noqa:F401 import torch @@ -49,7 +49,7 @@ def __init__( rank: int, distributed_init_method: str, is_driver_worker: bool = False, - model_runner_cls: Optional[type[ModelRunnerBase]] = None, + model_runner_cls: Optional[Type[ModelRunnerBase]] = None, ) -> None: WorkerBase.__init__(self, vllm_config=vllm_config) self.parallel_config.rank = rank @@ -69,9 +69,9 @@ def __init__( vllm_config=vllm_config, is_driver_worker=is_driver_worker) # Uninitialized cache engine. Will be initialized by # initialize_cache. - self.cache_engine: list[HPUCacheEngine] + self.cache_engine: List[HPUCacheEngine] # Initialize gpu_cache as pooling models don't initialize kv_caches - self.hpu_cache: Optional[list[list[torch.Tensor]]] = None + self.hpu_cache: Optional[List[List[torch.Tensor]]] = None # Torch profiler. Enabled and configured through env vars: # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace if envs.VLLM_TORCH_PROFILER_DIR: @@ -131,7 +131,7 @@ def load_model(self): def execute_model( self, execute_model_req: Optional[ExecuteModelRequest] = None, - ) -> Optional[list[SamplerOutput]]: + ) -> Optional[List[SamplerOutput]]: # VLLM_HPU_LOG_STEP_GRAPH_COMPILATION - will log graph compilations per engine step, only when there was any - highly recommended to use alongside PT_HPU_METRICS_GC_DETAILS! # noqa:E501 # VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL - will log graph compilations per engine step, always, even if there were none # noqa:E501 # VLLM_HPU_LOG_STEP_CPU_FALLBACKS - will log cpu fallbacks per engine step, only when there was any # noqa:E501 @@ -193,7 +193,7 @@ def execute_model( return output @torch.inference_mode() - def determine_num_available_blocks(self) -> tuple[int, int]: + def determine_num_available_blocks(self) -> Tuple[int, int]: """Profiles the peak memory usage of the model to determine how many KV blocks may be allocated without OOMs. @@ -305,7 +305,7 @@ def do_metadata_broadcast(self) -> bool: return self.parallel_config.tensor_parallel_size > 1 @property - def kv_cache(self) -> Optional[list[list[torch.Tensor]]]: + def kv_cache(self) -> Optional[List[List[torch.Tensor]]]: return self.hpu_cache @torch.inference_mode() @@ -361,7 +361,7 @@ def remove_lora(self, lora_id: int) -> bool: def pin_lora(self, lora_id: int) -> bool: return self.model_runner.pin_lora(lora_id) - def list_loras(self) -> set[int]: + def list_loras(self) -> Set[int]: return self.model_runner.list_loras() def add_prompt_adapter( @@ -377,7 +377,7 @@ def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool: raise NotImplementedError( "Prompt Adapter is not implemented for HPU backend.") - def list_prompt_adapters(self) -> set[int]: + def list_prompt_adapters(self) -> Set[int]: raise NotImplementedError( "Prompt Adapter is not implemented for HPU backend.") @@ -465,11 +465,11 @@ def _allocate_kv_cache( self, num_blocks: int, device: str, - ) -> list[tuple[torch.Tensor, torch.Tensor]]: + ) -> List[Tuple[torch.Tensor, torch.Tensor]]: """Allocates KV cache on the specified device.""" kv_cache_shape = self.attn_backend.get_kv_cache_shape( num_blocks, self.block_size, self.num_kv_heads, self.head_size) - kv_cache: list[tuple[torch.Tensor, torch.Tensor]] = [] + kv_cache: List[Tuple[torch.Tensor, torch.Tensor]] = [] for _ in range(self.num_attention_layers): key_cache = torch.zeros(kv_cache_shape, dtype=self.dtype, diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 591287608b5b0..a37a3168bbbc7 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -8,7 +8,8 @@ import weakref from contextlib import contextmanager from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, Callable, Optional, TypeVar, Union +from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Optional, Set, + Tuple, Type, TypeVar, Union) import numpy as np import torch @@ -85,22 +86,22 @@ class ModelInputForGPU(ModelRunnerInputBase): input_tokens: Optional[torch.Tensor] = None input_positions: Optional[torch.Tensor] = None token_types: Optional[torch.Tensor] = None - seq_lens: Optional[list[int]] = None - query_lens: Optional[list[int]] = None + seq_lens: Optional[List[int]] = None + query_lens: Optional[List[int]] = None lora_mapping: Optional["LoRAMapping"] = None - lora_requests: Optional[set[LoRARequest]] = None + lora_requests: Optional[Set[LoRARequest]] = None attn_metadata: Optional["AttentionMetadata"] = None prompt_adapter_mapping: Optional[PromptAdapterMapping] = None - prompt_adapter_requests: Optional[set[PromptAdapterRequest]] = None + prompt_adapter_requests: Optional[Set[PromptAdapterRequest]] = None multi_modal_kwargs: Optional[BatchedTensorInputs] = None - request_ids_to_seq_ids: Optional[dict[str, list[int]]] = None - finished_requests_ids: Optional[list[str]] = None + request_ids_to_seq_ids: Optional[Dict[str, List[int]]] = None + finished_requests_ids: Optional[List[str]] = None virtual_engine: int = 0 async_callback: Optional[Callable] = None scheduler_outputs: Optional[SchedulerOutputs] = None previous_hidden_states: Optional[torch.Tensor] = None - def as_broadcastable_tensor_dict(self) -> dict[str, Any]: + def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: tensor_dict = { "input_tokens": self.input_tokens, "input_positions": self.input_positions, @@ -118,8 +119,8 @@ def as_broadcastable_tensor_dict(self) -> dict[str, Any]: @classmethod def from_broadcasted_tensor_dict( - cls: type[TModelInputForGPU], - tensor_dict: dict[str, Any], + cls: Type[TModelInputForGPU], + tensor_dict: Dict[str, Any], attn_backend: Optional["AttentionBackend"] = None, ) -> TModelInputForGPU: if attn_backend is not None: @@ -150,7 +151,7 @@ class ModelInputForGPUWithSamplingMetadata(ModelInputForGPU): # used by the driver worker. is_prompt: Optional[bool] = None - def as_broadcastable_tensor_dict(self) -> dict[str, Any]: + def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: tensor_dict = { "input_tokens": self.input_tokens, "input_positions": self.input_positions, @@ -171,7 +172,7 @@ def as_broadcastable_tensor_dict(self) -> dict[str, Any]: @classmethod def from_broadcasted_tensor_dict( cls, - tensor_dict: dict[str, Any], + tensor_dict: Dict[str, Any], attn_backend: Optional["AttentionBackend"] = None, ) -> "ModelInputForGPUWithSamplingMetadata": tensor_dict = _init_sampling_metadata_from_tensor_dict(tensor_dict) @@ -211,43 +212,43 @@ def __init__( *, # From sequence group metadata. request_id: str, - seq_ids: list[int], + seq_ids: List[int], is_prompt: bool, - block_tables: Optional[dict[int, list[int]]], - computed_block_nums: list[int], + block_tables: Optional[Dict[int, List[int]]], + computed_block_nums: List[int], n_seqs: int = 0, # Input tokens and positions. - input_tokens: Optional[list[list[int]]] = None, - input_positions: Optional[list[list[int]]] = None, - token_types: Optional[list[list[int]]] = None, - mrope_input_positions: Optional[list[list[list[int]]]] = None, + input_tokens: Optional[List[List[int]]] = None, + input_positions: Optional[List[List[int]]] = None, + token_types: Optional[List[List[int]]] = None, + mrope_input_positions: Optional[List[List[List[int]]]] = None, # The sequence length (may be capped to the sliding window). - seq_lens: Optional[list[int]] = None, + seq_lens: Optional[List[int]] = None, # The original sequence length (before applying sliding window). # This is used to compute slot mapping. - orig_seq_lens: Optional[list[int]] = None, + orig_seq_lens: Optional[List[int]] = None, # The query length. - query_lens: Optional[list[int]] = None, + query_lens: Optional[List[int]] = None, # The number of tokens that are already computed. - context_lens: Optional[list[int]] = None, + context_lens: Optional[List[int]] = None, # The current sliding window block. - curr_sliding_window_blocks: Optional[list[int]] = None, + curr_sliding_window_blocks: Optional[List[int]] = None, # LoRA inputs. - lora_index_mapping: Optional[list[list[int]]] = None, - lora_prompt_mapping: Optional[list[list[int]]] = None, - lora_requests: Optional[set[LoRARequest]] = None, + lora_index_mapping: Optional[List[List[int]]] = None, + lora_prompt_mapping: Optional[List[List[int]]] = None, + lora_requests: Optional[Set[LoRARequest]] = None, # Prompt adapter inputs. - prompt_adapter_index_mapping: Optional[list[int]] = None, - prompt_adapter_prompt_mapping: Optional[list[int]] = None, + prompt_adapter_index_mapping: Optional[List[int]] = None, + prompt_adapter_prompt_mapping: Optional[List[int]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, # Multi-modal inputs. multi_modal_kwargs: Optional[MultiModalKwargs] = None, - multi_modal_placeholder_maps: Optional[dict[ + multi_modal_placeholder_maps: Optional[Dict[ str, MultiModalPlaceholderMap]] = None, # Whether the prefix cache is hit (prefill only). @@ -429,7 +430,7 @@ def reset_cached_inter_data(self): def __init__(self, runner: "GPUModelRunnerBase", - finished_requests_ids: Optional[list[str]] = None): + finished_requests_ids: Optional[List[str]] = None): super().__init__() # Compute functions for each sequence in a sequence group. # WARNING: The order of the functions matters! @@ -474,7 +475,7 @@ def __init__(self, self.sliding_window_blocks * self.block_size def prepare(self, - finished_requests_ids: Optional[list[str]] = None) -> None: + finished_requests_ids: Optional[List[str]] = None) -> None: self.finished_requests_ids = finished_requests_ids # if the current batch is decode-only. @@ -483,7 +484,7 @@ def prepare(self, # Intermediate data (data in CPU before going to GPU) for # the current sequence group. - self.inter_data_list: list[ + self.inter_data_list: List[ ModelInputForGPUBuilder.InterDataForSeqGroup] = [] self.attn_metadata_builder.prepare() @@ -834,7 +835,7 @@ def build(self) -> ModelInputForGPU: # prefix caching and there is no decode request. return self.model_input_cls() - mrope_input_positions: Optional[list[list[int]]] = None + mrope_input_positions: Optional[List[List[int]]] = None if any(inter_data.mrope_input_positions is not None for inter_data in self.inter_data_list): mrope_input_positions = [[] for _ in range(3)] @@ -948,7 +949,7 @@ def build(self) -> ModelInputForGPU: is_prefill=not self.decode_only)) # Prompt adapter data. - prompt_adapter_requests: set[PromptAdapterRequest] = set() + prompt_adapter_requests: Set[PromptAdapterRequest] = set() prompt_adapter_mapping = None if self.enable_prompt_adapter: prompt_adapter_requests = set( @@ -997,8 +998,8 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]): """ Helper class for shared methods between GPU model runners. """ - _model_input_cls: type[TModelInputForGPU] - _builder_cls: type[ModelInputForGPUBuilder] + _model_input_cls: Type[TModelInputForGPU] + _builder_cls: Type[ModelInputForGPUBuilder] builder: ModelInputForGPUBuilder def __init__( @@ -1028,10 +1029,10 @@ def __init__( self.max_batchsize_to_capture = \ self.vllm_config.compilation_config.max_capture_size - self.graph_runners: list[dict[int, CUDAGraphRunner]] = [ + self.graph_runners: List[Dict[int, CUDAGraphRunner]] = [ {} for _ in range(self.parallel_config.pipeline_parallel_size) ] - self.graph_memory_pool: Optional[tuple[ + self.graph_memory_pool: Optional[Tuple[ int, int]] = None # Set during graph capture. self.has_inner_state = model_config.has_inner_state @@ -1089,7 +1090,7 @@ def __init__( int(self.cache_config.cpu_offload_gb * 1024**3)) # Used to cache python objects - self.inter_data_cache: dict[int, PyObjectCache] = {} + self.inter_data_cache: Dict[int, PyObjectCache] = {} # Using the PythonizationCache in Pipeline-Parallel clobbers the # SequenceGroupToSample object. In Pipeline-Parallel, we have @@ -1196,8 +1197,8 @@ def get_max_block_per_batch(self) -> int: def _prepare_model_input_tensors( self, - seq_group_metadata_list: list[SequenceGroupMetadata], - finished_requests_ids: Optional[list[str]] = None + seq_group_metadata_list: List[SequenceGroupMetadata], + finished_requests_ids: Optional[List[str]] = None ) -> TModelInputForGPU: """Helper method to prepare the model input based on a given sequence group. Prepares metadata needed for the base model forward pass but not @@ -1253,8 +1254,8 @@ def _dummy_run(self, # that will have unique loras, an therefore the max amount of memory # consumption create dummy lora request copies from the lora request # passed in, which contains a lora from the lora warmup path. - dummy_lora_requests: list[LoRARequest] = [] - dummy_lora_requests_per_seq: list[LoRARequest] = [] + dummy_lora_requests: List[LoRARequest] = [] + dummy_lora_requests_per_seq: List[LoRARequest] = [] if self.lora_config: assert self.lora_manager is not None with self.lora_manager.dummy_lora_cache(): @@ -1275,7 +1276,7 @@ def _dummy_run(self, # Profile memory usage with max_num_sequences sequences and the # total number of tokens equal to max_num_batched_tokens. - seqs: list[SequenceGroupMetadata] = [] + seqs: List[SequenceGroupMetadata] = [] # Additional GPU memory may be needed for multi-modal encoding, # which needs to be accounted for when calculating the GPU blocks # for vLLM blocker manager. @@ -1363,7 +1364,7 @@ def remove_all_loras(self): raise RuntimeError("LoRA is not enabled.") self.lora_manager.remove_all_adapters() - def set_active_loras(self, lora_requests: set[LoRARequest], + def set_active_loras(self, lora_requests: Set[LoRARequest], lora_mapping: LoRAMapping) -> None: if not self.lora_manager: raise RuntimeError("LoRA is not enabled.") @@ -1384,7 +1385,7 @@ def pin_lora(self, lora_id: int) -> bool: raise RuntimeError("LoRA is not enabled.") return self.lora_manager.pin_adapter(lora_id) - def list_loras(self) -> set[int]: + def list_loras(self) -> Set[int]: if not self.lora_manager: raise RuntimeError("LoRA is not enabled.") return self.lora_manager.list_adapters() @@ -1395,7 +1396,7 @@ def remove_all_prompt_adapters(self): self.prompt_adapter_manager.remove_all_adapters() def set_active_prompt_adapters( - self, prompt_adapter_requests: set[PromptAdapterRequest], + self, prompt_adapter_requests: Set[PromptAdapterRequest], prompt_adapter_mapping: PromptAdapterMapping) -> None: if not self.prompt_adapter_manager: raise RuntimeError("PromptAdapter is not enabled.") @@ -1418,13 +1419,13 @@ def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool: raise RuntimeError("PromptAdapter is not enabled.") return self.prompt_adapter_manager.pin_adapter(prompt_adapter_id) - def list_prompt_adapters(self) -> set[int]: + def list_prompt_adapters(self) -> Set[int]: if not self.prompt_adapter_manager: raise RuntimeError("PromptAdapter is not enabled.") return self.prompt_adapter_manager.list_adapters() @torch.inference_mode() - def capture_model(self, kv_caches: list[list[torch.Tensor]]) -> None: + def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None: """Cuda graph capture a model. Note that CUDA graph's performance gain is negligible if number @@ -1570,7 +1571,7 @@ def capture_model(self, kv_caches: list[list[torch.Tensor]]) -> None: elapsed_time, cuda_graph_size / GiB_bytes) def _update_inputs_to_capture_for_enc_dec_model(self, - capture_inputs: dict[str, + capture_inputs: Dict[str, Any]): """ Updates the set of input tensors needed for CUDA graph capture in an @@ -1598,13 +1599,13 @@ class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]): """ GPU model runner with sampling step. """ - _model_input_cls: type[ModelInputForGPUWithSamplingMetadata] = ( + _model_input_cls: Type[ModelInputForGPUWithSamplingMetadata] = ( ModelInputForGPUWithSamplingMetadata) - _builder_cls: type[ModelInputForGPUBuilder] = ModelInputForGPUBuilder + _builder_cls: Type[ModelInputForGPUBuilder] = ModelInputForGPUBuilder def make_model_input_from_broadcasted_tensor_dict( self, - tensor_dict: dict[str, Any], + tensor_dict: Dict[str, Any], ) -> ModelInputForGPUWithSamplingMetadata: model_input = \ ModelInputForGPUWithSamplingMetadata.from_broadcasted_tensor_dict( @@ -1615,9 +1616,9 @@ def make_model_input_from_broadcasted_tensor_dict( def prepare_model_input( self, - seq_group_metadata_list: list[SequenceGroupMetadata], + seq_group_metadata_list: List[SequenceGroupMetadata], virtual_engine: int = 0, - finished_requests_ids: Optional[list[str]] = None, + finished_requests_ids: Optional[List[str]] = None, ) -> ModelInputForGPUWithSamplingMetadata: """Prepare the model input based on a given sequence group, including metadata for the sampling step. @@ -1654,11 +1655,11 @@ def prepare_model_input( def execute_model( self, model_input: ModelInputForGPUWithSamplingMetadata, - kv_caches: list[torch.Tensor], + kv_caches: List[torch.Tensor], intermediate_tensors: Optional[IntermediateTensors] = None, num_steps: int = 1, **kwargs, - ) -> Optional[Union[list[SamplerOutput], IntermediateTensors]]: + ) -> Optional[Union[List[SamplerOutput], IntermediateTensors]]: if num_steps > 1: raise ValueError("num_steps > 1 is not supported in ModelRunner") @@ -1884,8 +1885,8 @@ def __init__(self, model: nn.Module, backend_name: str, self.backend_name = backend_name self.attn_state = attn_state - self.input_buffers: dict[str, torch.Tensor] = {} - self.output_buffers: dict[str, torch.Tensor] = {} + self.input_buffers: Dict[str, torch.Tensor] = {} + self.output_buffers: Dict[str, torch.Tensor] = {} self._graph: Optional[torch.cuda.CUDAGraph] = None self._is_encoder_decoder_model = is_encoder_decoder_model @@ -1900,9 +1901,9 @@ def capture( input_ids: torch.Tensor, positions: torch.Tensor, intermediate_inputs: Optional[IntermediateTensors], - kv_caches: list[torch.Tensor], + kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, - memory_pool: Optional[tuple[int, int]], + memory_pool: Optional[Tuple[int, int]], stream: torch.cuda.Stream, **kwargs, ): diff --git a/vllm/worker/model_runner_base.py b/vllm/worker/model_runner_base.py index 95ecbc1de264d..935325cb2e1c0 100644 --- a/vllm/worker/model_runner_base.py +++ b/vllm/worker/model_runner_base.py @@ -2,7 +2,8 @@ import dataclasses from abc import ABC, abstractmethod -from typing import TYPE_CHECKING, Any, Generic, Optional, TypeVar +from typing import (TYPE_CHECKING, Any, Dict, Generic, List, Optional, Type, + TypeVar) import torch import torch.nn as nn @@ -23,7 +24,7 @@ def _add_attn_metadata_broadcastable_dict( - tensor_dict: dict[str, Any], + tensor_dict: Dict[str, Any], attn_metadata: Optional["AttentionMetadata"]) -> None: """ Helper method to update tensor_dict with broadcastable @@ -35,8 +36,8 @@ def _add_attn_metadata_broadcastable_dict( def _init_attn_metadata_from_tensor_dict( attn_backend: "AttentionBackend", - tensor_dict: dict[str, Any], -) -> dict[str, Any]: + tensor_dict: Dict[str, Any], +) -> Dict[str, Any]: """ Helper method to initialize AttentionMetadata based on an AttentionBackend and broadcastable AttentionMetadata fields. @@ -56,7 +57,7 @@ def _init_attn_metadata_from_tensor_dict( def _init_sampling_metadata_from_tensor_dict( # type: ignore - tensor_dict: dict[str, Any]) -> dict[str, Any]: + tensor_dict: Dict[str, Any]) -> Dict[str, Any]: """ Helper method to initialize SamplingMetadata based on broadcastable SamplingMetadata fields. @@ -77,7 +78,7 @@ def _init_sampling_metadata_from_tensor_dict( # type: ignore def _add_sampling_metadata_broadcastable_dict( - tensor_dict: dict[str, Any], + tensor_dict: Dict[str, Any], sampling_metadata: Optional["SamplingMetadata"]) -> None: """ Helper method to update tensor_dict with broadcastable @@ -89,8 +90,8 @@ def _add_sampling_metadata_broadcastable_dict( def _init_frozen_model_input_from_tensor_dict( - frozen_model_input_cls: type["ModelRunnerInputBase"], - tensor_dict: dict[str, Any]) -> dict[str, Any]: + frozen_model_input_cls: Type["ModelRunnerInputBase"], + tensor_dict: Dict[str, Any]) -> Dict[str, Any]: """ Helper method to initialize a frozen ModelInput based on broadcastable """ @@ -108,7 +109,7 @@ def _init_frozen_model_input_from_tensor_dict( class BroadcastableModelInput(ABC): @abstractmethod - def as_broadcastable_tensor_dict(self) -> dict[str, Any]: + def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: """ Extract broadcastable fields. Override for fields that require some custom deserialization. @@ -118,8 +119,8 @@ def as_broadcastable_tensor_dict(self) -> dict[str, Any]: @classmethod @abstractmethod def from_broadcasted_tensor_dict( - cls: type[T], - tensor_dict: dict[str, Any], + cls: Type[T], + tensor_dict: Dict[str, Any], attn_backend: Optional["AttentionBackend"] = None, ) -> T: """ @@ -149,7 +150,7 @@ class ModelRunnerInputBuilderBase(ABC, Generic[T]): @abstractmethod def prepare(self, - finished_requests_ids: Optional[list[str]] = None) -> None: + finished_requests_ids: Optional[List[str]] = None) -> None: raise NotImplementedError @abstractmethod @@ -190,12 +191,12 @@ def __init__( self.observability_config = vllm_config.observability_config # Map of request_id -> generator used for seeded random sampling - generators: dict[str, torch.Generator] = {} + generators: Dict[str, torch.Generator] = {} @abstractmethod def make_model_input_from_broadcasted_tensor_dict( self, - tensor_dict: dict[str, Any], + tensor_dict: Dict[str, Any], ) -> T: """ Make an instance of a ModelRunnerInputBase from the broadcasted tensor @@ -206,9 +207,9 @@ def make_model_input_from_broadcasted_tensor_dict( @abstractmethod def prepare_model_input( self, - seq_group_metadata_list: list[SequenceGroupMetadata], + seq_group_metadata_list: List[SequenceGroupMetadata], virtual_engine: int = 0, - finished_requests_ids: Optional[list[str]] = None, + finished_requests_ids: Optional[List[str]] = None, ) -> T: """ Prepare the inputs to ModelRunnerBase.execute_model from an execution @@ -224,17 +225,17 @@ def get_model(self) -> nn.Module: def execute_model( self, model_input: T, - kv_caches: Optional[list[torch.Tensor]], + kv_caches: Optional[List[torch.Tensor]], intermediate_tensors: Optional[IntermediateTensors] = None, num_steps: int = 1, **kwargs, - ) -> Optional[list[SamplerOutput]]: + ) -> Optional[List[SamplerOutput]]: """ Execute the model on the given input. """ raise NotImplementedError - def get_generators(self, finished_request_ids: Optional[list[str]] = None): + def get_generators(self, finished_request_ids: Optional[List[str]] = None): """ Return dict of per-request generators used for random sampling. """ diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py index f77411755a0ee..7ddf382079c62 100644 --- a/vllm/worker/multi_step_model_runner.py +++ b/vllm/worker/multi_step_model_runner.py @@ -3,7 +3,8 @@ import dataclasses import functools from dataclasses import dataclass, field -from typing import TYPE_CHECKING, Any, Callable, Optional, Union +from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, + Union) import torch @@ -36,7 +37,7 @@ MULTI_STEP_CHUNKED_PREFILL_ATTENTION_BACKENDS = ["FLASH_ATTN", "FLASHINFER"] def _get_supported_attention_backends(chunked_prefill_enabled: bool) \ - -> list[str]: + -> List[str]: if chunked_prefill_enabled: return MULTI_STEP_CHUNKED_PREFILL_ATTENTION_BACKENDS else: @@ -145,8 +146,8 @@ class StatefulModelInput(BroadcastableModelInput): # actual frozen model input dataclass passed to _base_model_runner frozen_model_input: Optional[ModelInputForGPUWithSamplingMetadata] = None - # List of model outputs for each step, may not be all pythonized - cached_outputs: list[ModelOutput] = field(default_factory=list) + # list of model outputs for each step, may not be all pythonized + cached_outputs: List[ModelOutput] = field(default_factory=list) # used to pass sampled token ids from the last step to the current step for # TP workers. Used to append to end of outputs and used by advance_step @@ -157,13 +158,13 @@ class StatefulModelInput(BroadcastableModelInput): is_first_multi_step: bool = False base_output_proc_callback: Optional[Callable] = None # ping-pong data structures for multi-step to wait on the previous step - step_cuda_events: list[torch.cuda.Event] = field( + step_cuda_events: List[torch.cuda.Event] = field( default_factory=lambda: [torch.cuda.Event(blocking=True)] * 2) num_seqs: int = -1 num_queries: int = -1 num_single_step_prefills: int = 0 - def as_broadcastable_tensor_dict(self) -> dict[str, Any]: + def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: assert self.frozen_model_input is not None tensor_dict = self.frozen_model_input.as_broadcastable_tensor_dict() new_tensor_dict = { @@ -182,7 +183,7 @@ def as_broadcastable_tensor_dict(self) -> dict[str, Any]: @classmethod def from_broadcasted_tensor_dict( cls, - tensor_dict: dict[str, Any], + tensor_dict: Dict[str, Any], attn_backend: Optional["AttentionBackend"] = None, ) -> "StatefulModelInput": tensor_dict = _init_sampling_metadata_from_tensor_dict(tensor_dict) @@ -311,7 +312,7 @@ def __init__(self, base_model_runner: GPUModelRunnerBase, *args, **kwargs): super().__init__(*args, **kwargs) # Check attention backend support. - supported_attention_backends: list[str] = \ + supported_attention_backends: List[str] = \ _get_supported_attention_backends( self.scheduler_config.chunked_prefill_enabled) if self.attn_backend.get_name() not in supported_attention_backends: @@ -345,7 +346,7 @@ def _copy_stream(self): return torch.cuda.Stream() def make_model_input_from_broadcasted_tensor_dict( - self, tensor_dict: dict[str, Any]) -> StatefulModelInput: + self, tensor_dict: Dict[str, Any]) -> StatefulModelInput: model_input = (StatefulModelInput.from_broadcasted_tensor_dict( tensor_dict, attn_backend=self.attn_backend, @@ -354,9 +355,9 @@ def make_model_input_from_broadcasted_tensor_dict( def prepare_model_input( self, - seq_group_metadata_list: list[SequenceGroupMetadata], + seq_group_metadata_list: List[SequenceGroupMetadata], virtual_engine: int = 0, - finished_requests_ids: Optional[list[str]] = None + finished_requests_ids: Optional[List[str]] = None ) -> StatefulModelInput: frozen_model_input: ModelInputForGPUWithSamplingMetadata = \ self._base_model_runner.prepare_model_input( @@ -409,7 +410,7 @@ def _async_process_outputs(self, model_input: StatefulModelInput, def _final_process_outputs( self, model_input: StatefulModelInput, - output_proc_callback: Optional[Callable]) -> list[SamplerOutput]: + output_proc_callback: Optional[Callable]) -> List[SamplerOutput]: assert model_input.frozen_model_input is not None has_async_callback = output_proc_callback is not None @@ -460,10 +461,10 @@ def _final_process_outputs( def execute_model( self, model_input: StatefulModelInput, - kv_caches: list[torch.Tensor], + kv_caches: List[torch.Tensor], intermediate_tensors: Optional[IntermediateTensors] = None, num_steps: int = 1, - ) -> Optional[Union[list[SamplerOutput], IntermediateTensors]]: + ) -> Optional[Union[List[SamplerOutput], IntermediateTensors]]: """ Execute the model for a single step and update multi-step metadata @@ -670,7 +671,7 @@ def profile_run(self) -> None: def remove_all_loras(self): return self._base_model_runner.remove_all_loras() - def capture_model(self, kv_caches: list[list]) -> None: + def capture_model(self, kv_caches: List[List]) -> None: return self._base_model_runner.capture_model(kv_caches) @property @@ -678,8 +679,8 @@ def vocab_size(self) -> int: return self._base_model_runner.vocab_size -DeferredLogprobsReturnType = tuple[Optional[list[Optional[PromptLogprobs]]], - Optional[list[SampleLogprobs]]] +DeferredLogprobsReturnType = Tuple[Optional[List[Optional[PromptLogprobs]]], + Optional[List[SampleLogprobs]]] def deferred_pythonize_logprobs( @@ -853,7 +854,7 @@ def _pythonize_sampler_output( seq_ids = seq_group.seq_ids next_token_ids = sample_result parent_ids = [0] - seq_outputs: list[SequenceOutput] + seq_outputs: List[SequenceOutput] if cache is not None: completion_seq_group_output: CompletionSequenceGroupOutput = \ diff --git a/vllm/worker/multi_step_tpu_worker.py b/vllm/worker/multi_step_tpu_worker.py index 887af34660217..3871199987cee 100644 --- a/vllm/worker/multi_step_tpu_worker.py +++ b/vllm/worker/multi_step_tpu_worker.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import dataclasses -from typing import Optional +from typing import Dict, Optional, Tuple import torch @@ -20,7 +20,7 @@ def __init__(self, *args, **kwargs): def _get_driver_input_and_broadcast( self, execute_model_req: ExecuteModelRequest - ) -> tuple[ModelInputForTPU, WorkerInput, dict[str, torch.Tensor]]: + ) -> Tuple[ModelInputForTPU, WorkerInput, Dict[str, torch.Tensor]]: assert self.is_driver_worker assert execute_model_req.virtual_engine == 0 @@ -71,7 +71,7 @@ def _get_driver_input_and_broadcast( def prepare_input( self, execute_model_req: Optional[ExecuteModelRequest] = None, - ) -> Optional[tuple[ModelInputForTPU, WorkerInput, dict[str, + ) -> Optional[Tuple[ModelInputForTPU, WorkerInput, Dict[str, torch.Tensor]]]: if self.is_driver_worker: if execute_model_req is None: diff --git a/vllm/worker/multi_step_worker.py b/vllm/worker/multi_step_worker.py index d3f73fea203e3..3518ab2f64fed 100644 --- a/vllm/worker/multi_step_worker.py +++ b/vllm/worker/multi_step_worker.py @@ -2,7 +2,7 @@ import dataclasses from dataclasses import dataclass -from typing import Optional +from typing import Dict, List, Optional, Tuple import torch @@ -35,13 +35,13 @@ def __init__(self, *args, **kwargs): ) pipeline_parallel_size = self.parallel_config.pipeline_parallel_size - self.multi_step_states: list[ + self.multi_step_states: List[ Optional[MultiStepState]] = [None] * pipeline_parallel_size self.temp_output = None def _get_driver_input_and_broadcast( self, execute_model_req: ExecuteModelRequest - ) -> tuple[BroadcastableModelInput, WorkerInput, dict[str, torch.Tensor]]: + ) -> Tuple[BroadcastableModelInput, WorkerInput, Dict[str, torch.Tensor]]: """ Get the driver input and broadcast it to other workers. """ @@ -136,7 +136,7 @@ def _prepare_last_sampled_token_ids_for_tp_workers( def prepare_input( self, execute_model_req: Optional[ExecuteModelRequest] = None, - ) -> Optional[tuple[StatefulModelInput, WorkerInput, dict[str, + ) -> Optional[Tuple[StatefulModelInput, WorkerInput, Dict[str, torch.Tensor]]]: """ Depending on the current state of the request and multi step worker, diff --git a/vllm/worker/neuron_model_runner.py b/vllm/worker/neuron_model_runner.py index b4b4b3535d954..f2093fc42ad16 100644 --- a/vllm/worker/neuron_model_runner.py +++ b/vllm/worker/neuron_model_runner.py @@ -3,7 +3,7 @@ import os from dataclasses import dataclass from importlib.util import find_spec -from typing import TYPE_CHECKING, Any, Optional, Union +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union import torch from torch import nn @@ -39,13 +39,13 @@ class ModelInputForNeuron(ModelRunnerInputBase): multi_modal_kwargs: Optional[BatchedTensorInputs] = None def as_broadcastable_tensor_dict( - self) -> dict[str, Union[int, torch.Tensor]]: + self) -> Dict[str, Union[int, torch.Tensor]]: raise NotImplementedError("ModelInputForNeuron cannot be broadcast.") @classmethod def from_broadcasted_tensor_dict( cls, - tensor_dict: dict[str, Any], + tensor_dict: Dict[str, Any], attn_backend: Optional["AttentionBackend"] = None, ) -> "ModelInputForNeuron": assert attn_backend is None @@ -85,7 +85,7 @@ def __init__( # NEURON needs to update sampling parameters when request IDs change # across batches. This variable stores the previous batch's request IDs # to determine if an update is needed. - self._previous_batch_request_ids: list[str] = [] + self._previous_batch_request_ids: List[str] = [] if not self._on_device_sampling_disabled: logger.warning( @@ -120,16 +120,16 @@ def get_model(self) -> nn.Module: def _prepare_prompt( self, - seq_group_metadata_list: list[SequenceGroupMetadata], - ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, list[int], + seq_group_metadata_list: List[SequenceGroupMetadata], + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, List[int], BatchedTensorInputs]: assert len(seq_group_metadata_list) > 0 - input_tokens: list[list[int]] = [] - input_positions: list[list[int]] = [] - input_block_ids: list[int] = [] + input_tokens: List[List[int]] = [] + input_positions: List[List[int]] = [] + input_block_ids: List[int] = [] - seq_lens: list[int] = [] - multi_modal_kwargs_list: list[MultiModalKwargs] = [] + seq_lens: List[int] = [] + multi_modal_kwargs_list: List[MultiModalKwargs] = [] for seq_group_metadata in seq_group_metadata_list: assert seq_group_metadata.is_prompt seq_ids = list(seq_group_metadata.seq_data.keys()) @@ -184,13 +184,13 @@ def _prepare_prompt( def _prepare_decode( self, - seq_group_metadata_list: list[SequenceGroupMetadata], - ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + seq_group_metadata_list: List[SequenceGroupMetadata], + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: assert len(seq_group_metadata_list) > 0 - input_tokens: list[list[int]] = [] - input_positions: list[list[int]] = [] - input_block_ids: list[int] = [] - context_lens: list[int] = [] + input_tokens: List[List[int]] = [] + input_positions: List[List[int]] = [] + input_block_ids: List[int] = [] + context_lens: List[int] = [] for seq_group_metadata in seq_group_metadata_list: assert not seq_group_metadata.is_prompt @@ -232,14 +232,14 @@ def _prepare_decode( return input_tokens, input_positions, input_block_ids def make_model_input_from_broadcasted_tensor_dict( - self, tensor_dict: dict[str, Any]) -> ModelInputForNeuron: + self, tensor_dict: Dict[str, Any]) -> ModelInputForNeuron: return ModelInputForNeuron.from_broadcasted_tensor_dict(tensor_dict) def prepare_model_input( self, - seq_group_metadata_list: list[SequenceGroupMetadata], + seq_group_metadata_list: List[SequenceGroupMetadata], virtual_engine: int = 0, - finished_requests_ids: Optional[list[str]] = None + finished_requests_ids: Optional[List[str]] = None ) -> ModelInputForNeuron: multi_modal_kwargs = None # NOTE: We assume that all sequences in the group are all prompts or @@ -312,10 +312,10 @@ def _convert_to_neuron_top_k(self, top_k: int) -> int: def execute_model( self, model_input: ModelInputForNeuron, - kv_caches: Optional[list[torch.Tensor]] = None, + kv_caches: Optional[List[torch.Tensor]] = None, intermediate_tensors: Optional[IntermediateTensors] = None, num_steps: int = 1, - ) -> Optional[list[SamplerOutput]]: + ) -> Optional[List[SamplerOutput]]: if num_steps > 1: raise ValueError( "NeuronModelRunner does not support multi-step execution.") diff --git a/vllm/worker/neuron_worker.py b/vllm/worker/neuron_worker.py index c229c283dbf50..df651e05a7bbc 100644 --- a/vllm/worker/neuron_worker.py +++ b/vllm/worker/neuron_worker.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 """A Neuron worker class.""" -from typing import Optional +from typing import List, Optional, Tuple import torch import torch.distributed @@ -45,7 +45,7 @@ def __init__( def execute_model( self, execute_model_req: Optional[ExecuteModelRequest] = None, - ) -> Optional[list[SamplerOutput]]: + ) -> Optional[List[SamplerOutput]]: assert execute_model_req is not None assert (not execute_model_req.blocks_to_swap_in and not execute_model_req.blocks_to_swap_out @@ -66,7 +66,7 @@ def init_device(self) -> None: def load_model(self): self.model_runner.load_model() - def determine_num_available_blocks(self) -> tuple[int, int]: + def determine_num_available_blocks(self) -> Tuple[int, int]: """Determine the number of available KV blocks. Swapping is not yet supported, so always return num_cpu_blocks=0. @@ -100,7 +100,7 @@ def do_metadata_broadcast(self) -> bool: return False @property - def kv_cache(self) -> Optional[list[list[torch.Tensor]]]: + def kv_cache(self) -> Optional[List[List[torch.Tensor]]]: return None @torch.inference_mode() diff --git a/vllm/worker/openvino_model_runner.py b/vllm/worker/openvino_model_runner.py index cb2857b5fc687..5035ea20294c4 100644 --- a/vllm/worker/openvino_model_runner.py +++ b/vllm/worker/openvino_model_runner.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 from collections import defaultdict -from typing import NamedTuple, Optional +from typing import Dict, List, NamedTuple, Optional, Tuple import openvino as ov import torch @@ -27,8 +27,8 @@ class ModelInput(NamedTuple): input_tokens: torch.Tensor input_positions: torch.Tensor attn_metadata: Optional[OpenVINOAttentionMetadata] - seq_lens: list[int] - query_lens: list[int] + seq_lens: List[int] + query_lens: List[int] multi_modal_kwargs: BatchedTensorInputs @classmethod @@ -88,7 +88,7 @@ def get_model(self) -> nn.Module: def _prepare_model_input( self, - seq_group_metadata_list: list[SequenceGroupMetadata], + seq_group_metadata_list: List[SequenceGroupMetadata], ) -> ModelInput: """Prepare the model input based on a given sequence group. @@ -100,20 +100,20 @@ def _prepare_model_input( - input_tokens[:num_prefill_tokens] contains prefill tokens. - input_tokens[num_prefill_tokens:] contains decode tokens. """ - input_tokens: list[int] = [] - input_positions: list[int] = [] - - seq_lens: list[int] = [] - past_lens: list[int] = [] - query_lens: list[int] = [] - multi_modal_kwargs_list: list[MultiModalKwargs] = [] - multi_modal_placeholder_maps: dict[ + input_tokens: List[int] = [] + input_positions: List[int] = [] + + seq_lens: List[int] = [] + past_lens: List[int] = [] + query_lens: List[int] = [] + multi_modal_kwargs_list: List[MultiModalKwargs] = [] + multi_modal_placeholder_maps: Dict[ str, MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap) - subsequence_begins: list[int] = [] - block_indices: list[int] = [] - block_indices_begins: list[int] = [] + subsequence_begins: List[int] = [] + block_indices: List[int] = [] + block_indices_begins: List[int] = [] # initialize beginning of prefix sums subsequence_begins.append(0) @@ -297,8 +297,8 @@ def _prepare_model_input( def prepare_input_tensors( self, - seq_group_metadata_list: list[SequenceGroupMetadata], - ) -> tuple[torch.Tensor, torch.Tensor, OpenVINOAttentionMetadata, + seq_group_metadata_list: List[SequenceGroupMetadata], + ) -> Tuple[torch.Tensor, torch.Tensor, OpenVINOAttentionMetadata, SamplingMetadata, BatchedTensorInputs]: # Prepare input tensors. ( @@ -329,8 +329,8 @@ def prepare_input_tensors( @torch.inference_mode() def execute_model( self, - seq_group_metadata_list: list[SequenceGroupMetadata], - kv_caches: list[tuple["ov.Tensor", "ov.Tensor"]], + seq_group_metadata_list: List[SequenceGroupMetadata], + kv_caches: List[Tuple["ov.Tensor", "ov.Tensor"]], ) -> Optional[SamplerOutput]: ( input_tokens, diff --git a/vllm/worker/openvino_worker.py b/vllm/worker/openvino_worker.py index fc70c9a2d8c74..fad91270ea2a4 100644 --- a/vllm/worker/openvino_worker.py +++ b/vllm/worker/openvino_worker.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 """An OpenVINO worker class.""" -from typing import Any, Optional +from typing import Any, Dict, List, Optional, Tuple import openvino as ov import torch @@ -80,13 +80,13 @@ def __init__( ) # Initialize the cache. - self.kv_cache: list[tuple[ov.Tensor, + self.kv_cache: List[Tuple[ov.Tensor, ov.Tensor]] = self._allocate_kv_cache( self.num_device_blocks, ov_core, ov_device) # Initialize the swap. - self.swap_cache: list[tuple[ov.Tensor, + self.swap_cache: List[Tuple[ov.Tensor, ov.Tensor]] = self._allocate_swap_cache( self.num_swap_blocks, ov_device) @@ -95,11 +95,11 @@ def _allocate_kv_cache( num_blocks: int, ov_core: ov.Core, ov_device: str, - ) -> list[tuple[ov.Tensor, ov.Tensor]]: + ) -> List[Tuple[ov.Tensor, ov.Tensor]]: """Allocates KV cache.""" k_block_shape = v_block_shape = self.attn_backend.get_kv_cache_shape( num_blocks, self.block_size, self.num_kv_heads, self.head_size)[1:] - kv_cache: list[tuple[ov.Tensor, ov.Tensor]] = [] + kv_cache: List[Tuple[ov.Tensor, ov.Tensor]] = [] if current_platform.is_openvino_cpu(): for _ in range(self.num_layers): @@ -134,11 +134,11 @@ def _allocate_swap_cache( self, num_blocks: int, ov_device: str, - ) -> list[tuple[ov.Tensor, ov.Tensor]]: + ) -> List[Tuple[ov.Tensor, ov.Tensor]]: """Allocates swap cache.""" k_block_shape = v_block_shape = self.attn_backend.get_kv_cache_shape( num_blocks, self.block_size, self.num_kv_heads, self.head_size)[1:] - swap_cache: list[tuple[ov.Tensor, ov.Tensor]] = [] + swap_cache: List[Tuple[ov.Tensor, ov.Tensor]] = [] if num_blocks == 0: return swap_cache @@ -159,21 +159,21 @@ def _allocate_swap_cache( return swap_cache - def swap_in(self, src_to_dst: list[tuple[int, int]]) -> None: + def swap_in(self, src_to_dst: List[Tuple[int, int]]) -> None: for i in range(self.num_layers): for swap_tensor, kv_tensor in zip(self.swap_cache[i], self.kv_cache[i]): self.attn_backend.swap_blocks(swap_tensor, kv_tensor, src_to_dst) - def swap_out(self, src_to_dst: list[tuple[int, int]]) -> None: + def swap_out(self, src_to_dst: List[Tuple[int, int]]) -> None: for i in range(self.num_layers): for swap_tensor, kv_tensor in zip(self.swap_cache[i], self.kv_cache[i]): self.attn_backend.swap_blocks(kv_tensor, swap_tensor, src_to_dst) - def copy(self, src_to_dsts: list[tuple[int, int]]) -> None: + def copy(self, src_to_dsts: List[Tuple[int, int]]) -> None: if (len(src_to_dsts) > 0): self.attn_backend.copy_blocks(self.kv_cache, src_to_dsts) @@ -243,7 +243,7 @@ def __init__( # Uninitialized cache engine. Will be initialized by # initialize_cache. self.cache_engine: OpenVINOCacheEngine - self.kv_cache: list[tuple[ov.Tensor, ov.Tensor]] + self.kv_cache: List[Tuple[ov.Tensor, ov.Tensor]] def init_device(self) -> None: self.init_distributed_environment() @@ -253,7 +253,7 @@ def init_device(self) -> None: def load_model(self): self.model_runner.load_model() - def determine_num_available_blocks(self) -> tuple[int, int]: + def determine_num_available_blocks(self) -> Tuple[int, int]: """Determine the number of blocks available for the KV cache. This determines how many KV blocks can fit into the configured @@ -352,15 +352,15 @@ def _init_cache_engine(self) -> None: key_cache.data[:] = 0 value_cache.data[:] = 0 - def cache_swap_in(self, src_to_dst: list[tuple[int, int]]) -> None: + def cache_swap_in(self, src_to_dst: List[Tuple[int, int]]) -> None: self.cache_engine.swap_in(src_to_dst) - def cache_swap_out(self, src_to_dst: list[tuple[int, int]]) -> None: + def cache_swap_out(self, src_to_dst: List[Tuple[int, int]]) -> None: self.cache_engine.swap_out(src_to_dst) def cache_copy( self, - blocks_to_copy: list[tuple[int, int]], + blocks_to_copy: List[Tuple[int, int]], ) -> None: self.cache_engine.copy(blocks_to_copy) # type: ignore @@ -371,7 +371,7 @@ def get_model(self) -> nn.Module: def execute_model( self, execute_model_req: Optional[ExecuteModelRequest] = None, - ) -> list[SamplerOutput]: + ) -> List[SamplerOutput]: if execute_model_req is None: seq_group_metadata_list = None else: @@ -384,7 +384,7 @@ def execute_model( blocks_to_copy = execute_model_req.blocks_to_copy blocks_to_swap_in = execute_model_req.blocks_to_swap_in blocks_to_swap_out = execute_model_req.blocks_to_swap_out - data: dict[str, Any] = { + data: Dict[str, Any] = { "num_seq_groups": num_seq_groups, "blocks_to_copy": execute_model_req.blocks_to_copy, "blocks_to_swap_in": execute_model_req.blocks_to_swap_in, @@ -488,7 +488,7 @@ def model_profile_run(): # Profile memory usage with max_num_sequences sequences and the # total # number of tokens equal to max_num_batched_tokens. - seqs: list[SequenceGroupMetadata] = [] + seqs: List[SequenceGroupMetadata] = [] for group_id in range(max_num_seqs): seq_len = (max_num_batched_tokens // max_num_seqs + (group_id < max_num_batched_tokens % max_num_seqs)) diff --git a/vllm/worker/pooling_model_runner.py b/vllm/worker/pooling_model_runner.py index 20eb0bf31f9ca..cbd5e2060cad5 100644 --- a/vllm/worker/pooling_model_runner.py +++ b/vllm/worker/pooling_model_runner.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import dataclasses -from typing import Any, Optional, Union +from typing import Any, Dict, List, Optional, Tuple, Type, Union import torch @@ -30,9 +30,9 @@ class ModelInputForGPUWithPoolingMetadata(ModelInputForGPU): class PoolingModelRunner( GPUModelRunnerBase[ModelInputForGPUWithPoolingMetadata]): - _model_input_cls: type[ModelInputForGPUWithPoolingMetadata] = ( + _model_input_cls: Type[ModelInputForGPUWithPoolingMetadata] = ( ModelInputForGPUWithPoolingMetadata) - _builder_cls: type[ModelInputForGPUBuilder] = ModelInputForGPUBuilder + _builder_cls: Type[ModelInputForGPUBuilder] = ModelInputForGPUBuilder def __init__( self, @@ -48,10 +48,10 @@ def __init__( def execute_model( self, model_input: ModelInputForGPUWithPoolingMetadata, - kv_caches: list[torch.Tensor], + kv_caches: List[torch.Tensor], intermediate_tensors: Optional[IntermediateTensors] = None, num_steps: int = 1, - ) -> Optional[Union[list[PoolerOutput], IntermediateTensors]]: + ) -> Optional[Union[List[PoolerOutput], IntermediateTensors]]: if num_steps > 1: raise ValueError( "PoolingModelRunner does not support multi-step execution.") @@ -151,7 +151,7 @@ def execute_model( def make_model_input_from_broadcasted_tensor_dict( self, - tensor_dict: dict[str, + tensor_dict: Dict[str, Any]) -> ModelInputForGPUWithPoolingMetadata: return ModelInputForGPUWithPoolingMetadata.from_broadcasted_tensor_dict( tensor_dict, @@ -160,9 +160,9 @@ def make_model_input_from_broadcasted_tensor_dict( def prepare_model_input( self, - seq_group_metadata_list: Optional[list[SequenceGroupMetadata]], + seq_group_metadata_list: Optional[List[SequenceGroupMetadata]], virtual_engine: int = 0, - finished_requests_ids: Optional[list[str]] = None + finished_requests_ids: Optional[List[str]] = None ) -> ModelInputForGPUWithPoolingMetadata: assert seq_group_metadata_list is not None model_input = self._prepare_model_input_tensors( @@ -177,17 +177,17 @@ def prepare_model_input( def _prepare_pooling( self, - seq_group_metadata_list: list[SequenceGroupMetadata], - prompt_lens: list[int], + seq_group_metadata_list: List[SequenceGroupMetadata], + prompt_lens: List[int], ) -> PoolingMetadata: """Prepare PoolingMetadata for the sequence group metadata list.""" - seq_groups: list[tuple[list[int], PoolingParams]] = [] + seq_groups: List[Tuple[List[int], PoolingParams]] = [] for i, seq_group_metadata in enumerate(seq_group_metadata_list): seq_ids = list(seq_group_metadata.seq_data.keys()) pooling_params = seq_group_metadata.pooling_params seq_groups.append((seq_ids, pooling_params)) - seq_data: dict[int, SequenceData] = {} + seq_data: Dict[int, SequenceData] = {} for seq_group_metadata in seq_group_metadata_list: seq_data.update(seq_group_metadata.seq_data) diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py index 59b410b705277..53541a2579ed5 100644 --- a/vllm/worker/tpu_model_runner.py +++ b/vllm/worker/tpu_model_runner.py @@ -3,7 +3,8 @@ import enum import time from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, Callable, Optional, Union +from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, + Type, Union) from unittest.mock import patch import numpy as np @@ -59,15 +60,15 @@ class ModelInputForTPU(ModelRunnerInputBase): t: torch.Tensor p: torch.Tensor num_samples: int - n: list[int] - seq_groups: list[list[int]] + n: List[int] + seq_groups: List[List[int]] is_first_multi_step: bool = True is_last_step: bool = True virtual_engine: int = 0 async_callback: Optional[Callable] = None def as_broadcastable_tensor_dict( - self) -> dict[str, Union[int, torch.Tensor]]: + self) -> Dict[str, Union[int, torch.Tensor]]: tensor_dict = { "token_ids": self.token_ids, "position_ids": self.position_ids, @@ -86,8 +87,8 @@ def as_broadcastable_tensor_dict( @classmethod def from_broadcasted_tensor_dict( - cls: type["ModelInputForTPU"], - tensor_dict: dict[str, Any], + cls: Type["ModelInputForTPU"], + tensor_dict: Dict[str, Any], attn_backend: Optional["AttentionBackend"] = None, ) -> "ModelInputForTPU": if attn_backend is not None: @@ -120,7 +121,7 @@ def __init__( self.model_config.is_attention_free, False, ) - self.cached_step_outputs: list[torch.Tensor] = [] + self.cached_step_outputs: List[torch.Tensor] = [] smem_size = 512 * 1024 block_table_size = 4 * self.block_tables.size @@ -166,7 +167,7 @@ def _dummy_run( self, batch_size: int, seq_len: int, - kv_caches: list[tuple[torch.Tensor, torch.Tensor]], + kv_caches: List[Tuple[torch.Tensor, torch.Tensor]], exec_mode: ExecutionMode, ) -> None: exec_mode = ExecutionMode(exec_mode) @@ -279,7 +280,7 @@ def _dummy_run( def warmup_model( self, - kv_caches: list[tuple[torch.Tensor, torch.Tensor]], + kv_caches: List[Tuple[torch.Tensor, torch.Tensor]], ) -> None: # Prefill logger.info("Compiling the model with different input shapes...") @@ -346,14 +347,14 @@ def warmup_model( def _prepare_prompt( self, - seq_group_metadata_list: list[SequenceGroupMetadata], - ) -> tuple[torch.Tensor, torch.Tensor, AttentionMetadata, torch.Tensor]: + seq_group_metadata_list: List[SequenceGroupMetadata], + ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, torch.Tensor]: assert len(seq_group_metadata_list) > 0 - input_tokens: list[int] = [] - input_positions: list[int] = [] - prompt_lens: list[int] = [] - context_lens: list[int] = [] - slot_mapping: list[int] = [] + input_tokens: List[int] = [] + input_positions: List[int] = [] + prompt_lens: List[int] = [] + context_lens: List[int] = [] + slot_mapping: List[int] = [] for batch_idx, seq_group_metadata in enumerate( seq_group_metadata_list): @@ -438,13 +439,13 @@ def _prepare_prompt( def _prepare_decode( self, - seq_group_metadata_list: list[SequenceGroupMetadata], - ) -> tuple[torch.Tensor, torch.Tensor, AttentionMetadata, torch.Tensor]: + seq_group_metadata_list: List[SequenceGroupMetadata], + ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, torch.Tensor]: assert len(seq_group_metadata_list) > 0 - input_tokens: list[list[int]] = [] - input_positions: list[list[int]] = [] - slot_mapping: list[list[int]] = [] - context_lens: list[int] = [] + input_tokens: List[List[int]] = [] + input_positions: List[List[int]] = [] + slot_mapping: List[List[int]] = [] + context_lens: List[int] = [] batch_idx = 0 for seq_group_metadata in seq_group_metadata_list: @@ -509,9 +510,9 @@ def _prepare_decode( def _prepare_sample( self, - seq_group_metadata_list: list[SequenceGroupMetadata], + seq_group_metadata_list: List[SequenceGroupMetadata], padded_batch_size: int, - ) -> tuple[torch.Tensor, torch.Tensor, list[int]]: + ) -> Tuple[torch.Tensor, torch.Tensor, List[int]]: assert len(seq_group_metadata_list) > 0 t = [] p = [] @@ -557,9 +558,9 @@ def _prepare_sample( def prepare_model_input( self, - seq_group_metadata_list: list[SequenceGroupMetadata], + seq_group_metadata_list: List[SequenceGroupMetadata], virtual_engine: int = 0, - finished_requests_ids: Optional[list[str]] = None, + finished_requests_ids: Optional[List[str]] = None, ) -> ModelInputForTPU: del finished_requests_ids # Unused. assert virtual_engine == 0 @@ -585,7 +586,7 @@ def prepare_model_input( input_lens, t, p, num_samples, n, seq_groups) def make_model_input_from_broadcasted_tensor_dict( - self, tensor_dict: dict[str, Any]) -> ModelInputForTPU: + self, tensor_dict: Dict[str, Any]) -> ModelInputForTPU: model_input = ModelInputForTPU.from_broadcasted_tensor_dict( tensor_dict, attn_backend=self.attn_backend) return model_input @@ -594,10 +595,10 @@ def make_model_input_from_broadcasted_tensor_dict( def execute_model( self, model_input: ModelInputForTPU, - kv_caches: Optional[list[Any]], + kv_caches: Optional[List[Any]], intermediate_tensors: Optional[IntermediateTensors] = None, num_steps: int = 1, - ) -> list[SamplerOutput]: + ) -> List[SamplerOutput]: assert intermediate_tensors is None if not model_input.is_first_multi_step: if not model_input.is_last_step: @@ -780,7 +781,7 @@ def forward( t: torch.Tensor, p: torch.Tensor, num_samples: int, - kv_caches: list[tuple[torch.Tensor, torch.Tensor]], + kv_caches: List[Tuple[torch.Tensor, torch.Tensor]], ) -> torch.Tensor: """Executes the forward pass of the model and samples the next token. @@ -887,8 +888,8 @@ def _apply_top_p(logits: torch.Tensor, p: torch.Tensor) -> torch.Tensor: def _make_decode_output( - next_token_ids: list[int], - seq_groups: list[list[int]], + next_token_ids: List[int], + seq_groups: List[List[int]], ) -> SamplerOutput: zero_logprob = Logprob(0.0) sampler_outputs = [] diff --git a/vllm/worker/tpu_worker.py b/vllm/worker/tpu_worker.py index 6392faf6ef92e..7903e81943c24 100644 --- a/vllm/worker/tpu_worker.py +++ b/vllm/worker/tpu_worker.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import os -from typing import Optional, Union +from typing import List, Optional, Tuple, Union import torch import torch_xla.core.xla_model as xm @@ -96,7 +96,7 @@ def init_device(self) -> None: def load_model(self): self.model_runner.load_model() - def determine_num_available_blocks(self) -> tuple[int, int]: + def determine_num_available_blocks(self) -> Tuple[int, int]: num_layers = self.model_config.get_num_layers(self.parallel_config) head_size = self.model_config.get_head_size() num_kv_heads = self.model_config.get_num_kv_heads(self.parallel_config) @@ -157,8 +157,8 @@ def initialize_cache( num_kv_heads = self.model_config.get_num_kv_heads(self.parallel_config) head_size = self.model_config.get_head_size() - self.cpu_cache: list[tuple[torch.Tensor, torch.Tensor]] = [] - self.tpu_cache: list[tuple[torch.Tensor, torch.Tensor]] = [] + self.cpu_cache: List[Tuple[torch.Tensor, torch.Tensor]] = [] + self.tpu_cache: List[Tuple[torch.Tensor, torch.Tensor]] = [] tpu_cache_shape = self.model_runner.attn_backend.get_kv_cache_shape( num_gpu_blocks, self.block_size, num_kv_heads, head_size) cpu_cache_shape = self.model_runner.attn_backend.get_kv_cache_shape( @@ -207,7 +207,7 @@ def do_metadata_broadcast(self) -> bool: return self.parallel_config.tensor_parallel_size > 1 @property - def kv_cache(self) -> Optional[list[list[torch.Tensor]]]: + def kv_cache(self) -> Optional[List[List[torch.Tensor]]]: # NOTE(woosuk): This assumes virtual_engine == 0, i.e., no pipeline # parallelism. return [self.tpu_cache] @@ -268,10 +268,10 @@ def execute_worker(self, worker_input: WorkerInput) -> None: def _make_src_to_dst( - mapping: list[tuple[int, int]], + mapping: List[Tuple[int, int]], src_device: Union[torch.device, str], dst_device: Union[torch.device, str], -) -> Optional[tuple[torch.Tensor, torch.Tensor]]: +) -> Optional[Tuple[torch.Tensor, torch.Tensor]]: if not mapping: return None diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index d0ecb52eb3c60..ad94a6a4db7a3 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -2,7 +2,7 @@ """A GPU worker class.""" import gc import os -from typing import Optional, Union +from typing import Dict, List, Optional, Set, Tuple, Type, Union import torch import torch.distributed @@ -50,7 +50,7 @@ def __init__( rank: int, distributed_init_method: str, is_driver_worker: bool = False, - model_runner_cls: Optional[type[GPUModelRunnerBase]] = None, + model_runner_cls: Optional[Type[GPUModelRunnerBase]] = None, ) -> None: WorkerBase.__init__(self, vllm_config) self.parallel_config.rank = rank @@ -74,7 +74,7 @@ def __init__( not in ("medusa", "mlp_speculator", "eagle", "deepseek_mtp")) \ else {"return_hidden_states": True} - ModelRunnerClass: type[GPUModelRunnerBase] = ModelRunner + ModelRunnerClass: Type[GPUModelRunnerBase] = ModelRunner if model_config.runner_type == "pooling": ModelRunnerClass = PoolingModelRunner elif self.model_config.is_encoder_decoder: @@ -90,10 +90,10 @@ def __init__( # Uninitialized cache engine. Will be initialized by # initialize_cache. - self.cache_engine: list[CacheEngine] + self.cache_engine: List[CacheEngine] # Initialize gpu_cache as pooling models don't initialize kv_caches - self.gpu_cache: Optional[list[list[torch.Tensor]]] = None - self._seq_group_metadata_cache: dict[str, SequenceGroupMetadata] = {} + self.gpu_cache: Optional[List[List[torch.Tensor]]] = None + self._seq_group_metadata_cache: Dict[str, SequenceGroupMetadata] = {} # Torch profiler. Enabled and configured through env vars: # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace @@ -202,7 +202,7 @@ def save_tensorized_model( tensorizer_config=tensorizer_config, ) @torch.inference_mode() - def determine_num_available_blocks(self) -> tuple[int, int]: + def determine_num_available_blocks(self) -> Tuple[int, int]: """Profiles the peak memory usage of the model to determine how many KV blocks may be allocated without OOMs. @@ -345,7 +345,7 @@ def do_metadata_broadcast(self) -> bool: return self.parallel_config.tensor_parallel_size > 1 @property - def kv_cache(self) -> Optional[list[list[torch.Tensor]]]: + def kv_cache(self) -> Optional[List[List[torch.Tensor]]]: return self.gpu_cache @torch.inference_mode() @@ -396,9 +396,9 @@ def execute_worker(self, worker_input: WorkerInput) -> None: def _get_cached_seq_group_metadata( self, - seq_group_metadata_list: list[Union[SequenceGroupMetadata, + seq_group_metadata_list: List[Union[SequenceGroupMetadata, SequenceGroupMetadataDelta]], - finished_request_ids: list[str]) -> list[SequenceGroupMetadata]: + finished_request_ids: List[str]) -> List[SequenceGroupMetadata]: """Return a list of cached Sequence Group Metadata after updating its state. @@ -439,7 +439,7 @@ def _execute_model_spmd( self, execute_model_req: ExecuteModelRequest, intermediate_tensors: Optional[IntermediateTensors] = None, - ) -> Optional[list[SamplerOutput]]: + ) -> Optional[List[SamplerOutput]]: if execute_model_req is not None: new_seq_group_metadata_list = self._get_cached_seq_group_metadata( execute_model_req.seq_group_metadata_list, @@ -460,7 +460,7 @@ def remove_lora(self, lora_id: int) -> bool: def pin_lora(self, lora_id: int) -> bool: return self.model_runner.pin_lora(lora_id) - def list_loras(self) -> set[int]: + def list_loras(self) -> Set[int]: return self.model_runner.list_loras() def add_prompt_adapter( @@ -473,7 +473,7 @@ def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool: def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool: return self.model_runner.pin_prompt_adapter(prompt_adapter_id) - def list_prompt_adapters(self) -> set[int]: + def list_prompt_adapters(self) -> Set[int]: return self.model_runner.list_prompt_adapters() @property diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py index 225d2036f5911..7cc1562a5bce5 100644 --- a/vllm/worker/worker_base.py +++ b/vllm/worker/worker_base.py @@ -4,7 +4,7 @@ import os import time from abc import abstractmethod -from typing import Any, Optional, Union +from typing import Any, Dict, List, Optional, Set, Tuple, Type, Union import cloudpickle import torch @@ -77,7 +77,7 @@ def load_model(self) -> None: def execute_model( self, execute_model_req: Optional[ExecuteModelRequest] = None - ) -> Optional[list[SamplerOutput]]: + ) -> Optional[List[SamplerOutput]]: raise NotImplementedError def start_worker_execution_loop(self) -> None: @@ -92,14 +92,14 @@ def start_worker_execution_loop(self) -> None: if output is None: return None - def determine_num_available_blocks(self) -> tuple[int, int]: + def determine_num_available_blocks(self) -> Tuple[int, int]: """Determine the number of available blocks for the GPU KV cache and swappable CPU KV cache. The implementation may run profiling or other heuristics to determine the size of caches. - Returns a tuple[num_gpu_blocks, num_cpu_blocks], where num_gpu_blocks + Returns a Tuple[num_gpu_blocks, num_cpu_blocks], where num_gpu_blocks are blocks that are "active" on the device and can be appended to. num_cpu_blocks refers to "swapped" blocks in CPU memory and cannot be appended to. @@ -121,7 +121,7 @@ def remove_lora(self, lora_id: int) -> bool: def pin_lora(self, lora_id: int) -> bool: raise NotImplementedError - def list_loras(self) -> set[int]: + def list_loras(self) -> Set[int]: raise NotImplementedError @property @@ -150,7 +150,7 @@ def __init__( def init_device(self) -> None: self.worker.init_device() - def determine_num_available_blocks(self) -> tuple[int, int]: + def determine_num_available_blocks(self) -> Tuple[int, int]: return self.worker.determine_num_available_blocks() def initialize_cache(self, num_gpu_blocks: int, @@ -167,7 +167,7 @@ def get_model(self) -> nn.Module: def execute_model( self, execute_model_req: Optional[ExecuteModelRequest] = None - ) -> Optional[list[SamplerOutput]]: + ) -> Optional[List[SamplerOutput]]: return self.worker.execute_model(execute_model_req) def get_cache_block_size_bytes(self) -> int: @@ -182,7 +182,7 @@ def remove_lora(self, lora_id: int) -> bool: def pin_lora(self, lora_id: int) -> bool: return self.worker.pin_lora(lora_id) - def list_loras(self) -> set[int]: + def list_loras(self) -> Set[int]: return self.worker.list_loras() def __getattr__(self, attr): @@ -204,7 +204,7 @@ def pin_lora(self, lora_id: int) -> bool: return ValueError( f"{type(self)} does not support LoRA") # type: ignore - def list_loras(self) -> set[int]: + def list_loras(self) -> Set[int]: raise ValueError(f"{type(self)} does not support LoRA") @@ -223,8 +223,8 @@ class WorkerInput: @classmethod def from_broadcasted_tensor_dict( - cls: type["WorkerInput"], - tensor_dict: dict[str, Any], + cls: Type["WorkerInput"], + tensor_dict: Dict[str, Any], ) -> "WorkerInput": """ Pop fields from the given tensor_dict and populate a new instance of @@ -240,7 +240,7 @@ def from_broadcasted_tensor_dict( ) def as_broadcastable_tensor_dict( - self) -> dict[str, Union[int, torch.Tensor]]: + self) -> Dict[str, Union[int, torch.Tensor]]: """ Extract broadcastable fields. """ @@ -282,7 +282,7 @@ def do_metadata_broadcast(self) -> bool: @property @abstractmethod - def kv_cache(self) -> Optional[list[list[torch.Tensor]]]: + def kv_cache(self) -> Optional[List[List[torch.Tensor]]]: """ Gets the list of kv caches to pass to the worker's model runner. Each element in the list is a kv cache corresponding to a particular virtual @@ -311,7 +311,7 @@ def execute_worker(self, worker_input: WorkerInput) -> None: def _get_worker_input_from_broadcast( self - ) -> Optional[tuple[BroadcastableModelInput, WorkerInput, dict[ + ) -> Optional[Tuple[BroadcastableModelInput, WorkerInput, Dict[ str, torch.Tensor]]]: """ Get the worker input from the broadcasted tensor dict. """ assert self.do_metadata_broadcast @@ -331,7 +331,7 @@ def _get_worker_input_from_broadcast( def _get_driver_input_and_broadcast( self, execute_model_req: ExecuteModelRequest - ) -> tuple[BroadcastableModelInput, WorkerInput, dict[str, torch.Tensor]]: + ) -> Tuple[BroadcastableModelInput, WorkerInput, Dict[str, torch.Tensor]]: """ Get the driver input and broadcast it to other workers. """ assert self.is_driver_worker @@ -361,7 +361,7 @@ def _get_driver_input_and_broadcast( def prepare_input( self, execute_model_req: Optional[ExecuteModelRequest] = None - ) -> Optional[tuple[BroadcastableModelInput, WorkerInput, dict[ + ) -> Optional[Tuple[BroadcastableModelInput, WorkerInput, Dict[ str, torch.Tensor]]]: """ Prepare the inputs to ModelRunner and workers. @@ -386,7 +386,7 @@ def get_model(self) -> nn.Module: def execute_model( self, execute_model_req: Optional[ExecuteModelRequest] = None, - ) -> Optional[list[SamplerOutput]]: + ) -> Optional[List[SamplerOutput]]: """Executes at least one model step on the given sequences, unless no sequences are provided.""" start_time = time.perf_counter() @@ -444,14 +444,14 @@ def execute_model( o.model_execute_time = (orig_model_execute_time + model_execute_time) - # output is list[SamplerOutput] + # output is List[SamplerOutput] return output def _execute_model_spmd( self, execute_model_req: ExecuteModelRequest, intermediate_tensors: Optional[IntermediateTensors] = None - ) -> Optional[list[SamplerOutput]]: + ) -> Optional[List[SamplerOutput]]: """ Execute model in Single Program Multiple Data (SPMD) fashion. All workers take the same request, prepare the input and @@ -521,7 +521,7 @@ def __init__( from vllm.utils import init_cached_hf_modules init_cached_hf_modules() - def adjust_rank(self, rank_mapping: dict[int, int]) -> None: + def adjust_rank(self, rank_mapping: Dict[int, int]) -> None: """ Adjust the rpc_rank based on the given mapping. It is only used during the initialization of the executor, @@ -530,7 +530,7 @@ def adjust_rank(self, rank_mapping: dict[int, int]) -> None: if self.rpc_rank in rank_mapping: self.rpc_rank = rank_mapping[self.rpc_rank] - def update_environment_variables(self, envs_list: list[dict[str, + def update_environment_variables(self, envs_list: List[Dict[str, str]]) -> None: envs = envs_list[self.rpc_rank] key = 'CUDA_VISIBLE_DEVICES' @@ -540,7 +540,7 @@ def update_environment_variables(self, envs_list: list[dict[str, del os.environ[key] update_environment_variables(envs) - def init_worker(self, all_kwargs: list[dict[str, Any]]) -> None: + def init_worker(self, all_kwargs: List[Dict[str, Any]]) -> None: """ Here we inject some common logic before initializing the worker. Arguments are passed to the worker class constructor. @@ -567,7 +567,7 @@ def init_worker(self, all_kwargs: list[dict[str, Any]]) -> None: self.worker = worker_class(**kwargs) assert self.worker is not None - def initialize_from_config(self, kv_cache_configs: list[Any]) -> None: + def initialize_from_config(self, kv_cache_configs: List[Any]) -> None: kv_cache_config = kv_cache_configs[self.rpc_rank] self.worker.initialize_from_config(kv_cache_config) # type: ignore @@ -598,8 +598,8 @@ def __getattr__(self, attr): def extract_previous_hidden_states( - data: Union[ExecuteModelRequest, dict[str, torch.Tensor]]) -> \ - dict[str, torch.Tensor]: + data: Union[ExecuteModelRequest, Dict[str, torch.Tensor]]) -> \ + Dict[str, torch.Tensor]: """If data contains previous_hidden_states, extract it. This returns a dict which can be used directly as additional kwargs in any following execute_model calls. This is used in draft models like EAGLE.""" diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py index 3fc0c9c10eebd..39957e661c474 100644 --- a/vllm/worker/xpu_model_runner.py +++ b/vllm/worker/xpu_model_runner.py @@ -5,7 +5,8 @@ import weakref from collections import defaultdict from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, Callable, Optional, TypeVar +from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, + Type, TypeVar) import torch import torch.nn as nn @@ -53,11 +54,11 @@ class ModelInputForXPU(ModelRunnerInputBase): attn_metadata: Optional["AttentionMetadata"] = None multi_modal_kwargs: Optional[BatchedTensorInputs] = None virtual_engine: Optional[int] = None - seq_lens: Optional[list[int]] = None - query_lens: Optional[list[int]] = None + seq_lens: Optional[List[int]] = None + query_lens: Optional[List[int]] = None async_callback: Optional[Callable] = None - def as_broadcastable_tensor_dict(self) -> dict[str, Any]: + def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: tensor_dict = { "input_tokens": self.input_tokens, "input_positions": self.input_positions, @@ -68,8 +69,8 @@ def as_broadcastable_tensor_dict(self) -> dict[str, Any]: @classmethod def from_broadcasted_tensor_dict( - cls: type[TModelInputForXPU], - tensor_dict: dict[str, Any], + cls: Type[TModelInputForXPU], + tensor_dict: Dict[str, Any], attn_backend: Optional["AttentionBackend"] = None, ) -> TModelInputForXPU: if attn_backend is not None: @@ -85,7 +86,7 @@ class ModelInputForXPUWithSamplingMetadata(ModelInputForXPU): """ sampling_metadata: Optional["SamplingMetadata"] = None - def as_broadcastable_tensor_dict(self) -> dict[str, Any]: + def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: tensor_dict = { "input_tokens": self.input_tokens, "input_positions": self.input_positions, @@ -98,7 +99,7 @@ def as_broadcastable_tensor_dict(self) -> dict[str, Any]: @classmethod def from_broadcasted_tensor_dict( cls, - tensor_dict: dict[str, Any], + tensor_dict: Dict[str, Any], attn_backend: Optional["AttentionBackend"] = None, ) -> "ModelInputForXPUWithSamplingMetadata": tensor_dict = _init_sampling_metadata_from_tensor_dict(tensor_dict) @@ -112,7 +113,7 @@ class ModelInputForXPUBuilder(ModelRunnerInputBuilderBase[ModelInputForXPU]): def __init__(self, runner: "XPUModelRunner", - finished_requests_ids: Optional[list[str]] = None) -> None: + finished_requests_ids: Optional[List[str]] = None) -> None: super().__init__() self.runner = runner self.model_input_cls = self.runner._model_input_cls @@ -122,8 +123,8 @@ def __init__(self, self.device = self.runner.device def prepare(self, - finished_requests_ids: Optional[list[str]] = None) -> None: - self.seq_group_metadata_list: list[SequenceGroupMetadata] = [] + finished_requests_ids: Optional[List[str]] = None) -> None: + self.seq_group_metadata_list: List[SequenceGroupMetadata] = [] def add_seq_group(self, seq_group_metadata: SequenceGroupMetadata): self.seq_group_metadata_list.append(seq_group_metadata) @@ -153,16 +154,16 @@ def build(self) -> ModelInputForXPU: def _prepare_prompt( self, - seq_group_metadata_list: list[SequenceGroupMetadata], - ) -> tuple[torch.Tensor, torch.Tensor, AttentionMetadata, list[int], + seq_group_metadata_list: List[SequenceGroupMetadata], + ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, List[int], BatchedTensorInputs]: assert len(seq_group_metadata_list) > 0 - input_tokens: list[int] = [] - input_positions: list[int] = [] - slot_mapping: list[int] = [] - seq_lens: list[int] = [] - multi_modal_kwargs_list: list[MultiModalKwargs] = [] - multi_modal_placeholder_maps: dict[ + input_tokens: List[int] = [] + input_positions: List[int] = [] + slot_mapping: List[int] = [] + seq_lens: List[int] = [] + multi_modal_kwargs_list: List[MultiModalKwargs] = [] + multi_modal_placeholder_maps: Dict[ str, MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap) @@ -281,14 +282,14 @@ def _prepare_prompt( def _prepare_decode( self, - seq_group_metadata_list: list[SequenceGroupMetadata], - ) -> tuple[torch.Tensor, torch.Tensor, AttentionMetadata]: + seq_group_metadata_list: List[SequenceGroupMetadata], + ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata]: assert len(seq_group_metadata_list) > 0 - input_tokens: list[int] = [] - input_positions: list[int] = [] - slot_mapping: list[int] = [] - seq_lens: list[int] = [] - block_tables: list[list[int]] = [] + input_tokens: List[int] = [] + input_positions: List[int] = [] + slot_mapping: List[int] = [] + seq_lens: List[int] = [] + block_tables: List[List[int]] = [] for seq_group_metadata in seq_group_metadata_list: assert not seq_group_metadata.is_prompt @@ -366,9 +367,9 @@ def _prepare_decode( class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]): - _model_input_cls: type[ModelInputForXPUWithSamplingMetadata] = ( + _model_input_cls: Type[ModelInputForXPUWithSamplingMetadata] = ( ModelInputForXPUWithSamplingMetadata) - _builder_cls: type[ModelInputForXPUBuilder] = ModelInputForXPUBuilder + _builder_cls: Type[ModelInputForXPUBuilder] = ModelInputForXPUBuilder def __init__( self, @@ -440,7 +441,7 @@ def profile_run(self) -> None: # Profile memory usage with max_num_sequences sequences and the total # number of tokens equal to max_num_batched_tokens. - seqs: list[SequenceGroupMetadata] = [] + seqs: List[SequenceGroupMetadata] = [] # Additional GPU memory may be needed for multi-modal encoding, which # needs to be accounted for when calculating the GPU blocks for # vLLM blocker manager. @@ -498,7 +499,7 @@ def profile_run(self) -> None: def make_model_input_from_broadcasted_tensor_dict( self, - tensor_dict: dict[str, + tensor_dict: Dict[str, Any]) -> ModelInputForXPUWithSamplingMetadata: return ( ModelInputForXPUWithSamplingMetadata.from_broadcasted_tensor_dict( @@ -508,8 +509,8 @@ def make_model_input_from_broadcasted_tensor_dict( def _prepare_model_input_tensors( self, - seq_group_metadata_list: list[SequenceGroupMetadata], - finished_requests_ids: Optional[list[str]] = None + seq_group_metadata_list: List[SequenceGroupMetadata], + finished_requests_ids: Optional[List[str]] = None ) -> ModelInputForXPUWithSamplingMetadata: """Helper method to prepare the model input based on a given sequence group. Prepares metadata needed for the base model forward pass but not @@ -525,9 +526,9 @@ def _prepare_model_input_tensors( def prepare_model_input( self, - seq_group_metadata_list: list[SequenceGroupMetadata], + seq_group_metadata_list: List[SequenceGroupMetadata], virtual_engine: int = 0, - finished_requests_ids: Optional[list[str]] = None + finished_requests_ids: Optional[List[str]] = None ) -> ModelInputForXPUWithSamplingMetadata: """Prepare the model input based on a given sequence group, including metadata for the sampling step. @@ -554,10 +555,10 @@ def prepare_model_input( def execute_model( self, model_input: ModelInputForXPUWithSamplingMetadata, - kv_caches: list[torch.Tensor], + kv_caches: List[torch.Tensor], intermediate_tensors: Optional[IntermediateTensors] = None, num_steps: int = 1, - ) -> Optional[list[SamplerOutput]]: + ) -> Optional[List[SamplerOutput]]: if num_steps > 1: raise ValueError( "XPUModelRunner does not support multi-step execution.") diff --git a/vllm/worker/xpu_worker.py b/vllm/worker/xpu_worker.py index af76419a7e3b2..3aea0d7419d02 100644 --- a/vllm/worker/xpu_worker.py +++ b/vllm/worker/xpu_worker.py @@ -2,7 +2,7 @@ """A XPU worker class.""" import gc import os -from typing import Optional +from typing import List, Optional, Tuple import intel_extension_for_pytorch # noqa: F401 import oneccl_bindings_for_pytorch # noqa: F401 @@ -64,8 +64,8 @@ def __init__( ) # Uninitialized cache engine. Will be initialized by # initialize_cache. - self.cache_engine: list[CacheEngine] - self.gpu_cache: Optional[list[list[torch.Tensor]]] + self.cache_engine: List[CacheEngine] + self.gpu_cache: Optional[List[List[torch.Tensor]]] def init_device(self) -> None: if self.device_config.device.type == "xpu" and current_platform.is_xpu( @@ -85,7 +85,7 @@ def init_device(self) -> None: # keep this method for `empty_cache` and `synchronize` api @torch.inference_mode() - def determine_num_available_blocks(self) -> tuple[int, int]: + def determine_num_available_blocks(self) -> Tuple[int, int]: """Profiles the peak memory usage of the model to determine how many KV blocks may be allocated without OOMs. From 83d605a91f3a93d7af0aa78efa33f72fb4d570a6 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Fri, 28 Feb 2025 11:48:40 +0100 Subject: [PATCH 5/5] Fix new files Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/v1/attention/backends/mla/common.py | 21 ++++++++++---------- vllm/v1/attention/backends/mla/flashmla.py | 14 ++++++------- vllm/v1/attention/backends/mla/triton_mla.py | 8 ++++---- 3 files changed, 21 insertions(+), 22 deletions(-) diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py index 30bce5cc8b68c..824ffcfd61ba2 100644 --- a/vllm/v1/attention/backends/mla/common.py +++ b/vllm/v1/attention/backends/mla/common.py @@ -195,8 +195,7 @@ import functools from abc import abstractmethod from dataclasses import dataclass -from typing import (TYPE_CHECKING, Any, Dict, Generic, List, Optional, Tuple, - Type, TypeVar) +from typing import TYPE_CHECKING, Any, Generic, Optional, TypeVar import torch from compressed_tensors.quantization import QuantizationStrategy @@ -250,11 +249,11 @@ def get_name() -> str: return "TRITON_MLA_VLLM_V1" @staticmethod - def get_metadata_cls() -> Type["AttentionMetadata"]: + def get_metadata_cls() -> type["AttentionMetadata"]: return MLACommonMetadata @staticmethod - def get_builder_cls() -> Type["MLACommonMetadataBuilder"]: + def get_builder_cls() -> type["MLACommonMetadataBuilder"]: return MLACommonMetadataBuilder @staticmethod @@ -263,11 +262,11 @@ def get_kv_cache_shape( block_size: int, num_kv_heads: int, # assumed to be 1 for MLA head_size: int, - ) -> Tuple[int, ...]: + ) -> tuple[int, ...]: return (num_blocks, block_size, head_size) @staticmethod - def get_supported_head_sizes() -> List[int]: + def get_supported_head_sizes() -> list[int]: return [576] @staticmethod @@ -317,8 +316,8 @@ class MLACommonMetadata: has_context: bool = False context_chunk_cu_seq_lens: Optional[torch.Tensor] = None context_chunk_starts: Optional[torch.Tensor] = None - context_chunk_seq_tot: Optional[List[int]] = None - context_chunk_max_seq_lens: Optional[List[int]] = None + context_chunk_seq_tot: Optional[list[int]] = None + context_chunk_max_seq_lens: Optional[list[int]] = None chunked_prefill_workspace: Optional[torch.Tensor] = None def __post_init__(self): @@ -538,10 +537,10 @@ def __init__( head_size: int, scale: float, num_kv_heads: int, - alibi_slopes: Optional[List[float]], + alibi_slopes: Optional[list[float]], sliding_window: Optional[int], kv_cache_dtype: str, - blocksparse_params: Optional[Dict[str, Any]], + blocksparse_params: Optional[dict[str, Any]], logits_soft_cap: Optional[float], attn_type: str, # MLA Specific Arguments @@ -634,7 +633,7 @@ def process_weights_after_loading(self, act_dtype: torch.dtype): # # returns input_group_shape, weight_group_shape def get_scale_group_shapes_for_fp8(layer: LinearBase) -> \ - Tuple[Tuple[int, int], Tuple[int, int]]: + tuple[tuple[int, int], tuple[int, int]]: if isinstance(layer.quant_method, Fp8LinearMethod): if layer.quant_method.block_quant: weight_block_size = \ diff --git a/vllm/v1/attention/backends/mla/flashmla.py b/vllm/v1/attention/backends/mla/flashmla.py index 8a7b7b974e364..b357d71424104 100644 --- a/vllm/v1/attention/backends/mla/flashmla.py +++ b/vllm/v1/attention/backends/mla/flashmla.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 from dataclasses import dataclass -from typing import Any, Dict, List, Optional, Tuple, Type +from typing import Any, Optional import torch @@ -25,21 +25,21 @@ def get_name() -> str: return "FLASHMLA_VLLM_V1" @staticmethod - def get_metadata_cls() -> Type["FlashMLAMetadata"]: + def get_metadata_cls() -> type["FlashMLAMetadata"]: return FlashMLAMetadata @staticmethod - def get_builder_cls() -> Type["FlashMLAMetadataBuilder"]: + def get_builder_cls() -> type["FlashMLAMetadataBuilder"]: return FlashMLAMetadataBuilder @staticmethod - def get_impl_cls() -> Type["FlashMLAImpl"]: + def get_impl_cls() -> type["FlashMLAImpl"]: return FlashMLAImpl @dataclass class FlashMLAMetadata(MLACommonMetadata): - decode_tile_scheduler_metadata: Optional[Tuple[torch.Tensor, + decode_tile_scheduler_metadata: Optional[tuple[torch.Tensor, torch.Tensor]] = None decode_num_splits: Optional[torch.Tensor] = None @@ -76,10 +76,10 @@ def __init__( head_size: int, scale: float, num_kv_heads: int, - alibi_slopes: Optional[List[float]], + alibi_slopes: Optional[list[float]], sliding_window: Optional[int], kv_cache_dtype: str, - blocksparse_params: Optional[Dict[str, Any]], + blocksparse_params: Optional[dict[str, Any]], logits_soft_cap: Optional[float], attn_type: str, # MLA Specific Arguments diff --git a/vllm/v1/attention/backends/mla/triton_mla.py b/vllm/v1/attention/backends/mla/triton_mla.py index 7747509f1a4bf..3f9b349a5f04f 100644 --- a/vllm/v1/attention/backends/mla/triton_mla.py +++ b/vllm/v1/attention/backends/mla/triton_mla.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Any, Dict, List, Optional, Type +from typing import Any, Optional import torch @@ -21,7 +21,7 @@ def get_name() -> str: return "TRITON_MLA_VLLM_V1" @staticmethod - def get_impl_cls() -> Type["TritonMLAImpl"]: + def get_impl_cls() -> type["TritonMLAImpl"]: return TritonMLAImpl @@ -33,10 +33,10 @@ def __init__( head_size: int, scale: float, num_kv_heads: int, - alibi_slopes: Optional[List[float]], + alibi_slopes: Optional[list[float]], sliding_window: Optional[int], kv_cache_dtype: str, - blocksparse_params: Optional[Dict[str, Any]], + blocksparse_params: Optional[dict[str, Any]], logits_soft_cap: Optional[float], attn_type: str, # MLA Specific Arguments