vllm.renderers ¶

Modules:

Name	Description
`base`
`hf`
`inputs`
`params`
`registry`

BaseRenderer ¶

Bases: ABC, Generic[_T]

Source code in vllm/renderers/base.py

class BaseRenderer(ABC, Generic[_T]):
    def __init__(self, config: "VllmConfig", tokenizer: _T | None) -> None:
        super().__init__()

        self.config = config
        self.model_config = config.model_config
        self.api_process_rank = config.parallel_config._api_process_rank

        self.tokenizer = tokenizer

        # Shared thread pool executor for blocking tokenizer and
        # multimodal preprocessing operations.  The multimodal processor
        # receives a deep-copied tokenizer (see #36557) so it is safe to
        # run tokenization and MM preprocessing concurrently.
        pool_workers = config.model_config.renderer_num_workers
        self._executor = ThreadPoolExecutor(max_workers=pool_workers)

        # Multimodal preprocessing is always offloaded to the thread pool
        # to keep the asyncio event loop responsive under concurrent load.
        self._mm_executor: Executor = self._executor

        # Lazy initialization since offline LLM doesn't use async
        self._async_tokenizer: AsyncMicrobatchTokenizer | None = None

        self.mm_processor: BaseMultiModalProcessor | None = None
        self._readonly_mm_processor: BaseMultiModalProcessor | None = None
        self._mm_cache_stats: MultiModalCacheStats | None = None
        self._clear_mm_cache_async = make_async(
            self.clear_mm_cache, executor=self._executor
        )
        self._process_multimodal_async = make_async(
            self._process_multimodal, executor=self._mm_executor
        )
        if config.model_config.is_multimodal_model:
            mm_processor_cache = mm_registry.processor_cache_from_config(config)

            # Deep-copy the tokenizer so the multimodal processor gets its
            # own Rust tokenizer backend.  Without this, concurrent access
            # from AsyncMicrobatchTokenizer and call_hf_processor causes
            # "RuntimeError: Already borrowed" from the Rust RefCell.
            # See: https://github.com/huggingface/tokenizers/issues/537
            mm_tokenizer = copy.deepcopy(tokenizer)

            with set_default_torch_num_threads():
                self.mm_processor = mm_registry.create_processor(
                    config.model_config,
                    tokenizer=mm_tokenizer,
                    cache=mm_processor_cache,
                )

            if mm_processor_cache:
                self._mm_cache_stats = MultiModalCacheStats()

            # A second processor with its own processor-only cache.
            # Used by the tokenize endpoint so that tokenize-only
            # requests don't pollute the sender cache.
            ro_cache = mm_registry.processor_only_cache_from_config(config)
            if ro_cache is not None:
                ro_tokenizer = copy.deepcopy(tokenizer)
                with set_default_torch_num_threads():
                    self._readonly_mm_processor = mm_registry.create_processor(
                        config.model_config,
                        tokenizer=ro_tokenizer,
                        cache=ro_cache,
                    )

            # This is used to generate internal request ID for MM processing
            # It has no relation to the request ID for engine core
            self._mm_req_counter = AtomicCounter()
            self._mm_timing_registry = MultiModalTimingRegistry(
                config.observability_config
            )

    def get_tokenizer(self) -> _T:
        tokenizer = self.tokenizer
        if tokenizer is None:
            raise ValueError("Tokenizer not available when `skip_tokenizer_init=True`")

        return tokenizer

    def get_async_tokenizer(self) -> AsyncMicrobatchTokenizer:
        if self._async_tokenizer is None:
            self._async_tokenizer = AsyncMicrobatchTokenizer(
                self.get_tokenizer(), executor=self._executor
            )

        return self._async_tokenizer

    def get_mm_processor(self) -> "BaseMultiModalProcessor":
        if self.mm_processor is None:
            raise ValueError("Multi-modal processor not available for text-only models")

        return self.mm_processor

    @property
    def mm_processor_cache(self) -> "BaseMultiModalProcessorCache | None":
        if self.mm_processor is None:
            return None

        return self.mm_processor.cache

    def stat_mm_cache(self) -> MultiModalCacheStats | None:
        mm_cache_stats = self._mm_cache_stats
        if mm_cache_stats is None:
            return None

        self._mm_cache_stats = MultiModalCacheStats()

        return mm_cache_stats

    def update_mm_cache_stats(self) -> None:
        mm_processor_cache = self.mm_processor_cache
        mm_cache_stats = self._mm_cache_stats

        if mm_processor_cache and mm_cache_stats:
            delta = mm_processor_cache.make_stats(delta=True)
            mm_cache_stats.record(delta.total, delta.hits)

    def clear_mm_cache(self) -> None:
        mm_processor_cache = self.mm_processor_cache
        if mm_processor_cache is not None:
            mm_processor_cache.clear_cache()

        if self._mm_cache_stats is not None:
            self._mm_cache_stats.reset = True

    def warmup(self, chat_params: ChatParams) -> None:
        """
        Warm up this renderer to avoid first-request latency.

        For chat requests:
        - Jinja2 template compilation
        """
        from vllm.entrypoints.chat_utils import ChatTemplateResolutionError

        try:
            logger.debug("Warming up chat template processing...")
            start_time = time.perf_counter()

            self.render_chat([[{"role": "user", "content": "warmup"}]], chat_params)

            elapsed = time.perf_counter() - start_time
            logger.debug("Chat template warmup completed in %.3fs", elapsed)
        except ChatTemplateResolutionError:
            logger.debug("This model does not support chat template.")
        except Exception:
            logger.warning("Chat template warmup failed", exc_info=True)

        if self.mm_processor:
            from vllm.multimodal.processing import TimingContext

            model_config = self.model_config
            mm_config = model_config.get_multimodal_config()
            processor = self.mm_processor
            mm_limits = processor.info.allowed_mm_limits

            try:
                logger.debug("Warming up multi-modal processing...")
                start_time = time.perf_counter()

                processor_inputs = processor.dummy_inputs.get_dummy_processor_inputs(
                    seq_len=model_config.max_model_len,
                    mm_counts=dict.fromkeys(mm_limits, 1),
                    mm_options=mm_config.limit_per_prompt,
                )
                _ = processor.apply(
                    processor_inputs, timing_ctx=TimingContext(enabled=False)
                )

                elapsed = time.perf_counter() - start_time
                logger.info("Multi-modal warmup completed in %.3fs", elapsed)
            except Exception:
                logger.warning("Multi-modal warmup failed")
            finally:
                self.clear_mm_cache()

    async def clear_mm_cache_async(self) -> None:
        """Serialize clear_mm_cache through the shared executor to avoid
        races with concurrent process_inputs on the mm_processor_cache."""
        await self._clear_mm_cache_async()

    def shutdown(self) -> None:
        mm_processor_cache = self.mm_processor_cache
        if mm_processor_cache is not None:
            mm_processor_cache.close()

        if executor := getattr(self, "_executor", None):
            executor.shutdown(wait=False)

        if (
            mm_executor := getattr(self, "_mm_executor", None)
        ) is not None and mm_executor is not executor:
            mm_executor.shutdown(wait=False)

    def get_bos_token_id(self) -> int | None:
        if self.tokenizer is None:
            logger.warning_once(
                "Using None for BOS token id because tokenizer is not initialized"
            )
            return None

        return self.tokenizer.bos_token_id

    def get_eos_token_id(self) -> int | None:
        if self.tokenizer is None:
            logger.warning_once(
                "Using None for EOS token id because tokenizer is not initialized"
            )
            return None

        return self.tokenizer.eos_token_id

    def get_dec_start_token_id(self) -> int:
        """
        Obtain the decoder start token id employed by an encoder/decoder model,
        raising an error if it is not available.
        """
        dec_start_token_id = getattr(
            self.model_config.hf_config, "decoder_start_token_id", None
        )

        if dec_start_token_id is None:
            logger.warning_once(
                "Falling back on <BOS> for decoder start token id "
                "because decoder start token id is not available."
            )
            dec_start_token_id = self.get_bos_token_id()

        if dec_start_token_id is None:
            raise RuntimeError("Cannot find decoder start token id or <BOS>")

        return dec_start_token_id

    @cached_property
    def default_cmpl_tok_params(self) -> TokenizeParams:
        mm_processor = self.mm_processor
        if mm_processor is not None:
            return mm_processor.info.default_tok_params

        model_config = self.model_config
        encoder_config = model_config.encoder_config or {}

        return TokenizeParams(
            max_total_tokens=model_config.max_model_len,
            do_lower_case=encoder_config.get("do_lower_case", False),
            add_special_tokens=True,
        )

    @cached_property
    def default_chat_tok_params(self) -> TokenizeParams:
        mm_processor = self.mm_processor
        if mm_processor is not None:
            return mm_processor.info.default_tok_params

        model_config = self.model_config
        encoder_config = model_config.encoder_config or {}

        return TokenizeParams(
            max_total_tokens=model_config.max_model_len,
            do_lower_case=encoder_config.get("do_lower_case", False),
            add_special_tokens=False,
        )

    # Step 1: Convert raw inputs to prompts
    def render_prompt(
        self,
        prompt: DictPrompt | bytes,
    ) -> DictPrompt:
        if isinstance(prompt, bytes):
            embeds = safe_load_prompt_embeds(self.model_config, prompt)
            prompt = EmbedsPrompt(prompt_embeds=embeds)

        return prompt

    def render_prompts(
        self,
        prompts: Sequence[DictPrompt | bytes],
    ) -> list[DictPrompt]:
        if len(prompts) == 0:
            raise ValueError("You must pass at least one prompt")

        return [self.render_prompt(prompt) for prompt in prompts]

    async def render_prompts_async(
        self,
        prompts: Sequence[DictPrompt | bytes],
    ) -> list[DictPrompt]:
        return self.render_prompts(prompts)

    @abstractmethod
    def render_messages(
        self,
        messages: list["ChatCompletionMessageParam"],
        params: ChatParams,
    ) -> tuple[list["ConversationMessage"], DictPrompt]:
        raise NotImplementedError

    async def render_messages_async(
        self,
        messages: list["ChatCompletionMessageParam"],
        params: ChatParams,
    ) -> tuple[list["ConversationMessage"], DictPrompt]:
        return self.render_messages(messages, params)

    # Step 2: Tokenize prompts if necessary
    def _tokenize_prompt(
        self,
        prompt: TextPrompt,
        params: TokenizeParams,
    ) -> TokensPrompt:
        tokenizer = self.get_tokenizer()
        prompt_token_ids = tokenizer.encode(
            prompt["prompt"],
            **params.get_encode_kwargs(),
        )

        return TokensPrompt(prompt_token_ids=prompt_token_ids, **prompt)

    async def _tokenize_prompt_async(
        self,
        prompt: TextPrompt,
        params: TokenizeParams,
    ) -> TokensPrompt:
        tokenizer = self.get_async_tokenizer()
        prompt_token_ids = await tokenizer.encode(
            prompt["prompt"],
            **params.get_encode_kwargs(),
        )

        return TokensPrompt(prompt_token_ids=prompt_token_ids, **prompt)

    def _detokenize_prompt(self, prompt: TokensPrompt) -> TokensPrompt:
        tokenizer = self.get_tokenizer()
        prompt["prompt"] = tokenizer.decode(prompt["prompt_token_ids"])

        return prompt

    async def _detokenize_prompt_async(self, prompt: TokensPrompt) -> TokensPrompt:
        tokenizer = self.get_async_tokenizer()
        prompt["prompt"] = await tokenizer.decode(prompt["prompt_token_ids"])

        return prompt

    @overload
    def _tokenize_singleton_prompt(
        self,
        prompt: TextPrompt | TokensPrompt,
        params: TokenizeParams,
    ) -> TokensPrompt: ...

    @overload
    def _tokenize_singleton_prompt(  # type: ignore[misc]
        self,
        prompt: EmbedsPrompt,
        params: TokenizeParams,
    ) -> EmbedsPrompt: ...

    def _tokenize_singleton_prompt(
        self,
        prompt: SingletonDictPrompt,
        params: TokenizeParams,
    ) -> SingletonTokPrompt:
        if "prompt_token_ids" not in prompt and "prompt_embeds" not in prompt:
            prompt = params.apply_pre_tokenization(self.tokenizer, prompt)  # type: ignore[arg-type]
            prompt = self._tokenize_prompt(prompt, params)

        if params.needs_detokenization and "prompt" not in prompt:
            if "prompt_token_ids" not in prompt:
                raise RuntimeError("Cannot run detokenization on embeddings")

            prompt = self._detokenize_prompt(prompt)  # type: ignore[arg-type]

        return params.apply_post_tokenization(self.tokenizer, prompt)  # type: ignore[arg-type]

    @overload
    async def _tokenize_singleton_prompt_async(
        self,
        prompt: TextPrompt | TokensPrompt,
        params: TokenizeParams,
    ) -> TokensPrompt: ...

    @overload
    async def _tokenize_singleton_prompt_async(  # type: ignore[misc]
        self,
        prompt: EmbedsPrompt,
        params: TokenizeParams,
    ) -> EmbedsPrompt: ...

    async def _tokenize_singleton_prompt_async(
        self,
        prompt: SingletonDictPrompt,
        params: TokenizeParams,
    ) -> SingletonTokPrompt:
        if "prompt_token_ids" not in prompt and "prompt_embeds" not in prompt:
            prompt = params.apply_pre_tokenization(self.tokenizer, prompt)  # type: ignore[arg-type]
            prompt = await self._tokenize_prompt_async(prompt, params)

        if params.needs_detokenization and "prompt" not in prompt:
            if "prompt_token_ids" not in prompt:
                raise RuntimeError("Cannot run detokenization on embeddings")

            prompt = await self._detokenize_prompt_async(prompt)  # type: ignore[arg-type]

        return params.apply_post_tokenization(self.tokenizer, prompt)  # type: ignore[arg-type]

    def _tokenize_enc_dec_prompt(
        self,
        prompt: EncoderDecoderDictPrompt,
        params: TokenizeParams,
    ) -> EncoderDecoderTokPrompt:
        enc_prompt, dec_prompt = (
            self._tokenize_singleton_prompt(prompt["encoder_prompt"], params),
            (
                None
                if prompt["decoder_prompt"] is None
                else self._tokenize_singleton_prompt(prompt["decoder_prompt"], params)
            ),
        )

        return EncoderDecoderTokPrompt(
            encoder_prompt=enc_prompt,
            decoder_prompt=dec_prompt,
        )

    async def _tokenize_enc_dec_prompt_async(
        self,
        prompt: EncoderDecoderDictPrompt,
        params: TokenizeParams,
    ) -> EncoderDecoderTokPrompt:
        enc_prompt, dec_prompt = await asyncio.gather(
            self._tokenize_singleton_prompt_async(prompt["encoder_prompt"], params),
            (
                asyncio.sleep(0)
                if prompt["decoder_prompt"] is None
                else self._tokenize_singleton_prompt_async(
                    prompt["decoder_prompt"], params
                )
            ),
        )

        return EncoderDecoderTokPrompt(
            encoder_prompt=enc_prompt,
            decoder_prompt=dec_prompt,
        )

    def tokenize_prompt(
        self,
        prompt: DictPrompt,
        params: TokenizeParams,
    ) -> TokPrompt:
        if "encoder_prompt" in prompt:
            return self._tokenize_enc_dec_prompt(prompt, params)  # type: ignore[arg-type]

        return self._tokenize_singleton_prompt(prompt, params)

    def tokenize_prompts(
        self,
        prompts: Sequence[DictPrompt],
        params: TokenizeParams,
    ) -> list[TokPrompt]:
        return [self.tokenize_prompt(prompt, params) for prompt in prompts]

    async def tokenize_prompt_async(
        self,
        prompt: DictPrompt,
        params: TokenizeParams,
    ) -> TokPrompt:
        if "encoder_prompt" in prompt:
            return await self._tokenize_enc_dec_prompt_async(prompt, params)  # type: ignore[arg-type]

        return await self._tokenize_singleton_prompt_async(prompt, params)

    async def tokenize_prompts_async(
        self,
        prompts: Sequence[DictPrompt],
        params: TokenizeParams,
    ) -> list[TokPrompt]:
        return await asyncio.gather(
            *(self.tokenize_prompt_async(prompt, params) for prompt in prompts)
        )

    # Step 3: Add extra keys to the prompts
    def _apply_prompt_extras(
        self,
        prompts: Sequence[TokPrompt],
        prompt_extras: dict[str, Any] | None,
    ):
        if not prompt_extras:
            return

        for prompt in prompts:
            target_prompt = extract_target_prompt(self.model_config, prompt)
            target_prompt.update(prompt_extras)  # type: ignore[arg-type]

    # Step 4: Convert to engine inputs
    def _validate_mm_uuids(
        self,
        mm_data: MultiModalDataDict,
        mm_data_items: MultiModalDataItems,
        mm_uuid_items: MultiModalUUIDItems,
    ) -> None:
        # NOTE: Keys corresponding to `None` in `mm_data` don't appear in
        # `mm_data_items`
        modalities = mm_data.keys() | mm_uuid_items.keys()

        for modality in modalities:
            data_items = mm_data_items.get(modality)
            uuid_items = mm_uuid_items.get(modality)

            if data_items is None:
                if uuid_items is None:
                    raise ValueError(
                        f"multi_modal_data[{modality!r}] is empty but "
                        f"multi_modal_uuids[{modality!r}] is missing."
                    )

            elif uuid_items is not None:
                if len(data_items) != len(uuid_items):
                    raise ValueError(
                        f"If given, multi_modal_uuids[{modality!r}] must have "
                        f"same length as multi_modal_data[{modality!r}], but "
                        f"got {len(uuid_items)} vs {len(data_items)}."
                    )

                for i, item in enumerate(data_items):
                    if item is None and uuid_items[i] is None:
                        raise ValueError(
                            f"multi_modal_data[{modality!r}][{i}] is empty but "
                            f"multi_modal_uuids[{modality!r}][{i}] is missing."
                        )

    def _process_mm_uuids(
        self,
        mm_data: MultiModalDataDict,
        mm_data_items: MultiModalDataItems,
        mm_uuid_items: MultiModalUUIDItems,
        mm_req_id: str,
    ) -> MultiModalUUIDItems:
        model_config = self.model_config

        # NOTE: When users explicitly turn off BOTH prefix caching and input
        # processing caching, no multimodal features or embeddings will be
        # reused across requests, therefore identifying multimodal data items
        # by their content is no longer necessary, and we create uuids with
        # `<mm_req_id>-<modality>-<index>`, overriding even user-provided ones.
        if (
            model_config.multimodal_config
            and model_config.multimodal_config.mm_processor_cache_gb == 0
            and not self.config.cache_config.enable_prefix_caching
        ):
            mm_uuid_items = {
                modality: [f"{mm_req_id}-{modality}-{i}" for i in range(data_count)]
                for modality, data_count in mm_data_items.get_all_counts().items()
            }

        self._validate_mm_uuids(mm_data, mm_data_items, mm_uuid_items)

        return mm_uuid_items

    # TODO: Remove str and tokenization_kwargs after deprecating InputPreprocessor
    def _process_multimodal(
        self,
        prompt: list[int] | str,
        mm_data: MultiModalDataDict,
        mm_uuids: MultiModalUUIDDict | None,
        mm_processor_kwargs: Mapping[str, object] | None,
        tokenization_kwargs: dict[str, Any] | None,
        *,
        skip_mm_cache: bool = False,
    ) -> "MultiModalInput":
        mm_req_id = f"renderer{self.api_process_rank}-mm-{self._mm_req_counter.inc(1)}"

        if skip_mm_cache and self._readonly_mm_processor is not None:
            mm_processor = self._readonly_mm_processor
        else:
            mm_processor = self.get_mm_processor()

        mm_data_items = mm_processor.info.parse_mm_data(mm_data)
        mm_uuid_items = parse_mm_uuids(mm_uuids)

        mm_uuid_items = self._process_mm_uuids(
            mm_data, mm_data_items, mm_uuid_items, mm_req_id
        )

        mm_processor_inputs = MMProcessorInputs(
            prompt,
            mm_data_items,
            mm_uuid_items,
            hf_processor_mm_kwargs=mm_processor_kwargs or {},
            tokenization_kwargs=tokenization_kwargs or {},
        )
        mm_timing_ctx = self._mm_timing_registry.get(mm_req_id)

        with set_default_torch_num_threads():
            mm_inputs = mm_processor.apply(mm_processor_inputs, mm_timing_ctx)

        self.update_mm_cache_stats()

        return mm_inputs

    def _process_tokens(
        self,
        prompt: TokensPrompt,
        *,
        skip_mm_cache: bool = False,
    ) -> TokensInput | MultiModalInput:
        """Process token inputs, with multimodal preprocessing offloaded
        to the shared thread pool in the async variant.
        """
        prompt_token_ids = prompt["prompt_token_ids"]

        engine_input: TokensInput | MultiModalInput
        if multi_modal_data := prompt.get("multi_modal_data"):
            engine_input = self._process_multimodal(
                prompt_token_ids,
                multi_modal_data,
                mm_processor_kwargs=prompt.get("mm_processor_kwargs"),
                tokenization_kwargs=None,  # Tokenization already done in Step 2
                mm_uuids=prompt.get("multi_modal_uuids"),
                skip_mm_cache=skip_mm_cache,
            )
        else:
            engine_input = tokens_input(prompt_token_ids)

        if prompt_text := prompt.get("prompt"):
            engine_input["prompt"] = prompt_text
        if cache_salt := prompt.get("cache_salt"):
            engine_input["cache_salt"] = cache_salt

        return engine_input

    def _process_embeds(self, prompt: EmbedsPrompt) -> EmbedsInput:
        if not self.model_config.enable_prompt_embeds:
            raise ValueError(
                "You must set `--enable-prompt-embeds` to input `prompt_embeds`."
            )

        prompt_embeds = prompt["prompt_embeds"]

        # prompt_embeds must be (seq_len, hidden_size), but if the user
        # passes in a batch of size 1, i.e. (1, seq_len, hidden_size),
        # we can unambiguously process the intent by squeezing the batch
        # dimension.
        if prompt_embeds.ndim == 3:
            prompt_embeds = prompt_embeds.squeeze(dim=0)

        if prompt_embeds.ndim != 2:
            raise ValueError("prompt_embeds must be of shape (seq_len, hidden_size).")

        # Tensors must be on CPU for serialization between processes
        # in the MsgpackEncoder. Casting to CPU here ensures that there is no
        # hidden device transfer in the critical path of generation.
        prompt_embeds = prompt_embeds.cpu()

        return embeds_input(
            prompt_embeds=prompt_embeds,
            cache_salt=prompt.get("cache_salt"),
        )

    async def _process_tokens_async(
        self,
        prompt: TokensPrompt,
        *,
        skip_mm_cache: bool = False,
    ) -> TokensInput | MultiModalInput:
        prompt_token_ids = prompt["prompt_token_ids"]

        engine_input: TokensInput | MultiModalInput
        if multi_modal_data := prompt.get("multi_modal_data"):
            engine_input = await self._process_multimodal_async(
                prompt_token_ids,
                multi_modal_data,
                mm_processor_kwargs=prompt.get("mm_processor_kwargs"),
                tokenization_kwargs=None,
                mm_uuids=prompt.get("multi_modal_uuids"),
                skip_mm_cache=skip_mm_cache,
            )
        else:
            engine_input = tokens_input(prompt_token_ids)

        if prompt_text := prompt.get("prompt"):
            engine_input["prompt"] = prompt_text
        if cache_salt := prompt.get("cache_salt"):
            engine_input["cache_salt"] = cache_salt

        return engine_input

    def _process_singleton(
        self,
        prompt: SingletonTokPrompt,
        *,
        skip_mm_cache: bool = False,
    ) -> SingletonInput:
        if "prompt_embeds" in prompt:
            return self._process_embeds(prompt)  # type: ignore[arg-type]

        return self._process_tokens(prompt, skip_mm_cache=skip_mm_cache)  # type: ignore[arg-type]

    async def _process_singleton_async(
        self,
        prompt: SingletonTokPrompt,
        *,
        skip_mm_cache: bool = False,
    ) -> SingletonInput:
        if "prompt_embeds" in prompt:
            return self._process_embeds(prompt)  # type: ignore[arg-type]

        return await self._process_tokens_async(prompt, skip_mm_cache=skip_mm_cache)  # type: ignore[arg-type]

    def _process_enc_dec(
        self,
        prompt: EncoderDecoderTokPrompt,
        *,
        skip_mm_cache: bool = False,
    ) -> EncoderDecoderInput:
        enc_prompt = prompt["encoder_prompt"]
        dec_prompt = prompt["decoder_prompt"]

        skip_decoder_start_token = False
        if self.mm_processor is not None:
            from vllm.multimodal.processing import EncDecMultiModalProcessor

            if isinstance(self.mm_processor, EncDecMultiModalProcessor):
                skip_decoder_start_token = self.mm_processor.skip_decoder_start_token

        return build_enc_dec_input(
            encoder_input=self._process_singleton(
                enc_prompt, skip_mm_cache=skip_mm_cache
            ),
            decoder_input=(
                None
                if dec_prompt is None
                else self._process_singleton(dec_prompt, skip_mm_cache=skip_mm_cache)
            ),
            decoder_start_token_id=self.get_dec_start_token_id(),
            skip_decoder_start_token=skip_decoder_start_token,
        )

    async def _process_enc_dec_async(
        self,
        prompt: EncoderDecoderTokPrompt,
        *,
        skip_mm_cache: bool = False,
    ) -> EncoderDecoderInput:
        enc_prompt = prompt["encoder_prompt"]
        dec_prompt = prompt["decoder_prompt"]

        encoder_input, decoder_input = await asyncio.gather(
            self._process_singleton_async(enc_prompt, skip_mm_cache=skip_mm_cache),
            (
                asyncio.sleep(0)
                if dec_prompt is None
                else self._process_singleton_async(
                    dec_prompt, skip_mm_cache=skip_mm_cache
                )
            ),
        )

        return build_enc_dec_input(
            encoder_input=encoder_input,
            decoder_input=decoder_input,
            decoder_start_token_id=self.get_dec_start_token_id(),
        )

    def process_for_engine(
        self,
        prompt: TokPrompt,
        arrival_time: float,
        *,
        skip_mm_cache: bool = False,
    ) -> EngineInput:
        engine_input: EngineInput
        if "encoder_prompt" in prompt:
            engine_input = self._process_enc_dec(prompt, skip_mm_cache=skip_mm_cache)  # type: ignore[arg-type]
        else:
            engine_input = self._process_singleton(prompt, skip_mm_cache=skip_mm_cache)

        engine_input["arrival_time"] = arrival_time

        return engine_input

    async def process_for_engine_async(
        self,
        prompt: TokPrompt,
        arrival_time: float,
        *,
        skip_mm_cache: bool = False,
    ) -> EngineInput:
        engine_input: EngineInput
        if "encoder_prompt" in prompt:
            engine_input = await self._process_enc_dec_async(
                prompt,  # type: ignore[arg-type]
                skip_mm_cache=skip_mm_cache,
            )
        else:
            engine_input = await self._process_singleton_async(
                prompt, skip_mm_cache=skip_mm_cache
            )

        engine_input["arrival_time"] = arrival_time

        return engine_input

    # Top-level methods
    def render_cmpl(
        self,
        prompts: Sequence[DictPrompt | bytes],
        tok_params: TokenizeParams | None = None,
        *,
        prompt_extras: dict[str, Any] | None = None,
        skip_mm_cache: bool = False,
    ):
        arrival_time = time.time()

        if tok_params is None:
            tok_params = self.default_cmpl_tok_params

        dict_prompts = self.render_prompts(prompts)
        tok_prompts = self.tokenize_prompts(dict_prompts, tok_params)

        self._apply_prompt_extras(tok_prompts, prompt_extras)

        return [
            self.process_for_engine(prompt, arrival_time, skip_mm_cache=skip_mm_cache)
            for prompt in tok_prompts
        ]

    async def render_cmpl_async(
        self,
        prompts: Sequence[DictPrompt | bytes],
        tok_params: TokenizeParams | None = None,
        *,
        prompt_extras: dict[str, Any] | None = None,
        skip_mm_cache: bool = False,
    ):
        arrival_time = time.time()

        if tok_params is None:
            tok_params = self.default_cmpl_tok_params

        dict_prompts = await self.render_prompts_async(prompts)
        tok_prompts = await self.tokenize_prompts_async(dict_prompts, tok_params)

        self._apply_prompt_extras(tok_prompts, prompt_extras)

        return await asyncio.gather(
            *(
                self.process_for_engine_async(
                    p, arrival_time, skip_mm_cache=skip_mm_cache
                )
                for p in tok_prompts
            )
        )

    def render_chat(
        self,
        conversations: Sequence[list["ChatCompletionMessageParam"]],
        chat_params: ChatParams,
        tok_params: TokenizeParams | None = None,
        *,
        prompt_extras: dict[str, Any] | None = None,
        skip_mm_cache: bool = False,
    ):
        arrival_time = time.time()

        if tok_params is None:
            tok_params = self.default_chat_tok_params

        rendered = [
            self.render_messages(conversation, chat_params)
            for conversation in conversations
        ]

        out_conversations = list[list["ConversationMessage"]]()
        dict_prompts = list[DictPrompt]()
        for conv, prompt in rendered:
            out_conversations.append(conv)
            dict_prompts.append(prompt)

        tok_prompts = self.tokenize_prompts(dict_prompts, tok_params)

        self._apply_prompt_extras(tok_prompts, prompt_extras)

        eng_prompts = [
            self.process_for_engine(prompt, arrival_time, skip_mm_cache=skip_mm_cache)
            for prompt in tok_prompts
        ]

        return out_conversations, eng_prompts

    async def render_chat_async(
        self,
        conversations: Sequence[list["ChatCompletionMessageParam"]],
        chat_params: ChatParams,
        tok_params: TokenizeParams | None = None,
        *,
        prompt_extras: dict[str, Any] | None = None,
        skip_mm_cache: bool = False,
    ):
        arrival_time = time.time()

        if tok_params is None:
            tok_params = self.default_chat_tok_params

        rendered = [
            self.render_messages_async(conversation, chat_params)
            for conversation in conversations
        ]

        out_conversations = list[list["ConversationMessage"]]()
        dict_prompts = list[DictPrompt]()
        for conv, prompt in await asyncio.gather(*rendered):
            out_conversations.append(conv)
            dict_prompts.append(prompt)

        tok_prompts = await self.tokenize_prompts_async(dict_prompts, tok_params)

        self._apply_prompt_extras(tok_prompts, prompt_extras)

        eng_prompts = await asyncio.gather(
            *(
                self.process_for_engine_async(
                    p, arrival_time, skip_mm_cache=skip_mm_cache
                )
                for p in tok_prompts
            )
        )

        return out_conversations, eng_prompts

_process_tokens ¶

_process_tokens(
    prompt: TokensPrompt, *, skip_mm_cache: bool = False
) -> TokensInput | MultiModalInput

Process token inputs, with multimodal preprocessing offloaded to the shared thread pool in the async variant.

Source code in vllm/renderers/base.py

def _process_tokens(
    self,
    prompt: TokensPrompt,
    *,
    skip_mm_cache: bool = False,
) -> TokensInput | MultiModalInput:
    """Process token inputs, with multimodal preprocessing offloaded
    to the shared thread pool in the async variant.
    """
    prompt_token_ids = prompt["prompt_token_ids"]

    engine_input: TokensInput | MultiModalInput
    if multi_modal_data := prompt.get("multi_modal_data"):
        engine_input = self._process_multimodal(
            prompt_token_ids,
            multi_modal_data,
            mm_processor_kwargs=prompt.get("mm_processor_kwargs"),
            tokenization_kwargs=None,  # Tokenization already done in Step 2
            mm_uuids=prompt.get("multi_modal_uuids"),
            skip_mm_cache=skip_mm_cache,
        )
    else:
        engine_input = tokens_input(prompt_token_ids)

    if prompt_text := prompt.get("prompt"):
        engine_input["prompt"] = prompt_text
    if cache_salt := prompt.get("cache_salt"):
        engine_input["cache_salt"] = cache_salt

    return engine_input

clear_mm_cache_async `async` ¶

clear_mm_cache_async() -> None

Serialize clear_mm_cache through the shared executor to avoid races with concurrent process_inputs on the mm_processor_cache.

Source code in vllm/renderers/base.py

async def clear_mm_cache_async(self) -> None:
    """Serialize clear_mm_cache through the shared executor to avoid
    races with concurrent process_inputs on the mm_processor_cache."""
    await self._clear_mm_cache_async()

get_dec_start_token_id ¶

get_dec_start_token_id() -> int

Obtain the decoder start token id employed by an encoder/decoder model, raising an error if it is not available.

Source code in vllm/renderers/base.py

def get_dec_start_token_id(self) -> int:
    """
    Obtain the decoder start token id employed by an encoder/decoder model,
    raising an error if it is not available.
    """
    dec_start_token_id = getattr(
        self.model_config.hf_config, "decoder_start_token_id", None
    )

    if dec_start_token_id is None:
        logger.warning_once(
            "Falling back on <BOS> for decoder start token id "
            "because decoder start token id is not available."
        )
        dec_start_token_id = self.get_bos_token_id()

    if dec_start_token_id is None:
        raise RuntimeError("Cannot find decoder start token id or <BOS>")

    return dec_start_token_id

warmup ¶

warmup(chat_params: ChatParams) -> None

Warm up this renderer to avoid first-request latency.

For chat requests: - Jinja2 template compilation

Source code in vllm/renderers/base.py

def warmup(self, chat_params: ChatParams) -> None:
    """
    Warm up this renderer to avoid first-request latency.

    For chat requests:
    - Jinja2 template compilation
    """
    from vllm.entrypoints.chat_utils import ChatTemplateResolutionError

    try:
        logger.debug("Warming up chat template processing...")
        start_time = time.perf_counter()

        self.render_chat([[{"role": "user", "content": "warmup"}]], chat_params)

        elapsed = time.perf_counter() - start_time
        logger.debug("Chat template warmup completed in %.3fs", elapsed)
    except ChatTemplateResolutionError:
        logger.debug("This model does not support chat template.")
    except Exception:
        logger.warning("Chat template warmup failed", exc_info=True)

    if self.mm_processor:
        from vllm.multimodal.processing import TimingContext

        model_config = self.model_config
        mm_config = model_config.get_multimodal_config()
        processor = self.mm_processor
        mm_limits = processor.info.allowed_mm_limits

        try:
            logger.debug("Warming up multi-modal processing...")
            start_time = time.perf_counter()

            processor_inputs = processor.dummy_inputs.get_dummy_processor_inputs(
                seq_len=model_config.max_model_len,
                mm_counts=dict.fromkeys(mm_limits, 1),
                mm_options=mm_config.limit_per_prompt,
            )
            _ = processor.apply(
                processor_inputs, timing_ctx=TimingContext(enabled=False)
            )

            elapsed = time.perf_counter() - start_time
            logger.info("Multi-modal warmup completed in %.3fs", elapsed)
        except Exception:
            logger.warning("Multi-modal warmup failed")
        finally:
            self.clear_mm_cache()

ChatParams `dataclass` ¶

Configuration to control how to parse chat messages.

Source code in vllm/renderers/params.py

@dataclass(frozen=True)
class ChatParams:
    """Configuration to control how to parse chat messages."""

    chat_template: str | None = None
    """The chat template to apply."""

    chat_template_content_format: "ChatTemplateContentFormatOption" = "auto"
    """The format of the chat template."""

    chat_template_kwargs: dict[str, Any] = field(default_factory=dict)
    """The kwargs to pass to the chat template."""

    media_io_kwargs: dict[str, dict[str, Any]] | None = None
    """Per-modality kwargs for media I/O (loading/decoding images, videos, etc.)."""

    mm_processor_kwargs: dict[str, Any] | None = None
    """The kwargs to pass to the multi-modal processor."""

    def with_defaults(
        self,
        default_chat_template_kwargs: dict[str, Any] | None = None,
        default_media_io_kwargs: dict[str, dict[str, Any]] | None = None,
        default_mm_processor_kwargs: dict[str, Any] | None = None,
    ):
        if (
            not default_chat_template_kwargs
            and not default_media_io_kwargs
            and not default_mm_processor_kwargs
        ):
            return self

        return ChatParams(
            chat_template=self.chat_template,
            chat_template_content_format=self.chat_template_content_format,
            chat_template_kwargs=merge_kwargs(
                default_chat_template_kwargs,
                self.chat_template_kwargs,
            ),
            media_io_kwargs=merge_media_io_kwargs(
                default_media_io_kwargs,
                self.media_io_kwargs,
            ),
            mm_processor_kwargs=recursively_merge_kwargs(
                default_mm_processor_kwargs,
                self.mm_processor_kwargs,
            ),
        )

    def get_apply_chat_template_kwargs(self) -> dict[str, Any]:
        """The arguments to pass to `tokenizer.apply_chat_template`."""
        return merge_kwargs(
            self.chat_template_kwargs,
            dict(chat_template=self.chat_template, return_dict=False),
        )

chat_template `class-attribute` `instance-attribute` ¶

chat_template: str | None = None

The chat template to apply.

chat_template_content_format `class-attribute` `instance-attribute` ¶

chat_template_content_format: ChatTemplateContentFormatOption = "auto"

The format of the chat template.

chat_template_kwargs `class-attribute` `instance-attribute` ¶

chat_template_kwargs: dict[str, Any] = field(
    default_factory=dict
)

The kwargs to pass to the chat template.

media_io_kwargs `class-attribute` `instance-attribute` ¶

media_io_kwargs: dict[str, dict[str, Any]] | None = None

Per-modality kwargs for media I/O (loading/decoding images, videos, etc.).

mm_processor_kwargs `class-attribute` `instance-attribute` ¶

mm_processor_kwargs: dict[str, Any] | None = None

The kwargs to pass to the multi-modal processor.

get_apply_chat_template_kwargs ¶

get_apply_chat_template_kwargs() -> dict[str, Any]

The arguments to pass to tokenizer.apply_chat_template.

Source code in vllm/renderers/params.py

def get_apply_chat_template_kwargs(self) -> dict[str, Any]:
    """The arguments to pass to `tokenizer.apply_chat_template`."""
    return merge_kwargs(
        self.chat_template_kwargs,
        dict(chat_template=self.chat_template, return_dict=False),
    )

TokenizeParams `dataclass` ¶

Configuration to control how prompts are tokenized.

Source code in vllm/renderers/params.py

@dataclass(frozen=True)
class TokenizeParams:
    """Configuration to control how prompts are tokenized."""

    max_total_tokens: int | None
    """
    Maximum allowed number of input + output tokens.

    Usually, this refers to the model's context length.
    """

    max_output_tokens: int = 0
    """Maximum requested number of output tokens."""

    pad_prompt_tokens: int | None = None
    """
    Number of tokens to pad to:
    - `None` means no padding.
    - `-1` maps to `max_input_tokens`.
    """

    truncate_prompt_tokens: int | None = None
    """
    Number of tokens to keep:
    - `None` means no truncation.
    - `-1` maps to `max_input_tokens`.
    """

    truncation_side: Literal["left", "right"] | None = None
    """
    Which side to truncate from when ``truncate_prompt_tokens`` is active:
    - ``"right"`` keeps the first N tokens (truncate from the end).
    - ``"left"``  keeps the last  N tokens (truncate from the start).
    - ``None``    falls back to the tokenizer default.
    """

    do_lower_case: bool = False
    """Whether to normalize text to lower case before tokenization."""

    add_special_tokens: bool = True
    """Whether to add special tokens."""

    needs_detokenization: bool = False
    """
    Whether the tokenized prompt needs to contain the original text.

    Not to be confused with `SamplingParams.detokenize` which deals
    with the output generated by the model.
    """

    max_total_tokens_param: str = "max_total_tokens"
    """Override this to edit the message for validation errors."""

    max_output_tokens_param: str = "max_output_tokens"
    """Override this to edit the message for validation errors."""

    truncate_prompt_tokens_param: str = "truncate_prompt_tokens"
    """Override this to edit the message for validation errors."""

    @property
    def max_input_tokens(self) -> int | None:
        """Maximum allowed number of input tokens."""
        if self.max_total_tokens is None:
            return None

        return self.max_total_tokens - self.max_output_tokens

    def __post_init__(self) -> None:
        max_total_tokens = self.max_total_tokens
        max_output_tokens = self.max_output_tokens
        max_input_tokens = self.max_input_tokens
        truncate_prompt_tokens = self.truncate_prompt_tokens

        if (
            max_output_tokens is not None
            and max_total_tokens is not None
            and max_output_tokens > max_total_tokens
        ):
            raise VLLMValidationError(
                f"{self.max_output_tokens_param}={max_output_tokens}"
                f"cannot be greater than "
                f"{self.max_total_tokens_param}={max_total_tokens=}. "
                f"Please request fewer output tokens.",
                parameter=self.max_output_tokens_param,
                value=max_output_tokens,
            )

        if (
            max_input_tokens is not None
            and truncate_prompt_tokens is not None
            and truncate_prompt_tokens > max_input_tokens
        ):
            raise VLLMValidationError(
                f"{self.truncate_prompt_tokens_param}={truncate_prompt_tokens} "
                f"cannot be greater than {self.max_total_tokens_param} - "
                f"{self.max_output_tokens_param} = {max_input_tokens}. "
                f"Please request a smaller truncation size.",
                parameter=self.truncate_prompt_tokens_param,
                value=truncate_prompt_tokens,
            )

    def with_kwargs(self, **tokenization_kwargs: Any):
        max_length = tokenization_kwargs.pop("max_length", self.max_input_tokens)
        pad_prompt_tokens = tokenization_kwargs.pop(
            "pad_prompt_tokens", self.pad_prompt_tokens
        )
        truncate_prompt_tokens = tokenization_kwargs.pop(
            "truncate_prompt_tokens", self.truncate_prompt_tokens
        )
        do_lower_case = tokenization_kwargs.pop("do_lower_case", self.do_lower_case)
        add_special_tokens = tokenization_kwargs.pop(
            "add_special_tokens", self.add_special_tokens
        )
        needs_detokenization = tokenization_kwargs.pop(
            "needs_detokenization", self.needs_detokenization
        )

        # https://huggingface.co/docs/transformers/en/pad_truncation
        if padding := tokenization_kwargs.pop("padding", None):
            if padding == "max_length":
                pad_prompt_tokens = max_length
            elif padding in (False, "do_not_pad"):
                pad_prompt_tokens = None
            else:
                # To emit the below warning
                tokenization_kwargs["padding"] = padding

        if truncation := tokenization_kwargs.pop("truncation", None):
            if truncation in (True, "longest_first"):
                truncate_prompt_tokens = max_length
            elif truncation in (False, "do_not_truncate"):
                truncate_prompt_tokens = None
            else:
                # To emit the below warning
                tokenization_kwargs["truncation"] = truncation

        if tokenization_kwargs:
            logger.warning(
                "The following tokenization arguments are not supported "
                "by vLLM Renderer and will be ignored: %s",
                tokenization_kwargs,
            )

        max_total_tokens = self.max_total_tokens

        return TokenizeParams(
            max_total_tokens=max_total_tokens,
            max_output_tokens=(
                0
                if max_total_tokens is None or max_length is None
                else max_total_tokens - max_length
            ),
            pad_prompt_tokens=pad_prompt_tokens,
            truncate_prompt_tokens=truncate_prompt_tokens,
            truncation_side=self.truncation_side,
            do_lower_case=do_lower_case,
            add_special_tokens=add_special_tokens,
            needs_detokenization=needs_detokenization,
        )

    def get_encode_kwargs(self) -> dict[str, Any]:
        """The arguments to pass to `tokenizer.encode`."""
        max_length = self.truncate_prompt_tokens
        if max_length is not None and max_length < 0:
            max_length = self.max_input_tokens
        elif max_length is None and self.max_input_tokens is not None:
            # This prevents tokenization from taking up more resources than necessary
            # while still failing `self._token_len_check` as expected by users
            max_length = self.max_input_tokens + 1

        # Left-side truncation requires the full token sequence so we can
        # slice from the end in _token_truncation.  Disable HF-level
        # truncation (which would incorrectly truncate from the right for
        # pooling models) and let _token_truncation handle it.
        if self.truncation_side == "left":
            return dict(
                truncation=False,
                add_special_tokens=self.add_special_tokens,
            )

        return dict(
            truncation=max_length is not None,
            max_length=max_length,
            add_special_tokens=self.add_special_tokens,
        )

    def _text_len_check(self, tokenizer: TokenizerLike | None, text: str) -> str:
        """Apply length checks to prompt text if necessary."""
        max_input_tokens = self.max_input_tokens
        if max_input_tokens is None:
            return text

        if self.truncate_prompt_tokens is None and tokenizer is not None:
            max_input_chars = max_input_tokens * tokenizer.max_chars_per_token

            if len(text) > max_input_chars:
                # To save resources, fail the request outright without even
                # attempting tokenization
                raise VLLMValidationError(
                    f"This model's maximum context length is "
                    f"{self.max_total_tokens} tokens. However, you requested "
                    f"{self.max_output_tokens} output tokens and your prompt "
                    f"contains {len(text)} characters (more than "
                    f"{max_input_chars} characters, which is the upper bound "
                    f"for {max_input_tokens} input tokens). "
                    f"Please reduce the length of the input prompt or the "
                    f"number of requested output tokens.",
                    parameter="input_text",
                    value=len(text),
                )

        return text

    def _text_lowercase(self, tokenizer: TokenizerLike | None, text: str) -> str:
        """Apply lowercase to prompt text if necessary."""
        return text.lower() if self.do_lower_case else text

    def _validate_text(self, tokenizer: TokenizerLike | None, text: str) -> str:
        """Apply all validators to prompt text."""
        for validator in (
            self._text_len_check,
            self._text_lowercase,
        ):
            text = validator(tokenizer, text)

        return text

    def apply_pre_tokenization(
        self,
        tokenizer: TokenizerLike | None,
        prompt: TextPrompt,
    ) -> TextPrompt:
        """
        Ensure that the prompt meets the requirements set out by this config.
        If that is not possible, raise a `VLLMValidationError`.

        This method is run before tokenization occurs.
        """
        prompt["prompt"] = self._validate_text(tokenizer, prompt["prompt"])

        return prompt

    def _token_padding(self, tokenizer: TokenizerLike | None, tokens: _S) -> _S:
        """Apply padding to prompt tokens if necessary."""
        pad_length = self.pad_prompt_tokens
        if pad_length is not None and pad_length < 0:
            pad_length = self.max_input_tokens

        if pad_length is None or pad_length <= len(tokens):
            return tokens

        if tokenizer is None:
            raise ValueError("Cannot pad tokens when `skip_tokenizer_init=True`")
        if not isinstance(tokens, list):
            raise ValueError("Cannot pad tokens for embedding inputs")

        return tokens + [tokenizer.pad_token_id] * (pad_length - len(tokens))

    def _token_truncation(self, tokenizer: TokenizerLike | None, tokens: _S) -> _S:
        """Apply truncation to prompt tokens if necessary."""
        max_length = self.truncate_prompt_tokens
        if max_length is not None and max_length < 0:
            max_length = self.max_input_tokens

        if max_length is None or max_length >= len(tokens):
            return tokens
        if max_length == 0:
            return tokens[:0]

        side = self.truncation_side or (
            tokenizer.truncation_side if tokenizer is not None else None
        )
        if side == "left":
            return tokens[-max_length:]

        return tokens[:max_length]

    def _token_len_check(self, tokenizer: TokenizerLike | None, tokens: _S) -> _S:
        """Apply length checks to prompt tokens if necessary."""
        max_input_tokens = self.max_input_tokens
        if max_input_tokens is None:
            return tokens

        if len(tokens) > max_input_tokens:
            token_count = len(tokens)
            # The tokenizer may have truncated the prompt to
            # max_input_tokens + 1 (see get_encode_kwargs), so the
            # actual prompt length could be larger.
            qualifier = "at least " if token_count == max_input_tokens + 1 else ""
            total = token_count + self.max_output_tokens
            raise VLLMValidationError(
                f"This model's maximum context length is "
                f"{self.max_total_tokens} tokens. However, you requested "
                f"{self.max_output_tokens} output tokens and your prompt "
                f"contains {qualifier}{token_count} input tokens, "
                f"for a total of {qualifier}{total} tokens. "
                f"Please reduce the length of the input prompt or the "
                f"number of requested output tokens.",
                parameter="input_tokens",
                value=token_count,
            )

        return tokens

    def _validate_tokens(self, tokenizer: TokenizerLike | None, tokens: _S) -> _S:
        """Apply all validators to a token sequence."""
        for validator in (
            self._token_padding,
            self._token_truncation,
            self._token_len_check,
        ):
            tokens = validator(tokenizer, tokens)

        return tokens

    def apply_post_tokenization(
        self,
        tokenizer: TokenizerLike | None,
        prompt: TokensPrompt | EmbedsPrompt,
    ) -> TokensPrompt | EmbedsPrompt:
        """
        Ensure that the prompt meets the requirements set out by this config.
        If that is not possible, raise a `VLLMValidationError`.

        This method is run after tokenization occurs.
        """
        if "prompt_token_ids" in prompt:
            prompt["prompt_token_ids"] = self._validate_tokens(  # type: ignore[typeddict-unknown-key]
                tokenizer,
                prompt["prompt_token_ids"],  # type: ignore[typeddict-item]
            )
        if "prompt_embeds" in prompt:
            prompt["prompt_embeds"] = self._validate_tokens(  # type: ignore[typeddict-unknown-key]
                tokenizer,
                prompt["prompt_embeds"],  # type: ignore[typeddict-item]
            )

        return prompt

add_special_tokens `class-attribute` `instance-attribute` ¶

add_special_tokens: bool = True

Whether to add special tokens.

do_lower_case `class-attribute` `instance-attribute` ¶

do_lower_case: bool = False

Whether to normalize text to lower case before tokenization.

max_input_tokens `property` ¶

max_input_tokens: int | None

Maximum allowed number of input tokens.

max_output_tokens `class-attribute` `instance-attribute` ¶

max_output_tokens: int = 0

Maximum requested number of output tokens.

max_output_tokens_param `class-attribute` `instance-attribute` ¶

max_output_tokens_param: str = 'max_output_tokens'

Override this to edit the message for validation errors.

max_total_tokens `instance-attribute` ¶

max_total_tokens: int | None

Maximum allowed number of input + output tokens.

Usually, this refers to the model's context length.

max_total_tokens_param `class-attribute` `instance-attribute` ¶

max_total_tokens_param: str = 'max_total_tokens'

Override this to edit the message for validation errors.

needs_detokenization `class-attribute` `instance-attribute` ¶

needs_detokenization: bool = False

Whether the tokenized prompt needs to contain the original text.

Not to be confused with SamplingParams.detokenize which deals with the output generated by the model.

pad_prompt_tokens `class-attribute` `instance-attribute` ¶

pad_prompt_tokens: int | None = None

Number of tokens to pad to: - None means no padding. - -1 maps to max_input_tokens.

truncate_prompt_tokens `class-attribute` `instance-attribute` ¶

truncate_prompt_tokens: int | None = None

Number of tokens to keep: - None means no truncation. - -1 maps to max_input_tokens.

truncate_prompt_tokens_param `class-attribute` `instance-attribute` ¶

truncate_prompt_tokens_param: str = "truncate_prompt_tokens"

Override this to edit the message for validation errors.

truncation_side `class-attribute` `instance-attribute` ¶

truncation_side: Literal['left', 'right'] | None = None

Which side to truncate from when truncate_prompt_tokens is active: - "right" keeps the first N tokens (truncate from the end). - "left" keeps the last N tokens (truncate from the start). - None falls back to the tokenizer default.

_text_len_check ¶

_text_len_check(
    tokenizer: TokenizerLike | None, text: str
) -> str

Apply length checks to prompt text if necessary.

Source code in vllm/renderers/params.py

def _text_len_check(self, tokenizer: TokenizerLike | None, text: str) -> str:
    """Apply length checks to prompt text if necessary."""
    max_input_tokens = self.max_input_tokens
    if max_input_tokens is None:
        return text

    if self.truncate_prompt_tokens is None and tokenizer is not None:
        max_input_chars = max_input_tokens * tokenizer.max_chars_per_token

        if len(text) > max_input_chars:
            # To save resources, fail the request outright without even
            # attempting tokenization
            raise VLLMValidationError(
                f"This model's maximum context length is "
                f"{self.max_total_tokens} tokens. However, you requested "
                f"{self.max_output_tokens} output tokens and your prompt "
                f"contains {len(text)} characters (more than "
                f"{max_input_chars} characters, which is the upper bound "
                f"for {max_input_tokens} input tokens). "
                f"Please reduce the length of the input prompt or the "
                f"number of requested output tokens.",
                parameter="input_text",
                value=len(text),
            )

    return text

_text_lowercase ¶

_text_lowercase(
    tokenizer: TokenizerLike | None, text: str
) -> str

Apply lowercase to prompt text if necessary.

Source code in vllm/renderers/params.py

def _text_lowercase(self, tokenizer: TokenizerLike | None, text: str) -> str:
    """Apply lowercase to prompt text if necessary."""
    return text.lower() if self.do_lower_case else text

_token_len_check ¶

_token_len_check(
    tokenizer: TokenizerLike | None, tokens: _S
) -> _S

Apply length checks to prompt tokens if necessary.

Source code in vllm/renderers/params.py

def _token_len_check(self, tokenizer: TokenizerLike | None, tokens: _S) -> _S:
    """Apply length checks to prompt tokens if necessary."""
    max_input_tokens = self.max_input_tokens
    if max_input_tokens is None:
        return tokens

    if len(tokens) > max_input_tokens:
        token_count = len(tokens)
        # The tokenizer may have truncated the prompt to
        # max_input_tokens + 1 (see get_encode_kwargs), so the
        # actual prompt length could be larger.
        qualifier = "at least " if token_count == max_input_tokens + 1 else ""
        total = token_count + self.max_output_tokens
        raise VLLMValidationError(
            f"This model's maximum context length is "
            f"{self.max_total_tokens} tokens. However, you requested "
            f"{self.max_output_tokens} output tokens and your prompt "
            f"contains {qualifier}{token_count} input tokens, "
            f"for a total of {qualifier}{total} tokens. "
            f"Please reduce the length of the input prompt or the "
            f"number of requested output tokens.",
            parameter="input_tokens",
            value=token_count,
        )

    return tokens

_token_padding ¶

_token_padding(
    tokenizer: TokenizerLike | None, tokens: _S
) -> _S

Apply padding to prompt tokens if necessary.

Source code in vllm/renderers/params.py

def _token_padding(self, tokenizer: TokenizerLike | None, tokens: _S) -> _S:
    """Apply padding to prompt tokens if necessary."""
    pad_length = self.pad_prompt_tokens
    if pad_length is not None and pad_length < 0:
        pad_length = self.max_input_tokens

    if pad_length is None or pad_length <= len(tokens):
        return tokens

    if tokenizer is None:
        raise ValueError("Cannot pad tokens when `skip_tokenizer_init=True`")
    if not isinstance(tokens, list):
        raise ValueError("Cannot pad tokens for embedding inputs")

    return tokens + [tokenizer.pad_token_id] * (pad_length - len(tokens))

_token_truncation ¶

_token_truncation(
    tokenizer: TokenizerLike | None, tokens: _S
) -> _S

Apply truncation to prompt tokens if necessary.

Source code in vllm/renderers/params.py

def _token_truncation(self, tokenizer: TokenizerLike | None, tokens: _S) -> _S:
    """Apply truncation to prompt tokens if necessary."""
    max_length = self.truncate_prompt_tokens
    if max_length is not None and max_length < 0:
        max_length = self.max_input_tokens

    if max_length is None or max_length >= len(tokens):
        return tokens
    if max_length == 0:
        return tokens[:0]

    side = self.truncation_side or (
        tokenizer.truncation_side if tokenizer is not None else None
    )
    if side == "left":
        return tokens[-max_length:]

    return tokens[:max_length]

_validate_text ¶

_validate_text(
    tokenizer: TokenizerLike | None, text: str
) -> str

Apply all validators to prompt text.

Source code in vllm/renderers/params.py

def _validate_text(self, tokenizer: TokenizerLike | None, text: str) -> str:
    """Apply all validators to prompt text."""
    for validator in (
        self._text_len_check,
        self._text_lowercase,
    ):
        text = validator(tokenizer, text)

    return text

_validate_tokens ¶

_validate_tokens(
    tokenizer: TokenizerLike | None, tokens: _S
) -> _S

Apply all validators to a token sequence.

Source code in vllm/renderers/params.py

def _validate_tokens(self, tokenizer: TokenizerLike | None, tokens: _S) -> _S:
    """Apply all validators to a token sequence."""
    for validator in (
        self._token_padding,
        self._token_truncation,
        self._token_len_check,
    ):
        tokens = validator(tokenizer, tokens)

    return tokens

apply_post_tokenization ¶

apply_post_tokenization(
    tokenizer: TokenizerLike | None,
    prompt: TokensPrompt | EmbedsPrompt,
) -> TokensPrompt | EmbedsPrompt

Ensure that the prompt meets the requirements set out by this config. If that is not possible, raise a VLLMValidationError.

This method is run after tokenization occurs.

Source code in vllm/renderers/params.py

def apply_post_tokenization(
    self,
    tokenizer: TokenizerLike | None,
    prompt: TokensPrompt | EmbedsPrompt,
) -> TokensPrompt | EmbedsPrompt:
    """
    Ensure that the prompt meets the requirements set out by this config.
    If that is not possible, raise a `VLLMValidationError`.

    This method is run after tokenization occurs.
    """
    if "prompt_token_ids" in prompt:
        prompt["prompt_token_ids"] = self._validate_tokens(  # type: ignore[typeddict-unknown-key]
            tokenizer,
            prompt["prompt_token_ids"],  # type: ignore[typeddict-item]
        )
    if "prompt_embeds" in prompt:
        prompt["prompt_embeds"] = self._validate_tokens(  # type: ignore[typeddict-unknown-key]
            tokenizer,
            prompt["prompt_embeds"],  # type: ignore[typeddict-item]
        )

    return prompt

apply_pre_tokenization ¶

apply_pre_tokenization(
    tokenizer: TokenizerLike | None, prompt: TextPrompt
) -> TextPrompt

Ensure that the prompt meets the requirements set out by this config. If that is not possible, raise a VLLMValidationError.

This method is run before tokenization occurs.

Source code in vllm/renderers/params.py

def apply_pre_tokenization(
    self,
    tokenizer: TokenizerLike | None,
    prompt: TextPrompt,
) -> TextPrompt:
    """
    Ensure that the prompt meets the requirements set out by this config.
    If that is not possible, raise a `VLLMValidationError`.

    This method is run before tokenization occurs.
    """
    prompt["prompt"] = self._validate_text(tokenizer, prompt["prompt"])

    return prompt

get_encode_kwargs ¶

get_encode_kwargs() -> dict[str, Any]

The arguments to pass to tokenizer.encode.

Source code in vllm/renderers/params.py

def get_encode_kwargs(self) -> dict[str, Any]:
    """The arguments to pass to `tokenizer.encode`."""
    max_length = self.truncate_prompt_tokens
    if max_length is not None and max_length < 0:
        max_length = self.max_input_tokens
    elif max_length is None and self.max_input_tokens is not None:
        # This prevents tokenization from taking up more resources than necessary
        # while still failing `self._token_len_check` as expected by users
        max_length = self.max_input_tokens + 1

    # Left-side truncation requires the full token sequence so we can
    # slice from the end in _token_truncation.  Disable HF-level
    # truncation (which would incorrectly truncate from the right for
    # pooling models) and let _token_truncation handle it.
    if self.truncation_side == "left":
        return dict(
            truncation=False,
            add_special_tokens=self.add_special_tokens,
        )

    return dict(
        truncation=max_length is not None,
        max_length=max_length,
        add_special_tokens=self.add_special_tokens,
    )

vllm.renderers ¶

BaseRenderer ¶

_process_tokens ¶

clear_mm_cache_async async ¶

get_dec_start_token_id ¶

warmup ¶

ChatParams dataclass ¶

chat_template class-attribute instance-attribute ¶

chat_template_content_format class-attribute instance-attribute ¶

chat_template_kwargs class-attribute instance-attribute ¶

media_io_kwargs class-attribute instance-attribute ¶

mm_processor_kwargs class-attribute instance-attribute ¶

get_apply_chat_template_kwargs ¶

TokenizeParams dataclass ¶

add_special_tokens class-attribute instance-attribute ¶

do_lower_case class-attribute instance-attribute ¶

max_input_tokens property ¶

max_output_tokens class-attribute instance-attribute ¶

max_output_tokens_param class-attribute instance-attribute ¶

max_total_tokens instance-attribute ¶

max_total_tokens_param class-attribute instance-attribute ¶

needs_detokenization class-attribute instance-attribute ¶

pad_prompt_tokens class-attribute instance-attribute ¶

truncate_prompt_tokens class-attribute instance-attribute ¶

truncate_prompt_tokens_param class-attribute instance-attribute ¶

truncation_side class-attribute instance-attribute ¶

_text_len_check ¶

_text_lowercase ¶

_token_len_check ¶

_token_padding ¶

_token_truncation ¶

_validate_text ¶

_validate_tokens ¶

apply_post_tokenization ¶

apply_pre_tokenization ¶

get_encode_kwargs ¶

clear_mm_cache_async `async` ¶

ChatParams `dataclass` ¶

chat_template `class-attribute` `instance-attribute` ¶

chat_template_content_format `class-attribute` `instance-attribute` ¶

chat_template_kwargs `class-attribute` `instance-attribute` ¶

media_io_kwargs `class-attribute` `instance-attribute` ¶

mm_processor_kwargs `class-attribute` `instance-attribute` ¶

TokenizeParams `dataclass` ¶

add_special_tokens `class-attribute` `instance-attribute` ¶

do_lower_case `class-attribute` `instance-attribute` ¶

max_input_tokens `property` ¶

max_output_tokens `class-attribute` `instance-attribute` ¶

max_output_tokens_param `class-attribute` `instance-attribute` ¶

max_total_tokens `instance-attribute` ¶

max_total_tokens_param `class-attribute` `instance-attribute` ¶

needs_detokenization `class-attribute` `instance-attribute` ¶

pad_prompt_tokens `class-attribute` `instance-attribute` ¶

truncate_prompt_tokens `class-attribute` `instance-attribute` ¶

truncate_prompt_tokens_param `class-attribute` `instance-attribute` ¶

truncation_side `class-attribute` `instance-attribute` ¶