Skip to content

vllm.transformers_utils.processors.cheers

Cheers (UMM) processor for image and text inputs.

CheersProcessor

Bases: ProcessorMixin

Constructs a Cheers processor which wraps a SigLIP image processor and a Qwen2 tokenizer.

Source code in vllm/transformers_utils/processors/cheers.py
class CheersProcessor(ProcessorMixin):
    """
    Constructs a Cheers processor which wraps a
    SigLIP image processor and a Qwen2 tokenizer.
    """

    attributes = ["image_processor", "tokenizer"]
    image_processor_class = "AutoImageProcessor"
    tokenizer_class = "AutoTokenizer"

    def __call__(
        self,
        text: TextInput
        | PreTokenizedInput
        | list[TextInput]
        | list[PreTokenizedInput] = None,
        images: ImageInput = None,
        **kwargs: Unpack[CheersProcessorKwargs],
    ):
        output_kwargs = self._merge_kwargs(
            CheersProcessorKwargs,
            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
            **kwargs,
        )

        if images is not None:
            import torch

            if isinstance(images, (list, tuple)):
                all_pv = []
                all_ghw = []
                for img in images:
                    result = self.image_processor(img, **output_kwargs["images_kwargs"])
                    all_pv.append(result["pixel_values"])
                    if "grid_hws" in result:
                        all_ghw.append(result["grid_hws"])
                pixel_values = {
                    "pixel_values": torch.cat(all_pv, dim=0),
                }
                if all_ghw:
                    pixel_values["grid_hws"] = torch.cat(all_ghw, dim=0)
            else:
                pixel_values = self.image_processor(
                    images, **output_kwargs["images_kwargs"]
                )
        else:
            pixel_values = {}

        text_inputs = (
            self.tokenizer(text, **output_kwargs["text_kwargs"])
            if text is not None
            else {}
        )

        return BatchFeature(data={**pixel_values, **text_inputs})

    def batch_decode(self, *args, **kwargs):
        return self.tokenizer.batch_decode(*args, **kwargs)

    def decode(self, *args, **kwargs):
        return self.tokenizer.decode(*args, **kwargs)

    @property
    def model_input_names(self):
        tokenizer_input_names = self.tokenizer.model_input_names
        image_processor_input_names = self.image_processor.model_input_names
        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))