class CheersProcessor(ProcessorMixin):
"""
Constructs a Cheers processor which wraps a
SigLIP image processor and a Qwen2 tokenizer.
"""
attributes = ["image_processor", "tokenizer"]
image_processor_class = "AutoImageProcessor"
tokenizer_class = "AutoTokenizer"
def __call__(
self,
text: TextInput
| PreTokenizedInput
| list[TextInput]
| list[PreTokenizedInput] = None,
images: ImageInput = None,
**kwargs: Unpack[CheersProcessorKwargs],
):
output_kwargs = self._merge_kwargs(
CheersProcessorKwargs,
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
**kwargs,
)
if images is not None:
import torch
if isinstance(images, (list, tuple)):
all_pv = []
all_ghw = []
for img in images:
result = self.image_processor(img, **output_kwargs["images_kwargs"])
all_pv.append(result["pixel_values"])
if "grid_hws" in result:
all_ghw.append(result["grid_hws"])
pixel_values = {
"pixel_values": torch.cat(all_pv, dim=0),
}
if all_ghw:
pixel_values["grid_hws"] = torch.cat(all_ghw, dim=0)
else:
pixel_values = self.image_processor(
images, **output_kwargs["images_kwargs"]
)
else:
pixel_values = {}
text_inputs = (
self.tokenizer(text, **output_kwargs["text_kwargs"])
if text is not None
else {}
)
return BatchFeature(data={**pixel_values, **text_inputs})
def batch_decode(self, *args, **kwargs):
return self.tokenizer.batch_decode(*args, **kwargs)
def decode(self, *args, **kwargs):
return self.tokenizer.decode(*args, **kwargs)
@property
def model_input_names(self):
tokenizer_input_names = self.tokenizer.model_input_names
image_processor_input_names = self.image_processor.model_input_names
return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))