vllm.model_executor.models.gemma4_utils ¶

Gemma4 output parsing utilities for offline inference.

Standalone functions that parse decoded model text to extract structured thinking content and tool calls from Gemma4 models. These are pure-Python utilities with zero heavy dependencies — they work on raw decoded strings from any inference backend (vLLM, HuggingFace, TGI, etc.).

Usage with vLLM offline inference::

from vllm import LLM, SamplingParams
from vllm.model_executor.models.gemma4_utils import (
    parse_output,
    parse_tool_calls,
)

llm = LLM(model="google/gemma-4-it")
outputs = llm.generate(prompt, SamplingParams(...))
text = tokenizer.decode(outputs[0].outputs[0].token_ids, skip_special_tokens=False)

# Extract thinking / answer (works with or without enable_thinking)
result = parse_output(text)
print(result["thinking"])  # chain-of-thought or None
print(result["answer"])  # final answer

# Extract tool calls
tool_calls = parse_tool_calls(text)
for tc in tool_calls:
    print(f"{tc['name']}({tc['arguments']})")

Ported from transformers.models.gemma4.utils_gemma4 so that vLLM users do not need a transformers dependency for output parsing.

_clean_answer ¶

_clean_answer(text: str) -> str

Clean trailing sentinel tokens from the answer text.

Strips <turn|>, <eos>, and surrounding whitespace that the model appends at the end of its response.

Source code in vllm/model_executor/models/gemma4_utils.py

def _clean_answer(text: str) -> str:
    """Clean trailing sentinel tokens from the answer text.

    Strips ``<turn|>``, ``<eos>``, and surrounding whitespace that the
    model appends at the end of its response.
    """
    text = text.strip()
    # Strip trailing <turn|> (Gemma4 turn-end marker)
    if text.endswith(_TURN_END_TAG):
        text = text[: -len(_TURN_END_TAG)].rstrip()
    # Strip trailing <eos> if present
    if text.endswith("<eos>"):
        text = text[:-5].rstrip()
    return text

_parse_tool_arguments ¶

_parse_tool_arguments(args_str: str) -> dict[str, str]

Parse tool call arguments from the Gemma4 compact format.

Handles the key:<|"|>value<|"|> format used by Gemma4, with fallback to heuristic key-value extraction. Also tolerates the slightly different key: "value" format (space + plain quotes) that some chat templates produce.

Parameters:

Name	Type	Description	Default
`args_str`	`str`	Raw argument string from inside `call:name{...}`.	required

Returns:

Type	Description
`dict[str, str]`	Dictionary of argument name → value.

Source code in vllm/model_executor/models/gemma4_utils.py

def _parse_tool_arguments(args_str: str) -> dict[str, str]:
    """Parse tool call arguments from the Gemma4 compact format.

    Handles the ``key:<|"|>value<|"|>`` format used by Gemma4, with fallback
    to heuristic key-value extraction. Also tolerates the slightly different
    ``key: "value"`` format (space + plain quotes) that some chat templates
    produce.

    Args:
        args_str: Raw argument string from inside ``call:name{...}``.

    Returns:
        Dictionary of argument name → value.
    """
    if not args_str or not args_str.strip():
        return {}

    # Replace Gemma4 escape tokens with standard quotes.
    cleaned = args_str.replace(_ESCAPE_TOKEN, '"')

    # Try JSON parsing first (handles nested values, arrays, etc.).
    try:
        parsed = json.loads("{" + cleaned + "}")
        # Ensure all values are strings for consistency.
        return {k: str(v) if not isinstance(v, str) else v for k, v in parsed.items()}
    except (json.JSONDecodeError, ValueError):
        pass

    # Fallback: extract key:"value" pairs (allow optional space after colon).
    arguments = {}
    for key, value in re.findall(r'(\w+):\s*"([^"]*)"', cleaned):
        arguments[key] = value

    if not arguments:
        # Last resort: extract key:value pairs (unquoted).
        for key, value in re.findall(r"(\w+):\s*([^,}]+)", args_str):
            arguments[key] = value.strip().strip('"').replace(_ESCAPE_TOKEN, "")

    return arguments

_strip_thought_label ¶

_strip_thought_label(text: str) -> str

Strip the spurious thought\n label from the start of text.

Only strips when thought appears as the very first word followed by a newline — preserving the word thought in any other context.

Source code in vllm/model_executor/models/gemma4_utils.py

def _strip_thought_label(text: str) -> str:
    """Strip the spurious ``thought\\n`` label from the start of text.

    Only strips when ``thought`` appears as the very first word followed by
    a newline — preserving the word ``thought`` in any other context.
    """
    if text.startswith("thought\n"):
        return text[len("thought\n") :]
    return text

has_tool_response_tag ¶

has_tool_response_tag(text: str) -> bool

Check if model output properly ends with a tool response tag.

Some Gemma4 models sometimes emit <eos> instead of <|tool_response> after a tool call. This helper detects whether the model used the proper termination, so callers can decide whether to inject <|tool_response> into the next prompt.

Parameters:

Name	Type	Description	Default
`text`	`str`	Decoded model output text.	required

Returns:

Type	Description
`bool`	`True` if the output ends with `<\|tool_response>`
`bool`	(proper behavior), `False` otherwise.

Example::

>>> from vllm.model_executor.models.gemma4_utils import (
...     has_tool_response_tag
... )
>>> if not has_tool_response_tag(model_output):
...     # Model used <eos> instead — inject <|tool_response> manually
...     next_prompt = "<|tool_response>" + tool_result

Source code in vllm/model_executor/models/gemma4_utils.py

def has_tool_response_tag(text: str) -> bool:
    """Check if model output properly ends with a tool response tag.

    Some Gemma4 models sometimes emit ``<eos>`` instead of
    ``<|tool_response>`` after a tool call. This helper detects
    whether the model used the proper termination, so callers can
    decide whether to inject ``<|tool_response>`` into the next prompt.

    Args:
        text: Decoded model output text.

    Returns:
        ``True`` if the output ends with ``<|tool_response>``
        (proper behavior), ``False`` otherwise.

    Example::

        >>> from vllm.model_executor.models.gemma4_utils import (
        ...     has_tool_response_tag
        ... )
        >>> if not has_tool_response_tag(model_output):
        ...     # Model used <eos> instead — inject <|tool_response> manually
        ...     next_prompt = "<|tool_response>" + tool_result
    """
    stripped = text.rstrip()
    return stripped.endswith(_TOOL_RESPONSE_START_TAG)

parse_thinking_output ¶

parse_thinking_output(text: str) -> dict[str, str | None]

Parse decoded Gemma4 model output.

Use this on all Gemma4 output regardless of whether thinking mode was enabled. It handles three cases:

Thinking enabled, tags present — splits on <|channel>/ <channel|> to separate chain-of-thought from the answer and strips the thought\n role label.
Thinking disabled, spurious label — strips the bare thought\n prefix that some Gemma4 models emit even without thinking mode.
Clean output — returns the text unchanged.

The answer text is always cleaned of trailing sentinel tokens (<turn|>, <eos>, etc.).

Parameters:

Name	Type	Description	Default
`text`	`str`	Decoded model output text (from `tokenizer.decode(...)`).	required

Returns:

Type	Description
`dict[str, str \| None]`	A dict with keys: - `"thinking"`: The chain-of-thought text, or `None` if no thinking delimiters were found. - `"answer"`: The final answer text.

Example::

>>> from vllm.model_executor.models.gemma4_utils import parse_thinking_output
>>> output_text = tokenizer.decode(outputs[0], skip_special_tokens=False)
>>> result = parse_thinking_output(output_text)
>>> print(result["thinking"])  # chain-of-thought reasoning or None
>>> print(result["answer"])    # final answer

Source code in vllm/model_executor/models/gemma4_utils.py

def parse_thinking_output(text: str) -> dict[str, str | None]:
    """Parse decoded Gemma4 model output.

    Use this on **all** Gemma4 output regardless of whether thinking mode
    was enabled.  It handles three cases:

    1. **Thinking enabled, tags present** — splits on ``<|channel>``/
       ``<channel|>`` to separate chain-of-thought from the answer and
       strips the ``thought\\n`` role label.
    2. **Thinking disabled, spurious label** — strips the bare
       ``thought\\n`` prefix that some Gemma4 models emit even
       without thinking mode.
    3. **Clean output** — returns the text unchanged.

    The answer text is always cleaned of trailing sentinel tokens
    (``<turn|>``, ``<eos>``, etc.).

    Args:
        text: Decoded model output text (from ``tokenizer.decode(...)``).

    Returns:
        A dict with keys:
            - ``"thinking"``: The chain-of-thought text, or ``None`` if no
              thinking delimiters were found.
            - ``"answer"``: The final answer text.

    Example::

        >>> from vllm.model_executor.models.gemma4_utils import parse_thinking_output
        >>> output_text = tokenizer.decode(outputs[0], skip_special_tokens=False)
        >>> result = parse_thinking_output(output_text)
        >>> print(result["thinking"])  # chain-of-thought reasoning or None
        >>> print(result["answer"])    # final answer
    """
    if _THINKING_END_TAG in text:
        parts = text.split(_THINKING_END_TAG, 1)
        thinking_block = parts[0]
        answer = _clean_answer(parts[1])

        # Extract thinking content: strip the start tag if present
        if _THINKING_START_TAG in thinking_block:
            thinking = thinking_block.split(_THINKING_START_TAG, 1)[1]
        else:
            thinking = thinking_block

        # Strip the "thought\n" channel role label the model emits inside
        # <|channel>thought\n...<channel|> (analogous to "user\n" in
        # <|turn>user\n...<turn|>).
        thinking = _strip_thought_label(thinking.strip())
        thinking = thinking.strip()

        return {"thinking": thinking, "answer": answer}

    # No thinking delimiters found.
    # Strip spurious "thought\n" role label that some Gemma4 models sometimes
    # emit even without thinking mode enabled, then clean trailing tokens.
    answer = _strip_thought_label(text)
    answer = _clean_answer(answer)
    return {"thinking": None, "answer": answer}

parse_tool_calls ¶

parse_tool_calls(
    text: str, *, strict: bool = False
) -> list[dict]

Parse tool calls from decoded Gemma4 model output.

Uses a tiered parsing strategy to handle known output variations in Gemma4 models, which may emit non-standard tool call formats.

Parsing tiers

Standard: <|tool_call>call:name{args}<tool_call|> (special token IDs 48/49 in decoded text)
Fallback (when strict=False): bare call:name{args} patterns, including <call>name{args} (fragmented tokens from multimodal inputs)

Parameters:

Name	Type	Description	Default
`text`	`str`	Decoded model output text (from `tokenizer.decode(..., skip_special_tokens=False)`).	required
`strict`	`bool`	If `True`, only match the standard `<\|tool_call>` format. If `False` (default), also try fallback patterns for known Gemma4 output variations.	`False`

Returns:

Type	Description
`list[dict]`	A list of dicts, each with keys: - `"name"`: The tool function name (e.g. `"get_weather"`). - `"arguments"`: A dict of argument name → value.

Example::

>>> from vllm.model_executor.models.gemma4_utils import (
...     parse_tool_calls
... )
>>> output = tokenizer.decode(outputs[0], skip_special_tokens=False)
>>> tool_calls = parse_tool_calls(output)
>>> for tc in tool_calls:
...     print(f"Call: {tc['name']}({tc['arguments']})")

Source code in vllm/model_executor/models/gemma4_utils.py

def parse_tool_calls(text: str, *, strict: bool = False) -> list[dict]:
    """Parse tool calls from decoded Gemma4 model output.

    Uses a tiered parsing strategy to handle known output variations in
    Gemma4 models, which may emit
    non-standard tool call formats.

    Parsing tiers:
        1. **Standard**: ``<|tool_call>call:name{args}<tool_call|>``
           (special token IDs 48/49 in decoded text)
        2. **Fallback** (when ``strict=False``): bare ``call:name{args}``
           patterns, including ``<call>name{args}`` (fragmented tokens from
           multimodal inputs)

    Args:
        text: Decoded model output text (from ``tokenizer.decode(...,
            skip_special_tokens=False)``).
        strict: If ``True``, only match the standard ``<|tool_call>`` format.
            If ``False`` (default), also try fallback patterns for
            known Gemma4 output variations.

    Returns:
        A list of dicts, each with keys:
            - ``"name"``: The tool function name (e.g. ``"get_weather"``).
            - ``"arguments"``: A dict of argument name → value.

    Example::

        >>> from vllm.model_executor.models.gemma4_utils import (
        ...     parse_tool_calls
        ... )
        >>> output = tokenizer.decode(outputs[0], skip_special_tokens=False)
        >>> tool_calls = parse_tool_calls(output)
        >>> for tc in tool_calls:
        ...     print(f"Call: {tc['name']}({tc['arguments']})")
    """
    results = []

    # Tier 1: Standard format with special tokens.
    # <|tool_call>call:name{args}<tool_call|>
    # Note: Some Gemma4 models emit <turn|> instead of <tool_call|>.
    standard_pattern = r"<\|tool_call\>call:(\w+)\{(.*?)\}(?:<tool_call\|>|<turn\|>)"
    for match in re.finditer(standard_pattern, text, re.DOTALL):
        name, args_str = match.group(1), match.group(2)
        results.append(
            {
                "name": name,
                "arguments": _parse_tool_arguments(args_str),
            }
        )

    if results or strict:
        return results

    # Tier 2: Fallback for known Gemma4 output variations.
    # Matches: <call>name{args}, call:name{args}, or bare call:name{args}<eos>
    fallback_pattern = r"(?:<call>|(?:^|\s)call:)(\w+)\{(.*?)\}"
    for match in re.finditer(fallback_pattern, text, re.DOTALL):
        name, args_str = match.group(1), match.group(2)
        results.append(
            {
                "name": name,
                "arguments": _parse_tool_arguments(args_str),
            }
        )

    return results