base

`optimus_dl.modules.tokenizer.base` ¶

`BaseTokenizer` ¶

Bases: ABC

Abstract base class for all tokenizers.

Defines the standard interface for encoding strings to token IDs and decoding IDs back to text.

Attributes:

Name	Type	Description
`config`		Configuration object for the tokenizer.

Source code in optimus_dl/modules/tokenizer/base.py

class BaseTokenizer(ABC):
    """Abstract base class for all tokenizers.

    Defines the standard interface for encoding strings to token IDs and
    decoding IDs back to text.

    Attributes:
        config: Configuration object for the tokenizer.
    """

    def __init__(self, config: BaseTokenizerConfig):
        self.config = config

    @abstractmethod
    def encode(self, text: str) -> list[int]:
        """Convert a text string into a list of integer token IDs."""
        pass

    @abstractmethod
    def decode(self, ids: list[int]) -> str:
        """Convert a list of token IDs back into a text string."""
        pass

    @property
    @abstractmethod
    def vocab_size(self) -> int:
        """Total size of the tokenizer's vocabulary."""
        pass

    @property
    def bos_token_id(self) -> int | None:
        """ID of the Beginning-of-Sequence token, if any."""
        return None

    @property
    def eos_token_id(self) -> int | None:
        """ID of the End-of-Sequence token, if any."""
        return None

    def save_pretrained(self, save_directory: str):
        """Save tokenizer configuration to a directory."""
        save_directory_path = pathlib.Path(save_directory)
        save_directory_path.mkdir(parents=True, exist_ok=True)
        with open(save_directory_path / "tokenizer_config.json", "w") as f:
            yaml.dump(self.config, f)

    def apply_chat_template(
        self,
        conversation: list[dict[str, str]],
        tokenize: bool = True,
        add_generation_prompt: bool = True,
    ) -> str | list[int]:
        """Apply a chat template (e.g., Llama-2-chat) to a conversation history.

        Args:
            conversation: List of messages (e.g., [{"role": "user", "content": "..."}]).
            tokenize: Whether to return token IDs (True) or the raw string (False).
            add_generation_prompt: Whether to append the assistant's response prefix.

        Returns:
            Formatted string or list of token IDs.
        """
        raise NotImplementedError(
            "apply_chat_template not implemented for this tokenizer"
        )

`bos_token_id` `property` ¶

ID of the Beginning-of-Sequence token, if any.

`eos_token_id` `property` ¶

ID of the End-of-Sequence token, if any.

`vocab_size` `abstractmethod` `property` ¶

Total size of the tokenizer's vocabulary.

`apply_chat_template(conversation, tokenize=True, add_generation_prompt=True)` ¶

Apply a chat template (e.g., Llama-2-chat) to a conversation history.

Parameters:

Name	Type	Description	Default
`conversation`	`list[dict[str, str]]`	List of messages (e.g., [{"role": "user", "content": "..."}]).	required
`tokenize`	`bool`	Whether to return token IDs (True) or the raw string (False).	`True`
`add_generation_prompt`	`bool`	Whether to append the assistant's response prefix.	`True`

Returns:

Type	Description
`str \| list[int]`	Formatted string or list of token IDs.

Source code in optimus_dl/modules/tokenizer/base.py

def apply_chat_template(
    self,
    conversation: list[dict[str, str]],
    tokenize: bool = True,
    add_generation_prompt: bool = True,
) -> str | list[int]:
    """Apply a chat template (e.g., Llama-2-chat) to a conversation history.

    Args:
        conversation: List of messages (e.g., [{"role": "user", "content": "..."}]).
        tokenize: Whether to return token IDs (True) or the raw string (False).
        add_generation_prompt: Whether to append the assistant's response prefix.

    Returns:
        Formatted string or list of token IDs.
    """
    raise NotImplementedError(
        "apply_chat_template not implemented for this tokenizer"
    )

`decode(ids)` `abstractmethod` ¶

Convert a list of token IDs back into a text string.

Source code in optimus_dl/modules/tokenizer/base.py

@abstractmethod
def decode(self, ids: list[int]) -> str:
    """Convert a list of token IDs back into a text string."""
    pass

`encode(text)` `abstractmethod` ¶

Convert a text string into a list of integer token IDs.

Source code in optimus_dl/modules/tokenizer/base.py

@abstractmethod
def encode(self, text: str) -> list[int]:
    """Convert a text string into a list of integer token IDs."""
    pass

`save_pretrained(save_directory)` ¶

Save tokenizer configuration to a directory.

Source code in optimus_dl/modules/tokenizer/base.py

def save_pretrained(self, save_directory: str):
    """Save tokenizer configuration to a directory."""
    save_directory_path = pathlib.Path(save_directory)
    save_directory_path.mkdir(parents=True, exist_ok=True)
    with open(save_directory_path / "tokenizer_config.json", "w") as f:
        yaml.dump(self.config, f)

base

optimus_dl.modules.tokenizer.base ¶

BaseTokenizer ¶

bos_token_id property ¶

eos_token_id property ¶

vocab_size abstractmethod property ¶

apply_chat_template(conversation, tokenize=True, add_generation_prompt=True) ¶

decode(ids) abstractmethod ¶

encode(text) abstractmethod ¶

save_pretrained(save_directory) ¶

`optimus_dl.modules.tokenizer.base` ¶

`BaseTokenizer` ¶

`bos_token_id` `property` ¶

`eos_token_id` `property` ¶

`vocab_size` `abstractmethod` `property` ¶

`apply_chat_template(conversation, tokenize=True, add_generation_prompt=True)` ¶

`decode(ids)` `abstractmethod` ¶

`encode(text)` `abstractmethod` ¶

`save_pretrained(save_directory)` ¶