char

`optimus_dl.modules.tokenizer.implementations.char` ¶

`CharTokenizer` ¶

Bases: BaseTokenizer

Simple byte-level UTF-8 tokenizer.

Converts text to raw UTF-8 bytes and adds optional BOS/EOS tokens. Detokenization skips the special token IDs and decodes the remainder as UTF-8.

Parameters:

Name	Type	Description	Default
`config`	`CharTokenizerConfig`	Character tokenizer configuration.	required

Source code in optimus_dl/modules/tokenizer/implementations/char.py

@register_tokenizer("char_tokenize", CharTokenizerConfig)
class CharTokenizer(BaseTokenizer):
    """Simple byte-level UTF-8 tokenizer.

    Converts text to raw UTF-8 bytes and adds optional BOS/EOS tokens.
    Detokenization skips the special token IDs and decodes the remainder
    as UTF-8.

    Args:
        config: Character tokenizer configuration.
    """

    def __init__(self, config: CharTokenizerConfig, **kwargs):
        super().__init__(config)

    def encode(self, text: str) -> list[int]:
        """Convert text to UTF-8 bytes and add special tokens."""
        input_ids = list(text.encode("utf-8"))

        if self.config.add_bos:
            if self.bos_token_id is None:
                raise ValueError(
                    "Tokenizer does not have a BOS token ID, but add_bos is True."
                )
            input_ids.insert(0, self.bos_token_id)
        if self.config.add_eos:
            if self.eos_token_id is None:
                raise ValueError(
                    "Tokenizer does not have an EOS token ID, but add_eos is True."
                )
            input_ids.append(self.eos_token_id)
        return input_ids

    def decode(self, ids: list[int]) -> str:
        """Filter out special IDs and decode bytes to UTF-8."""
        # Filter out special tokens
        bytes_list = []
        for id in ids:
            if 0 <= id < 256:
                bytes_list.append(id)
        return bytes(bytes_list).decode("utf-8", errors="replace")

    @property
    def vocab_size(self) -> int:
        """Vocabulary size including BOS/EOS tokens."""
        return max(self.config.vocab_size, (self.config.eos_token_id or 0) + 1)

    @property
    def bos_token_id(self):
        """BOS token ID from config."""
        return self.config.bos_token_id

    @property
    def eos_token_id(self):
        """EOS token ID from config."""
        return self.config.eos_token_id

`bos_token_id` `property` ¶

BOS token ID from config.

`eos_token_id` `property` ¶

EOS token ID from config.

`vocab_size` `property` ¶

Vocabulary size including BOS/EOS tokens.

`decode(ids)` ¶

Filter out special IDs and decode bytes to UTF-8.

Source code in optimus_dl/modules/tokenizer/implementations/char.py

def decode(self, ids: list[int]) -> str:
    """Filter out special IDs and decode bytes to UTF-8."""
    # Filter out special tokens
    bytes_list = []
    for id in ids:
        if 0 <= id < 256:
            bytes_list.append(id)
    return bytes(bytes_list).decode("utf-8", errors="replace")

`encode(text)` ¶

Convert text to UTF-8 bytes and add special tokens.

Source code in optimus_dl/modules/tokenizer/implementations/char.py

def encode(self, text: str) -> list[int]:
    """Convert text to UTF-8 bytes and add special tokens."""
    input_ids = list(text.encode("utf-8"))

    if self.config.add_bos:
        if self.bos_token_id is None:
            raise ValueError(
                "Tokenizer does not have a BOS token ID, but add_bos is True."
            )
        input_ids.insert(0, self.bos_token_id)
    if self.config.add_eos:
        if self.eos_token_id is None:
            raise ValueError(
                "Tokenizer does not have an EOS token ID, but add_eos is True."
            )
        input_ids.append(self.eos_token_id)
    return input_ids

`CharTokenizerConfig` `dataclass` ¶

Bases: BaseTokenizerConfig

Configuration for character/byte-level tokenizer.

Attributes:

Name	Type	Description

Parameters:

Name	Type	Default
`vocab_size`	`int`	`256`
`bos_token_id`	`int`	`256`
`eos_token_id`	`int`	`257`

Source code in optimus_dl/modules/tokenizer/implementations/char.py

@dataclass
class CharTokenizerConfig(BaseTokenizerConfig):
    """Configuration for character/byte-level tokenizer.

    Attributes:
        vocab_size: Number of unique byte values (usually 256).
        bos_token_id: ID for the Beginning-of-Sequence token.
        eos_token_id: ID for the End-of-Sequence token.
    """

    vocab_size: int = 256  # 0-255 bytes + special tokens
    bos_token_id: int = 256
    eos_token_id: int = 257

char

optimus_dl.modules.tokenizer.implementations.char ¶

CharTokenizer ¶

bos_token_id property ¶

eos_token_id property ¶

vocab_size property ¶

decode(ids) ¶

encode(text) ¶

CharTokenizerConfig dataclass ¶

`optimus_dl.modules.tokenizer.implementations.char` ¶

`CharTokenizer` ¶

`bos_token_id` `property` ¶

`eos_token_id` `property` ¶

`vocab_size` `property` ¶

`decode(ids)` ¶

`encode(text)` ¶

`CharTokenizerConfig` `dataclass` ¶