Skip to content

hf_qwen3

optimus_dl.modules.model.presets.hf_qwen3

Preset for loading Hugging Face Qwen3 models.

HFQwen3Config dataclass

Bases: Qwen3Config

HFQwen3Config(_name: str | None = None, block_size: int = 1024, vocab_size: int = 50304, n_layer: int = 12, n_head: int = 12, n_embd: int = 768, head_dim: int | None = None, dropout: float = 0.0, bias: bool = False, tie_word_embeddings: bool = True, shard_every_ith_layer: int = 1, padding_token_id: int | None = None, sequence_length: int = 32768, rmsnorm_eps: float = 1e-06, rope_theta: float = 1000000.0, rope_scaling: dict | None = None, attention_bias: bool = True, n_kv_head: int | None = None, intermediate_size: int | None = None, multiple_of: int = 256, sliding_window: int | None = None, use_liger_rmsnorm: bool | None = None, use_liger_swiglu: bool | None = None, hf_model_name: str = 'Qwen/Qwen3-4B-Thinking-2507', load_weights: bool = True)

Parameters:

Name Type Description Default
hf_model_name str
'Qwen/Qwen3-4B-Thinking-2507'
load_weights bool
True
Source code in optimus_dl/modules/model/presets/hf_qwen3.py
@dataclass
class HFQwen3Config(Qwen3Config):
    hf_model_name: str = "Qwen/Qwen3-4B-Thinking-2507"
    load_weights: bool = True

make_hf_qwen3_model(cfg, **_)

Create a Qwen3 model loaded with weights from Hugging Face.

Source code in optimus_dl/modules/model/presets/hf_qwen3.py
@register_model("preset_hfqwen3", HFQwen3Config)
def make_hf_qwen3_model(cfg: HFQwen3Config, **_):
    """Create a Qwen3 model loaded with weights from Hugging Face."""
    logger.info(f"Loading HF model: {cfg.hf_model_name}")

    # Load HF config
    hf_config = AutoConfig.from_pretrained(cfg.hf_model_name, trust_remote_code=True)

    # Update local config from HF config
    update_config_from_hf(cfg, hf_config)

    # Qwen models often use attention_bias in config, but sometimes it is implied by presence of biases
    cfg.attention_bias = getattr(
        hf_config, "attention_bias", getattr(hf_config, "use_bias", False)
    )
    cfg.sliding_window = getattr(hf_config, "sliding_window", None)
    cfg.rmsnorm_eps = getattr(
        hf_config, "rms_norm_eps", getattr(hf_config, "layer_norm_epsilon", 1e-6)
    )

    # Initialize local Qwen3 model
    model = Qwen3(cfg)

    if not cfg.load_weights:
        return model

    # Load HF model weights
    logger.info("Loading HF model weights...")
    hf_model = AutoModelForCausalLM.from_pretrained(
        cfg.hf_model_name,
        dtype=torch.float32,
        low_cpu_mem_usage=True,
        trust_remote_code=True,
    )
    hf_sd = hf_model.state_dict()
    mapper = WeightMapper(hf_sd, model.state_dict())

    logger.info("Copying weights...")

    # Embeddings
    mapper.copy("model.embed_tokens.weight", "transformer.wte.weight")

    # Layers
    for i in range(cfg.n_layer):
        # Attention
        mapper.copy(
            f"model.layers.{i}.self_attn.q_proj.weight",
            f"transformer.h.{i}.attn.wq.weight",
            permute=True,
            n_heads=cfg.n_head,
            head_dim=cfg.head_dim,
        )
        mapper.copy(
            f"model.layers.{i}.self_attn.q_proj.bias",
            f"transformer.h.{i}.attn.wq.bias",
            permute=True,
            n_heads=cfg.n_head,
            head_dim=cfg.head_dim,
        )
        mapper.copy(
            f"model.layers.{i}.self_attn.k_proj.weight",
            f"transformer.h.{i}.attn.wk.weight",
            permute=True,
            n_heads=cfg.n_kv_head,
            head_dim=cfg.head_dim,
        )
        mapper.copy(
            f"model.layers.{i}.self_attn.k_proj.bias",
            f"transformer.h.{i}.attn.wk.bias",
            permute=True,
            n_heads=cfg.n_kv_head,
            head_dim=cfg.head_dim,
        )
        mapper.copy(
            f"model.layers.{i}.self_attn.v_proj.weight",
            f"transformer.h.{i}.attn.wv.weight",
        )
        mapper.copy(
            f"model.layers.{i}.self_attn.v_proj.bias",
            f"transformer.h.{i}.attn.wv.bias",
        )
        mapper.copy(
            f"model.layers.{i}.self_attn.o_proj.weight",
            f"transformer.h.{i}.attn.wo.weight",
        )
        mapper.copy(
            f"model.layers.{i}.self_attn.o_proj.bias",
            f"transformer.h.{i}.attn.wo.bias",
        )

        # Q/K Norms
        mapper.copy(
            f"model.layers.{i}.self_attn.q_norm.weight",
            f"transformer.h.{i}.attn.q_norm.weight",
            permute=True,
            n_heads=cfg.n_head,
            head_dim=cfg.head_dim,
        )
        mapper.copy(
            f"model.layers.{i}.self_attn.k_norm.weight",
            f"transformer.h.{i}.attn.k_norm.weight",
            permute=True,
            n_heads=cfg.n_kv_head,
            head_dim=cfg.head_dim,
        )

        # MLP
        mapper.copy(
            f"model.layers.{i}.mlp.gate_proj.weight", f"transformer.h.{i}.mlp.w1.weight"
        )
        mapper.copy(
            f"model.layers.{i}.mlp.gate_proj.bias", f"transformer.h.{i}.mlp.w1.bias"
        )
        mapper.copy(
            f"model.layers.{i}.mlp.up_proj.weight", f"transformer.h.{i}.mlp.w2.weight"
        )
        mapper.copy(
            f"model.layers.{i}.mlp.up_proj.bias", f"transformer.h.{i}.mlp.w2.bias"
        )
        mapper.copy(
            f"model.layers.{i}.mlp.down_proj.weight",
            f"transformer.h.{i}.mlp.c_proj.weight",
        )
        mapper.copy(
            f"model.layers.{i}.mlp.down_proj.bias",
            f"transformer.h.{i}.mlp.c_proj.bias",
        )

        # Layer Norms
        mapper.copy(
            f"model.layers.{i}.input_layernorm.weight", f"transformer.h.{i}.ln_1.weight"
        )
        mapper.copy(
            f"model.layers.{i}.post_attention_layernorm.weight",
            f"transformer.h.{i}.ln_2.weight",
        )

    # Final Norm
    mapper.copy("model.norm.weight", "transformer.ln_f.weight")

    # LM Head
    mapper.copy("lm_head.weight", "lm_head.weight")

    # Validation
    mapper.validate(tie_word_embeddings=cfg.tie_word_embeddings)

    del hf_model
    del hf_sd
    import gc

    gc.collect()

    return model