Inference-only GLM-4-0414 model compatible with HuggingFace weights.
 module-attribute  ¶
 ALL_DECODER_LAYER_TYPES = {'attention': Glm4DecoderLayer}
 
  Bases: Module
Source code in vllm/model_executor/models/glm4.py
  instance-attribute  ¶
 attn = Attention(
    num_heads,
    head_dim,
    scaling,
    num_kv_heads=num_kv_heads,
    cache_config=cache_config,
    quant_config=quant_config,
    prefix=f"{prefix}.attn",
    attn_type=attn_type,
)
 instance-attribute  ¶
 o_proj = RowParallelLinear(
    total_num_heads * head_dim,
    hidden_size,
    bias=False,
    quant_config=quant_config,
    prefix=f"{prefix}.o_proj",
)
 instance-attribute  ¶
 qkv_proj = QKVParallelLinear(
    hidden_size,
    head_dim,
    total_num_heads,
    total_num_kv_heads,
    bias=qkv_bias,
    quant_config=quant_config,
    prefix=f"{prefix}.qkv_proj",
)
 instance-attribute  ¶
 rotary_emb = get_rope(
    head_dim,
    rotary_dim=rotary_dim,
    max_position=max_position,
    base=rope_theta,
    rope_scaling=rope_scaling,
    partial_rotary_factor=partial_rotary_factor,
    is_neox_style=False,
)
 
 __init__(
    config: Glm4Config,
    hidden_size: int,
    num_heads: int,
    num_kv_heads: int,
    max_position: int = 4096 * 32,
    head_dim: int | None = None,
    qkv_bias: bool = False,
    rope_theta: float = 10000,
    cache_config: CacheConfig | None = None,
    quant_config: QuantizationConfig | None = None,
    rope_scaling: tuple | None = None,
    prefix: str = "",
    attn_type: str = DECODER,
) -> None
Source code in vllm/model_executor/models/glm4.py
  
  Source code in vllm/model_executor/models/glm4.py
  
  Bases: Module
Source code in vllm/model_executor/models/glm4.py
  instance-attribute  ¶
 mlp = LlamaMLP(
    hidden_size=hidden_size,
    intermediate_size=intermediate_size,
    hidden_act=hidden_act,
    quant_config=quant_config,
    prefix=f"{prefix}.mlp",
)
 instance-attribute  ¶
 post_attention_layernorm = RMSNorm(
    hidden_size, eps=rms_norm_eps
)
 instance-attribute  ¶
 post_self_attn_layernorm = RMSNorm(
    hidden_size, eps=rms_norm_eps
)
 instance-attribute  ¶
 self_attn = Glm4Attention(
    config=config,
    hidden_size=hidden_size,
    num_heads=num_attention_heads,
    max_position=max_position_embeddings,
    num_kv_heads=num_key_value_heads,
    rope_theta=rope_theta,
    qkv_bias=getattr(config, "attention_bias", False),
    head_dim=getattr(config, "head_dim", None),
    cache_config=cache_config,
    quant_config=quant_config,
    rope_scaling=rope_scaling,
    prefix=f"{prefix}.self_attn",
    attn_type=DECODER,
)
 
 __init__(
    vllm_config: VllmConfig,
    prefix: str = "",
    config: Glm4Config | None = None,
) -> None
Source code in vllm/model_executor/models/glm4.py
  
 forward(
    positions: Tensor,
    hidden_states: Tensor,
    residual: Tensor | None,
) -> tuple[Tensor, Tensor]
Source code in vllm/model_executor/models/glm4.py
  
  Bases: Module, SupportsLoRA, SupportsPP
Source code in vllm/model_executor/models/glm4.py
  instance-attribute  ¶
   instance-attribute  ¶
 model = Glm4Model(
    vllm_config=vllm_config,
    prefix=maybe_prefix(prefix, "model"),
)
 class-attribute instance-attribute  ¶
 packed_modules_mapping = {
    "qkv_proj": ["q_proj", "k_proj", "v_proj"],
    "gate_up_proj": ["gate_proj", "up_proj"],
}
 
 __init__(*, vllm_config: VllmConfig, prefix: str = '')
Source code in vllm/model_executor/models/glm4.py
  
    
 forward(
    input_ids: Tensor,
    positions: Tensor,
    intermediate_tensors: IntermediateTensors | None = None,
    inputs_embeds: Tensor | None = None,
) -> Tensor | IntermediateTensors
Source code in vllm/model_executor/models/glm4.py
  
    
  Source code in vllm/model_executor/models/glm4.py
   
  Bases: LlamaModel