gm.nn.Gemma3_27B

gm.nn.Gemma3_27B#

class gemma.gm.nn.Gemma3_27B(
config: _config.TransformerConfig = TransformerConfig(num_embed=262144,
embed_dim=5376,
hidden_dim=21504,
num_heads=32,
head_dim=128,
num_kv_heads=16,
final_logit_softcap=None,
use_post_attn_norm=True,
use_post_ffw_norm=True,
attention_types=(<AttentionType.LOCAL_SLIDING: 2>,
<AttentionType.LOCAL_SLIDING: 2>,
<AttentionType.LOCAL_SLIDING: 2>,
<AttentionType.LOCAL_SLIDING: 2>,
<AttentionType.LOCAL_SLIDING: 2>,
<AttentionType.GLOBAL: 1>,
<AttentionType.LOCAL_SLIDING: 2>,
<AttentionType.LOCAL_SLIDING: 2>,
<AttentionType.LOCAL_SLIDING: 2>,
<AttentionType.LOCAL_SLIDING: 2>,
<AttentionType.LOCAL_SLIDING: 2>,
<AttentionType.GLOBAL: 1>,
<AttentionType.LOCAL_SLIDING: 2>,
<AttentionType.LOCAL_SLIDING: 2>,
<AttentionType.LOCAL_SLIDING: 2>,
<AttentionType.LOCAL_SLIDING: 2>,
<AttentionType.LOCAL_SLIDING: 2>,
<AttentionType.GLOBAL: 1>,
<AttentionType.LOCAL_SLIDING: 2>,
<AttentionType.LOCAL_SLIDING: 2>,
<AttentionType.LOCAL_SLIDING: 2>,
<AttentionType.LOCAL_SLIDING: 2>,
<AttentionType.LOCAL_SLIDING: 2>,
<AttentionType.GLOBAL: 1>,
<AttentionType.LOCAL_SLIDING: 2>,
<AttentionType.LOCAL_SLIDING: 2>,
<AttentionType.LOCAL_SLIDING: 2>,
<AttentionType.LOCAL_SLIDING: 2>,
<AttentionType.LOCAL_SLIDING: 2>,
<AttentionType.GLOBAL: 1>,
<AttentionType.LOCAL_SLIDING: 2>,
<AttentionType.LOCAL_SLIDING: 2>,
<AttentionType.LOCAL_SLIDING: 2>,
<AttentionType.LOCAL_SLIDING: 2>,
<AttentionType.LOCAL_SLIDING: 2>,
<AttentionType.GLOBAL: 1>,
<AttentionType.LOCAL_SLIDING: 2>,
<AttentionType.LOCAL_SLIDING: 2>,
<AttentionType.LOCAL_SLIDING: 2>,
<AttentionType.LOCAL_SLIDING: 2>,
<AttentionType.LOCAL_SLIDING: 2>,
<AttentionType.GLOBAL: 1>,
<AttentionType.LOCAL_SLIDING: 2>,
<AttentionType.LOCAL_SLIDING: 2>,
<AttentionType.LOCAL_SLIDING: 2>,
<AttentionType.LOCAL_SLIDING: 2>,
<AttentionType.LOCAL_SLIDING: 2>,
<AttentionType.GLOBAL: 1>,
<AttentionType.LOCAL_SLIDING: 2>,
<AttentionType.LOCAL_SLIDING: 2>,
<AttentionType.LOCAL_SLIDING: 2>,
<AttentionType.LOCAL_SLIDING: 2>,
<AttentionType.LOCAL_SLIDING: 2>,
<AttentionType.GLOBAL: 1>,
<AttentionType.LOCAL_SLIDING: 2>,
<AttentionType.LOCAL_SLIDING: 2>,
<AttentionType.LOCAL_SLIDING: 2>,
<AttentionType.LOCAL_SLIDING: 2>,
<AttentionType.LOCAL_SLIDING: 2>,
<AttentionType.GLOBAL: 1>,
<AttentionType.LOCAL_SLIDING: 2>,
<AttentionType.LOCAL_SLIDING: 2>),
query_pre_attn_norm=<QueryPreAttentionNormalisation.BY_ONE_OVER_SQRT_EMBED_DIM_DIV_NUM_HEADS: 3>,
attn_logits_soft_cap=None,
sliding_window_size=1024,
transpose_gating_einsum=True,
use_qk_norm=True,
local_base_frequency=10000,
global_base_frequency=1000000,
local_scale_factor=1.0,
global_scale_factor=8.0,
vision_encoder=SigLiPFromPatches(     # attributes     siglip_encoder = ViTModel(         # attributes         patch_size = (14,
14)         width=1152         depth = 27         mlp_dim = 4304         num_heads = 16         posemb = 'learn'         dropout = 0.0         scan = False         remat_policy = 'nothing_saveable'         dtype_mm = 'float32'     )     siglip_exit = VisionExit(         # attributes         output_length = 256     )     num_mm_tokens_per_image_prepool = 4096     num_mm_tokens_per_image = 256     image_height = 896     image_width = 896     image_channels = 3     apply_stop_gradient = True )),
text_only: bool = False,
parent: flax.linen.module.Module | flax.core.scope.Scope | flax.linen.module._Sentinel | None = <flax.linen.module._Sentinel object>,
name: str | None = None,
*,
return_last_only: bool | None = None,
dtype: jnp.dtype = <class 'jax.numpy.bfloat16'>,
tokens: kontext.Key = '__KEY_REQUIRED__',
images: kontext.Key | None = None,
positions: kontext.Key | None = None,
attention_mask: kontext.Key | None = None,
)[source]

Bases: gemma.gm.nn._gemma._Gemma3Base

Gemma3 transformer architecture.

config: gemma.gm.nn._config.TransformerConfig = TransformerConfig(num_embed=262144, embed_dim=5376, hidden_dim=21504, num_heads=32, head_dim=128, num_kv_heads=16, final_logit_softcap=None, use_post_attn_norm=True, use_post_ffw_norm=True, attention_types=(<AttentionType.LOCAL_SLIDING: 2>, <AttentionType.LOCAL_SLIDING: 2>, <AttentionType.LOCAL_SLIDING: 2>, <AttentionType.LOCAL_SLIDING: 2>, <AttentionType.LOCAL_SLIDING: 2>, <AttentionType.GLOBAL: 1>, <AttentionType.LOCAL_SLIDING: 2>, <AttentionType.LOCAL_SLIDING: 2>, <AttentionType.LOCAL_SLIDING: 2>, <AttentionType.LOCAL_SLIDING: 2>, <AttentionType.LOCAL_SLIDING: 2>, <AttentionType.GLOBAL: 1>, <AttentionType.LOCAL_SLIDING: 2>, <AttentionType.LOCAL_SLIDING: 2>, <AttentionType.LOCAL_SLIDING: 2>, <AttentionType.LOCAL_SLIDING: 2>, <AttentionType.LOCAL_SLIDING: 2>, <AttentionType.GLOBAL: 1>, <AttentionType.LOCAL_SLIDING: 2>, <AttentionType.LOCAL_SLIDING: 2>, <AttentionType.LOCAL_SLIDING: 2>, <AttentionType.LOCAL_SLIDING: 2>, <AttentionType.LOCAL_SLIDING: 2>, <AttentionType.GLOBAL: 1>, <AttentionType.LOCAL_SLIDING: 2>, <AttentionType.LOCAL_SLIDING: 2>, <AttentionType.LOCAL_SLIDING: 2>, <AttentionType.LOCAL_SLIDING: 2>, <AttentionType.LOCAL_SLIDING: 2>, <AttentionType.GLOBAL: 1>, <AttentionType.LOCAL_SLIDING: 2>, <AttentionType.LOCAL_SLIDING: 2>, <AttentionType.LOCAL_SLIDING: 2>, <AttentionType.LOCAL_SLIDING: 2>, <AttentionType.LOCAL_SLIDING: 2>, <AttentionType.GLOBAL: 1>, <AttentionType.LOCAL_SLIDING: 2>, <AttentionType.LOCAL_SLIDING: 2>, <AttentionType.LOCAL_SLIDING: 2>, <AttentionType.LOCAL_SLIDING: 2>, <AttentionType.LOCAL_SLIDING: 2>, <AttentionType.GLOBAL: 1>, <AttentionType.LOCAL_SLIDING: 2>, <AttentionType.LOCAL_SLIDING: 2>, <AttentionType.LOCAL_SLIDING: 2>, <AttentionType.LOCAL_SLIDING: 2>, <AttentionType.LOCAL_SLIDING: 2>, <AttentionType.GLOBAL: 1>, <AttentionType.LOCAL_SLIDING: 2>, <AttentionType.LOCAL_SLIDING: 2>, <AttentionType.LOCAL_SLIDING: 2>, <AttentionType.LOCAL_SLIDING: 2>, <AttentionType.LOCAL_SLIDING: 2>, <AttentionType.GLOBAL: 1>, <AttentionType.LOCAL_SLIDING: 2>, <AttentionType.LOCAL_SLIDING: 2>, <AttentionType.LOCAL_SLIDING: 2>, <AttentionType.LOCAL_SLIDING: 2>, <AttentionType.LOCAL_SLIDING: 2>, <AttentionType.GLOBAL: 1>, <AttentionType.LOCAL_SLIDING: 2>, <AttentionType.LOCAL_SLIDING: 2>), query_pre_attn_norm=<QueryPreAttentionNormalisation.BY_ONE_OVER_SQRT_EMBED_DIM_DIV_NUM_HEADS: 3>, attn_logits_soft_cap=None, sliding_window_size=1024, transpose_gating_einsum=True, use_qk_norm=True, local_base_frequency=10000, global_base_frequency=1000000, local_scale_factor=1.0, global_scale_factor=8.0, vision_encoder=SigLiPFromPatches(     # attributes     siglip_encoder = ViTModel(         # attributes         patch_size = (14, 14)         width = 1152         depth = 27         mlp_dim = 4304         num_heads = 16         posemb = 'learn'         dropout = 0.0         scan = False         remat_policy = 'nothing_saveable'         dtype_mm = 'float32'     )     siglip_exit = VisionExit(         # attributes         output_length = 256     )     num_mm_tokens_per_image_prepool = 4096     num_mm_tokens_per_image = 256     image_height = 896     image_width = 896     image_channels = 3     apply_stop_gradient = True ))
INFO: ClassVar[gemma.gm.nn._transformer.ModelInfo] = ModelInfo(tokenizer_version=3, default_ckpt=None)
name: str | None = None
parent: flax.linen.module.Module | flax.core.scope.Scope | flax.linen.module._Sentinel | None = None
scope: flax.core.scope.Scope | None = None