gm.nn.Attention

gm.nn.Attention#

class gemma.gm.nn.Attention(
num_heads: int,
num_kv_heads: int,
features: int,
head_dim: int,
attn_type: gemma.gm.nn._modules.AttentionType,
query_pre_attn_scalar: float,
rope_base_frequency: int = 10000,
rope_scale_factor: float = 1.0,
attn_logits_soft_cap: float | None = None,
sliding_window_size: int | None = None,
use_qk_norm: bool = False,
parent: flax.linen.module.Module | flax.core.scope.Scope | flax.linen.module._Sentinel | None = <flax.linen.module._Sentinel object>,
name: str | None = None,
)[source]

Bases: flax.linen.module.Module

Attention module.

num_heads: int
num_kv_heads: int
features: int
head_dim: int
attn_type: gemma.gm.nn._modules.AttentionType
query_pre_attn_scalar: float
rope_base_frequency: int = 10000
rope_scale_factor: float = 1.0
attn_logits_soft_cap: float | None = None
sliding_window_size: int | None = None
use_qk_norm: bool = False
property use_qkv_einsum
property use_gqa
setup()[source]

Initializes a Module lazily (similar to a lazy __init__).

setup is called once lazily on a module instance when a module is bound, immediately before any other methods like __call__ are invoked, or before a setup-defined attribute on self is accessed.

This can happen in three cases:

  1. Immediately when invoking apply(), init() or init_and_output().

  2. Once the module is given a name by being assigned to an attribute of another module inside the other module’s setup method (see __setattr__()):

    >>> class MyModule(nn.Module):
    ...   def setup(self):
    ...     submodule = nn.Conv(...)
    
    ...     # Accessing `submodule` attributes does not yet work here.
    
    ...     # The following line invokes `self.__setattr__`, which gives
    ...     # `submodule` the name "conv1".
    ...     self.conv1 = submodule
    
    ...     # Accessing `submodule` attributes or methods is now safe and
    ...     # either causes setup() to be called once.
    
  3. Once a module is constructed inside a method wrapped with compact(), immediately before another method is called or setup defined attribute is accessed.

classmethod init_cache(
cache_size: int,
num_heads: int,
head_dim: int,
batch_size: int,
dtype: numpy.dtype = <class 'jax.numpy.bfloat16'>,
) dict[str, jax.Array][source]
name: str | None = None
parent: flax.linen.module.Module | flax.core.scope.Scope | flax.linen.module._Sentinel | None = None
scope: flax.core.scope.Scope | None = None