Source code for akida.layers.attention

from akida.core import (Layer, LayerParams, LayerType)


[docs]class Attention(Layer): """Multi-head attention layer. From A. Vaswani et al., "Attention is All You Need" (arXiv:1706.03762): "Self-attention, sometimes called intra-attention is an attention mechanism relating different positions of a single sequence in order to compute a representation of the sequence." This layer will take three inputs, Query, Key and Value, and perform these actions on each head: * Multiply Query and Key to obtain a vector of attention scores expressing how tokens/patches relate to one another. * Divide by a scale factor. * Convert the score to a probability mask using a Softmax function (replaced by a Shiftmax in our implementation). * Multiply the mask by the Values. Note that outputs and masks will be saturated on the range that can be represented with output_bits. Args: num_heads (int): number of heads. output_bits (int, optional): output bitwidth. Defaults to 8 buffer_bits (int, optional): internal bitwidth. Defaults to 32 post_op_buffer_bits (int, optional): internal bitwidth for post operations. Defaults to 32. shiftmax_output_bits (int, optional): output bitwidth for shiftmax, must be no more than 1/2 of buffer_bits. Defaults to 10 name (str, optional): name of the layer. Defaults to empty string """ def __init__(self, num_heads, output_bits=8, buffer_bits=32, post_op_buffer_bits=32, shiftmax_output_bits=10, name=""): try: params = LayerParams( LayerType.Attention, { "num_heads": num_heads, "output_bits": output_bits, "buffer_bits": buffer_bits, "post_op_buffer_bits": post_op_buffer_bits, "shiftmax_output_bits": shiftmax_output_bits }) # Call parent constructor to initialize C++ bindings # Note that we invoke directly __init__ instead of using super, as # specified in pybind documentation Layer.__init__(self, params, name) except BaseException: self = None raise