FastAttention.json

[
  {
    "name": "T-DMCA",
    "arxiv_id": "1801.10198",
    "code": "https://github.com/lucidrains/memory-compressed-attention",
    "comment": "compresses key and value + blocked attention",
    "complexity": "{b}*\\frac{N}{b}*\\frac{N}{{b}*{k}}*{D}",
    "causal": true
  },
  {
    "name": "CBAM",
    "arxiv_id": "1807.06521",
    "code": "https://github.com/Jongchan/attention-module",
    "comment": "combines the SE attention with a per pixel(local) weight",
    "complexity": "({N}*{D}+\\frac{{D}^2}{r})+({N}*{D}*{k}^2)",
    "causal": false
  },
  {
    "name": "Set Transformer",
    "arxiv_id": "1810.00825",
    "code": "https://github.com/juho-lee/set_transformer",
    "comment": "uses K relay nodes",
    "complexity": "{N}*{K}*{D}",
    "causal": false
  },
  {
    "name": "Criss Cross Attention",
    "arxiv_id": "1811.11721",
    "code": "https://github.com/speedinghzl/CCNet",
    "comment": "each pixel attends to its row and column simultaneously",
    "complexity": "{N}*({H}+{W})*{D}",
    "causal": false
  },
  {
    "name": "Efficient Attention",
    "arxiv_id": "1812.01243",
    "code": "https://github.com/cmsflash/efficient-attention",
    "comment": "Softmax(Q)*(Softmax(K^T)*V)",
    "complexity": "{N}*{D}^2",
    "causal": false
  },
  {
    "name": "StarTransformer",
    "arxiv_id": "1902.09113",
    "code": "https://github.com/fastnlp/fastNLP/blob/master/fastNLP/modules/encoder/star_transformer.py",
    "comment": "uses a relay(global) node and attends to/from that node",
    "complexity": "{N}*{D}",
    "causal": false
  },
  {
    "name": "Sparse Transformer",
    "arxiv_id": "1904.10509",
    "code": "https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/ops/sparse_attention/sparse_self_attention.py",
    "comment": "sparse block based attention",
    "complexity": "{N}*\\sqrt{N}*{D}",
    "causal": true
  },
  {
    "name": "GCNet",
    "arxiv_id": "1904.11492",
    "code": "https://github.com/xvjiarui/GCNet",
    "comment": "squeeze and excitation with an attention pooling (instead of a GAP)",
    "complexity": "{N}*{D}^2",
    "causal": false
  },
  {
    "name": "SCRAM",
    "arxiv_id": "1905.10308",
    "comment": "uses PatchMatch to find close keys",
    "complexity": "{N}*\\log({N})*{D}",
    "causal": true
  },
  {
    "name": "Permutohedral Attention",
    "arxiv_id": "1907.00641",
    "code": "https://github.com/SamuelJoutard/Permutohedral_attention_module",
    "comment": "uses permutohedral lattice approximation algorithm to approximate the attention output",
    "complexity": "{N}*{D}^2",
    "causal": false
  },
  {
    "name": "Product Key Attention",
    "arxiv_id": "1907.05242",
    "code": "https://github.com/facebookresearch/XLM",
    "comment": "search for nearest neighbor keys",
    "complexity": "{Q}*({K}+{k}^2)*{D}",
    "causal": true
  },
  {
    "name": "Interlaced Attention",
    "arxiv_id": "1907.12273",
    "code": "IN_PAPER",
    "comment": "combination of a short length and then long range(dilated) attention",
    "complexity": "{N}*{D}^2+{N}*\\sqrt{N}*{D}",
    "causal": true
  },
  {
    "name": "EMA",
    "arxiv_id": "1907.13426",
    "code": "https://github.com/XiaLiPKU/EMANet",
    "comment": "applys expectation maximization to cluster keys into k clusters",
    "complexity": "{N}*{k}*{D}",
    "causal": false
  },
  {
    "name": "BP-Transformer",
    "arxiv_id": "1911.04070",
    "code": "https://github.com/yzh119/BPT",
    "comment": "attends to distant tokens coarsely and attends to close tokens in a more fine-grained manner",
    "complexity": "{N}*{k}*\\log(\\frac{N}{k})*{D}",
    "causal": true
  },
  {
    "name": "Compressive Transformer",
    "arxiv_id": "1911.05507",
    "code": "https://github.com/lucidrains/compressive-transformer-pytorch",
    "comment": "compresses distant tokens instead of just stop_grad() ing them, more efficient version of transformerXL",
    "complexity": "{N}^2*{D}",
    "causal": true
  },
  {
    "name": "Axial Attention",
    "arxiv_id": "1912.12180",
    "code": "https://github.com/lucidrains/axial-attention",
    "comment": "apply attention on each axis separately",
    "complexity": "{N}*({H}+{W})*{D}",
    "causal": true
  },
  {
    "name": "Reformer",
    "arxiv_id": "2001.04451",
    "code": "https://github.com/google/trax/tree/master/trax/models/reformer",
    "comment": "uses LSH to find close keys",
    "complexity": "{N}*\\log({N})*{D}^2",
    "causal": true
  },
  {
    "name": "TaLK",
    "arxiv_id": "2002.03184",
    "code": "https://github.com/lioutasb/TaLKConvolutions",
    "comment": "calculate mean over a dynamic subsequence around each token with the help of summed-area table",
    "complexity": "{N}*{D}",
    "causal": true
  },
  {
    "name": "Transformer on a Diet",
    "arxiv_id": "2002.06170",
    "code": "https://github.com/cgraywang/transformer-on-diet",
    "comment": "dilated transformer like wavenet",
    "complexity": "{N}*{k}*{D}",
    "causal": true
  },
  {
    "name": "Sinkhorn Attention",
    "arxiv_id": "2002.11296",
    "code": "https://github.com/lucidrains/sinkhorn-transformer",
    "comment": "uses a cost matrix to limit attention between buckets",
    "complexity": "\\frac{{N}^2}{n_b}+{n_b}^2",
    "causal": true
  },
  {
    "name": "Routing Transformer",
    "arxiv_id": "2003.05997",
    "code": "https://github.com/lucidrains/routing-transformer",
    "comment": "computes attention with same-cluster tokens (computed by online k-means)",
    "complexity": "{N}*\\sqrt{N}*{D}",
    "causal": true
  },
  {
    "name": "SAC",
    "arxiv_id": "2003.09833",
    "comment": "learns the q, k connections == dynamically creates a sparse attention matrix",
    "complexity": "{N}*{k}*{D}",
    "causal": true
  },
  {
    "name": "AutoNL",
    "arxiv_id": "2004.01961",
    "code": "https://github.com/LiYingwei/AutoNL",
    "comment": "computes Q(KV) and also down samples q, k, v both in spatial and channel dimensions",
    "complexity": "(\\frac{H}{h}*\\frac{W}{w})*(\\frac{D}{k})^2",
    "causal": false
  },
  {
    "name": "Longformer",
    "arxiv_id": "2004.05150",
    "code": "https://github.com/allenai/longformer",
    "comment": "global + blocked attention",
    "complexity": "{N}*({k}+{g})*{D}",
    "causal": true
  },
  {
    "name": "ETC",
    "arxiv_id": "2004.08483",
    "comment": "combines global attention (star transformer with multiple global tokens) with local attention",
    "complexity": "({N}*{g}+{g}^2+{N}*{k})*{D}",
    "causal": false
  },
  {
    "name": "Jukebox",
    "arxiv_id": "2005.00341",
    "code": "https://github.com/openai/jukebox",
    "comment": "better attention patterns from Sparse Transformer",
    "complexity": "{N}*\\sqrt{N}*{D}",
    "causal": true
  },
  {
    "name": "MultiScale Transformer",
    "arxiv_id": "2005.00581",
    "code": "IN_PAPER",
    "comment": "UNet like + retina attetion is something close to BP-Transformer",
    "complexity": "{N}^2*{D}",
    "causal": true
  },
  {
    "name": "Synthesizer",
    "arxiv_id": "2005.00743",
    "code": "https://github.com/leaderj1001/Synthesizer-Rethinking-Self-Attention-Transformer-Models",
    "comment": "does not compute pairwise interactions",
    "complexity": "{N}^2*{D}",
    "causal": true
  },
  {
    "name": "GMAT",
    "arxiv_id": "2006.03274",
    "code": "https://github.com/ag1988/gmat",
    "comment": "adds global tokens",
    "complexity": "{m}*({N}+{m})*{D}",
    "causal": false
  },
  {
    "name": "Performer(FAVOR)",
    "arxiv_id": "2006.03555",
    "code": "https://github.com/google-research/google-research/tree/master/performer/fast_attention",
    "comment": "calculate an unbiased stochastic approximation of the attention matrix",
    "complexity": "{N}*{D}^2*\\log({D})",
    "causal": true
  },
  {
    "name": "Linformer",
    "arxiv_id": "2006.04768",
    "code": "https://github.com/tatp22/linformer-pytorch",
    "comment": "project key and value from n*d to k*d",
    "complexity": "{N}*{k}*{D}",
    "causal": false
  },
  {
    "name": "Hand-Crafted Attention",
    "arxiv_id": "2006.05174",
    "comment": "does not compute pairwise interactions and uses fixed mask patters",
    "complexity": "{N}^2*{D}",
    "causal": true
  },
  {
    "name": "Linear Transformer",
    "arxiv_id": "2006.16236",
    "code": "https://github.com/idiap/fast-transformers",
    "comment": "uses phi(q)(phi(k)v) and also improves the sequential sampling step",
    "complexity": "{N}*{D}^2",
    "causal": true
  },
  {
    "name": "FANet",
    "arxiv_id": "2007.03815",
    "comment": "l2_norm(q)*(l2_norm(k)*v)",
    "complexity": "{N}*{D}^2",
    "causal": false
  },
  {
    "name": "Clustered Attention",
    "arxiv_id": "2007.04825",
    "code": "https://github.com/idiap/fast-transformers",
    "comment": "groups queries together with LSH",
    "complexity": "{N}*{k}*{D}",
    "causal": false
  },
  {
    "name": "Kronecker Attention",
    "arxiv_id": "2007.08442",
    "code": "https://github.com/lucidrains/kronecker-attention-pytorch",
    "comment": "uses horizontal and lateral average matrices",
    "complexity": "({H}+{W})^2*{D}",
    "causal": false
  },
  {
    "name": "BigBird",
    "arxiv_id": "2007.14062",
    "code": "https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/ops/sparse_attention/sparse_self_attention.py",
    "comment": "ETC with random connections",
    "complexity": "({g}^2+{N}*({k}+{g}+{r}))*{D}",
    "causal": false
  },
  {
    "name": "TGM+TRM",
    "arxiv_id": "2008.00490",
    "comment": "decompose the full attention tensor into rank one tensors (CP decomposition)",
    "complexity": "({D}*{H}*{W}+{D}^2+{H}^2+{W}^2)*{r}",
    "causal": false
  },
  {
    "name": "FracTAL",
    "arxiv_id": "2009.02062",
    "code": "IN_PAPER",
    "comment": "uses the fractal tanimoto similarity to compare queries with keys inside the attention module",
    "complexity": "{H}*{W}*{D}",
    "causal": false
  },
  {
    "name": "Performer(FAVOR+)",
    "arxiv_id": "2009.14794",
    "code": "https://github.com/google-research/google-research/tree/master/performer",
    "comment": "unbiased approximation of the attention matrix with softmax kernel",
    "complexity": "{N}*{m}*{D}",
    "causal": true
  },
  {
    "name": "SMYRF",
    "arxiv_id": "2010.05315",
    "code": "https://github.com/giannisdaras/smyrf",
    "comment": "LSH with balanced clusters",
    "complexity": "{N}*\\log({N})*{D}",
    "causal": false
  },
  {
    "name": "Memformer",
    "arxiv_id": "2010.06891",
    "code": "https://github.com/lucidrains/memformer",
    "comment": "attend to memory slots + Memory-Replay BackPropagation",
    "complexity": "{N}*{D}",
    "causal": true
  },
  {
    "name": "Informer",
    "arxiv_id": "2012.07436",
    "code": "https://github.com/zhouhaoyi/Informer2020",
    "comment": "sparse attention + funnel like encoder",
    "complexity": "{N}*\\log({N})*{D}",
    "causal": true
  },
  {
    "name": "SLiM",
    "arxiv_id": "2012.11346",
    "code": "https://github.com/google-research/google-research/tree/master/performer/models/slim_performer",
    "comment": "Performer but with sublinear Memory usage",
    "complexity": "{N}*{m}*{D}",
    "causal": true
  },
  {
    "name": "Nystromformer",
    "arxiv_id": "2102.03902",
    "code": "https://github.com/mlpen/Nystromformer",
    "comment": "uses Nystrom method to approximate the attention matrix",
    "complexity": "{N}*{D}",
    "causal": false
  },
  {
    "name": "LambdaNetworks",
    "arxiv_id": "2102.08602",
    "code": "https://github.com/lucidrains/lambda-networks",
    "comment": "generates a linear layer based on context + decouple pos/context",
    "complexity": "{N}^2*{k}*\\frac{v}{h}",
    "causal": true
  },
  {
    "name": "SecretlyFastWeight",
    "arxiv_id": "2102.11174",
    "code": "https://github.com/ischlag/fast-weight-transformers",
    "comment": "show that linear transformers are basically fast weight networks + propose a new kernel function to linearise attention, balancing simplicity and effectiveness",
    "complexity": "{N}*{m}*{D}",
    "causal": true
  },
  {
    "name": "RFA",
    "arxiv_id": "2103.02143",
    "comment": "kernel approximation and also transformers are rnn",
    "complexity": "{N}*{D}",
    "causal": true
  }
]