@article{https://doi.org/10.48550/arxiv.1706.03762,
  doi             = {10.48550/ARXIV.1706.03762},
  url             = {https://arxiv.org/abs/1706.03762},
  author          = {Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and
                  Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N. and
                  Kaiser, Lukasz and Polosukhin, Illia},
  keywords        = {Computation and Language (cs.CL), Machine Learning (cs.LG),
                  FOS: Computer and information sciences, FOS: Computer and
                  information sciences},
  title           = {Attention Is All You Need},
  publisher       = {arXiv},
  year            = 2017,
  copyright       = {arXiv.org perpetual, non-exclusive license}
}

@article{https://doi.org/10.48550/arxiv.1912.12180,
  doi             = {10.48550/ARXIV.1912.12180},
  url             = {https://arxiv.org/abs/1912.12180},
  author          = {Ho, Jonathan and Kalchbrenner, Nal and Weissenborn, Dirk
                  and Salimans, Tim},
  keywords        = {Computer Vision and Pattern Recognition (cs.CV), FOS:
                  Computer and information sciences, FOS: Computer and
                  information sciences},
  title           = {Axial Attention in Multidimensional Transformers},
  publisher       = {arXiv},
  year            = 2019,
  copyright       = {arXiv.org perpetual, non-exclusive license}
}

@article{https://doi.org/10.48550/arxiv.2004.05150,
  doi             = {10.48550/ARXIV.2004.05150},
  url             = {https://arxiv.org/abs/2004.05150},
  author          = {Beltagy, Iz and Peters, Matthew E. and Cohan, Arman},
  keywords        = {Computation and Language (cs.CL), FOS: Computer and
                  information sciences, FOS: Computer and information sciences},
  title           = {Longformer: The Long-Document Transformer},
  publisher       = {arXiv},
  year            = 2020,
  copyright       = {arXiv.org perpetual, non-exclusive license}
}

@article{https://doi.org/10.48550/arxiv.1901.02860,
  doi             = {10.48550/ARXIV.1901.02860},
  url             = {https://arxiv.org/abs/1901.02860},
  author          = {Dai, Zihang and Yang, Zhilin and Yang, Yiming and
                  Carbonell, Jaime and Le, Quoc V. and Salakhutdinov, Ruslan},
  keywords        = {Machine Learning (cs.LG), Computation and Language (cs.CL),
                  Machine Learning (stat.ML), FOS: Computer and information
                  sciences, FOS: Computer and information sciences},
  title           = {Transformer-XL: Attentive Language Models Beyond a
                  Fixed-Length Context},
  publisher       = {arXiv},
  year            = 2019,
  copyright       = {Creative Commons Attribution Non Commercial Share Alike 4.0
                  International}
}

@inproceedings{devlin-etal-2019-bert,
  title           = "{BERT}: Pre-training of Deep Bidirectional Transformers for
                  Language Understanding",
  author          = "Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and
                  Toutanova, Kristina",
  booktitle       = "Proceedings of the 2019 Conference of the North {A}merican
                  Chapter of the Association for Computational Linguistics:
                  Human Language Technologies, Volume 1 (Long and Short Papers)",
  month           = jun,
  year            = 2019,
  address         = "Minneapolis, Minnesota",
  publisher       = "Association for Computational Linguistics",
  url             = "https://aclanthology.org/N19-1423",
  doi             = "10.18653/v1/N19-1423",
  pages           = "4171--4186",
  abstract        = "We introduce a new language representation model called
                  BERT, which stands for Bidirectional Encoder Representations
                  from Transformers. Unlike recent language representation
                  models (Peters et al., 2018a; Radford et al., 2018), BERT is
                  designed to pre-train deep bidirectional representations from
                  unlabeled text by jointly conditioning on both left and right
                  context in all layers. As a result, the pre-trained BERT model
                  can be fine-tuned with just one additional output layer to
                  create state-of-the-art models for a wide range of tasks, such
                  as question answering and language inference, without
                  substantial task-specific architecture modifications. BERT is
                  conceptually simple and empirically powerful. It obtains new
                  state-of-the-art results on eleven natural language processing
                  tasks, including pushing the GLUE score to 80.5 (7.7 point
                  absolute improvement), MultiNLI accuracy to 86.7{\%} (4.6{\%}
                  absolute improvement), SQuAD v1.1 question answering Test F1
                  to 93.2 (1.5 point absolute improvement) and SQuAD v2.0 Test
                  F1 to 83.1 (5.1 point absolute improvement).",
}

@article{https://doi.org/10.48550/arxiv.2007.04825,
  doi             = {10.48550/ARXIV.2007.04825},
  url             = {https://arxiv.org/abs/2007.04825},
  author          = {Vyas, Apoorv and Katharopoulos, Angelos and Fleuret,
                  François},
  keywords        = {Machine Learning (cs.LG), Machine Learning (stat.ML), FOS:
                  Computer and information sciences, FOS: Computer and
                  information sciences},
  title           = {Fast Transformers with Clustered Attention},
  publisher       = {arXiv},
  year            = 2020,
  copyright       = {arXiv.org perpetual, non-exclusive license}
}

@inproceedings{bastings-filippova-2020-elephant,
  title           = "The elephant in the interpretability room: Why use
                  attention as explanation when we have saliency methods?",
  author          = "Bastings, Jasmijn and Filippova, Katja",
  booktitle       = "Proceedings of the Third BlackboxNLP Workshop on Analyzing
                  and Interpreting Neural Networks for NLP",
  month           = nov,
  year            = 2020,
  address         = "Online",
  publisher       = "Association for Computational Linguistics",
  url             = "https://aclanthology.org/2020.blackboxnlp-1.14",
  doi             = "10.18653/v1/2020.blackboxnlp-1.14",
  pages           = "149--155",
  abstract        = "There is a recent surge of interest in using attention as
                  explanation of model predictions, with mixed evidence on
                  whether attention can be used as such. While attention
                  conveniently gives us one weight per input token and is easily
                  extracted, it is often unclear toward what goal it is used as
                  explanation. We find that often that goal, whether explicitly
                  stated or not, is to find out what input tokens are the most
                  relevant to a prediction, and that the implied user for the
                  explanation is a model developer. For this goal and user, we
                  argue that input saliency methods are better suited, and that
                  there are no compelling reasons to use attention, despite the
                  coincidence that it provides a weight for each input. With
                  this position paper, we hope to shift some of the recent focus
                  on attention to saliency methods, and for authors to clearly
                  state the goal and user for their explanations.",
}

@article {Rao2021.02.12.430858,
  author          = {Rao, Roshan and Liu, Jason and Verkuil, Robert and Meier,
                  Joshua and Canny, John F. and Abbeel, Pieter and Sercu, Tom
                  and Rives, Alexander},
  title           = {MSA Transformer},
  elocation-id    = {2021.02.12.430858},
  year            = 2021,
  doi             = {10.1101/2021.02.12.430858},
  publisher       = {Cold Spring Harbor Laboratory},
  abstract        = {Unsupervised protein language models trained across
                  millions of diverse sequences learn structure and function of
                  proteins. Protein language models studied to date have been
                  trained to perform inference from individual sequences. The
                  longstanding approach in computational biology has been to
                  make inferences from a family of evo lutionarily related
                  sequences by fitting a model to each family independently. In
                  this work we combine the two paradigms. We introduce a protein
                  language model which takes as input a set of sequences in the
                  form of a multiple sequence alignment. The model interleaves
                  row and column attention across the input sequences and is
                  trained with a variant of the masked language modeling
                  objective across many protein families. The performance of the
                  model surpasses current state-of-the-art unsupervised
                  structure learning methods by a wide margin, with far greater
                  parameter efficiency than prior state-of-the-art protein
                  language models.Competing Interest StatementThe authors have
                  declared no competing interest.},
  URL             =
                  {https://www.biorxiv.org/content/early/2021/08/27/2021.02.12.430858},
  eprint          =
                  {https://www.biorxiv.org/content/early/2021/08/27/2021.02.12.430858.full.pdf},
  journal         = {bioRxiv}
}

@article{Jumper2021,
  author          = {Jumper, John and Evans, Richard and Pritzel, Alexander and
                  Green, Tim and Figurnov, Michael and Ronneberger, Olaf and
                  Tunyasuvunakool, Kathryn and Bates, Russ and {\v{Z}}{\'i}dek,
                  Augustin and Potapenko, Anna and Bridgland, Alex and Meyer,
                  Clemens and Kohl, Simon A. A. and Ballard, Andrew J. and
                  Cowie, Andrew and Romera-Paredes, Bernardino and Nikolov,
                  Stanislav and Jain, Rishub and Adler, Jonas and Back, Trevor
                  and Petersen, Stig and Reiman, David and Clancy, Ellen and
                  Zielinski, Michal and Steinegger, Martin and Pacholska,
                  Michalina and Berghammer, Tamas and Bodenstein, Sebastian and
                  Silver, David and Vinyals, Oriol and Senior, Andrew W. and
                  Kavukcuoglu, Koray and Kohli, Pushmeet and Hassabis, Demis},
  title           = {Highly accurate protein structure prediction with
                  AlphaFold},
  journal         = {Nature},
  year            = 2021,
  month           = {Aug},
  day             = 01,
  volume          = 596,
  number          = 7873,
  pages           = {583-589},
  abstract        = {Proteins are essential to life, and understanding their
                  structure can facilitate a mechanistic understanding of their
                  function. Through an enormous experimental effort1--4, the
                  structures of around 100,000 unique proteins have been
                  determined5, but this represents a small fraction of the
                  billions of known protein sequences6,7. Structural coverage is
                  bottlenecked by the months to years of painstaking effort
                  required to determine a single protein structure. Accurate
                  computational approaches are needed to address this gap and to
                  enable large-scale structural bioinformatics. Predicting the
                  three-dimensional structure that a protein will adopt based
                  solely on its amino acid sequence---the structure prediction
                  component of the `protein folding problem'8---has been an
                  important open research problem for more than 50 years9.
                  Despite recent progress10--14, existing methods fall far short
                  of atomic accuracy, especially when no homologous structure is
                  available. Here we provide the first computational method that
                  can regularly predict protein structures with atomic accuracy
                  even in cases in which no similar structure is known. We
                  validated an entirely redesigned version of our neural
                  network-based model, AlphaFold, in the challenging 14th
                  Critical Assessment of protein Structure Prediction
                  (CASP14)15, demonstrating accuracy competitive with
                  experimental structures in a majority of cases and greatly
                  outperforming other methods. Underpinning the latest version
                  of AlphaFold is a novel machine learning approach that
                  incorporates physical and biological knowledge about protein
                  structure, leveraging multi-sequence alignments, into the
                  design of the deep learning algorithm.},
  issn            = {1476-4687},
  doi             = {10.1038/s41586-021-03819-2},
  url             = {https://doi.org/10.1038/s41586-021-03819-2}
}