@article{https://doi.org/10.48550/arxiv.1706.03762, doi = {10.48550/ARXIV.1706.03762}, url = {https://arxiv.org/abs/1706.03762}, author = {Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N. and Kaiser, Lukasz and Polosukhin, Illia}, keywords = {Computation and Language (cs.CL), Machine Learning (cs.LG), FOS: Computer and information sciences, FOS: Computer and information sciences}, title = {Attention Is All You Need}, publisher = {arXiv}, year = 2017, copyright = {arXiv.org perpetual, non-exclusive license} } @article{https://doi.org/10.48550/arxiv.1912.12180, doi = {10.48550/ARXIV.1912.12180}, url = {https://arxiv.org/abs/1912.12180}, author = {Ho, Jonathan and Kalchbrenner, Nal and Weissenborn, Dirk and Salimans, Tim}, keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences}, title = {Axial Attention in Multidimensional Transformers}, publisher = {arXiv}, year = 2019, copyright = {arXiv.org perpetual, non-exclusive license} } @article{https://doi.org/10.48550/arxiv.2004.05150, doi = {10.48550/ARXIV.2004.05150}, url = {https://arxiv.org/abs/2004.05150}, author = {Beltagy, Iz and Peters, Matthew E. and Cohan, Arman}, keywords = {Computation and Language (cs.CL), FOS: Computer and information sciences, FOS: Computer and information sciences}, title = {Longformer: The Long-Document Transformer}, publisher = {arXiv}, year = 2020, copyright = {arXiv.org perpetual, non-exclusive license} } @article{https://doi.org/10.48550/arxiv.1901.02860, doi = {10.48550/ARXIV.1901.02860}, url = {https://arxiv.org/abs/1901.02860}, author = {Dai, Zihang and Yang, Zhilin and Yang, Yiming and Carbonell, Jaime and Le, Quoc V. and Salakhutdinov, Ruslan}, keywords = {Machine Learning (cs.LG), Computation and Language (cs.CL), Machine Learning (stat.ML), FOS: Computer and information sciences, FOS: Computer and information sciences}, title = {Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context}, publisher = {arXiv}, year = 2019, copyright = {Creative Commons Attribution Non Commercial Share Alike 4.0 International} } @inproceedings{devlin-etal-2019-bert, title = "{BERT}: Pre-training of Deep Bidirectional Transformers for Language Understanding", author = "Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina", booktitle = "Proceedings of the 2019 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)", month = jun, year = 2019, address = "Minneapolis, Minnesota", publisher = "Association for Computational Linguistics", url = "https://aclanthology.org/N19-1423", doi = "10.18653/v1/N19-1423", pages = "4171--4186", abstract = "We introduce a new language representation model called BERT, which stands for Bidirectional Encoder Representations from Transformers. Unlike recent language representation models (Peters et al., 2018a; Radford et al., 2018), BERT is designed to pre-train deep bidirectional representations from unlabeled text by jointly conditioning on both left and right context in all layers. As a result, the pre-trained BERT model can be fine-tuned with just one additional output layer to create state-of-the-art models for a wide range of tasks, such as question answering and language inference, without substantial task-specific architecture modifications. BERT is conceptually simple and empirically powerful. It obtains new state-of-the-art results on eleven natural language processing tasks, including pushing the GLUE score to 80.5 (7.7 point absolute improvement), MultiNLI accuracy to 86.7{\%} (4.6{\%} absolute improvement), SQuAD v1.1 question answering Test F1 to 93.2 (1.5 point absolute improvement) and SQuAD v2.0 Test F1 to 83.1 (5.1 point absolute improvement).", } @article{https://doi.org/10.48550/arxiv.2007.04825, doi = {10.48550/ARXIV.2007.04825}, url = {https://arxiv.org/abs/2007.04825}, author = {Vyas, Apoorv and Katharopoulos, Angelos and Fleuret, François}, keywords = {Machine Learning (cs.LG), Machine Learning (stat.ML), FOS: Computer and information sciences, FOS: Computer and information sciences}, title = {Fast Transformers with Clustered Attention}, publisher = {arXiv}, year = 2020, copyright = {arXiv.org perpetual, non-exclusive license} } @inproceedings{bastings-filippova-2020-elephant, title = "The elephant in the interpretability room: Why use attention as explanation when we have saliency methods?", author = "Bastings, Jasmijn and Filippova, Katja", booktitle = "Proceedings of the Third BlackboxNLP Workshop on Analyzing and Interpreting Neural Networks for NLP", month = nov, year = 2020, address = "Online", publisher = "Association for Computational Linguistics", url = "https://aclanthology.org/2020.blackboxnlp-1.14", doi = "10.18653/v1/2020.blackboxnlp-1.14", pages = "149--155", abstract = "There is a recent surge of interest in using attention as explanation of model predictions, with mixed evidence on whether attention can be used as such. While attention conveniently gives us one weight per input token and is easily extracted, it is often unclear toward what goal it is used as explanation. We find that often that goal, whether explicitly stated or not, is to find out what input tokens are the most relevant to a prediction, and that the implied user for the explanation is a model developer. For this goal and user, we argue that input saliency methods are better suited, and that there are no compelling reasons to use attention, despite the coincidence that it provides a weight for each input. With this position paper, we hope to shift some of the recent focus on attention to saliency methods, and for authors to clearly state the goal and user for their explanations.", } @article {Rao2021.02.12.430858, author = {Rao, Roshan and Liu, Jason and Verkuil, Robert and Meier, Joshua and Canny, John F. and Abbeel, Pieter and Sercu, Tom and Rives, Alexander}, title = {MSA Transformer}, elocation-id = {2021.02.12.430858}, year = 2021, doi = {10.1101/2021.02.12.430858}, publisher = {Cold Spring Harbor Laboratory}, abstract = {Unsupervised protein language models trained across millions of diverse sequences learn structure and function of proteins. Protein language models studied to date have been trained to perform inference from individual sequences. The longstanding approach in computational biology has been to make inferences from a family of evo lutionarily related sequences by fitting a model to each family independently. In this work we combine the two paradigms. We introduce a protein language model which takes as input a set of sequences in the form of a multiple sequence alignment. The model interleaves row and column attention across the input sequences and is trained with a variant of the masked language modeling objective across many protein families. The performance of the model surpasses current state-of-the-art unsupervised structure learning methods by a wide margin, with far greater parameter efficiency than prior state-of-the-art protein language models.Competing Interest StatementThe authors have declared no competing interest.}, URL = {https://www.biorxiv.org/content/early/2021/08/27/2021.02.12.430858}, eprint = {https://www.biorxiv.org/content/early/2021/08/27/2021.02.12.430858.full.pdf}, journal = {bioRxiv} } @article{Jumper2021, author = {Jumper, John and Evans, Richard and Pritzel, Alexander and Green, Tim and Figurnov, Michael and Ronneberger, Olaf and Tunyasuvunakool, Kathryn and Bates, Russ and {\v{Z}}{\'i}dek, Augustin and Potapenko, Anna and Bridgland, Alex and Meyer, Clemens and Kohl, Simon A. A. and Ballard, Andrew J. and Cowie, Andrew and Romera-Paredes, Bernardino and Nikolov, Stanislav and Jain, Rishub and Adler, Jonas and Back, Trevor and Petersen, Stig and Reiman, David and Clancy, Ellen and Zielinski, Michal and Steinegger, Martin and Pacholska, Michalina and Berghammer, Tamas and Bodenstein, Sebastian and Silver, David and Vinyals, Oriol and Senior, Andrew W. and Kavukcuoglu, Koray and Kohli, Pushmeet and Hassabis, Demis}, title = {Highly accurate protein structure prediction with AlphaFold}, journal = {Nature}, year = 2021, month = {Aug}, day = 01, volume = 596, number = 7873, pages = {583-589}, abstract = {Proteins are essential to life, and understanding their structure can facilitate a mechanistic understanding of their function. Through an enormous experimental effort1--4, the structures of around 100,000 unique proteins have been determined5, but this represents a small fraction of the billions of known protein sequences6,7. Structural coverage is bottlenecked by the months to years of painstaking effort required to determine a single protein structure. Accurate computational approaches are needed to address this gap and to enable large-scale structural bioinformatics. Predicting the three-dimensional structure that a protein will adopt based solely on its amino acid sequence---the structure prediction component of the `protein folding problem'8---has been an important open research problem for more than 50 years9. Despite recent progress10--14, existing methods fall far short of atomic accuracy, especially when no homologous structure is available. Here we provide the first computational method that can regularly predict protein structures with atomic accuracy even in cases in which no similar structure is known. We validated an entirely redesigned version of our neural network-based model, AlphaFold, in the challenging 14th Critical Assessment of protein Structure Prediction (CASP14)15, demonstrating accuracy competitive with experimental structures in a majority of cases and greatly outperforming other methods. Underpinning the latest version of AlphaFold is a novel machine learning approach that incorporates physical and biological knowledge about protein structure, leveraging multi-sequence alignments, into the design of the deep learning algorithm.}, issn = {1476-4687}, doi = {10.1038/s41586-021-03819-2}, url = {https://doi.org/10.1038/s41586-021-03819-2} }