14 KiB
14 KiB
- Deep Learning
- Transformers
- Attention is All You Need
- Axial Attention in Multidimensional Transformers
- Longformer: The Long-Document Transformer
- Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context
- BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding
- Fast Transformers with Clustered Attention
- The elephant in the interpretability room: Why use attention as explanation when we have saliency methods?
- Transformers
- Deep Learning + Biology
- Biology
Deep Learning
Transformers
Attention is All You Need
@article{https://doi.org/10.48550/arxiv.1706.03762,
doi = {10.48550/ARXIV.1706.03762},
url = {https://arxiv.org/abs/1706.03762},
author = {Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and
Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N. and
Kaiser, Lukasz and Polosukhin, Illia},
keywords = {Computation and Language (cs.CL), Machine Learning (cs.LG),
FOS: Computer and information sciences, FOS: Computer and
information sciences},
title = {Attention Is All You Need},
publisher = {arXiv},
year = 2017,
copyright = {arXiv.org perpetual, non-exclusive license}
}
Axial Attention in Multidimensional Transformers
@article{https://doi.org/10.48550/arxiv.1912.12180,
doi = {10.48550/ARXIV.1912.12180},
url = {https://arxiv.org/abs/1912.12180},
author = {Ho, Jonathan and Kalchbrenner, Nal and Weissenborn, Dirk
and Salimans, Tim},
keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS:
Computer and information sciences, FOS: Computer and
information sciences},
title = {Axial Attention in Multidimensional Transformers},
publisher = {arXiv},
year = 2019,
copyright = {arXiv.org perpetual, non-exclusive license}
}
Longformer: The Long-Document Transformer
@article{https://doi.org/10.48550/arxiv.2004.05150,
doi = {10.48550/ARXIV.2004.05150},
url = {https://arxiv.org/abs/2004.05150},
author = {Beltagy, Iz and Peters, Matthew E. and Cohan, Arman},
keywords = {Computation and Language (cs.CL), FOS: Computer and
information sciences, FOS: Computer and information sciences},
title = {Longformer: The Long-Document Transformer},
publisher = {arXiv},
year = 2020,
copyright = {arXiv.org perpetual, non-exclusive license}
}
Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context
@article{https://doi.org/10.48550/arxiv.1901.02860,
doi = {10.48550/ARXIV.1901.02860},
url = {https://arxiv.org/abs/1901.02860},
author = {Dai, Zihang and Yang, Zhilin and Yang, Yiming and
Carbonell, Jaime and Le, Quoc V. and Salakhutdinov, Ruslan},
keywords = {Machine Learning (cs.LG), Computation and Language (cs.CL),
Machine Learning (stat.ML), FOS: Computer and information
sciences, FOS: Computer and information sciences},
title = {Transformer-XL: Attentive Language Models Beyond a
Fixed-Length Context},
publisher = {arXiv},
year = 2019,
copyright = {Creative Commons Attribution Non Commercial Share Alike 4.0
International}
}
BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding
@inproceedings{devlin-etal-2019-bert,
title = "{BERT}: Pre-training of Deep Bidirectional Transformers for
Language Understanding",
author = "Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and
Toutanova, Kristina",
booktitle = "Proceedings of the 2019 Conference of the North {A}merican
Chapter of the Association for Computational Linguistics:
Human Language Technologies, Volume 1 (Long and Short Papers)",
month = jun,
year = 2019,
address = "Minneapolis, Minnesota",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/N19-1423",
doi = "10.18653/v1/N19-1423",
pages = "4171--4186",
abstract = "We introduce a new language representation model called
BERT, which stands for Bidirectional Encoder Representations
from Transformers. Unlike recent language representation
models (Peters et al., 2018a; Radford et al., 2018), BERT is
designed to pre-train deep bidirectional representations from
unlabeled text by jointly conditioning on both left and right
context in all layers. As a result, the pre-trained BERT model
can be fine-tuned with just one additional output layer to
create state-of-the-art models for a wide range of tasks, such
as question answering and language inference, without
substantial task-specific architecture modifications. BERT is
conceptually simple and empirically powerful. It obtains new
state-of-the-art results on eleven natural language processing
tasks, including pushing the GLUE score to 80.5 (7.7 point
absolute improvement), MultiNLI accuracy to 86.7{\%} (4.6{\%}
absolute improvement), SQuAD v1.1 question answering Test F1
to 93.2 (1.5 point absolute improvement) and SQuAD v2.0 Test
F1 to 83.1 (5.1 point absolute improvement).",
}
A masked language model (MLM) randomly masks some of the tokens from the input, and the objective is to predict the original input based only on its context.
Fast Transformers with Clustered Attention
@article{https://doi.org/10.48550/arxiv.2007.04825,
doi = {10.48550/ARXIV.2007.04825},
url = {https://arxiv.org/abs/2007.04825},
author = {Vyas, Apoorv and Katharopoulos, Angelos and Fleuret,
François},
keywords = {Machine Learning (cs.LG), Machine Learning (stat.ML), FOS:
Computer and information sciences, FOS: Computer and
information sciences},
title = {Fast Transformers with Clustered Attention},
publisher = {arXiv},
year = 2020,
copyright = {arXiv.org perpetual, non-exclusive license}
}
The elephant in the interpretability room: Why use attention as explanation when we have saliency methods?
@inproceedings{bastings-filippova-2020-elephant,
title = "The elephant in the interpretability room: Why use
attention as explanation when we have saliency methods?",
author = "Bastings, Jasmijn and Filippova, Katja",
booktitle = "Proceedings of the Third BlackboxNLP Workshop on Analyzing
and Interpreting Neural Networks for NLP",
month = nov,
year = 2020,
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.blackboxnlp-1.14",
doi = "10.18653/v1/2020.blackboxnlp-1.14",
pages = "149--155",
abstract = "There is a recent surge of interest in using attention as
explanation of model predictions, with mixed evidence on
whether attention can be used as such. While attention
conveniently gives us one weight per input token and is easily
extracted, it is often unclear toward what goal it is used as
explanation. We find that often that goal, whether explicitly
stated or not, is to find out what input tokens are the most
relevant to a prediction, and that the implied user for the
explanation is a model developer. For this goal and user, we
argue that input saliency methods are better suited, and that
there are no compelling reasons to use attention, despite the
coincidence that it provides a weight for each input. With
this position paper, we hope to shift some of the recent focus
on attention to saliency methods, and for authors to clearly
state the goal and user for their explanations.",
}
Deep Learning + Biology
MSA Transformer
@article {Rao2021.02.12.430858,
author = {Rao, Roshan and Liu, Jason and Verkuil, Robert and Meier,
Joshua and Canny, John F. and Abbeel, Pieter and Sercu, Tom
and Rives, Alexander},
title = {MSA Transformer},
elocation-id = {2021.02.12.430858},
year = 2021,
doi = {10.1101/2021.02.12.430858},
publisher = {Cold Spring Harbor Laboratory},
abstract = {Unsupervised protein language models trained across
millions of diverse sequences learn structure and function of
proteins. Protein language models studied to date have been
trained to perform inference from individual sequences. The
longstanding approach in computational biology has been to
make inferences from a family of evo lutionarily related
sequences by fitting a model to each family independently. In
this work we combine the two paradigms. We introduce a protein
language model which takes as input a set of sequences in the
form of a multiple sequence alignment. The model interleaves
row and column attention across the input sequences and is
trained with a variant of the masked language modeling
objective across many protein families. The performance of the
model surpasses current state-of-the-art unsupervised
structure learning methods by a wide margin, with far greater
parameter efficiency than prior state-of-the-art protein
language models.Competing Interest StatementThe authors have
declared no competing interest.},
URL =
{https://www.biorxiv.org/content/early/2021/08/27/2021.02.12.430858},
eprint =
{https://www.biorxiv.org/content/early/2021/08/27/2021.02.12.430858.full.pdf},
journal = {bioRxiv}
}
Highly accurate protein structure prediction with AlphaFold
@article{Jumper2021,
author = {Jumper, John and Evans, Richard and Pritzel, Alexander and
Green, Tim and Figurnov, Michael and Ronneberger, Olaf and
Tunyasuvunakool, Kathryn and Bates, Russ and {\v{Z}}{\'i}dek,
Augustin and Potapenko, Anna and Bridgland, Alex and Meyer,
Clemens and Kohl, Simon A. A. and Ballard, Andrew J. and
Cowie, Andrew and Romera-Paredes, Bernardino and Nikolov,
Stanislav and Jain, Rishub and Adler, Jonas and Back, Trevor
and Petersen, Stig and Reiman, David and Clancy, Ellen and
Zielinski, Michal and Steinegger, Martin and Pacholska,
Michalina and Berghammer, Tamas and Bodenstein, Sebastian and
Silver, David and Vinyals, Oriol and Senior, Andrew W. and
Kavukcuoglu, Koray and Kohli, Pushmeet and Hassabis, Demis},
title = {Highly accurate protein structure prediction with
AlphaFold},
journal = {Nature},
year = 2021,
month = {Aug},
day = 01,
volume = 596,
number = 7873,
pages = {583-589},
abstract = {Proteins are essential to life, and understanding their
structure can facilitate a mechanistic understanding of their
function. Through an enormous experimental effort1--4, the
structures of around 100,000 unique proteins have been
determined5, but this represents a small fraction of the
billions of known protein sequences6,7. Structural coverage is
bottlenecked by the months to years of painstaking effort
required to determine a single protein structure. Accurate
computational approaches are needed to address this gap and to
enable large-scale structural bioinformatics. Predicting the
three-dimensional structure that a protein will adopt based
solely on its amino acid sequence---the structure prediction
component of the `protein folding problem'8---has been an
important open research problem for more than 50 years9.
Despite recent progress10--14, existing methods fall far short
of atomic accuracy, especially when no homologous structure is
available. Here we provide the first computational method that
can regularly predict protein structures with atomic accuracy
even in cases in which no similar structure is known. We
validated an entirely redesigned version of our neural
network-based model, AlphaFold, in the challenging 14th
Critical Assessment of protein Structure Prediction
(CASP14)15, demonstrating accuracy competitive with
experimental structures in a majority of cases and greatly
outperforming other methods. Underpinning the latest version
of AlphaFold is a novel machine learning approach that
incorporates physical and biological knowledge about protein
structure, leveraging multi-sequence alignments, into the
design of the deep learning algorithm.},
issn = {1476-4687},
doi = {10.1038/s41586-021-03819-2},
url = {https://doi.org/10.1038/s41586-021-03819-2}
}