226 lines
13 KiB
BibTeX
226 lines
13 KiB
BibTeX
@article{https://doi.org/10.48550/arxiv.1706.03762,
|
||
doi = {10.48550/ARXIV.1706.03762},
|
||
url = {https://arxiv.org/abs/1706.03762},
|
||
author = {Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and
|
||
Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N. and
|
||
Kaiser, Lukasz and Polosukhin, Illia},
|
||
keywords = {Computation and Language (cs.CL), Machine Learning (cs.LG),
|
||
FOS: Computer and information sciences, FOS: Computer and
|
||
information sciences},
|
||
title = {Attention Is All You Need},
|
||
publisher = {arXiv},
|
||
year = 2017,
|
||
copyright = {arXiv.org perpetual, non-exclusive license}
|
||
}
|
||
|
||
@article{https://doi.org/10.48550/arxiv.1912.12180,
|
||
doi = {10.48550/ARXIV.1912.12180},
|
||
url = {https://arxiv.org/abs/1912.12180},
|
||
author = {Ho, Jonathan and Kalchbrenner, Nal and Weissenborn, Dirk
|
||
and Salimans, Tim},
|
||
keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS:
|
||
Computer and information sciences, FOS: Computer and
|
||
information sciences},
|
||
title = {Axial Attention in Multidimensional Transformers},
|
||
publisher = {arXiv},
|
||
year = 2019,
|
||
copyright = {arXiv.org perpetual, non-exclusive license}
|
||
}
|
||
|
||
@article{https://doi.org/10.48550/arxiv.2004.05150,
|
||
doi = {10.48550/ARXIV.2004.05150},
|
||
url = {https://arxiv.org/abs/2004.05150},
|
||
author = {Beltagy, Iz and Peters, Matthew E. and Cohan, Arman},
|
||
keywords = {Computation and Language (cs.CL), FOS: Computer and
|
||
information sciences, FOS: Computer and information sciences},
|
||
title = {Longformer: The Long-Document Transformer},
|
||
publisher = {arXiv},
|
||
year = 2020,
|
||
copyright = {arXiv.org perpetual, non-exclusive license}
|
||
}
|
||
|
||
@article{https://doi.org/10.48550/arxiv.1901.02860,
|
||
doi = {10.48550/ARXIV.1901.02860},
|
||
url = {https://arxiv.org/abs/1901.02860},
|
||
author = {Dai, Zihang and Yang, Zhilin and Yang, Yiming and
|
||
Carbonell, Jaime and Le, Quoc V. and Salakhutdinov, Ruslan},
|
||
keywords = {Machine Learning (cs.LG), Computation and Language (cs.CL),
|
||
Machine Learning (stat.ML), FOS: Computer and information
|
||
sciences, FOS: Computer and information sciences},
|
||
title = {Transformer-XL: Attentive Language Models Beyond a
|
||
Fixed-Length Context},
|
||
publisher = {arXiv},
|
||
year = 2019,
|
||
copyright = {Creative Commons Attribution Non Commercial Share Alike 4.0
|
||
International}
|
||
}
|
||
|
||
@inproceedings{devlin-etal-2019-bert,
|
||
title = "{BERT}: Pre-training of Deep Bidirectional Transformers for
|
||
Language Understanding",
|
||
author = "Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and
|
||
Toutanova, Kristina",
|
||
booktitle = "Proceedings of the 2019 Conference of the North {A}merican
|
||
Chapter of the Association for Computational Linguistics:
|
||
Human Language Technologies, Volume 1 (Long and Short Papers)",
|
||
month = jun,
|
||
year = 2019,
|
||
address = "Minneapolis, Minnesota",
|
||
publisher = "Association for Computational Linguistics",
|
||
url = "https://aclanthology.org/N19-1423",
|
||
doi = "10.18653/v1/N19-1423",
|
||
pages = "4171--4186",
|
||
abstract = "We introduce a new language representation model called
|
||
BERT, which stands for Bidirectional Encoder Representations
|
||
from Transformers. Unlike recent language representation
|
||
models (Peters et al., 2018a; Radford et al., 2018), BERT is
|
||
designed to pre-train deep bidirectional representations from
|
||
unlabeled text by jointly conditioning on both left and right
|
||
context in all layers. As a result, the pre-trained BERT model
|
||
can be fine-tuned with just one additional output layer to
|
||
create state-of-the-art models for a wide range of tasks, such
|
||
as question answering and language inference, without
|
||
substantial task-specific architecture modifications. BERT is
|
||
conceptually simple and empirically powerful. It obtains new
|
||
state-of-the-art results on eleven natural language processing
|
||
tasks, including pushing the GLUE score to 80.5 (7.7 point
|
||
absolute improvement), MultiNLI accuracy to 86.7{\%} (4.6{\%}
|
||
absolute improvement), SQuAD v1.1 question answering Test F1
|
||
to 93.2 (1.5 point absolute improvement) and SQuAD v2.0 Test
|
||
F1 to 83.1 (5.1 point absolute improvement).",
|
||
}
|
||
|
||
@article{https://doi.org/10.48550/arxiv.2007.04825,
|
||
doi = {10.48550/ARXIV.2007.04825},
|
||
url = {https://arxiv.org/abs/2007.04825},
|
||
author = {Vyas, Apoorv and Katharopoulos, Angelos and Fleuret,
|
||
François},
|
||
keywords = {Machine Learning (cs.LG), Machine Learning (stat.ML), FOS:
|
||
Computer and information sciences, FOS: Computer and
|
||
information sciences},
|
||
title = {Fast Transformers with Clustered Attention},
|
||
publisher = {arXiv},
|
||
year = 2020,
|
||
copyright = {arXiv.org perpetual, non-exclusive license}
|
||
}
|
||
|
||
@inproceedings{bastings-filippova-2020-elephant,
|
||
title = "The elephant in the interpretability room: Why use
|
||
attention as explanation when we have saliency methods?",
|
||
author = "Bastings, Jasmijn and Filippova, Katja",
|
||
booktitle = "Proceedings of the Third BlackboxNLP Workshop on Analyzing
|
||
and Interpreting Neural Networks for NLP",
|
||
month = nov,
|
||
year = 2020,
|
||
address = "Online",
|
||
publisher = "Association for Computational Linguistics",
|
||
url = "https://aclanthology.org/2020.blackboxnlp-1.14",
|
||
doi = "10.18653/v1/2020.blackboxnlp-1.14",
|
||
pages = "149--155",
|
||
abstract = "There is a recent surge of interest in using attention as
|
||
explanation of model predictions, with mixed evidence on
|
||
whether attention can be used as such. While attention
|
||
conveniently gives us one weight per input token and is easily
|
||
extracted, it is often unclear toward what goal it is used as
|
||
explanation. We find that often that goal, whether explicitly
|
||
stated or not, is to find out what input tokens are the most
|
||
relevant to a prediction, and that the implied user for the
|
||
explanation is a model developer. For this goal and user, we
|
||
argue that input saliency methods are better suited, and that
|
||
there are no compelling reasons to use attention, despite the
|
||
coincidence that it provides a weight for each input. With
|
||
this position paper, we hope to shift some of the recent focus
|
||
on attention to saliency methods, and for authors to clearly
|
||
state the goal and user for their explanations.",
|
||
}
|
||
|
||
@article {Rao2021.02.12.430858,
|
||
author = {Rao, Roshan and Liu, Jason and Verkuil, Robert and Meier,
|
||
Joshua and Canny, John F. and Abbeel, Pieter and Sercu, Tom
|
||
and Rives, Alexander},
|
||
title = {MSA Transformer},
|
||
elocation-id = {2021.02.12.430858},
|
||
year = 2021,
|
||
doi = {10.1101/2021.02.12.430858},
|
||
publisher = {Cold Spring Harbor Laboratory},
|
||
abstract = {Unsupervised protein language models trained across
|
||
millions of diverse sequences learn structure and function of
|
||
proteins. Protein language models studied to date have been
|
||
trained to perform inference from individual sequences. The
|
||
longstanding approach in computational biology has been to
|
||
make inferences from a family of evo lutionarily related
|
||
sequences by fitting a model to each family independently. In
|
||
this work we combine the two paradigms. We introduce a protein
|
||
language model which takes as input a set of sequences in the
|
||
form of a multiple sequence alignment. The model interleaves
|
||
row and column attention across the input sequences and is
|
||
trained with a variant of the masked language modeling
|
||
objective across many protein families. The performance of the
|
||
model surpasses current state-of-the-art unsupervised
|
||
structure learning methods by a wide margin, with far greater
|
||
parameter efficiency than prior state-of-the-art protein
|
||
language models.Competing Interest StatementThe authors have
|
||
declared no competing interest.},
|
||
URL =
|
||
{https://www.biorxiv.org/content/early/2021/08/27/2021.02.12.430858},
|
||
eprint =
|
||
{https://www.biorxiv.org/content/early/2021/08/27/2021.02.12.430858.full.pdf},
|
||
journal = {bioRxiv}
|
||
}
|
||
|
||
@article{Jumper2021,
|
||
author = {Jumper, John and Evans, Richard and Pritzel, Alexander and
|
||
Green, Tim and Figurnov, Michael and Ronneberger, Olaf and
|
||
Tunyasuvunakool, Kathryn and Bates, Russ and {\v{Z}}{\'i}dek,
|
||
Augustin and Potapenko, Anna and Bridgland, Alex and Meyer,
|
||
Clemens and Kohl, Simon A. A. and Ballard, Andrew J. and
|
||
Cowie, Andrew and Romera-Paredes, Bernardino and Nikolov,
|
||
Stanislav and Jain, Rishub and Adler, Jonas and Back, Trevor
|
||
and Petersen, Stig and Reiman, David and Clancy, Ellen and
|
||
Zielinski, Michal and Steinegger, Martin and Pacholska,
|
||
Michalina and Berghammer, Tamas and Bodenstein, Sebastian and
|
||
Silver, David and Vinyals, Oriol and Senior, Andrew W. and
|
||
Kavukcuoglu, Koray and Kohli, Pushmeet and Hassabis, Demis},
|
||
title = {Highly accurate protein structure prediction with
|
||
AlphaFold},
|
||
journal = {Nature},
|
||
year = 2021,
|
||
month = {Aug},
|
||
day = 01,
|
||
volume = 596,
|
||
number = 7873,
|
||
pages = {583-589},
|
||
abstract = {Proteins are essential to life, and understanding their
|
||
structure can facilitate a mechanistic understanding of their
|
||
function. Through an enormous experimental effort1--4, the
|
||
structures of around 100,000 unique proteins have been
|
||
determined5, but this represents a small fraction of the
|
||
billions of known protein sequences6,7. Structural coverage is
|
||
bottlenecked by the months to years of painstaking effort
|
||
required to determine a single protein structure. Accurate
|
||
computational approaches are needed to address this gap and to
|
||
enable large-scale structural bioinformatics. Predicting the
|
||
three-dimensional structure that a protein will adopt based
|
||
solely on its amino acid sequence---the structure prediction
|
||
component of the `protein folding problem'8---has been an
|
||
important open research problem for more than 50 years9.
|
||
Despite recent progress10--14, existing methods fall far short
|
||
of atomic accuracy, especially when no homologous structure is
|
||
available. Here we provide the first computational method that
|
||
can regularly predict protein structures with atomic accuracy
|
||
even in cases in which no similar structure is known. We
|
||
validated an entirely redesigned version of our neural
|
||
network-based model, AlphaFold, in the challenging 14th
|
||
Critical Assessment of protein Structure Prediction
|
||
(CASP14)15, demonstrating accuracy competitive with
|
||
experimental structures in a majority of cases and greatly
|
||
outperforming other methods. Underpinning the latest version
|
||
of AlphaFold is a novel machine learning approach that
|
||
incorporates physical and biological knowledge about protein
|
||
structure, leveraging multi-sequence alignments, into the
|
||
design of the deep learning algorithm.},
|
||
issn = {1476-4687},
|
||
doi = {10.1038/s41586-021-03819-2},
|
||
url = {https://doi.org/10.1038/s41586-021-03819-2}
|
||
}
|