226 lines
13 KiB
BibTeX
226 lines
13 KiB
BibTeX
|
@article{https://doi.org/10.48550/arxiv.1706.03762,
|
|||
|
doi = {10.48550/ARXIV.1706.03762},
|
|||
|
url = {https://arxiv.org/abs/1706.03762},
|
|||
|
author = {Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and
|
|||
|
Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N. and
|
|||
|
Kaiser, Lukasz and Polosukhin, Illia},
|
|||
|
keywords = {Computation and Language (cs.CL), Machine Learning (cs.LG),
|
|||
|
FOS: Computer and information sciences, FOS: Computer and
|
|||
|
information sciences},
|
|||
|
title = {Attention Is All You Need},
|
|||
|
publisher = {arXiv},
|
|||
|
year = 2017,
|
|||
|
copyright = {arXiv.org perpetual, non-exclusive license}
|
|||
|
}
|
|||
|
|
|||
|
@article{https://doi.org/10.48550/arxiv.1912.12180,
|
|||
|
doi = {10.48550/ARXIV.1912.12180},
|
|||
|
url = {https://arxiv.org/abs/1912.12180},
|
|||
|
author = {Ho, Jonathan and Kalchbrenner, Nal and Weissenborn, Dirk
|
|||
|
and Salimans, Tim},
|
|||
|
keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS:
|
|||
|
Computer and information sciences, FOS: Computer and
|
|||
|
information sciences},
|
|||
|
title = {Axial Attention in Multidimensional Transformers},
|
|||
|
publisher = {arXiv},
|
|||
|
year = 2019,
|
|||
|
copyright = {arXiv.org perpetual, non-exclusive license}
|
|||
|
}
|
|||
|
|
|||
|
@article{https://doi.org/10.48550/arxiv.2004.05150,
|
|||
|
doi = {10.48550/ARXIV.2004.05150},
|
|||
|
url = {https://arxiv.org/abs/2004.05150},
|
|||
|
author = {Beltagy, Iz and Peters, Matthew E. and Cohan, Arman},
|
|||
|
keywords = {Computation and Language (cs.CL), FOS: Computer and
|
|||
|
information sciences, FOS: Computer and information sciences},
|
|||
|
title = {Longformer: The Long-Document Transformer},
|
|||
|
publisher = {arXiv},
|
|||
|
year = 2020,
|
|||
|
copyright = {arXiv.org perpetual, non-exclusive license}
|
|||
|
}
|
|||
|
|
|||
|
@article{https://doi.org/10.48550/arxiv.1901.02860,
|
|||
|
doi = {10.48550/ARXIV.1901.02860},
|
|||
|
url = {https://arxiv.org/abs/1901.02860},
|
|||
|
author = {Dai, Zihang and Yang, Zhilin and Yang, Yiming and
|
|||
|
Carbonell, Jaime and Le, Quoc V. and Salakhutdinov, Ruslan},
|
|||
|
keywords = {Machine Learning (cs.LG), Computation and Language (cs.CL),
|
|||
|
Machine Learning (stat.ML), FOS: Computer and information
|
|||
|
sciences, FOS: Computer and information sciences},
|
|||
|
title = {Transformer-XL: Attentive Language Models Beyond a
|
|||
|
Fixed-Length Context},
|
|||
|
publisher = {arXiv},
|
|||
|
year = 2019,
|
|||
|
copyright = {Creative Commons Attribution Non Commercial Share Alike 4.0
|
|||
|
International}
|
|||
|
}
|
|||
|
|
|||
|
@inproceedings{devlin-etal-2019-bert,
|
|||
|
title = "{BERT}: Pre-training of Deep Bidirectional Transformers for
|
|||
|
Language Understanding",
|
|||
|
author = "Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and
|
|||
|
Toutanova, Kristina",
|
|||
|
booktitle = "Proceedings of the 2019 Conference of the North {A}merican
|
|||
|
Chapter of the Association for Computational Linguistics:
|
|||
|
Human Language Technologies, Volume 1 (Long and Short Papers)",
|
|||
|
month = jun,
|
|||
|
year = 2019,
|
|||
|
address = "Minneapolis, Minnesota",
|
|||
|
publisher = "Association for Computational Linguistics",
|
|||
|
url = "https://aclanthology.org/N19-1423",
|
|||
|
doi = "10.18653/v1/N19-1423",
|
|||
|
pages = "4171--4186",
|
|||
|
abstract = "We introduce a new language representation model called
|
|||
|
BERT, which stands for Bidirectional Encoder Representations
|
|||
|
from Transformers. Unlike recent language representation
|
|||
|
models (Peters et al., 2018a; Radford et al., 2018), BERT is
|
|||
|
designed to pre-train deep bidirectional representations from
|
|||
|
unlabeled text by jointly conditioning on both left and right
|
|||
|
context in all layers. As a result, the pre-trained BERT model
|
|||
|
can be fine-tuned with just one additional output layer to
|
|||
|
create state-of-the-art models for a wide range of tasks, such
|
|||
|
as question answering and language inference, without
|
|||
|
substantial task-specific architecture modifications. BERT is
|
|||
|
conceptually simple and empirically powerful. It obtains new
|
|||
|
state-of-the-art results on eleven natural language processing
|
|||
|
tasks, including pushing the GLUE score to 80.5 (7.7 point
|
|||
|
absolute improvement), MultiNLI accuracy to 86.7{\%} (4.6{\%}
|
|||
|
absolute improvement), SQuAD v1.1 question answering Test F1
|
|||
|
to 93.2 (1.5 point absolute improvement) and SQuAD v2.0 Test
|
|||
|
F1 to 83.1 (5.1 point absolute improvement).",
|
|||
|
}
|
|||
|
|
|||
|
@article{https://doi.org/10.48550/arxiv.2007.04825,
|
|||
|
doi = {10.48550/ARXIV.2007.04825},
|
|||
|
url = {https://arxiv.org/abs/2007.04825},
|
|||
|
author = {Vyas, Apoorv and Katharopoulos, Angelos and Fleuret,
|
|||
|
François},
|
|||
|
keywords = {Machine Learning (cs.LG), Machine Learning (stat.ML), FOS:
|
|||
|
Computer and information sciences, FOS: Computer and
|
|||
|
information sciences},
|
|||
|
title = {Fast Transformers with Clustered Attention},
|
|||
|
publisher = {arXiv},
|
|||
|
year = 2020,
|
|||
|
copyright = {arXiv.org perpetual, non-exclusive license}
|
|||
|
}
|
|||
|
|
|||
|
@inproceedings{bastings-filippova-2020-elephant,
|
|||
|
title = "The elephant in the interpretability room: Why use
|
|||
|
attention as explanation when we have saliency methods?",
|
|||
|
author = "Bastings, Jasmijn and Filippova, Katja",
|
|||
|
booktitle = "Proceedings of the Third BlackboxNLP Workshop on Analyzing
|
|||
|
and Interpreting Neural Networks for NLP",
|
|||
|
month = nov,
|
|||
|
year = 2020,
|
|||
|
address = "Online",
|
|||
|
publisher = "Association for Computational Linguistics",
|
|||
|
url = "https://aclanthology.org/2020.blackboxnlp-1.14",
|
|||
|
doi = "10.18653/v1/2020.blackboxnlp-1.14",
|
|||
|
pages = "149--155",
|
|||
|
abstract = "There is a recent surge of interest in using attention as
|
|||
|
explanation of model predictions, with mixed evidence on
|
|||
|
whether attention can be used as such. While attention
|
|||
|
conveniently gives us one weight per input token and is easily
|
|||
|
extracted, it is often unclear toward what goal it is used as
|
|||
|
explanation. We find that often that goal, whether explicitly
|
|||
|
stated or not, is to find out what input tokens are the most
|
|||
|
relevant to a prediction, and that the implied user for the
|
|||
|
explanation is a model developer. For this goal and user, we
|
|||
|
argue that input saliency methods are better suited, and that
|
|||
|
there are no compelling reasons to use attention, despite the
|
|||
|
coincidence that it provides a weight for each input. With
|
|||
|
this position paper, we hope to shift some of the recent focus
|
|||
|
on attention to saliency methods, and for authors to clearly
|
|||
|
state the goal and user for their explanations.",
|
|||
|
}
|
|||
|
|
|||
|
@article {Rao2021.02.12.430858,
|
|||
|
author = {Rao, Roshan and Liu, Jason and Verkuil, Robert and Meier,
|
|||
|
Joshua and Canny, John F. and Abbeel, Pieter and Sercu, Tom
|
|||
|
and Rives, Alexander},
|
|||
|
title = {MSA Transformer},
|
|||
|
elocation-id = {2021.02.12.430858},
|
|||
|
year = 2021,
|
|||
|
doi = {10.1101/2021.02.12.430858},
|
|||
|
publisher = {Cold Spring Harbor Laboratory},
|
|||
|
abstract = {Unsupervised protein language models trained across
|
|||
|
millions of diverse sequences learn structure and function of
|
|||
|
proteins. Protein language models studied to date have been
|
|||
|
trained to perform inference from individual sequences. The
|
|||
|
longstanding approach in computational biology has been to
|
|||
|
make inferences from a family of evo lutionarily related
|
|||
|
sequences by fitting a model to each family independently. In
|
|||
|
this work we combine the two paradigms. We introduce a protein
|
|||
|
language model which takes as input a set of sequences in the
|
|||
|
form of a multiple sequence alignment. The model interleaves
|
|||
|
row and column attention across the input sequences and is
|
|||
|
trained with a variant of the masked language modeling
|
|||
|
objective across many protein families. The performance of the
|
|||
|
model surpasses current state-of-the-art unsupervised
|
|||
|
structure learning methods by a wide margin, with far greater
|
|||
|
parameter efficiency than prior state-of-the-art protein
|
|||
|
language models.Competing Interest StatementThe authors have
|
|||
|
declared no competing interest.},
|
|||
|
URL =
|
|||
|
{https://www.biorxiv.org/content/early/2021/08/27/2021.02.12.430858},
|
|||
|
eprint =
|
|||
|
{https://www.biorxiv.org/content/early/2021/08/27/2021.02.12.430858.full.pdf},
|
|||
|
journal = {bioRxiv}
|
|||
|
}
|
|||
|
|
|||
|
@article{Jumper2021,
|
|||
|
author = {Jumper, John and Evans, Richard and Pritzel, Alexander and
|
|||
|
Green, Tim and Figurnov, Michael and Ronneberger, Olaf and
|
|||
|
Tunyasuvunakool, Kathryn and Bates, Russ and {\v{Z}}{\'i}dek,
|
|||
|
Augustin and Potapenko, Anna and Bridgland, Alex and Meyer,
|
|||
|
Clemens and Kohl, Simon A. A. and Ballard, Andrew J. and
|
|||
|
Cowie, Andrew and Romera-Paredes, Bernardino and Nikolov,
|
|||
|
Stanislav and Jain, Rishub and Adler, Jonas and Back, Trevor
|
|||
|
and Petersen, Stig and Reiman, David and Clancy, Ellen and
|
|||
|
Zielinski, Michal and Steinegger, Martin and Pacholska,
|
|||
|
Michalina and Berghammer, Tamas and Bodenstein, Sebastian and
|
|||
|
Silver, David and Vinyals, Oriol and Senior, Andrew W. and
|
|||
|
Kavukcuoglu, Koray and Kohli, Pushmeet and Hassabis, Demis},
|
|||
|
title = {Highly accurate protein structure prediction with
|
|||
|
AlphaFold},
|
|||
|
journal = {Nature},
|
|||
|
year = 2021,
|
|||
|
month = {Aug},
|
|||
|
day = 01,
|
|||
|
volume = 596,
|
|||
|
number = 7873,
|
|||
|
pages = {583-589},
|
|||
|
abstract = {Proteins are essential to life, and understanding their
|
|||
|
structure can facilitate a mechanistic understanding of their
|
|||
|
function. Through an enormous experimental effort1--4, the
|
|||
|
structures of around 100,000 unique proteins have been
|
|||
|
determined5, but this represents a small fraction of the
|
|||
|
billions of known protein sequences6,7. Structural coverage is
|
|||
|
bottlenecked by the months to years of painstaking effort
|
|||
|
required to determine a single protein structure. Accurate
|
|||
|
computational approaches are needed to address this gap and to
|
|||
|
enable large-scale structural bioinformatics. Predicting the
|
|||
|
three-dimensional structure that a protein will adopt based
|
|||
|
solely on its amino acid sequence---the structure prediction
|
|||
|
component of the `protein folding problem'8---has been an
|
|||
|
important open research problem for more than 50 years9.
|
|||
|
Despite recent progress10--14, existing methods fall far short
|
|||
|
of atomic accuracy, especially when no homologous structure is
|
|||
|
available. Here we provide the first computational method that
|
|||
|
can regularly predict protein structures with atomic accuracy
|
|||
|
even in cases in which no similar structure is known. We
|
|||
|
validated an entirely redesigned version of our neural
|
|||
|
network-based model, AlphaFold, in the challenging 14th
|
|||
|
Critical Assessment of protein Structure Prediction
|
|||
|
(CASP14)15, demonstrating accuracy competitive with
|
|||
|
experimental structures in a majority of cases and greatly
|
|||
|
outperforming other methods. Underpinning the latest version
|
|||
|
of AlphaFold is a novel machine learning approach that
|
|||
|
incorporates physical and biological knowledge about protein
|
|||
|
structure, leveraging multi-sequence alignments, into the
|
|||
|
design of the deep learning algorithm.},
|
|||
|
issn = {1476-4687},
|
|||
|
doi = {10.1038/s41586-021-03819-2},
|
|||
|
url = {https://doi.org/10.1038/s41586-021-03819-2}
|
|||
|
}
|