@article{https://doi.org/10.48550/arxiv.1706.03762,
  doi             = {10.48550/ARXIV.1706.03762},
  url             = {https://arxiv.org/abs/1706.03762},
  author          = {Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and
                  Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N. and
                  Kaiser, Lukasz and Polosukhin, Illia},
  keywords        = {Computation and Language (cs.CL), Machine Learning (cs.LG),
                  FOS: Computer and information sciences, FOS: Computer and
                  information sciences},
  title           = {Attention Is All You Need},
  publisher       = {arXiv},
  year            = 2017,
  copyright       = {arXiv.org perpetual, non-exclusive license}
}

@article{https://doi.org/10.48550/arxiv.1912.12180,
  doi             = {10.48550/ARXIV.1912.12180},
  url             = {https://arxiv.org/abs/1912.12180},
  author          = {Ho, Jonathan and Kalchbrenner, Nal and Weissenborn, Dirk
                  and Salimans, Tim},
  keywords        = {Computer Vision and Pattern Recognition (cs.CV), FOS:
                  Computer and information sciences, FOS: Computer and
                  information sciences},
  title           = {Axial Attention in Multidimensional Transformers},
  publisher       = {arXiv},
  year            = 2019,
  copyright       = {arXiv.org perpetual, non-exclusive license}
}

@article{https://doi.org/10.48550/arxiv.2004.05150,
  doi             = {10.48550/ARXIV.2004.05150},
  url             = {https://arxiv.org/abs/2004.05150},
  author          = {Beltagy, Iz and Peters, Matthew E. and Cohan, Arman},
  keywords        = {Computation and Language (cs.CL), FOS: Computer and
                  information sciences, FOS: Computer and information sciences},
  title           = {Longformer: The Long-Document Transformer},
  publisher       = {arXiv},
  year            = 2020,
  copyright       = {arXiv.org perpetual, non-exclusive license}
}

@article{https://doi.org/10.48550/arxiv.1901.02860,
  doi             = {10.48550/ARXIV.1901.02860},
  url             = {https://arxiv.org/abs/1901.02860},
  author          = {Dai, Zihang and Yang, Zhilin and Yang, Yiming and
                  Carbonell, Jaime and Le, Quoc V. and Salakhutdinov, Ruslan},
  keywords        = {Machine Learning (cs.LG), Computation and Language (cs.CL),
                  Machine Learning (stat.ML), FOS: Computer and information
                  sciences, FOS: Computer and information sciences},
  title           = {Transformer-XL: Attentive Language Models Beyond a
                  Fixed-Length Context},
  publisher       = {arXiv},
  year            = 2019,
  copyright       = {Creative Commons Attribution Non Commercial Share Alike 4.0
                  International}
}

@inproceedings{devlin-etal-2019-bert,
  title           = "{BERT}: Pre-training of Deep Bidirectional Transformers for
                  Language Understanding",
  author          = "Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and
                  Toutanova, Kristina",
  booktitle       = "Proceedings of the 2019 Conference of the North {A}merican
                  Chapter of the Association for Computational Linguistics:
                  Human Language Technologies, Volume 1 (Long and Short Papers)",
  month           = jun,
  year            = 2019,
  address         = "Minneapolis, Minnesota",
  publisher       = "Association for Computational Linguistics",
  url             = "https://aclanthology.org/N19-1423",
  doi             = "10.18653/v1/N19-1423",
  pages           = "4171--4186",
  abstract        = "We introduce a new language representation model called
                  BERT, which stands for Bidirectional Encoder Representations
                  from Transformers. Unlike recent language representation
                  models (Peters et al., 2018a; Radford et al., 2018), BERT is
                  designed to pre-train deep bidirectional representations from
                  unlabeled text by jointly conditioning on both left and right
                  context in all layers. As a result, the pre-trained BERT model
                  can be fine-tuned with just one additional output layer to
                  create state-of-the-art models for a wide range of tasks, such
                  as question answering and language inference, without
                  substantial task-specific architecture modifications. BERT is
                  conceptually simple and empirically powerful. It obtains new
                  state-of-the-art results on eleven natural language processing
                  tasks, including pushing the GLUE score to 80.5 (7.7 point
                  absolute improvement), MultiNLI accuracy to 86.7{\%} (4.6{\%}
                  absolute improvement), SQuAD v1.1 question answering Test F1
                  to 93.2 (1.5 point absolute improvement) and SQuAD v2.0 Test
                  F1 to 83.1 (5.1 point absolute improvement).",
}

@article{https://doi.org/10.48550/arxiv.2007.04825,
  doi             = {10.48550/ARXIV.2007.04825},
  url             = {https://arxiv.org/abs/2007.04825},
  author          = {Vyas, Apoorv and Katharopoulos, Angelos and Fleuret,
                  François},
  keywords        = {Machine Learning (cs.LG), Machine Learning (stat.ML), FOS:
                  Computer and information sciences, FOS: Computer and
                  information sciences},
  title           = {Fast Transformers with Clustered Attention},
  publisher       = {arXiv},
  year            = 2020,
  copyright       = {arXiv.org perpetual, non-exclusive license}
}

@inproceedings{bastings-filippova-2020-elephant,
  title           = "The elephant in the interpretability room: Why use
                  attention as explanation when we have saliency methods?",
  author          = "Bastings, Jasmijn and Filippova, Katja",
  booktitle       = "Proceedings of the Third BlackboxNLP Workshop on Analyzing
                  and Interpreting Neural Networks for NLP",
  month           = nov,
  year            = 2020,
  address         = "Online",
  publisher       = "Association for Computational Linguistics",
  url             = "https://aclanthology.org/2020.blackboxnlp-1.14",
  doi             = "10.18653/v1/2020.blackboxnlp-1.14",
  pages           = "149--155",
  abstract        = "There is a recent surge of interest in using attention as
                  explanation of model predictions, with mixed evidence on
                  whether attention can be used as such. While attention
                  conveniently gives us one weight per input token and is easily
                  extracted, it is often unclear toward what goal it is used as
                  explanation. We find that often that goal, whether explicitly
                  stated or not, is to find out what input tokens are the most
                  relevant to a prediction, and that the implied user for the
                  explanation is a model developer. For this goal and user, we
                  argue that input saliency methods are better suited, and that
                  there are no compelling reasons to use attention, despite the
                  coincidence that it provides a weight for each input. With
                  this position paper, we hope to shift some of the recent focus
                  on attention to saliency methods, and for authors to clearly
                  state the goal and user for their explanations.",
}

@article{https://doi.org/10.48550/arxiv.2204.01678,
  doi             = {10.48550/ARXIV.2204.01678},
  url             = {https://arxiv.org/abs/2204.01678},
  author          = {Bachmann, Roman and Mizrahi, David and Atanov, Andrei and
                  Zamir, Amir},
  keywords        = {Computer Vision and Pattern Recognition (cs.CV), Machine
                  Learning (cs.LG), FOS: Computer and information sciences, FOS:
                  Computer and information sciences},
  title           = {MultiMAE: Multi-modal Multi-task Masked Autoencoders},
  publisher       = {arXiv},
  year            = 2022,
  copyright       = {arXiv.org perpetual, non-exclusive license}
}

@article{10.1093/bioinformatics/btab746,
  author          = {De Waele, Gaetan and Clauwaert, Jim and Menschaert, Gerben
                  and Waegeman, Willem},
  title           = "{CpG Transformer for imputation of single-cell methylomes}",
  journal         = {Bioinformatics},
  volume          = 38,
  number          = 3,
  pages           = {597-603},
  year            = 2021,
  month           = 10,
  abstract        = "{The adoption of current single-cell DNA methylation
                  sequencing protocols is hindered by incomplete coverage,
                  outlining the need for effective imputation techniques. The
                  task of imputing single-cell (methylation) data requires
                  models to build an understanding of underlying biological
                  processes.We adapt the transformer neural network architecture
                  to operate on methylation matrices through combining axial
                  attention with sliding window self-attention. The obtained CpG
                  Transformer displays state-of-the-art performances on a wide
                  range of scBS-seq and scRRBS-seq datasets. Furthermore, we
                  demonstrate the interpretability of CpG Transformer and
                  illustrate its rapid transfer learning properties, allowing
                  practitioners to train models on new datasets with a limited
                  computational and time budget.CpG Transformer is freely
                  available at
                  https://github.com/gdewael/cpg-transformer.Supplementary data
                  are available at Bioinformatics online.}",
  issn            = {1367-4803},
  doi             = {10.1093/bioinformatics/btab746},
  url             = {https://doi.org/10.1093/bioinformatics/btab746},
  eprint          =
                  {https://academic.oup.com/bioinformatics/article-pdf/38/3/597/42167564/btab746.pdf},
}

@article {Rao2021.02.12.430858,
  author          = {Rao, Roshan and Liu, Jason and Verkuil, Robert and Meier,
                  Joshua and Canny, John F. and Abbeel, Pieter and Sercu, Tom
                  and Rives, Alexander},
  title           = {MSA Transformer},
  elocation-id    = {2021.02.12.430858},
  year            = 2021,
  doi             = {10.1101/2021.02.12.430858},
  publisher       = {Cold Spring Harbor Laboratory},
  abstract        = {Unsupervised protein language models trained across
                  millions of diverse sequences learn structure and function of
                  proteins. Protein language models studied to date have been
                  trained to perform inference from individual sequences. The
                  longstanding approach in computational biology has been to
                  make inferences from a family of evo lutionarily related
                  sequences by fitting a model to each family independently. In
                  this work we combine the two paradigms. We introduce a protein
                  language model which takes as input a set of sequences in the
                  form of a multiple sequence alignment. The model interleaves
                  row and column attention across the input sequences and is
                  trained with a variant of the masked language modeling
                  objective across many protein families. The performance of the
                  model surpasses current state-of-the-art unsupervised
                  structure learning methods by a wide margin, with far greater
                  parameter efficiency than prior state-of-the-art protein
                  language models.Competing Interest StatementThe authors have
                  declared no competing interest.},
  URL             =
                  {https://www.biorxiv.org/content/early/2021/08/27/2021.02.12.430858},
  eprint          =
                  {https://www.biorxiv.org/content/early/2021/08/27/2021.02.12.430858.full.pdf},
  journal         = {bioRxiv}
}

@article{Jumper2021,
  author          = {Jumper, John and Evans, Richard and Pritzel, Alexander and
                  Green, Tim and Figurnov, Michael and Ronneberger, Olaf and
                  Tunyasuvunakool, Kathryn and Bates, Russ and {\v{Z}}{\'i}dek,
                  Augustin and Potapenko, Anna and Bridgland, Alex and Meyer,
                  Clemens and Kohl, Simon A. A. and Ballard, Andrew J. and
                  Cowie, Andrew and Romera-Paredes, Bernardino and Nikolov,
                  Stanislav and Jain, Rishub and Adler, Jonas and Back, Trevor
                  and Petersen, Stig and Reiman, David and Clancy, Ellen and
                  Zielinski, Michal and Steinegger, Martin and Pacholska,
                  Michalina and Berghammer, Tamas and Bodenstein, Sebastian and
                  Silver, David and Vinyals, Oriol and Senior, Andrew W. and
                  Kavukcuoglu, Koray and Kohli, Pushmeet and Hassabis, Demis},
  title           = {Highly accurate protein structure prediction with
                  AlphaFold},
  journal         = {Nature},
  year            = 2021,
  month           = {Aug},
  day             = 01,
  volume          = 596,
  number          = 7873,
  pages           = {583-589},
  abstract        = {Proteins are essential to life, and understanding their
                  structure can facilitate a mechanistic understanding of their
                  function. Through an enormous experimental effort1--4, the
                  structures of around 100,000 unique proteins have been
                  determined5, but this represents a small fraction of the
                  billions of known protein sequences6,7. Structural coverage is
                  bottlenecked by the months to years of painstaking effort
                  required to determine a single protein structure. Accurate
                  computational approaches are needed to address this gap and to
                  enable large-scale structural bioinformatics. Predicting the
                  three-dimensional structure that a protein will adopt based
                  solely on its amino acid sequence---the structure prediction
                  component of the `protein folding problem'8---has been an
                  important open research problem for more than 50 years9.
                  Despite recent progress10--14, existing methods fall far short
                  of atomic accuracy, especially when no homologous structure is
                  available. Here we provide the first computational method that
                  can regularly predict protein structures with atomic accuracy
                  even in cases in which no similar structure is known. We
                  validated an entirely redesigned version of our neural
                  network-based model, AlphaFold, in the challenging 14th
                  Critical Assessment of protein Structure Prediction
                  (CASP14)15, demonstrating accuracy competitive with
                  experimental structures in a majority of cases and greatly
                  outperforming other methods. Underpinning the latest version
                  of AlphaFold is a novel machine learning approach that
                  incorporates physical and biological knowledge about protein
                  structure, leveraging multi-sequence alignments, into the
                  design of the deep learning algorithm.},
  issn            = {1476-4687},
  doi             = {10.1038/s41586-021-03819-2},
  url             = {https://doi.org/10.1038/s41586-021-03819-2}
}

@article {Ashuach2021.08.20.457057,
  author          = {Ashuach, Tal and Gabitto, Mariano I. and Jordan, Michael I.
                  and Yosef, Nir},
  title           = {MultiVI: deep generative model for the integration of
                  multi-modal data},
  elocation-id    = {2021.08.20.457057},
  year            = 2021,
  doi             = {10.1101/2021.08.20.457057},
  publisher       = {Cold Spring Harbor Laboratory},
  abstract        = {Jointly profiling the transcriptional and chromatin
                  accessibility landscapes of single-cells is a powerful
                  technique to characterize cellular populations. Here we
                  present MultiVI, a probabilistic model to analyze such
                  multiomic data and integrate it with single modality datasets.
                  MultiVI creates a joint representation that accurately
                  reflects both chromatin and transcriptional properties of the
                  cells even when one modality is missing. It also imputes
                  missing data, corrects for batch effects and is available in
                  the scvi-tools framework:
                  https://docs.scvi-tools.org/.Competing Interest StatementThe
                  authors have declared no competing interest.},
  URL             =
                  {https://www.biorxiv.org/content/early/2021/09/07/2021.08.20.457057},
  eprint          =
                  {https://www.biorxiv.org/content/early/2021/09/07/2021.08.20.457057.full.pdf},
  journal         = {bioRxiv}
}

@article{Gong2021,
  author          = {Gong, Boying and Zhou, Yun and Purdom, Elizabeth},
  title           = {Cobolt: integrative analysis of multimodal single-cell
                  sequencing data},
  journal         = {Genome Biology},
  year            = 2021,
  month           = {Dec},
  day             = 28,
  volume          = 22,
  number          = 1,
  pages           = 351,
  abstract        = {A growing number of single-cell sequencing platforms enable
                  joint profiling of multiple omics from the same cells. We
                  present Cobolt, a novel method that not only allows for
                  analyzing the data from joint-modality platforms, but provides
                  a coherent framework for the integration of multiple datasets
                  measured on different modalities. We demonstrate its
                  performance on multi-modality data of gene expression and
                  chromatin accessibility and illustrate the integration
                  abilities of Cobolt by jointly analyzing this multi-modality
                  data with single-cell RNA-seq and ATAC-seq datasets.},
  issn            = {1474-760X},
  doi             = {10.1186/s13059-021-02556-z},
  url             = {https://doi.org/10.1186/s13059-021-02556-z}
}

@article{Bredikhin2022,
  author          = {Bredikhin, Danila and Kats, Ilia and Stegle, Oliver},
  title           = {MUON: multimodal omics analysis framework},
  journal         = {Genome Biology},
  year            = 2022,
  month           = {Feb},
  day             = 01,
  volume          = 23,
  number          = 1,
  pages           = 42,
  abstract        = {Advances in multi-omics have led to an explosion of
                  multimodal datasets to address questions from basic biology to
                  translation. While these data provide novel opportunities for
                  discovery, they also pose management and analysis challenges,
                  thus motivating the development of tailored computational
                  solutions. Here, we present a data standard and an analysis
                  framework for multi-omics, MUON, designed to organise,
                  analyse, visualise, and exchange multimodal data. MUON stores
                  multimodal data in an efficient yet flexible and interoperable
                  data structure. MUON enables a versatile range of analyses,
                  from data preprocessing to flexible multi-omics alignment.},
  issn            = {1474-760X},
  doi             = {10.1186/s13059-021-02577-8},
  url             = {https://doi.org/10.1186/s13059-021-02577-8}
}

@inproceedings{pmlr-v176-lance22a,
  title           = {Multimodal single cell data integration challenge: Results
                  and lessons learned},
  author          = {Lance, Christopher and Luecken, Malte D. and Burkhardt,
                  Daniel B. and Cannoodt, Robrecht and Rautenstrauch, Pia and
                  Laddach, Anna and Ubingazhibov, Aidyn and Cao, Zhi-Jie and
                  Deng, Kaiwen and Khan, Sumeer and Liu, Qiao and Russkikh,
                  Nikolay and Ryazantsev, Gleb and Ohler, Uwe and data
                  integration competition participants, NeurIPS 2021 Multimodal
                  and Pisco, Angela Oliveira and Bloom, Jonathan and
                  Krishnaswamy, Smita and Theis, Fabian J.},
  booktitle       = {Proceedings of the NeurIPS 2021 Competitions and
                  Demonstrations Track},
  pages           = {162--176},
  year            = 2022,
  editor          = {Kiela, Douwe and Ciccone, Marco and Caputo, Barbara},
  volume          = 176,
  series          = {Proceedings of Machine Learning Research},
  month           = {06--14 Dec},
  publisher       = {PMLR},
  pdf             = {https://proceedings.mlr.press/v176/lance22a/lance22a.pdf},
  url             = {https://proceedings.mlr.press/v176/lance22a.html},
  abstract        = {Biology has become a data-intensive science. Recent
                  technological advances in single-cell genomics have enabled
                  the measurement of multiple facets of cellular state,
                  producing datasets with millions of single-cell observations.
                  While these data hold great promise for understanding
                  molecular mechanisms in health and disease, analysis
                  challenges arising from sparsity, technical and biological
                  variability, and high dimensionality of the data hinder the
                  derivation of such mechanistic insights. To promote the
                  innovation of algorithms for analysis of multimodal
                  single-cell data, we organized a competition at NeurIPS 2021
                  applying the Common Task Framework to multimodal single-cell
                  data integration. For this competition we generated the first
                  multimodal benchmarking dataset for single-cell biology and
                  defined three tasks in this domain: prediction of missing
                  modalities, aligning modalities, and learning a joint
                  representation across modalities. We further specified
                  evaluation metrics and developed a cloud-based algorithm
                  evaluation pipeline. Using this setup, 280 competitors
                  submitted over 2600 proposed solutions within a 3 month
                  period, showcasing substantial innovation especially in the
                  modality alignment task. Here, we present the results,
                  describe trends of well performing approaches, and discuss
                  challenges associated with running the competition.}
}