287 lines
17 KiB
BibTeX
287 lines
17 KiB
BibTeX
@article{10.1093/molbev/msy224,
|
||
author = {Flagel, Lex and Brandvain, Yaniv and Schrider, Daniel R},
|
||
title = "{The Unreasonable Effectiveness of Convolutional Neural
|
||
Networks in Population Genetic Inference}",
|
||
journal = {Molecular Biology and Evolution},
|
||
volume = 36,
|
||
number = 2,
|
||
pages = {220-238},
|
||
year = 2018,
|
||
month = 12,
|
||
abstract = "{Population-scale genomic data sets have given researchers
|
||
incredible amounts of information from which to infer
|
||
evolutionary histories. Concomitant with this flood of data,
|
||
theoretical and methodological advances have sought to extract
|
||
information from genomic sequences to infer demographic events
|
||
such as population size changes and gene flow among closely
|
||
related populations/species, construct recombination maps, and
|
||
uncover loci underlying recent adaptation. To date, most
|
||
methods make use of only one or a few summaries of the input
|
||
sequences and therefore ignore potentially useful information
|
||
encoded in the data. The most sophisticated of these
|
||
approaches involve likelihood calculations, which require
|
||
theoretical advances for each new problem, and often focus on
|
||
a single aspect of the data (e.g., only allele frequency
|
||
information) in the interest of mathematical and computational
|
||
tractability. Directly interrogating the entirety of the input
|
||
sequence data in a likelihood-free manner would thus offer a
|
||
fruitful alternative. Here, we accomplish this by representing
|
||
DNA sequence alignments as images and using a class of deep
|
||
learning methods called convolutional neural networks (CNNs)
|
||
to make population genetic inferences from these images. We
|
||
apply CNNs to a number of evolutionary questions and find that
|
||
they frequently match or exceed the accuracy of current
|
||
methods. Importantly, we show that CNNs perform accurate
|
||
evolutionary model selection and parameter estimation, even on
|
||
problems that have not received detailed theoretical
|
||
treatments. Thus, when applied to population genetic
|
||
alignments, CNNs are capable of outperforming expert-derived
|
||
statistical methods and offer a new path forward in cases
|
||
where no likelihood approach exists.}",
|
||
issn = {0737-4038},
|
||
doi = {10.1093/molbev/msy224},
|
||
url = {https://doi.org/10.1093/molbev/msy224},
|
||
eprint = {https://academic.oup.com/mbe/article-pdf/36/2/220/27736968/msy224.pdf},
|
||
}
|
||
|
||
@Article{pmid19706884,
|
||
Author = "Robins, H. S. and Campregher, P. V. and Srivastava, S. K.
|
||
and Wacher, A. and Turtle, C. J. and Kahsai, O. and Riddell,
|
||
S. R. and Warren, E. H. and Carlson, C. S. ",
|
||
Title = "{{C}omprehensive assessment of {T}-cell receptor beta-chain
|
||
diversity in alphabeta {T} cells}",
|
||
Journal = "Blood",
|
||
Year = 2009,
|
||
Volume = 114,
|
||
Number = 19,
|
||
Pages = "4099--4107",
|
||
Month = "Nov"
|
||
}
|
||
|
||
@article {Nurk2021.05.26.445798,
|
||
author = {Nurk, Sergey and Koren, Sergey and Rhie, Arang and
|
||
Rautiainen, Mikko and Bzikadze, Andrey V. and Mikheenko, Alla
|
||
and Vollger, Mitchell R. and Altemose, Nicolas and Uralsky,
|
||
Lev and Gershman, Ariel and Aganezov, Sergey and Hoyt,
|
||
Savannah J. and Diekhans, Mark and Logsdon, Glennis A. and
|
||
Alonge, Michael and Antonarakis, Stylianos E. and Borchers,
|
||
Matthew and Bouffard, Gerard G. and Brooks, Shelise Y. and
|
||
Caldas, Gina V. and Cheng, Haoyu and Chin, Chen-Shan and Chow,
|
||
William and de Lima, Leonardo G. and Dishuck, Philip C. and
|
||
Durbin, Richard and Dvorkina, Tatiana and Fiddes, Ian T. and
|
||
Formenti, Giulio and Fulton, Robert S. and Fungtammasan,
|
||
Arkarachai and Garrison, Erik and Grady, Patrick G.S. and
|
||
Graves-Lindsay, Tina A. and Hall, Ira M. and Hansen, Nancy F.
|
||
and Hartley, Gabrielle A. and Haukness, Marina and Howe,
|
||
Kerstin and Hunkapiller, Michael W. and Jain, Chirag and Jain,
|
||
Miten and Jarvis, Erich D. and Kerpedjiev, Peter and Kirsche,
|
||
Melanie and Kolmogorov, Mikhail and Korlach, Jonas and
|
||
Kremitzki, Milinn and Li, Heng and Maduro, Valerie V. and
|
||
Marschall, Tobias and McCartney, Ann M. and McDaniel, Jennifer
|
||
and Miller, Danny E. and Mullikin, James C. and Myers, Eugene
|
||
W. and Olson, Nathan D. and Paten, Benedict and Peluso, Paul
|
||
and Pevzner, Pavel A. and Porubsky, David and Potapova, Tamara
|
||
and Rogaev, Evgeny I. and Rosenfeld, Jeffrey A. and Salzberg,
|
||
Steven L. and Schneider, Valerie A. and Sedlazeck, Fritz J.
|
||
and Shafin, Kishwar and Shew, Colin J. and Shumate, Alaina and
|
||
Sims, Yumi and Smit, Arian F. A. and Soto, Daniela C. and
|
||
Sovi{\'c}, Ivan and Storer, Jessica M. and Streets, Aaron and
|
||
Sullivan, Beth A. and Thibaud-Nissen, Fran{\c c}oise and
|
||
Torrance, James and Wagner, Justin and Walenz, Brian P. and
|
||
Wenger, Aaron and Wood, Jonathan M. D. and Xiao, Chunlin and
|
||
Yan, Stephanie M. and Young, Alice C. and Zarate, Samantha and
|
||
Surti, Urvashi and McCoy, Rajiv C. and Dennis, Megan Y. and
|
||
Alexandrov, Ivan A. and Gerton, Jennifer L. and
|
||
O{\textquoteright}Neill, Rachel J. and Timp, Winston and Zook,
|
||
Justin M. and Schatz, Michael C. and Eichler, Evan E. and
|
||
Miga, Karen H. and Phillippy, Adam M.},
|
||
title = {The complete sequence of a human genome},
|
||
elocation-id = {2021.05.26.445798},
|
||
year = 2021,
|
||
doi = {10.1101/2021.05.26.445798},
|
||
publisher = {Cold Spring Harbor Laboratory},
|
||
abstract = {In 2001, Celera Genomics and the International Human Genome
|
||
Sequencing Consortium published their initial drafts of the
|
||
human genome, which revolutionized the field of genomics.
|
||
While these drafts and the updates that followed effectively
|
||
covered the euchromatic fraction of the genome, the
|
||
heterochromatin and many other complex regions were left
|
||
unfinished or erroneous. Addressing this remaining 8\% of the
|
||
genome, the Telomere-to-Telomere (T2T) Consortium has finished
|
||
the first truly complete 3.055 billion base pair (bp) sequence
|
||
of a human genome, representing the largest improvement to the
|
||
human reference genome since its initial release. The new
|
||
T2T-CHM13 reference includes gapless assemblies for all 22
|
||
autosomes plus Chromosome X, corrects numerous errors, and
|
||
introduces nearly 200 million bp of novel sequence containing
|
||
2,226 paralogous gene copies, 115 of which are predicted to be
|
||
protein coding. The newly completed regions include all
|
||
centromeric satellite arrays and the short arms of all five
|
||
acrocentric chromosomes, unlocking these complex regions of
|
||
the genome to variational and functional studies for the first
|
||
time.Competing Interest StatementAF and CSC are employees of
|
||
DNAnexus; IS, JK, MWH, PP, and AW are employees of Pacific
|
||
Biosciences; FJS has received travel funds to speak at events
|
||
hosted by Pacific Biosciences; SK and FJS have received travel
|
||
funds to speak at events hosted by Oxford Nanopore
|
||
Technologies. WT has licensed two patents to Oxford Nanopore
|
||
Technologies (US 8748091 and 8394584).},
|
||
URL = {https://www.biorxiv.org/content/early/2021/05/27/2021.05.26.445798},
|
||
eprint = {https://www.biorxiv.org/content/early/2021/05/27/2021.05.26.445798.full.pdf},
|
||
journal = {bioRxiv}
|
||
}
|
||
|
||
@ARTICLE{10.3389/fgene.2020.00900,
|
||
AUTHOR = {Wang, Luotong and Qu, Li and Yang, Longshu and Wang, Yiying
|
||
and Zhu, Huaiqiu},
|
||
TITLE = {NanoReviser: An Error-Correction Tool for Nanopore
|
||
Sequencing Based on a Deep Learning Algorithm},
|
||
JOURNAL = {Frontiers in Genetics},
|
||
VOLUME = 11,
|
||
PAGES = 900,
|
||
YEAR = 2020,
|
||
URL = {https://www.frontiersin.org/article/10.3389/fgene.2020.00900},
|
||
DOI = {10.3389/fgene.2020.00900},
|
||
ISSN = {1664-8021},
|
||
ABSTRACT = {Nanopore sequencing is regarded as one of the most
|
||
promising third-generation sequencing (TGS) technologies.
|
||
Since 2014, Oxford Nanopore Technologies (ONT) has developed a
|
||
series of devices based on nanopore sequencing to produce very
|
||
long reads, with an expected impact on genomics. However, the
|
||
nanopore sequencing reads are susceptible to a fairly high
|
||
error rate owing to the difficulty in identifying the DNA
|
||
bases from the complex electrical signals. Although several
|
||
basecalling tools have been developed for nanopore sequencing
|
||
over the past years, it is still challenging to correct the
|
||
sequences after applying the basecalling procedure. In this
|
||
study, we developed an open-source DNA basecalling reviser,
|
||
NanoReviser, based on a deep learning algorithm to correct the
|
||
basecalling errors introduced by current basecallers provided
|
||
by default. In our module, we re-segmented the raw electrical
|
||
signals based on the basecalled sequences provided by the
|
||
default basecallers. By employing convolution neural networks
|
||
(CNNs) and bidirectional long short-term memory (Bi-LSTM)
|
||
networks, we took advantage of the information from the raw
|
||
electrical signals and the basecalled sequences from the
|
||
basecallers. Our results showed NanoReviser, as a
|
||
post-basecalling reviser, significantly improving the
|
||
basecalling quality. After being trained on standard ONT
|
||
sequencing reads from public E. coli and human NA12878
|
||
datasets, NanoReviser reduced the sequencing error rate by
|
||
over 5% for both the E. coli dataset and the human dataset.
|
||
The performance of NanoReviser was found to be better than
|
||
those of all current basecalling tools. Furthermore, we
|
||
analyzed the modified bases of the E. coli dataset and added
|
||
the methylation information to train our module. With the
|
||
methylation annotation, NanoReviser reduced the error rate by
|
||
7% for the E. coli dataset and specifically reduced the error
|
||
rate by over 10% for the regions of the sequence rich in
|
||
methylated bases. To the best of our knowledge, NanoReviser is
|
||
the first post-processing tool after basecalling to accurately
|
||
correct the nanopore sequences without the time-consuming
|
||
procedure of building the consensus sequence. The NanoReviser
|
||
package is freely available at <ext-link ext-link-type="uri"
|
||
xlink:href="https://github.com/pkubioinformatics/NanoReviser"
|
||
xmlns:xlink="http://www.w3.org/1999/xlink">https://github.com/pkubioinformatics/NanoReviser</ext-link>.}
|
||
}
|
||
|
||
|
||
|
||
@Article{Davis2021,
|
||
author = {Davis, Eric M. and Sun, Yu and Liu, Yanling and Kolekar,
|
||
Pandurang and Shao, Ying and Szlachta, Karol and Mulder,
|
||
Heather L. and Ren, Dongren and Rice, Stephen V. and Wang,
|
||
Zhaoming and Nakitandwe, Joy and Gout, Alexander M. and
|
||
Shaner, Bridget and Hall, Salina and Robison, Leslie L. and
|
||
Pounds, Stanley and Klco, Jeffery M. and Easton, John and Ma,
|
||
Xiaotu},
|
||
title = {SequencErr: measuring and suppressing sequencer errors in
|
||
next-generation sequencing data},
|
||
journal = {Genome Biology},
|
||
year = 2021,
|
||
month = {Jan},
|
||
day = 25,
|
||
volume = 22,
|
||
number = 1,
|
||
pages = 37,
|
||
abstract = {There is currently no method to precisely measure the
|
||
errors that occur in the sequencing instrument/sequencer,
|
||
which is critical for next-generation sequencing applications
|
||
aimed at discovering the genetic makeup of heterogeneous
|
||
cellular populations.},
|
||
issn = {1474-760X},
|
||
doi = {10.1186/s13059-020-02254-2},
|
||
url = {https://doi.org/10.1186/s13059-020-02254-2}
|
||
}
|
||
|
||
@article{HEATHER20161,
|
||
title = {The sequence of sequencers: The history of sequencing DNA},
|
||
journal = {Genomics},
|
||
volume = 107,
|
||
number = 1,
|
||
pages = {1-8},
|
||
year = 2016,
|
||
issn = {0888-7543},
|
||
doi = {https://doi.org/10.1016/j.ygeno.2015.11.003},
|
||
url = {https://www.sciencedirect.com/science/article/pii/S0888754315300410},
|
||
author = {James M. Heather and Benjamin Chain},
|
||
keywords = {DNA, RNA, Sequencing, Sequencer, History},
|
||
abstract = {Determining the order of nucleic acid residues in
|
||
biological samples is an integral component of a wide variety
|
||
of research applications. Over the last fifty years large
|
||
numbers of researchers have applied themselves to the
|
||
production of techniques and technologies to facilitate this
|
||
feat, sequencing DNA and RNA molecules. This time-scale has
|
||
witnessed tremendous changes, moving from sequencing short
|
||
oligonucleotides to millions of bases, from struggling towards
|
||
the deduction of the coding sequence of a single gene to rapid
|
||
and widely available whole genome sequencing. This article
|
||
traverses those years, iterating through the different
|
||
generations of sequencing technology, highlighting some of the
|
||
key discoveries, researchers, and sequences along the way.}
|
||
}
|
||
|
||
|
||
|
||
@Article{vanDijk2014,
|
||
author = {van Dijk, Erwin L. and Auger, H{\'e}l{\`e}ne and
|
||
Jaszczyszyn, Yan and Thermes, Claude},
|
||
title = {Ten years of next-generation sequencing technology},
|
||
journal = {Trends in Genetics},
|
||
year = 2014,
|
||
month = {Sep},
|
||
day = 01,
|
||
publisher = {Elsevier},
|
||
volume = 30,
|
||
number = 9,
|
||
pages = {418-426},
|
||
issn = {0168-9525},
|
||
doi = {10.1016/j.tig.2014.07.001},
|
||
url = {https://doi.org/10.1016/j.tig.2014.07.001}
|
||
}
|
||
|
||
@article {Sanger5463,
|
||
author = {Sanger, F. and Nicklen, S. and Coulson, A. R.},
|
||
title = {DNA sequencing with chain-terminating inhibitors},
|
||
volume = 74,
|
||
number = 12,
|
||
pages = {5463--5467},
|
||
year = 1977,
|
||
doi = {10.1073/pnas.74.12.5463},
|
||
publisher = {National Academy of Sciences},
|
||
abstract = {A new method for determining nucleotide sequences in DNA is
|
||
described. It is similar to the {\textquotedblleft}plus and
|
||
minus{\textquotedblright} method [Sanger, F. \& Coulson,
|
||
A. R. (1975) J. Mol. Biol. 94, 441-448] but makes use of the
|
||
2',3'-dideoxy and arabinonucleoside analogues of the normal
|
||
deoxynucleoside triphosphates, which act as specific
|
||
chain-terminating inhibitors of DNA polymerase. The technique
|
||
has been applied to the DNA of bacteriophage ϕX174 and is more
|
||
rapid and more accurate than either the plus or the minus
|
||
method.},
|
||
issn = {0027-8424},
|
||
URL = {https://www.pnas.org/content/74/12/5463},
|
||
eprint = {https://www.pnas.org/content/74/12/5463.full.pdf},
|
||
journal = {Proceedings of the National Academy of Sciences}
|
||
}
|