2021-06-26 18:05:40 +02:00
|
|
|
|
@article{10.1093/molbev/msy224,
|
2021-06-28 00:48:32 +02:00
|
|
|
|
author = {Flagel, Lex and Brandvain, Yaniv and Schrider, Daniel R},
|
|
|
|
|
title = "{The Unreasonable Effectiveness of Convolutional Neural
|
|
|
|
|
Networks in Population Genetic Inference}",
|
|
|
|
|
journal = {Molecular Biology and Evolution},
|
|
|
|
|
volume = 36,
|
|
|
|
|
number = 2,
|
|
|
|
|
pages = {220-238},
|
|
|
|
|
year = 2018,
|
|
|
|
|
month = 12,
|
|
|
|
|
abstract = "{Population-scale genomic data sets have given researchers
|
|
|
|
|
incredible amounts of information from which to infer
|
|
|
|
|
evolutionary histories. Concomitant with this flood of data,
|
|
|
|
|
theoretical and methodological advances have sought to extract
|
|
|
|
|
information from genomic sequences to infer demographic events
|
|
|
|
|
such as population size changes and gene flow among closely
|
|
|
|
|
related populations/species, construct recombination maps, and
|
|
|
|
|
uncover loci underlying recent adaptation. To date, most
|
|
|
|
|
methods make use of only one or a few summaries of the input
|
|
|
|
|
sequences and therefore ignore potentially useful information
|
|
|
|
|
encoded in the data. The most sophisticated of these
|
|
|
|
|
approaches involve likelihood calculations, which require
|
|
|
|
|
theoretical advances for each new problem, and often focus on
|
|
|
|
|
a single aspect of the data (e.g., only allele frequency
|
|
|
|
|
information) in the interest of mathematical and computational
|
|
|
|
|
tractability. Directly interrogating the entirety of the input
|
|
|
|
|
sequence data in a likelihood-free manner would thus offer a
|
|
|
|
|
fruitful alternative. Here, we accomplish this by representing
|
|
|
|
|
DNA sequence alignments as images and using a class of deep
|
|
|
|
|
learning methods called convolutional neural networks (CNNs)
|
|
|
|
|
to make population genetic inferences from these images. We
|
|
|
|
|
apply CNNs to a number of evolutionary questions and find that
|
|
|
|
|
they frequently match or exceed the accuracy of current
|
|
|
|
|
methods. Importantly, we show that CNNs perform accurate
|
|
|
|
|
evolutionary model selection and parameter estimation, even on
|
|
|
|
|
problems that have not received detailed theoretical
|
|
|
|
|
treatments. Thus, when applied to population genetic
|
|
|
|
|
alignments, CNNs are capable of outperforming expert-derived
|
|
|
|
|
statistical methods and offer a new path forward in cases
|
|
|
|
|
where no likelihood approach exists.}",
|
|
|
|
|
issn = {0737-4038},
|
|
|
|
|
doi = {10.1093/molbev/msy224},
|
|
|
|
|
url = {https://doi.org/10.1093/molbev/msy224},
|
|
|
|
|
eprint = {https://academic.oup.com/mbe/article-pdf/36/2/220/27736968/msy224.pdf},
|
2021-06-26 18:05:40 +02:00
|
|
|
|
}
|
|
|
|
|
|
2021-06-27 18:21:28 +02:00
|
|
|
|
@Article{pmid19706884,
|
2021-06-28 00:48:32 +02:00
|
|
|
|
Author = "Robins, H. S. and Campregher, P. V. and Srivastava, S. K.
|
|
|
|
|
and Wacher, A. and Turtle, C. J. and Kahsai, O. and Riddell,
|
|
|
|
|
S. R. and Warren, E. H. and Carlson, C. S. ",
|
|
|
|
|
Title = "{{C}omprehensive assessment of {T}-cell receptor beta-chain
|
|
|
|
|
diversity in alphabeta {T} cells}",
|
|
|
|
|
Journal = "Blood",
|
|
|
|
|
Year = 2009,
|
|
|
|
|
Volume = 114,
|
|
|
|
|
Number = 19,
|
|
|
|
|
Pages = "4099--4107",
|
|
|
|
|
Month = "Nov"
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@article {Nurk2021.05.26.445798,
|
|
|
|
|
author = {Nurk, Sergey and Koren, Sergey and Rhie, Arang and
|
|
|
|
|
Rautiainen, Mikko and Bzikadze, Andrey V. and Mikheenko, Alla
|
|
|
|
|
and Vollger, Mitchell R. and Altemose, Nicolas and Uralsky,
|
|
|
|
|
Lev and Gershman, Ariel and Aganezov, Sergey and Hoyt,
|
|
|
|
|
Savannah J. and Diekhans, Mark and Logsdon, Glennis A. and
|
|
|
|
|
Alonge, Michael and Antonarakis, Stylianos E. and Borchers,
|
|
|
|
|
Matthew and Bouffard, Gerard G. and Brooks, Shelise Y. and
|
|
|
|
|
Caldas, Gina V. and Cheng, Haoyu and Chin, Chen-Shan and Chow,
|
|
|
|
|
William and de Lima, Leonardo G. and Dishuck, Philip C. and
|
|
|
|
|
Durbin, Richard and Dvorkina, Tatiana and Fiddes, Ian T. and
|
|
|
|
|
Formenti, Giulio and Fulton, Robert S. and Fungtammasan,
|
|
|
|
|
Arkarachai and Garrison, Erik and Grady, Patrick G.S. and
|
|
|
|
|
Graves-Lindsay, Tina A. and Hall, Ira M. and Hansen, Nancy F.
|
|
|
|
|
and Hartley, Gabrielle A. and Haukness, Marina and Howe,
|
|
|
|
|
Kerstin and Hunkapiller, Michael W. and Jain, Chirag and Jain,
|
|
|
|
|
Miten and Jarvis, Erich D. and Kerpedjiev, Peter and Kirsche,
|
|
|
|
|
Melanie and Kolmogorov, Mikhail and Korlach, Jonas and
|
|
|
|
|
Kremitzki, Milinn and Li, Heng and Maduro, Valerie V. and
|
|
|
|
|
Marschall, Tobias and McCartney, Ann M. and McDaniel, Jennifer
|
|
|
|
|
and Miller, Danny E. and Mullikin, James C. and Myers, Eugene
|
|
|
|
|
W. and Olson, Nathan D. and Paten, Benedict and Peluso, Paul
|
|
|
|
|
and Pevzner, Pavel A. and Porubsky, David and Potapova, Tamara
|
|
|
|
|
and Rogaev, Evgeny I. and Rosenfeld, Jeffrey A. and Salzberg,
|
|
|
|
|
Steven L. and Schneider, Valerie A. and Sedlazeck, Fritz J.
|
|
|
|
|
and Shafin, Kishwar and Shew, Colin J. and Shumate, Alaina and
|
|
|
|
|
Sims, Yumi and Smit, Arian F. A. and Soto, Daniela C. and
|
|
|
|
|
Sovi{\'c}, Ivan and Storer, Jessica M. and Streets, Aaron and
|
|
|
|
|
Sullivan, Beth A. and Thibaud-Nissen, Fran{\c c}oise and
|
|
|
|
|
Torrance, James and Wagner, Justin and Walenz, Brian P. and
|
|
|
|
|
Wenger, Aaron and Wood, Jonathan M. D. and Xiao, Chunlin and
|
|
|
|
|
Yan, Stephanie M. and Young, Alice C. and Zarate, Samantha and
|
|
|
|
|
Surti, Urvashi and McCoy, Rajiv C. and Dennis, Megan Y. and
|
|
|
|
|
Alexandrov, Ivan A. and Gerton, Jennifer L. and
|
|
|
|
|
O{\textquoteright}Neill, Rachel J. and Timp, Winston and Zook,
|
|
|
|
|
Justin M. and Schatz, Michael C. and Eichler, Evan E. and
|
|
|
|
|
Miga, Karen H. and Phillippy, Adam M.},
|
|
|
|
|
title = {The complete sequence of a human genome},
|
|
|
|
|
elocation-id = {2021.05.26.445798},
|
|
|
|
|
year = 2021,
|
|
|
|
|
doi = {10.1101/2021.05.26.445798},
|
|
|
|
|
publisher = {Cold Spring Harbor Laboratory},
|
|
|
|
|
abstract = {In 2001, Celera Genomics and the International Human Genome
|
|
|
|
|
Sequencing Consortium published their initial drafts of the
|
|
|
|
|
human genome, which revolutionized the field of genomics.
|
|
|
|
|
While these drafts and the updates that followed effectively
|
|
|
|
|
covered the euchromatic fraction of the genome, the
|
|
|
|
|
heterochromatin and many other complex regions were left
|
|
|
|
|
unfinished or erroneous. Addressing this remaining 8\% of the
|
|
|
|
|
genome, the Telomere-to-Telomere (T2T) Consortium has finished
|
|
|
|
|
the first truly complete 3.055 billion base pair (bp) sequence
|
|
|
|
|
of a human genome, representing the largest improvement to the
|
|
|
|
|
human reference genome since its initial release. The new
|
|
|
|
|
T2T-CHM13 reference includes gapless assemblies for all 22
|
|
|
|
|
autosomes plus Chromosome X, corrects numerous errors, and
|
|
|
|
|
introduces nearly 200 million bp of novel sequence containing
|
|
|
|
|
2,226 paralogous gene copies, 115 of which are predicted to be
|
|
|
|
|
protein coding. The newly completed regions include all
|
|
|
|
|
centromeric satellite arrays and the short arms of all five
|
|
|
|
|
acrocentric chromosomes, unlocking these complex regions of
|
|
|
|
|
the genome to variational and functional studies for the first
|
|
|
|
|
time.Competing Interest StatementAF and CSC are employees of
|
|
|
|
|
DNAnexus; IS, JK, MWH, PP, and AW are employees of Pacific
|
|
|
|
|
Biosciences; FJS has received travel funds to speak at events
|
|
|
|
|
hosted by Pacific Biosciences; SK and FJS have received travel
|
|
|
|
|
funds to speak at events hosted by Oxford Nanopore
|
|
|
|
|
Technologies. WT has licensed two patents to Oxford Nanopore
|
|
|
|
|
Technologies (US 8748091 and 8394584).},
|
|
|
|
|
URL = {https://www.biorxiv.org/content/early/2021/05/27/2021.05.26.445798},
|
|
|
|
|
eprint = {https://www.biorxiv.org/content/early/2021/05/27/2021.05.26.445798.full.pdf},
|
|
|
|
|
journal = {bioRxiv}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@ARTICLE{10.3389/fgene.2020.00900,
|
|
|
|
|
AUTHOR = {Wang, Luotong and Qu, Li and Yang, Longshu and Wang, Yiying
|
|
|
|
|
and Zhu, Huaiqiu},
|
|
|
|
|
TITLE = {NanoReviser: An Error-Correction Tool for Nanopore
|
|
|
|
|
Sequencing Based on a Deep Learning Algorithm},
|
|
|
|
|
JOURNAL = {Frontiers in Genetics},
|
|
|
|
|
VOLUME = 11,
|
|
|
|
|
PAGES = 900,
|
|
|
|
|
YEAR = 2020,
|
|
|
|
|
URL = {https://www.frontiersin.org/article/10.3389/fgene.2020.00900},
|
|
|
|
|
DOI = {10.3389/fgene.2020.00900},
|
|
|
|
|
ISSN = {1664-8021},
|
|
|
|
|
ABSTRACT = {Nanopore sequencing is regarded as one of the most
|
|
|
|
|
promising third-generation sequencing (TGS) technologies.
|
|
|
|
|
Since 2014, Oxford Nanopore Technologies (ONT) has developed a
|
|
|
|
|
series of devices based on nanopore sequencing to produce very
|
|
|
|
|
long reads, with an expected impact on genomics. However, the
|
|
|
|
|
nanopore sequencing reads are susceptible to a fairly high
|
|
|
|
|
error rate owing to the difficulty in identifying the DNA
|
|
|
|
|
bases from the complex electrical signals. Although several
|
|
|
|
|
basecalling tools have been developed for nanopore sequencing
|
|
|
|
|
over the past years, it is still challenging to correct the
|
|
|
|
|
sequences after applying the basecalling procedure. In this
|
|
|
|
|
study, we developed an open-source DNA basecalling reviser,
|
|
|
|
|
NanoReviser, based on a deep learning algorithm to correct the
|
|
|
|
|
basecalling errors introduced by current basecallers provided
|
|
|
|
|
by default. In our module, we re-segmented the raw electrical
|
|
|
|
|
signals based on the basecalled sequences provided by the
|
|
|
|
|
default basecallers. By employing convolution neural networks
|
|
|
|
|
(CNNs) and bidirectional long short-term memory (Bi-LSTM)
|
|
|
|
|
networks, we took advantage of the information from the raw
|
|
|
|
|
electrical signals and the basecalled sequences from the
|
|
|
|
|
basecallers. Our results showed NanoReviser, as a
|
|
|
|
|
post-basecalling reviser, significantly improving the
|
|
|
|
|
basecalling quality. After being trained on standard ONT
|
|
|
|
|
sequencing reads from public E. coli and human NA12878
|
|
|
|
|
datasets, NanoReviser reduced the sequencing error rate by
|
|
|
|
|
over 5% for both the E. coli dataset and the human dataset.
|
|
|
|
|
The performance of NanoReviser was found to be better than
|
|
|
|
|
those of all current basecalling tools. Furthermore, we
|
|
|
|
|
analyzed the modified bases of the E. coli dataset and added
|
|
|
|
|
the methylation information to train our module. With the
|
|
|
|
|
methylation annotation, NanoReviser reduced the error rate by
|
|
|
|
|
7% for the E. coli dataset and specifically reduced the error
|
|
|
|
|
rate by over 10% for the regions of the sequence rich in
|
|
|
|
|
methylated bases. To the best of our knowledge, NanoReviser is
|
|
|
|
|
the first post-processing tool after basecalling to accurately
|
|
|
|
|
correct the nanopore sequences without the time-consuming
|
|
|
|
|
procedure of building the consensus sequence. The NanoReviser
|
|
|
|
|
package is freely available at <ext-link ext-link-type="uri"
|
|
|
|
|
xlink:href="https://github.com/pkubioinformatics/NanoReviser"
|
|
|
|
|
xmlns:xlink="http://www.w3.org/1999/xlink">https://github.com/pkubioinformatics/NanoReviser</ext-link>.}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@Article{Davis2021,
|
|
|
|
|
author = {Davis, Eric M. and Sun, Yu and Liu, Yanling and Kolekar,
|
|
|
|
|
Pandurang and Shao, Ying and Szlachta, Karol and Mulder,
|
|
|
|
|
Heather L. and Ren, Dongren and Rice, Stephen V. and Wang,
|
|
|
|
|
Zhaoming and Nakitandwe, Joy and Gout, Alexander M. and
|
|
|
|
|
Shaner, Bridget and Hall, Salina and Robison, Leslie L. and
|
|
|
|
|
Pounds, Stanley and Klco, Jeffery M. and Easton, John and Ma,
|
|
|
|
|
Xiaotu},
|
|
|
|
|
title = {SequencErr: measuring and suppressing sequencer errors in
|
|
|
|
|
next-generation sequencing data},
|
|
|
|
|
journal = {Genome Biology},
|
|
|
|
|
year = 2021,
|
|
|
|
|
month = {Jan},
|
|
|
|
|
day = 25,
|
|
|
|
|
volume = 22,
|
|
|
|
|
number = 1,
|
|
|
|
|
pages = 37,
|
|
|
|
|
abstract = {There is currently no method to precisely measure the
|
|
|
|
|
errors that occur in the sequencing instrument/sequencer,
|
|
|
|
|
which is critical for next-generation sequencing applications
|
|
|
|
|
aimed at discovering the genetic makeup of heterogeneous
|
|
|
|
|
cellular populations.},
|
|
|
|
|
issn = {1474-760X},
|
|
|
|
|
doi = {10.1186/s13059-020-02254-2},
|
|
|
|
|
url = {https://doi.org/10.1186/s13059-020-02254-2}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@article{HEATHER20161,
|
|
|
|
|
title = {The sequence of sequencers: The history of sequencing DNA},
|
|
|
|
|
journal = {Genomics},
|
|
|
|
|
volume = 107,
|
|
|
|
|
number = 1,
|
|
|
|
|
pages = {1-8},
|
|
|
|
|
year = 2016,
|
|
|
|
|
issn = {0888-7543},
|
|
|
|
|
doi = {https://doi.org/10.1016/j.ygeno.2015.11.003},
|
|
|
|
|
url = {https://www.sciencedirect.com/science/article/pii/S0888754315300410},
|
|
|
|
|
author = {James M. Heather and Benjamin Chain},
|
|
|
|
|
keywords = {DNA, RNA, Sequencing, Sequencer, History},
|
|
|
|
|
abstract = {Determining the order of nucleic acid residues in
|
|
|
|
|
biological samples is an integral component of a wide variety
|
|
|
|
|
of research applications. Over the last fifty years large
|
|
|
|
|
numbers of researchers have applied themselves to the
|
|
|
|
|
production of techniques and technologies to facilitate this
|
|
|
|
|
feat, sequencing DNA and RNA molecules. This time-scale has
|
|
|
|
|
witnessed tremendous changes, moving from sequencing short
|
|
|
|
|
oligonucleotides to millions of bases, from struggling towards
|
|
|
|
|
the deduction of the coding sequence of a single gene to rapid
|
|
|
|
|
and widely available whole genome sequencing. This article
|
|
|
|
|
traverses those years, iterating through the different
|
|
|
|
|
generations of sequencing technology, highlighting some of the
|
|
|
|
|
key discoveries, researchers, and sequences along the way.}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@Article{vanDijk2014,
|
|
|
|
|
author = {van Dijk, Erwin L. and Auger, H{\'e}l{\`e}ne and
|
|
|
|
|
Jaszczyszyn, Yan and Thermes, Claude},
|
|
|
|
|
title = {Ten years of next-generation sequencing technology},
|
|
|
|
|
journal = {Trends in Genetics},
|
|
|
|
|
year = 2014,
|
|
|
|
|
month = {Sep},
|
|
|
|
|
day = 01,
|
|
|
|
|
publisher = {Elsevier},
|
|
|
|
|
volume = 30,
|
|
|
|
|
number = 9,
|
|
|
|
|
pages = {418-426},
|
|
|
|
|
issn = {0168-9525},
|
|
|
|
|
doi = {10.1016/j.tig.2014.07.001},
|
|
|
|
|
url = {https://doi.org/10.1016/j.tig.2014.07.001}
|
2021-06-27 18:21:28 +02:00
|
|
|
|
}
|
2021-06-28 01:56:27 +02:00
|
|
|
|
|
|
|
|
|
@article {Sanger5463,
|
|
|
|
|
author = {Sanger, F. and Nicklen, S. and Coulson, A. R.},
|
|
|
|
|
title = {DNA sequencing with chain-terminating inhibitors},
|
|
|
|
|
volume = 74,
|
|
|
|
|
number = 12,
|
|
|
|
|
pages = {5463--5467},
|
|
|
|
|
year = 1977,
|
|
|
|
|
doi = {10.1073/pnas.74.12.5463},
|
|
|
|
|
publisher = {National Academy of Sciences},
|
|
|
|
|
abstract = {A new method for determining nucleotide sequences in DNA is
|
|
|
|
|
described. It is similar to the {\textquotedblleft}plus and
|
|
|
|
|
minus{\textquotedblright} method [Sanger, F. \& Coulson,
|
|
|
|
|
A. R. (1975) J. Mol. Biol. 94, 441-448] but makes use of the
|
|
|
|
|
2',3'-dideoxy and arabinonucleoside analogues of the normal
|
|
|
|
|
deoxynucleoside triphosphates, which act as specific
|
|
|
|
|
chain-terminating inhibitors of DNA polymerase. The technique
|
|
|
|
|
has been applied to the DNA of bacteriophage ϕX174 and is more
|
|
|
|
|
rapid and more accurate than either the plus or the minus
|
|
|
|
|
method.},
|
|
|
|
|
issn = {0027-8424},
|
|
|
|
|
URL = {https://www.pnas.org/content/74/12/5463},
|
|
|
|
|
eprint = {https://www.pnas.org/content/74/12/5463.full.pdf},
|
|
|
|
|
journal = {Proceedings of the National Academy of Sciences}
|
|
|
|
|
}
|
2021-06-28 19:01:25 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@Article{InternationalHumanGenomeSequencingConsortium2004,
|
|
|
|
|
author = {Consortium, International Human Genome Sequencing},
|
|
|
|
|
title = {Finishing the euchromatic sequence of the human genome},
|
|
|
|
|
journal = {Nature},
|
|
|
|
|
year = 2004,
|
|
|
|
|
month = {Oct},
|
|
|
|
|
day = 01,
|
|
|
|
|
volume = 431,
|
|
|
|
|
number = 7011,
|
|
|
|
|
pages = {931-945},
|
|
|
|
|
abstract = {The sequence of the human genome encodes the genetic
|
|
|
|
|
instructions for human physiology, as well as rich information
|
|
|
|
|
about human evolution. In 2001, the International Human Genome
|
|
|
|
|
Sequencing Consortium reported a draft sequence of the
|
|
|
|
|
euchromatic portion of the human genome. Since then, the
|
|
|
|
|
international collaboration has worked to convert this draft
|
|
|
|
|
into a genome sequence with high accuracy and nearly complete
|
|
|
|
|
coverage. Here, we report the result of this finishing
|
|
|
|
|
process. The current genome sequence (Build 35) contains 2.85
|
|
|
|
|
billion nucleotides interrupted by only 341 gaps. It covers
|
|
|
|
|
∼99{\%} of the euchromatic genome and is accurate to an error
|
|
|
|
|
rate of ∼1 event per 100,000 bases. Many of the remaining
|
|
|
|
|
euchromatic gaps are associated with segmental duplications
|
|
|
|
|
and will require focused work with new methods. The
|
|
|
|
|
near-complete sequence, the first for a vertebrate, greatly
|
|
|
|
|
improves the precision of biological analyses of the human
|
|
|
|
|
genome including studies of gene number, birth and death.
|
|
|
|
|
Notably, the human genome seems to encode only 20,000--25,000
|
|
|
|
|
protein-coding genes. The genome sequence reported here should
|
|
|
|
|
serve as a firm foundation for biomedical research in the
|
|
|
|
|
decades ahead.},
|
|
|
|
|
issn = {1476-4687},
|
|
|
|
|
doi = {10.1038/nature03001},
|
|
|
|
|
url = {https://doi.org/10.1038/nature03001}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@Article{Schloss2008,
|
|
|
|
|
author = {Schloss, Jeffery A.},
|
|
|
|
|
title = {How to get genomes at one ten-thousandth the cost},
|
|
|
|
|
journal = {Nature Biotechnology},
|
|
|
|
|
year = 2008,
|
|
|
|
|
month = {Oct},
|
|
|
|
|
day = 01,
|
|
|
|
|
volume = 26,
|
|
|
|
|
number = 10,
|
|
|
|
|
pages = {1113-1115},
|
|
|
|
|
abstract = {The NHGRI's Advanced DNA Sequencing Technology program is
|
|
|
|
|
spearheading the development of platforms that will bring
|
|
|
|
|
routine whole-genome sequencing closer to reality.},
|
|
|
|
|
issn = {1546-1696},
|
|
|
|
|
doi = {10.1038/nbt1008-1113},
|
|
|
|
|
url = {https://doi.org/10.1038/nbt1008-1113}
|
|
|
|
|
}
|