bachelor-thesis/assets/bibliography.bib

520 lines
28 KiB
BibTeX
Raw Normal View History

2021-06-26 18:05:40 +02:00
@article{10.1093/molbev/msy224,
2021-06-28 00:48:32 +02:00
author = {Flagel, Lex and Brandvain, Yaniv and Schrider, Daniel R},
title = "{The Unreasonable Effectiveness of Convolutional Neural
Networks in Population Genetic Inference}",
journal = {Molecular Biology and Evolution},
volume = 36,
number = 2,
pages = {220-238},
year = 2018,
month = 12,
abstract = "{Population-scale genomic data sets have given researchers
incredible amounts of information from which to infer
evolutionary histories. Concomitant with this flood of data,
theoretical and methodological advances have sought to extract
information from genomic sequences to infer demographic events
such as population size changes and gene flow among closely
related populations/species, construct recombination maps, and
uncover loci underlying recent adaptation. To date, most
methods make use of only one or a few summaries of the input
sequences and therefore ignore potentially useful information
encoded in the data. The most sophisticated of these
approaches involve likelihood calculations, which require
theoretical advances for each new problem, and often focus on
a single aspect of the data (e.g., only allele frequency
information) in the interest of mathematical and computational
tractability. Directly interrogating the entirety of the input
sequence data in a likelihood-free manner would thus offer a
fruitful alternative. Here, we accomplish this by representing
DNA sequence alignments as images and using a class of deep
learning methods called convolutional neural networks (CNNs)
to make population genetic inferences from these images. We
apply CNNs to a number of evolutionary questions and find that
they frequently match or exceed the accuracy of current
methods. Importantly, we show that CNNs perform accurate
evolutionary model selection and parameter estimation, even on
problems that have not received detailed theoretical
treatments. Thus, when applied to population genetic
alignments, CNNs are capable of outperforming expert-derived
statistical methods and offer a new path forward in cases
where no likelihood approach exists.}",
issn = {0737-4038},
doi = {10.1093/molbev/msy224},
url = {https://doi.org/10.1093/molbev/msy224},
eprint = {https://academic.oup.com/mbe/article-pdf/36/2/220/27736968/msy224.pdf},
2021-06-26 18:05:40 +02:00
}
2021-06-27 18:21:28 +02:00
@Article{pmid19706884,
2021-06-28 00:48:32 +02:00
Author = "Robins, H. S. and Campregher, P. V. and Srivastava, S. K.
and Wacher, A. and Turtle, C. J. and Kahsai, O. and Riddell,
S. R. and Warren, E. H. and Carlson, C. S. ",
Title = "{{C}omprehensive assessment of {T}-cell receptor beta-chain
diversity in alphabeta {T} cells}",
Journal = "Blood",
Year = 2009,
Volume = 114,
Number = 19,
Pages = "4099--4107",
Month = "Nov"
}
@article {Nurk2021.05.26.445798,
author = {Nurk, Sergey and Koren, Sergey and Rhie, Arang and
Rautiainen, Mikko and Bzikadze, Andrey V. and Mikheenko, Alla
and Vollger, Mitchell R. and Altemose, Nicolas and Uralsky,
Lev and Gershman, Ariel and Aganezov, Sergey and Hoyt,
Savannah J. and Diekhans, Mark and Logsdon, Glennis A. and
Alonge, Michael and Antonarakis, Stylianos E. and Borchers,
Matthew and Bouffard, Gerard G. and Brooks, Shelise Y. and
Caldas, Gina V. and Cheng, Haoyu and Chin, Chen-Shan and Chow,
William and de Lima, Leonardo G. and Dishuck, Philip C. and
Durbin, Richard and Dvorkina, Tatiana and Fiddes, Ian T. and
Formenti, Giulio and Fulton, Robert S. and Fungtammasan,
Arkarachai and Garrison, Erik and Grady, Patrick G.S. and
Graves-Lindsay, Tina A. and Hall, Ira M. and Hansen, Nancy F.
and Hartley, Gabrielle A. and Haukness, Marina and Howe,
Kerstin and Hunkapiller, Michael W. and Jain, Chirag and Jain,
Miten and Jarvis, Erich D. and Kerpedjiev, Peter and Kirsche,
Melanie and Kolmogorov, Mikhail and Korlach, Jonas and
Kremitzki, Milinn and Li, Heng and Maduro, Valerie V. and
Marschall, Tobias and McCartney, Ann M. and McDaniel, Jennifer
and Miller, Danny E. and Mullikin, James C. and Myers, Eugene
W. and Olson, Nathan D. and Paten, Benedict and Peluso, Paul
and Pevzner, Pavel A. and Porubsky, David and Potapova, Tamara
and Rogaev, Evgeny I. and Rosenfeld, Jeffrey A. and Salzberg,
Steven L. and Schneider, Valerie A. and Sedlazeck, Fritz J.
and Shafin, Kishwar and Shew, Colin J. and Shumate, Alaina and
Sims, Yumi and Smit, Arian F. A. and Soto, Daniela C. and
Sovi{\'c}, Ivan and Storer, Jessica M. and Streets, Aaron and
Sullivan, Beth A. and Thibaud-Nissen, Fran{\c c}oise and
Torrance, James and Wagner, Justin and Walenz, Brian P. and
Wenger, Aaron and Wood, Jonathan M. D. and Xiao, Chunlin and
Yan, Stephanie M. and Young, Alice C. and Zarate, Samantha and
Surti, Urvashi and McCoy, Rajiv C. and Dennis, Megan Y. and
Alexandrov, Ivan A. and Gerton, Jennifer L. and
O{\textquoteright}Neill, Rachel J. and Timp, Winston and Zook,
Justin M. and Schatz, Michael C. and Eichler, Evan E. and
Miga, Karen H. and Phillippy, Adam M.},
title = {The complete sequence of a human genome},
elocation-id = {2021.05.26.445798},
year = 2021,
doi = {10.1101/2021.05.26.445798},
publisher = {Cold Spring Harbor Laboratory},
abstract = {In 2001, Celera Genomics and the International Human Genome
Sequencing Consortium published their initial drafts of the
human genome, which revolutionized the field of genomics.
While these drafts and the updates that followed effectively
covered the euchromatic fraction of the genome, the
heterochromatin and many other complex regions were left
unfinished or erroneous. Addressing this remaining 8\% of the
genome, the Telomere-to-Telomere (T2T) Consortium has finished
the first truly complete 3.055 billion base pair (bp) sequence
of a human genome, representing the largest improvement to the
human reference genome since its initial release. The new
T2T-CHM13 reference includes gapless assemblies for all 22
autosomes plus Chromosome X, corrects numerous errors, and
introduces nearly 200 million bp of novel sequence containing
2,226 paralogous gene copies, 115 of which are predicted to be
protein coding. The newly completed regions include all
centromeric satellite arrays and the short arms of all five
acrocentric chromosomes, unlocking these complex regions of
the genome to variational and functional studies for the first
time.Competing Interest StatementAF and CSC are employees of
DNAnexus; IS, JK, MWH, PP, and AW are employees of Pacific
Biosciences; FJS has received travel funds to speak at events
hosted by Pacific Biosciences; SK and FJS have received travel
funds to speak at events hosted by Oxford Nanopore
Technologies. WT has licensed two patents to Oxford Nanopore
Technologies (US 8748091 and 8394584).},
URL = {https://www.biorxiv.org/content/early/2021/05/27/2021.05.26.445798},
eprint = {https://www.biorxiv.org/content/early/2021/05/27/2021.05.26.445798.full.pdf},
journal = {bioRxiv}
}
@ARTICLE{10.3389/fgene.2020.00900,
AUTHOR = {Wang, Luotong and Qu, Li and Yang, Longshu and Wang, Yiying
and Zhu, Huaiqiu},
TITLE = {NanoReviser: An Error-Correction Tool for Nanopore
Sequencing Based on a Deep Learning Algorithm},
JOURNAL = {Frontiers in Genetics},
VOLUME = 11,
PAGES = 900,
YEAR = 2020,
URL = {https://www.frontiersin.org/article/10.3389/fgene.2020.00900},
DOI = {10.3389/fgene.2020.00900},
ISSN = {1664-8021},
ABSTRACT = {Nanopore sequencing is regarded as one of the most
promising third-generation sequencing (TGS) technologies.
Since 2014, Oxford Nanopore Technologies (ONT) has developed a
series of devices based on nanopore sequencing to produce very
long reads, with an expected impact on genomics. However, the
nanopore sequencing reads are susceptible to a fairly high
error rate owing to the difficulty in identifying the DNA
bases from the complex electrical signals. Although several
basecalling tools have been developed for nanopore sequencing
over the past years, it is still challenging to correct the
sequences after applying the basecalling procedure. In this
study, we developed an open-source DNA basecalling reviser,
NanoReviser, based on a deep learning algorithm to correct the
basecalling errors introduced by current basecallers provided
by default. In our module, we re-segmented the raw electrical
signals based on the basecalled sequences provided by the
default basecallers. By employing convolution neural networks
(CNNs) and bidirectional long short-term memory (Bi-LSTM)
networks, we took advantage of the information from the raw
electrical signals and the basecalled sequences from the
basecallers. Our results showed NanoReviser, as a
post-basecalling reviser, significantly improving the
basecalling quality. After being trained on standard ONT
sequencing reads from public E. coli and human NA12878
datasets, NanoReviser reduced the sequencing error rate by
over 5% for both the E. coli dataset and the human dataset.
The performance of NanoReviser was found to be better than
those of all current basecalling tools. Furthermore, we
analyzed the modified bases of the E. coli dataset and added
the methylation information to train our module. With the
methylation annotation, NanoReviser reduced the error rate by
7% for the E. coli dataset and specifically reduced the error
rate by over 10% for the regions of the sequence rich in
methylated bases. To the best of our knowledge, NanoReviser is
the first post-processing tool after basecalling to accurately
correct the nanopore sequences without the time-consuming
procedure of building the consensus sequence. The NanoReviser
package is freely available at <ext-link ext-link-type="uri"
xlink:href="https://github.com/pkubioinformatics/NanoReviser"
xmlns:xlink="http://www.w3.org/1999/xlink">https://github.com/pkubioinformatics/NanoReviser</ext-link>.}
}
@article{HEATHER20161,
title = {The sequence of sequencers: The history of sequencing DNA},
journal = {Genomics},
volume = 107,
number = 1,
pages = {1-8},
year = 2016,
issn = {0888-7543},
doi = {https://doi.org/10.1016/j.ygeno.2015.11.003},
url = {https://www.sciencedirect.com/science/article/pii/S0888754315300410},
author = {James M. Heather and Benjamin Chain},
keywords = {DNA, RNA, Sequencing, Sequencer, History},
abstract = {Determining the order of nucleic acid residues in
biological samples is an integral component of a wide variety
of research applications. Over the last fifty years large
numbers of researchers have applied themselves to the
production of techniques and technologies to facilitate this
feat, sequencing DNA and RNA molecules. This time-scale has
witnessed tremendous changes, moving from sequencing short
oligonucleotides to millions of bases, from struggling towards
the deduction of the coding sequence of a single gene to rapid
and widely available whole genome sequencing. This article
traverses those years, iterating through the different
generations of sequencing technology, highlighting some of the
key discoveries, researchers, and sequences along the way.}
}

@Article{vanDijk2014,
author = {van Dijk, Erwin L. and Auger, H{\'e}l{\`e}ne and
Jaszczyszyn, Yan and Thermes, Claude},
title = {Ten years of next-generation sequencing technology},
journal = {Trends in Genetics},
year = 2014,
month = {Sep},
day = 01,
publisher = {Elsevier},
volume = 30,
number = 9,
pages = {418-426},
issn = {0168-9525},
doi = {10.1016/j.tig.2014.07.001},
url = {https://doi.org/10.1016/j.tig.2014.07.001}
2021-06-27 18:21:28 +02:00
}
2021-06-28 01:56:27 +02:00
@article {Sanger5463,
author = {Sanger, F. and Nicklen, S. and Coulson, A. R.},
title = {DNA sequencing with chain-terminating inhibitors},
volume = 74,
number = 12,
pages = {5463--5467},
year = 1977,
doi = {10.1073/pnas.74.12.5463},
publisher = {National Academy of Sciences},
abstract = {A new method for determining nucleotide sequences in DNA is
described. It is similar to the {\textquotedblleft}plus and
minus{\textquotedblright} method [Sanger, F. \&amp; Coulson,
A. R. (1975) J. Mol. Biol. 94, 441-448] but makes use of the
2',3'-dideoxy and arabinonucleoside analogues of the normal
deoxynucleoside triphosphates, which act as specific
chain-terminating inhibitors of DNA polymerase. The technique
has been applied to the DNA of bacteriophage ϕX174 and is more
rapid and more accurate than either the plus or the minus
method.},
issn = {0027-8424},
URL = {https://www.pnas.org/content/74/12/5463},
eprint = {https://www.pnas.org/content/74/12/5463.full.pdf},
journal = {Proceedings of the National Academy of Sciences}
}
2021-06-28 19:01:25 +02:00

@Article{InternationalHumanGenomeSequencingConsortium2004,
author = {Consortium, International Human Genome Sequencing},
title = {Finishing the euchromatic sequence of the human genome},
journal = {Nature},
year = 2004,
month = {Oct},
day = 01,
volume = 431,
number = 7011,
pages = {931-945},
abstract = {The sequence of the human genome encodes the genetic
instructions for human physiology, as well as rich information
about human evolution. In 2001, the International Human Genome
Sequencing Consortium reported a draft sequence of the
euchromatic portion of the human genome. Since then, the
international collaboration has worked to convert this draft
into a genome sequence with high accuracy and nearly complete
coverage. Here, we report the result of this finishing
process. The current genome sequence (Build 35) contains 2.85
billion nucleotides interrupted by only 341 gaps. It covers
99{\%} of the euchromatic genome and is accurate to an error
rate of 1 event per 100,000 bases. Many of the remaining
euchromatic gaps are associated with segmental duplications
and will require focused work with new methods. The
near-complete sequence, the first for a vertebrate, greatly
improves the precision of biological analyses of the human
genome including studies of gene number, birth and death.
Notably, the human genome seems to encode only 20,000--25,000
protein-coding genes. The genome sequence reported here should
serve as a firm foundation for biomedical research in the
decades ahead.},
issn = {1476-4687},
doi = {10.1038/nature03001},
url = {https://doi.org/10.1038/nature03001}
}

@Article{Schloss2008,
author = {Schloss, Jeffery A.},
title = {How to get genomes at one ten-thousandth the cost},
journal = {Nature Biotechnology},
year = 2008,
month = {Oct},
day = 01,
volume = 26,
number = 10,
pages = {1113-1115},
abstract = {The NHGRI's Advanced DNA Sequencing Technology program is
spearheading the development of platforms that will bring
routine whole-genome sequencing closer to reality.},
issn = {1546-1696},
doi = {10.1038/nbt1008-1113},
url = {https://doi.org/10.1038/nbt1008-1113}
}
2021-06-29 02:44:36 +02:00
@Article{Shugay2014,
author = {Shugay, Mikhail and Britanova, Olga V. and Merzlyak,
Ekaterina M. and Turchaninova, Maria A. and Mamedov, Ilgar Z.
and Tuganbaev, Timur R. and Bolotin, Dmitriy A. and
Staroverov, Dmitry B. and Putintseva, Ekaterina V. and
Plevova, Karla and Linnemann, Carsten and Shagin, Dmitriy and
Pospisilova, Sarka and Lukyanov, Sergey and Schumacher, Ton N.
and Chudakov, Dmitriy M.},
title = {Towards error-free profiling of immune repertoires},
journal = {Nature Methods},
year = 2014,
month = {Jun},
day = 01,
volume = 11,
number = 6,
pages = {653-655},
abstract = {A two-step error correction process for high
throughput--sequenced T- and B-cell receptors allows the
elimination of most errors while not diminishing the natural
complexity of the repertoires.},
issn = {1548-7105},
doi = {10.1038/nmeth.2960},
url = {https://doi.org/10.1038/nmeth.2960}
}
@Article{Ma2019,
author = {Ma, Xiaotu and Shao, Ying and Tian, Liqing and Flasch,
Diane A. and Mulder, Heather L. and Edmonson, Michael N. and
Liu, Yu and Chen, Xiang and Newman, Scott and Nakitandwe, Joy
and Li, Yongjin and Li, Benshang and Shen, Shuhong and Wang,
Zhaoming and Shurtleff, Sheila and Robison, Leslie L. and
Levy, Shawn and Easton, John and Zhang, Jinghui},
title = {Analysis of error profiles in deep next-generation
sequencing data},
journal = {Genome Biology},
year = 2019,
month = {Mar},
day = 14,
volume = 20,
number = 1,
pages = 50,
abstract = {Sequencing errors are key confounding factors for detecting
low-frequency genetic variants that are important for cancer
molecular diagnosis, treatment, and surveillance using deep
next-generation sequencing (NGS). However, there is a lack of
comprehensive understanding of errors introduced at various
steps of a conventional NGS workflow, such as sample handling,
library preparation, PCR enrichment, and sequencing. In this
study, we use current NGS technology to systematically
investigate these questions.},
issn = {1474-760X},
doi = {10.1186/s13059-019-1659-6},
}
@mastersthesis{BenítezCantos-Master,
author = "María Soledad Benítez Cantos",
title = "Análisis de repertorios de receptores de células T a partir de datos de secuenciación masiva",
school = "Universidad de Granada",
year = "2019",
month = "{Jul}",
}
@inbook{abbas_lichtman_pillai_2017,
place = {Philadelphia, PA},
edition = {9th},
booktitle = {Cellular and molecular immunology},
publisher = {Elsevier},
author = {Abbas, Abul K. and Lichtman, Andrew H. and Pillai, Shiv},
year = 2017,
pages = 204
}
2021-06-29 20:00:09 +02:00

@Article{CRICK1970,
author = {Crick, Francis},
title = {Central Dogma of Molecular Biology},
journal = {Nature},
year = 1970,
month = {Aug},
day = 01,
volume = 227,
number = 5258,
pages = {561-563},
abstract = {The central dogma of molecular biology deals with the
detailed residue-by-residue transfer of sequential
information. It states that such information cannot be
transferred from protein to either protein or nucleic acid.},
issn = {1476-4687},
doi = {10.1038/227561a0},
url = {https://doi.org/10.1038/227561a0}
}
@Article{Salk2018,
author = {Salk, Jesse J. and Schmitt, Michael W. and Loeb, Lawrence
A.},
title = {Enhancing the accuracy of next-generation sequencing for
detecting rare and subclonal mutations},
journal = {Nature Reviews Genetics},
year = 2018,
month = {May},
day = 01,
volume = 19,
number = 5,
pages = {269-285},
abstract = {The ability to identify low-frequency genetic variants
among heterogeneous populations of cells or DNA molecules is
important in many fields of basic science, clinical medicine
and other applications, yet current high-throughput DNA
sequencing technologies have an error rate between 1 per 100
and 1 per 1,000 base pairs sequenced, which obscures their
presence below this level.As next-generation sequencing
technologies evolved over the decade, throughput has improved
markedly, but raw accuracy has remained generally unchanged.
Researchers with a need for high accuracy developed data
filtering methods and incremental biochemical improvements
that modestly improve low-frequency variant detection, but
background errors remain limiting in many fields.The most
profoundly impactful means for reducing errors, first
developed approximately 7 years ago, has been the concept of
single-molecule consensus sequencing. This entails redundant
sequencing of multiple copies of a given specific DNA molecule
and discounting of variants that are not present in all or
most of the copies as likely errors.Consensus sequencing can
be achieved by labelling each molecule with a unique molecular
barcode before generating copies, which allows subsequent
comparison of these copies or schemes whereby copies are
physically joined and sequenced together. Because of
trade-offs in cost, time and accuracy, no single method is
optimal for every application, and each method should be
considered on a case-by-case basis.Major applications for
high-accuracy DNA sequencing include non-invasive cancer
diagnostics, cancer screening, early detection of cancer
relapse or impending drug resistance, infectious disease
applications, prenatal diagnostics, forensics and mutagenesis
assessment.Future advances in ultra-high-accuracy sequencing
are likely to be driven by an emerging generation of
single-molecule sequencers, particularly those that allow
independent sequence comparison of both strands of native DNA
duplexes.},
issn = {1471-0064},
doi = {10.1038/nrg.2017.117},
url = {https://doi.org/10.1038/nrg.2017.117}
}
@book{book:lehninger,
title = {Lehninger-Principles of Biochemistry},
author = {Albert Lehninger, David L. Nelson, Michael M. Cox},
publisher = {W. H. Freeman},
isbn = {9781429224161,1429224169},
year = 2008,
edition = {5th Edition},
pages = 276
}
@inproceedings{crick1958protein,
title = {On protein synthesis},
author = {Crick, Francis HC},
booktitle = {Symp Soc Exp Biol},
volume = 12,
number = {138-63},
pages = 8,
year = 1958
}
@article{10.1093/bioinformatics/btg109,
author = {Lee, Christopher},
title = "{Generating consensus sequences from partial order multiple
sequence alignment graphs}",
journal = {Bioinformatics},
volume = 19,
number = 8,
pages = {999-1008},
year = 2003,
month = 05,
abstract = "{Motivation: Consensus sequence generation is important in
many kinds of sequence analysis ranging from sequence assembly
to profile-based iterative search methods. However, how can a
consensus be constructed when its inherent assumption—that the
aligned sequences form a single linear consensus—is not
true?Results: Partial Order Alignment (POA) enables
construction and analysis of multiple sequence alignments as
directed acyclic graphs containing complex branching
structure. Here we present a dynamic programming algorithm
(heaviest\_bundle) for generating multiple consensus sequences
from such complex alignments. The number and relationships of
these consensus sequences reveals the degree of structural
complexity of the source alignment. This is a powerful and
general approach for analyzing and visualizing complex
alignment structures, and can be applied to any alignment. We
illustrate its value for analyzing expressed sequence
alignments to detect alternative splicing, reconstruct full
length mRNA isoform sequences from EST fragments, and separate
paralog mixtures that can cause incorrect SNP
predictions.Availability: The heaviest\_bundle source code is
available at http://www.bioinformatics.ucla.edu/poaContact:
leec@mbi.ucla.edu*To whom correspondence should be
addressed.}",
issn = {1367-4803},
doi = {10.1093/bioinformatics/btg109},
url = {https://doi.org/10.1093/bioinformatics/btg109},
eprint = {https://academic.oup.com/bioinformatics/article-pdf/19/8/999/642375/btg109.pdf},
}