565 lines
31 KiB
BibTeX
565 lines
31 KiB
BibTeX
@article{10.1093/molbev/msy224,
|
||
author = {Flagel, Lex and Brandvain, Yaniv and Schrider, Daniel R},
|
||
title = "{The Unreasonable Effectiveness of Convolutional Neural
|
||
Networks in Population Genetic Inference}",
|
||
journal = {Molecular Biology and Evolution},
|
||
volume = 36,
|
||
number = 2,
|
||
pages = {220-238},
|
||
year = 2018,
|
||
month = 12,
|
||
abstract = "{Population-scale genomic data sets have given researchers
|
||
incredible amounts of information from which to infer
|
||
evolutionary histories. Concomitant with this flood of data,
|
||
theoretical and methodological advances have sought to extract
|
||
information from genomic sequences to infer demographic events
|
||
such as population size changes and gene flow among closely
|
||
related populations/species, construct recombination maps, and
|
||
uncover loci underlying recent adaptation. To date, most
|
||
methods make use of only one or a few summaries of the input
|
||
sequences and therefore ignore potentially useful information
|
||
encoded in the data. The most sophisticated of these
|
||
approaches involve likelihood calculations, which require
|
||
theoretical advances for each new problem, and often focus on
|
||
a single aspect of the data (e.g., only allele frequency
|
||
information) in the interest of mathematical and computational
|
||
tractability. Directly interrogating the entirety of the input
|
||
sequence data in a likelihood-free manner would thus offer a
|
||
fruitful alternative. Here, we accomplish this by representing
|
||
DNA sequence alignments as images and using a class of deep
|
||
learning methods called convolutional neural networks (CNNs)
|
||
to make population genetic inferences from these images. We
|
||
apply CNNs to a number of evolutionary questions and find that
|
||
they frequently match or exceed the accuracy of current
|
||
methods. Importantly, we show that CNNs perform accurate
|
||
evolutionary model selection and parameter estimation, even on
|
||
problems that have not received detailed theoretical
|
||
treatments. Thus, when applied to population genetic
|
||
alignments, CNNs are capable of outperforming expert-derived
|
||
statistical methods and offer a new path forward in cases
|
||
where no likelihood approach exists.}",
|
||
issn = {0737-4038},
|
||
doi = {10.1093/molbev/msy224},
|
||
url = {https://doi.org/10.1093/molbev/msy224},
|
||
eprint = {https://academic.oup.com/mbe/article-pdf/36/2/220/27736968/msy224.pdf},
|
||
}
|
||
|
||
@Article{pmid19706884,
|
||
Author = "Robins, H. S. and Campregher, P. V. and Srivastava, S. K.
|
||
and Wacher, A. and Turtle, C. J. and Kahsai, O. and Riddell,
|
||
S. R. and Warren, E. H. and Carlson, C. S. ",
|
||
Title = "{{C}omprehensive assessment of {T}-cell receptor beta-chain
|
||
diversity in alphabeta {T} cells}",
|
||
Journal = "Blood",
|
||
Year = 2009,
|
||
Volume = 114,
|
||
Number = 19,
|
||
Pages = "4099--4107",
|
||
Month = "Nov"
|
||
}
|
||
|
||
@article {Nurk2021.05.26.445798,
|
||
author = {Nurk, Sergey and Koren, Sergey and Rhie, Arang and
|
||
Rautiainen, Mikko and Bzikadze, Andrey V. and Mikheenko, Alla
|
||
and Vollger, Mitchell R. and Altemose, Nicolas and Uralsky,
|
||
Lev and Gershman, Ariel and Aganezov, Sergey and Hoyt,
|
||
Savannah J. and Diekhans, Mark and Logsdon, Glennis A. and
|
||
Alonge, Michael and Antonarakis, Stylianos E. and Borchers,
|
||
Matthew and Bouffard, Gerard G. and Brooks, Shelise Y. and
|
||
Caldas, Gina V. and Cheng, Haoyu and Chin, Chen-Shan and Chow,
|
||
William and de Lima, Leonardo G. and Dishuck, Philip C. and
|
||
Durbin, Richard and Dvorkina, Tatiana and Fiddes, Ian T. and
|
||
Formenti, Giulio and Fulton, Robert S. and Fungtammasan,
|
||
Arkarachai and Garrison, Erik and Grady, Patrick G.S. and
|
||
Graves-Lindsay, Tina A. and Hall, Ira M. and Hansen, Nancy F.
|
||
and Hartley, Gabrielle A. and Haukness, Marina and Howe,
|
||
Kerstin and Hunkapiller, Michael W. and Jain, Chirag and Jain,
|
||
Miten and Jarvis, Erich D. and Kerpedjiev, Peter and Kirsche,
|
||
Melanie and Kolmogorov, Mikhail and Korlach, Jonas and
|
||
Kremitzki, Milinn and Li, Heng and Maduro, Valerie V. and
|
||
Marschall, Tobias and McCartney, Ann M. and McDaniel, Jennifer
|
||
and Miller, Danny E. and Mullikin, James C. and Myers, Eugene
|
||
W. and Olson, Nathan D. and Paten, Benedict and Peluso, Paul
|
||
and Pevzner, Pavel A. and Porubsky, David and Potapova, Tamara
|
||
and Rogaev, Evgeny I. and Rosenfeld, Jeffrey A. and Salzberg,
|
||
Steven L. and Schneider, Valerie A. and Sedlazeck, Fritz J.
|
||
and Shafin, Kishwar and Shew, Colin J. and Shumate, Alaina and
|
||
Sims, Yumi and Smit, Arian F. A. and Soto, Daniela C. and
|
||
Sovi{\'c}, Ivan and Storer, Jessica M. and Streets, Aaron and
|
||
Sullivan, Beth A. and Thibaud-Nissen, Fran{\c c}oise and
|
||
Torrance, James and Wagner, Justin and Walenz, Brian P. and
|
||
Wenger, Aaron and Wood, Jonathan M. D. and Xiao, Chunlin and
|
||
Yan, Stephanie M. and Young, Alice C. and Zarate, Samantha and
|
||
Surti, Urvashi and McCoy, Rajiv C. and Dennis, Megan Y. and
|
||
Alexandrov, Ivan A. and Gerton, Jennifer L. and
|
||
O{\textquoteright}Neill, Rachel J. and Timp, Winston and Zook,
|
||
Justin M. and Schatz, Michael C. and Eichler, Evan E. and
|
||
Miga, Karen H. and Phillippy, Adam M.},
|
||
title = {The complete sequence of a human genome},
|
||
elocation-id = {2021.05.26.445798},
|
||
year = 2021,
|
||
doi = {10.1101/2021.05.26.445798},
|
||
publisher = {Cold Spring Harbor Laboratory},
|
||
abstract = {In 2001, Celera Genomics and the International Human Genome
|
||
Sequencing Consortium published their initial drafts of the
|
||
human genome, which revolutionized the field of genomics.
|
||
While these drafts and the updates that followed effectively
|
||
covered the euchromatic fraction of the genome, the
|
||
heterochromatin and many other complex regions were left
|
||
unfinished or erroneous. Addressing this remaining 8\% of the
|
||
genome, the Telomere-to-Telomere (T2T) Consortium has finished
|
||
the first truly complete 3.055 billion base pair (bp) sequence
|
||
of a human genome, representing the largest improvement to the
|
||
human reference genome since its initial release. The new
|
||
T2T-CHM13 reference includes gapless assemblies for all 22
|
||
autosomes plus Chromosome X, corrects numerous errors, and
|
||
introduces nearly 200 million bp of novel sequence containing
|
||
2,226 paralogous gene copies, 115 of which are predicted to be
|
||
protein coding. The newly completed regions include all
|
||
centromeric satellite arrays and the short arms of all five
|
||
acrocentric chromosomes, unlocking these complex regions of
|
||
the genome to variational and functional studies for the first
|
||
time.Competing Interest StatementAF and CSC are employees of
|
||
DNAnexus; IS, JK, MWH, PP, and AW are employees of Pacific
|
||
Biosciences; FJS has received travel funds to speak at events
|
||
hosted by Pacific Biosciences; SK and FJS have received travel
|
||
funds to speak at events hosted by Oxford Nanopore
|
||
Technologies. WT has licensed two patents to Oxford Nanopore
|
||
Technologies (US 8748091 and 8394584).},
|
||
URL = {https://www.biorxiv.org/content/early/2021/05/27/2021.05.26.445798},
|
||
eprint = {https://www.biorxiv.org/content/early/2021/05/27/2021.05.26.445798.full.pdf},
|
||
journal = {bioRxiv}
|
||
}
|
||
|
||
@ARTICLE{10.3389/fgene.2020.00900,
|
||
AUTHOR = {Wang, Luotong and Qu, Li and Yang, Longshu and Wang, Yiying
|
||
and Zhu, Huaiqiu},
|
||
TITLE = {NanoReviser: An Error-Correction Tool for Nanopore
|
||
Sequencing Based on a Deep Learning Algorithm},
|
||
JOURNAL = {Frontiers in Genetics},
|
||
VOLUME = 11,
|
||
PAGES = 900,
|
||
YEAR = 2020,
|
||
URL = {https://www.frontiersin.org/article/10.3389/fgene.2020.00900},
|
||
DOI = {10.3389/fgene.2020.00900},
|
||
ISSN = {1664-8021},
|
||
ABSTRACT = {Nanopore sequencing is regarded as one of the most
|
||
promising third-generation sequencing (TGS) technologies.
|
||
Since 2014, Oxford Nanopore Technologies (ONT) has developed a
|
||
series of devices based on nanopore sequencing to produce very
|
||
long reads, with an expected impact on genomics. However, the
|
||
nanopore sequencing reads are susceptible to a fairly high
|
||
error rate owing to the difficulty in identifying the DNA
|
||
bases from the complex electrical signals. Although several
|
||
basecalling tools have been developed for nanopore sequencing
|
||
over the past years, it is still challenging to correct the
|
||
sequences after applying the basecalling procedure. In this
|
||
study, we developed an open-source DNA basecalling reviser,
|
||
NanoReviser, based on a deep learning algorithm to correct the
|
||
basecalling errors introduced by current basecallers provided
|
||
by default. In our module, we re-segmented the raw electrical
|
||
signals based on the basecalled sequences provided by the
|
||
default basecallers. By employing convolution neural networks
|
||
(CNNs) and bidirectional long short-term memory (Bi-LSTM)
|
||
networks, we took advantage of the information from the raw
|
||
electrical signals and the basecalled sequences from the
|
||
basecallers. Our results showed NanoReviser, as a
|
||
post-basecalling reviser, significantly improving the
|
||
basecalling quality. After being trained on standard ONT
|
||
sequencing reads from public E. coli and human NA12878
|
||
datasets, NanoReviser reduced the sequencing error rate by
|
||
over 5% for both the E. coli dataset and the human dataset.
|
||
The performance of NanoReviser was found to be better than
|
||
those of all current basecalling tools. Furthermore, we
|
||
analyzed the modified bases of the E. coli dataset and added
|
||
the methylation information to train our module. With the
|
||
methylation annotation, NanoReviser reduced the error rate by
|
||
7% for the E. coli dataset and specifically reduced the error
|
||
rate by over 10% for the regions of the sequence rich in
|
||
methylated bases. To the best of our knowledge, NanoReviser is
|
||
the first post-processing tool after basecalling to accurately
|
||
correct the nanopore sequences without the time-consuming
|
||
procedure of building the consensus sequence. The NanoReviser
|
||
package is freely available at <ext-link ext-link-type="uri"
|
||
xlink:href="https://github.com/pkubioinformatics/NanoReviser"
|
||
xmlns:xlink="http://www.w3.org/1999/xlink">https://github.com/pkubioinformatics/NanoReviser</ext-link>.}
|
||
}
|
||
|
||
@article{HEATHER20161,
|
||
title = {The sequence of sequencers: The history of sequencing DNA},
|
||
journal = {Genomics},
|
||
volume = 107,
|
||
number = 1,
|
||
pages = {1-8},
|
||
year = 2016,
|
||
issn = {0888-7543},
|
||
doi = {https://doi.org/10.1016/j.ygeno.2015.11.003},
|
||
url = {https://www.sciencedirect.com/science/article/pii/S0888754315300410},
|
||
author = {James M. Heather and Benjamin Chain},
|
||
keywords = {DNA, RNA, Sequencing, Sequencer, History},
|
||
abstract = {Determining the order of nucleic acid residues in
|
||
biological samples is an integral component of a wide variety
|
||
of research applications. Over the last fifty years large
|
||
numbers of researchers have applied themselves to the
|
||
production of techniques and technologies to facilitate this
|
||
feat, sequencing DNA and RNA molecules. This time-scale has
|
||
witnessed tremendous changes, moving from sequencing short
|
||
oligonucleotides to millions of bases, from struggling towards
|
||
the deduction of the coding sequence of a single gene to rapid
|
||
and widely available whole genome sequencing. This article
|
||
traverses those years, iterating through the different
|
||
generations of sequencing technology, highlighting some of the
|
||
key discoveries, researchers, and sequences along the way.}
|
||
}
|
||
|
||
|
||
|
||
@Article{vanDijk2014,
|
||
author = {van Dijk, Erwin L. and Auger, H{\'e}l{\`e}ne and
|
||
Jaszczyszyn, Yan and Thermes, Claude},
|
||
title = {Ten years of next-generation sequencing technology},
|
||
journal = {Trends in Genetics},
|
||
year = 2014,
|
||
month = {Sep},
|
||
day = 01,
|
||
publisher = {Elsevier},
|
||
volume = 30,
|
||
number = 9,
|
||
pages = {418-426},
|
||
issn = {0168-9525},
|
||
doi = {10.1016/j.tig.2014.07.001},
|
||
url = {https://doi.org/10.1016/j.tig.2014.07.001}
|
||
}
|
||
|
||
@article {Sanger5463,
|
||
author = {Sanger, F. and Nicklen, S. and Coulson, A. R.},
|
||
title = {DNA sequencing with chain-terminating inhibitors},
|
||
volume = 74,
|
||
number = 12,
|
||
pages = {5463--5467},
|
||
year = 1977,
|
||
doi = {10.1073/pnas.74.12.5463},
|
||
publisher = {National Academy of Sciences},
|
||
abstract = {A new method for determining nucleotide sequences in DNA is
|
||
described. It is similar to the {\textquotedblleft}plus and
|
||
minus{\textquotedblright} method [Sanger, F. \& Coulson,
|
||
A. R. (1975) J. Mol. Biol. 94, 441-448] but makes use of the
|
||
2',3'-dideoxy and arabinonucleoside analogues of the normal
|
||
deoxynucleoside triphosphates, which act as specific
|
||
chain-terminating inhibitors of DNA polymerase. The technique
|
||
has been applied to the DNA of bacteriophage ϕX174 and is more
|
||
rapid and more accurate than either the plus or the minus
|
||
method.},
|
||
issn = {0027-8424},
|
||
URL = {https://www.pnas.org/content/74/12/5463},
|
||
eprint = {https://www.pnas.org/content/74/12/5463.full.pdf},
|
||
journal = {Proceedings of the National Academy of Sciences}
|
||
}
|
||
|
||
|
||
|
||
@Article{InternationalHumanGenomeSequencingConsortium2004,
|
||
author = {Consortium, International Human Genome Sequencing},
|
||
title = {Finishing the euchromatic sequence of the human genome},
|
||
journal = {Nature},
|
||
year = 2004,
|
||
month = {Oct},
|
||
day = 01,
|
||
volume = 431,
|
||
number = 7011,
|
||
pages = {931-945},
|
||
abstract = {The sequence of the human genome encodes the genetic
|
||
instructions for human physiology, as well as rich information
|
||
about human evolution. In 2001, the International Human Genome
|
||
Sequencing Consortium reported a draft sequence of the
|
||
euchromatic portion of the human genome. Since then, the
|
||
international collaboration has worked to convert this draft
|
||
into a genome sequence with high accuracy and nearly complete
|
||
coverage. Here, we report the result of this finishing
|
||
process. The current genome sequence (Build 35) contains 2.85
|
||
billion nucleotides interrupted by only 341 gaps. It covers
|
||
∼99{\%} of the euchromatic genome and is accurate to an error
|
||
rate of ∼1 event per 100,000 bases. Many of the remaining
|
||
euchromatic gaps are associated with segmental duplications
|
||
and will require focused work with new methods. The
|
||
near-complete sequence, the first for a vertebrate, greatly
|
||
improves the precision of biological analyses of the human
|
||
genome including studies of gene number, birth and death.
|
||
Notably, the human genome seems to encode only 20,000--25,000
|
||
protein-coding genes. The genome sequence reported here should
|
||
serve as a firm foundation for biomedical research in the
|
||
decades ahead.},
|
||
issn = {1476-4687},
|
||
doi = {10.1038/nature03001},
|
||
url = {https://doi.org/10.1038/nature03001}
|
||
}
|
||
|
||
|
||
|
||
@Article{Schloss2008,
|
||
author = {Schloss, Jeffery A.},
|
||
title = {How to get genomes at one ten-thousandth the cost},
|
||
journal = {Nature Biotechnology},
|
||
year = 2008,
|
||
month = {Oct},
|
||
day = 01,
|
||
volume = 26,
|
||
number = 10,
|
||
pages = {1113-1115},
|
||
abstract = {The NHGRI's Advanced DNA Sequencing Technology program is
|
||
spearheading the development of platforms that will bring
|
||
routine whole-genome sequencing closer to reality.},
|
||
issn = {1546-1696},
|
||
doi = {10.1038/nbt1008-1113},
|
||
url = {https://doi.org/10.1038/nbt1008-1113}
|
||
}
|
||
|
||
@Article{Shugay2014,
|
||
author = {Shugay, Mikhail and Britanova, Olga V. and Merzlyak,
|
||
Ekaterina M. and Turchaninova, Maria A. and Mamedov, Ilgar Z.
|
||
and Tuganbaev, Timur R. and Bolotin, Dmitriy A. and
|
||
Staroverov, Dmitry B. and Putintseva, Ekaterina V. and
|
||
Plevova, Karla and Linnemann, Carsten and Shagin, Dmitriy and
|
||
Pospisilova, Sarka and Lukyanov, Sergey and Schumacher, Ton N.
|
||
and Chudakov, Dmitriy M.},
|
||
title = {Towards error-free profiling of immune repertoires},
|
||
journal = {Nature Methods},
|
||
year = 2014,
|
||
month = {Jun},
|
||
day = 01,
|
||
volume = 11,
|
||
number = 6,
|
||
pages = {653-655},
|
||
abstract = {A two-step error correction process for high
|
||
throughput--sequenced T- and B-cell receptors allows the
|
||
elimination of most errors while not diminishing the natural
|
||
complexity of the repertoires.},
|
||
issn = {1548-7105},
|
||
doi = {10.1038/nmeth.2960},
|
||
url = {https://doi.org/10.1038/nmeth.2960}
|
||
}
|
||
|
||
@Article{Ma2019,
|
||
author = {Ma, Xiaotu and Shao, Ying and Tian, Liqing and Flasch,
|
||
Diane A. and Mulder, Heather L. and Edmonson, Michael N. and
|
||
Liu, Yu and Chen, Xiang and Newman, Scott and Nakitandwe, Joy
|
||
and Li, Yongjin and Li, Benshang and Shen, Shuhong and Wang,
|
||
Zhaoming and Shurtleff, Sheila and Robison, Leslie L. and
|
||
Levy, Shawn and Easton, John and Zhang, Jinghui},
|
||
title = {Analysis of error profiles in deep next-generation
|
||
sequencing data},
|
||
journal = {Genome Biology},
|
||
year = 2019,
|
||
month = {Mar},
|
||
day = 14,
|
||
volume = 20,
|
||
number = 1,
|
||
pages = 50,
|
||
abstract = {Sequencing errors are key confounding factors for detecting
|
||
low-frequency genetic variants that are important for cancer
|
||
molecular diagnosis, treatment, and surveillance using deep
|
||
next-generation sequencing (NGS). However, there is a lack of
|
||
comprehensive understanding of errors introduced at various
|
||
steps of a conventional NGS workflow, such as sample handling,
|
||
library preparation, PCR enrichment, and sequencing. In this
|
||
study, we use current NGS technology to systematically
|
||
investigate these questions.},
|
||
issn = {1474-760X},
|
||
doi = {10.1186/s13059-019-1659-6},
|
||
}
|
||
|
||
@mastersthesis{BenítezCantos-Master,
|
||
author = "María Soledad Benítez Cantos",
|
||
title = "Análisis de repertorios de receptores de células T a partir de datos de secuenciación masiva",
|
||
school = "Universidad de Granada",
|
||
year = "2019",
|
||
month = "{Jul}",
|
||
}
|
||
|
||
@inbook{abbas_lichtman_pillai_2017,
|
||
place = {Philadelphia, PA},
|
||
edition = {9th},
|
||
booktitle = {Cellular and molecular immunology},
|
||
publisher = {Elsevier},
|
||
author = {Abbas, Abul K. and Lichtman, Andrew H. and Pillai, Shiv},
|
||
year = 2017,
|
||
pages = 204
|
||
}
|
||
|
||
|
||
|
||
@Article{CRICK1970,
|
||
author = {Crick, Francis},
|
||
title = {Central Dogma of Molecular Biology},
|
||
journal = {Nature},
|
||
year = 1970,
|
||
month = {Aug},
|
||
day = 01,
|
||
volume = 227,
|
||
number = 5258,
|
||
pages = {561-563},
|
||
abstract = {The central dogma of molecular biology deals with the
|
||
detailed residue-by-residue transfer of sequential
|
||
information. It states that such information cannot be
|
||
transferred from protein to either protein or nucleic acid.},
|
||
issn = {1476-4687},
|
||
doi = {10.1038/227561a0},
|
||
url = {https://doi.org/10.1038/227561a0}
|
||
}
|
||
|
||
@Article{Salk2018,
|
||
author = {Salk, Jesse J. and Schmitt, Michael W. and Loeb, Lawrence
|
||
A.},
|
||
title = {Enhancing the accuracy of next-generation sequencing for
|
||
detecting rare and subclonal mutations},
|
||
journal = {Nature Reviews Genetics},
|
||
year = 2018,
|
||
month = {May},
|
||
day = 01,
|
||
volume = 19,
|
||
number = 5,
|
||
pages = {269-285},
|
||
abstract = {The ability to identify low-frequency genetic variants
|
||
among heterogeneous populations of cells or DNA molecules is
|
||
important in many fields of basic science, clinical medicine
|
||
and other applications, yet current high-throughput DNA
|
||
sequencing technologies have an error rate between 1 per 100
|
||
and 1 per 1,000 base pairs sequenced, which obscures their
|
||
presence below this level.As next-generation sequencing
|
||
technologies evolved over the decade, throughput has improved
|
||
markedly, but raw accuracy has remained generally unchanged.
|
||
Researchers with a need for high accuracy developed data
|
||
filtering methods and incremental biochemical improvements
|
||
that modestly improve low-frequency variant detection, but
|
||
background errors remain limiting in many fields.The most
|
||
profoundly impactful means for reducing errors, first
|
||
developed approximately 7 years ago, has been the concept of
|
||
single-molecule consensus sequencing. This entails redundant
|
||
sequencing of multiple copies of a given specific DNA molecule
|
||
and discounting of variants that are not present in all or
|
||
most of the copies as likely errors.Consensus sequencing can
|
||
be achieved by labelling each molecule with a unique molecular
|
||
barcode before generating copies, which allows subsequent
|
||
comparison of these copies or schemes whereby copies are
|
||
physically joined and sequenced together. Because of
|
||
trade-offs in cost, time and accuracy, no single method is
|
||
optimal for every application, and each method should be
|
||
considered on a case-by-case basis.Major applications for
|
||
high-accuracy DNA sequencing include non-invasive cancer
|
||
diagnostics, cancer screening, early detection of cancer
|
||
relapse or impending drug resistance, infectious disease
|
||
applications, prenatal diagnostics, forensics and mutagenesis
|
||
assessment.Future advances in ultra-high-accuracy sequencing
|
||
are likely to be driven by an emerging generation of
|
||
single-molecule sequencers, particularly those that allow
|
||
independent sequence comparison of both strands of native DNA
|
||
duplexes.},
|
||
issn = {1471-0064},
|
||
doi = {10.1038/nrg.2017.117},
|
||
url = {https://doi.org/10.1038/nrg.2017.117}
|
||
}
|
||
|
||
@book{book:lehninger,
|
||
title = {Lehninger-Principles of Biochemistry},
|
||
author = {Albert Lehninger, David L. Nelson, Michael M. Cox},
|
||
publisher = {W. H. Freeman},
|
||
isbn = {9781429224161,1429224169},
|
||
year = 2008,
|
||
edition = {5th Edition},
|
||
pages = 276
|
||
}
|
||
|
||
@inproceedings{crick1958protein,
|
||
title = {On protein synthesis},
|
||
author = {Crick, Francis HC},
|
||
booktitle = {Symp Soc Exp Biol},
|
||
volume = 12,
|
||
number = {138-63},
|
||
pages = 8,
|
||
year = 1958
|
||
}
|
||
|
||
@article{10.1093/bioinformatics/btg109,
|
||
author = {Lee, Christopher},
|
||
title = "{Generating consensus sequences from partial order multiple
|
||
sequence alignment graphs}",
|
||
journal = {Bioinformatics},
|
||
volume = 19,
|
||
number = 8,
|
||
pages = {999-1008},
|
||
year = 2003,
|
||
month = 05,
|
||
abstract = "{Motivation: Consensus sequence generation is important in
|
||
many kinds of sequence analysis ranging from sequence assembly
|
||
to profile-based iterative search methods. However, how can a
|
||
consensus be constructed when its inherent assumption—that the
|
||
aligned sequences form a single linear consensus—is not
|
||
true?Results: Partial Order Alignment (POA) enables
|
||
construction and analysis of multiple sequence alignments as
|
||
directed acyclic graphs containing complex branching
|
||
structure. Here we present a dynamic programming algorithm
|
||
(heaviest\_bundle) for generating multiple consensus sequences
|
||
from such complex alignments. The number and relationships of
|
||
these consensus sequences reveals the degree of structural
|
||
complexity of the source alignment. This is a powerful and
|
||
general approach for analyzing and visualizing complex
|
||
alignment structures, and can be applied to any alignment. We
|
||
illustrate its value for analyzing expressed sequence
|
||
alignments to detect alternative splicing, reconstruct full
|
||
length mRNA isoform sequences from EST fragments, and separate
|
||
paralog mixtures that can cause incorrect SNP
|
||
predictions.Availability: The heaviest\_bundle source code is
|
||
available at http://www.bioinformatics.ucla.edu/poaContact:
|
||
leec@mbi.ucla.edu*To whom correspondence should be
|
||
addressed.}",
|
||
issn = {1367-4803},
|
||
doi = {10.1093/bioinformatics/btg109},
|
||
url = {https://doi.org/10.1093/bioinformatics/btg109},
|
||
eprint = {https://academic.oup.com/bioinformatics/article-pdf/19/8/999/642375/btg109.pdf},
|
||
}
|
||
|
||
@Article{Nagar2013,
|
||
author = {Nagar, Anurag and Hahsler, Michael},
|
||
title = {Fast discovery and visualization of conserved regions in
|
||
DNA sequences using quasi-alignment},
|
||
journal = {BMC Bioinformatics},
|
||
year = 2013,
|
||
month = {Sep},
|
||
day = 13,
|
||
volume = 14,
|
||
number = 11,
|
||
pages = {S2},
|
||
abstract = {Next Generation Sequencing techniques are producing
|
||
enormous amounts of biological sequence data and analysis
|
||
becomes a major computational problem. Currently, most
|
||
analysis, especially the identification of conserved regions,
|
||
relies heavily on Multiple Sequence Alignment and its various
|
||
heuristics such as progressive alignment, whose run time grows
|
||
with the square of the number and the length of the aligned
|
||
sequences and requires significant computational resources. In
|
||
this work, we present a method to efficiently discover regions
|
||
of high similarity across multiple sequences without
|
||
performing expensive sequence alignment. The method is based
|
||
on approximating edit distance between segments of sequences
|
||
using p-mer frequency counts. Then, efficient high-throughput
|
||
data stream clustering is used to group highly similar
|
||
segments into so called quasi-alignments. Quasi-alignments
|
||
have numerous applications such as identifying species and
|
||
their taxonomic class from sequences, comparing sequences for
|
||
similarities, and, as in this paper, discovering conserved
|
||
regions across related sequences.},
|
||
issn = {1471-2105},
|
||
doi = {10.1186/1471-2105-14-S11-S2},
|
||
url = {https://doi.org/10.1186/1471-2105-14-S11-S2}
|
||
}
|
||
|
||
@book{book:771224,
|
||
title = {Artificial Intelligence: A Modern Approach},
|
||
author = {Stuart Russell, Peter Norvig},
|
||
publisher = {Prentice Hall},
|
||
isbn = {0136042597, 9780136042594},
|
||
year = 2010,
|
||
series = {Prentice Hall Series in Artificial Intelligence},
|
||
edition = {3rd}
|
||
}
|