commit c1fe9158cec4a692c6cd4314c07601ea831147ae Author: coolneng Date: Thu Oct 13 13:35:20 2022 +0200 Initial commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..d8dfacf --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +bibliography diff --git a/code/repository-template/.gitignore b/code/repository-template/.gitignore new file mode 100644 index 0000000..b6e4761 --- /dev/null +++ b/code/repository-template/.gitignore @@ -0,0 +1,129 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ diff --git a/code/repository-template/LICENSE b/code/repository-template/LICENSE new file mode 100644 index 0000000..09d78a2 --- /dev/null +++ b/code/repository-template/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2022 Gaetan De Waele + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/code/repository-template/README.md b/code/repository-template/README.md new file mode 100644 index 0000000..febd92e --- /dev/null +++ b/code/repository-template/README.md @@ -0,0 +1,19 @@ +# Template + + +Contains +- `setup.py` +- `README.md` +- `LICENSE` +- `.gitignore` for python, optionally add `./data/` to gitignore if you want +- a folder with the actual python package structure + + +## Install steps: + +``` +conda create --name env +conda activate env +conda install pip +pip install -e . +``` diff --git a/code/repository-template/project/__init__.py b/code/repository-template/project/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/code/repository-template/project/version.py b/code/repository-template/project/version.py new file mode 100644 index 0000000..3dc1f76 --- /dev/null +++ b/code/repository-template/project/version.py @@ -0,0 +1 @@ +__version__ = "0.1.0" diff --git a/code/repository-template/setup.py b/code/repository-template/setup.py new file mode 100644 index 0000000..e4137f1 --- /dev/null +++ b/code/repository-template/setup.py @@ -0,0 +1,20 @@ +import sys +from setuptools import setup, find_packages + +sys.path[0:0] = ["project"] +from version import __version__ + +setup( + name="project", + python_requires=">3.9.0", + packages=find_packages(), + version=__version__, + license="MIT", + description="project template", + author="Gaetan De Waele", + author_email="gaetan.dewaele@ugent.be", + url="https://github.com/gdewael/project", + install_requires=[ + "numpy", + ], +) diff --git a/docs/Bibliography.org b/docs/Bibliography.org new file mode 100644 index 0000000..5340741 --- /dev/null +++ b/docs/Bibliography.org @@ -0,0 +1,258 @@ +#+PROPERTY: header-args :exports none :tangle "./bibliography.bib" +#+LATEX_CLASS_OPTIONS: [12pt] +#+LATEX_HEADER: \usepackage[natbib=true]{biblatex} \DeclareFieldFormat{apacase}{#1} \addbibresource{./bibliography.bib} +#+LATEX_HEADER: \usepackage{parskip} +#+OPTIONS: <:nil c:nil todo:nil H:5 +#+auto_tangle: t +* Deep Learning +** Transformers +*** Attention is All You Need +#+begin_src bibtex +@article{https://doi.org/10.48550/arxiv.1706.03762, + doi = {10.48550/ARXIV.1706.03762}, + url = {https://arxiv.org/abs/1706.03762}, + author = {Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and + Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N. and + Kaiser, Lukasz and Polosukhin, Illia}, + keywords = {Computation and Language (cs.CL), Machine Learning (cs.LG), + FOS: Computer and information sciences, FOS: Computer and + information sciences}, + title = {Attention Is All You Need}, + publisher = {arXiv}, + year = 2017, + copyright = {arXiv.org perpetual, non-exclusive license} +} +#+end_src + +#+LaTeX: \printbibliography[heading=none] +*** Axial Attention in Multidimensional Transformers +#+begin_src bibtex +@article{https://doi.org/10.48550/arxiv.1912.12180, + doi = {10.48550/ARXIV.1912.12180}, + url = {https://arxiv.org/abs/1912.12180}, + author = {Ho, Jonathan and Kalchbrenner, Nal and Weissenborn, Dirk + and Salimans, Tim}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: + Computer and information sciences, FOS: Computer and + information sciences}, + title = {Axial Attention in Multidimensional Transformers}, + publisher = {arXiv}, + year = 2019, + copyright = {arXiv.org perpetual, non-exclusive license} +} +#+end_src +*** Longformer: The Long-Document Transformer +#+begin_src bibtex +@article{https://doi.org/10.48550/arxiv.2004.05150, + doi = {10.48550/ARXIV.2004.05150}, + url = {https://arxiv.org/abs/2004.05150}, + author = {Beltagy, Iz and Peters, Matthew E. and Cohan, Arman}, + keywords = {Computation and Language (cs.CL), FOS: Computer and + information sciences, FOS: Computer and information sciences}, + title = {Longformer: The Long-Document Transformer}, + publisher = {arXiv}, + year = 2020, + copyright = {arXiv.org perpetual, non-exclusive license} +} +#+end_src +*** Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context +#+begin_src bibtex +@article{https://doi.org/10.48550/arxiv.1901.02860, + doi = {10.48550/ARXIV.1901.02860}, + url = {https://arxiv.org/abs/1901.02860}, + author = {Dai, Zihang and Yang, Zhilin and Yang, Yiming and + Carbonell, Jaime and Le, Quoc V. and Salakhutdinov, Ruslan}, + keywords = {Machine Learning (cs.LG), Computation and Language (cs.CL), + Machine Learning (stat.ML), FOS: Computer and information + sciences, FOS: Computer and information sciences}, + title = {Transformer-XL: Attentive Language Models Beyond a + Fixed-Length Context}, + publisher = {arXiv}, + year = 2019, + copyright = {Creative Commons Attribution Non Commercial Share Alike 4.0 + International} +} +#+end_src +*** BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding +#+begin_src bibtex +@inproceedings{devlin-etal-2019-bert, + title = "{BERT}: Pre-training of Deep Bidirectional Transformers for + Language Understanding", + author = "Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and + Toutanova, Kristina", + booktitle = "Proceedings of the 2019 Conference of the North {A}merican + Chapter of the Association for Computational Linguistics: + Human Language Technologies, Volume 1 (Long and Short Papers)", + month = jun, + year = 2019, + address = "Minneapolis, Minnesota", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/N19-1423", + doi = "10.18653/v1/N19-1423", + pages = "4171--4186", + abstract = "We introduce a new language representation model called + BERT, which stands for Bidirectional Encoder Representations + from Transformers. Unlike recent language representation + models (Peters et al., 2018a; Radford et al., 2018), BERT is + designed to pre-train deep bidirectional representations from + unlabeled text by jointly conditioning on both left and right + context in all layers. As a result, the pre-trained BERT model + can be fine-tuned with just one additional output layer to + create state-of-the-art models for a wide range of tasks, such + as question answering and language inference, without + substantial task-specific architecture modifications. BERT is + conceptually simple and empirically powerful. It obtains new + state-of-the-art results on eleven natural language processing + tasks, including pushing the GLUE score to 80.5 (7.7 point + absolute improvement), MultiNLI accuracy to 86.7{\%} (4.6{\%} + absolute improvement), SQuAD v1.1 question answering Test F1 + to 93.2 (1.5 point absolute improvement) and SQuAD v2.0 Test + F1 to 83.1 (5.1 point absolute improvement).", +} +#+end_src + +A masked language model (MLM) randomly masks some of the tokens from the input, and the objective is to predict the original input based only on its context. +*** Fast Transformers with Clustered Attention +#+begin_src bibtex +@article{https://doi.org/10.48550/arxiv.2007.04825, + doi = {10.48550/ARXIV.2007.04825}, + url = {https://arxiv.org/abs/2007.04825}, + author = {Vyas, Apoorv and Katharopoulos, Angelos and Fleuret, + François}, + keywords = {Machine Learning (cs.LG), Machine Learning (stat.ML), FOS: + Computer and information sciences, FOS: Computer and + information sciences}, + title = {Fast Transformers with Clustered Attention}, + publisher = {arXiv}, + year = 2020, + copyright = {arXiv.org perpetual, non-exclusive license} +} +#+end_src +*** The elephant in the interpretability room: Why use attention as explanation when we have saliency methods? +#+begin_src bibtex +@inproceedings{bastings-filippova-2020-elephant, + title = "The elephant in the interpretability room: Why use + attention as explanation when we have saliency methods?", + author = "Bastings, Jasmijn and Filippova, Katja", + booktitle = "Proceedings of the Third BlackboxNLP Workshop on Analyzing + and Interpreting Neural Networks for NLP", + month = nov, + year = 2020, + address = "Online", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/2020.blackboxnlp-1.14", + doi = "10.18653/v1/2020.blackboxnlp-1.14", + pages = "149--155", + abstract = "There is a recent surge of interest in using attention as + explanation of model predictions, with mixed evidence on + whether attention can be used as such. While attention + conveniently gives us one weight per input token and is easily + extracted, it is often unclear toward what goal it is used as + explanation. We find that often that goal, whether explicitly + stated or not, is to find out what input tokens are the most + relevant to a prediction, and that the implied user for the + explanation is a model developer. For this goal and user, we + argue that input saliency methods are better suited, and that + there are no compelling reasons to use attention, despite the + coincidence that it provides a weight for each input. With + this position paper, we hope to shift some of the recent focus + on attention to saliency methods, and for authors to clearly + state the goal and user for their explanations.", +} +#+end_src +* Deep Learning + Biology +** MSA Transformer +#+begin_src bibtex +@article {Rao2021.02.12.430858, + author = {Rao, Roshan and Liu, Jason and Verkuil, Robert and Meier, + Joshua and Canny, John F. and Abbeel, Pieter and Sercu, Tom + and Rives, Alexander}, + title = {MSA Transformer}, + elocation-id = {2021.02.12.430858}, + year = 2021, + doi = {10.1101/2021.02.12.430858}, + publisher = {Cold Spring Harbor Laboratory}, + abstract = {Unsupervised protein language models trained across + millions of diverse sequences learn structure and function of + proteins. Protein language models studied to date have been + trained to perform inference from individual sequences. The + longstanding approach in computational biology has been to + make inferences from a family of evo lutionarily related + sequences by fitting a model to each family independently. In + this work we combine the two paradigms. We introduce a protein + language model which takes as input a set of sequences in the + form of a multiple sequence alignment. The model interleaves + row and column attention across the input sequences and is + trained with a variant of the masked language modeling + objective across many protein families. The performance of the + model surpasses current state-of-the-art unsupervised + structure learning methods by a wide margin, with far greater + parameter efficiency than prior state-of-the-art protein + language models.Competing Interest StatementThe authors have + declared no competing interest.}, + URL = + {https://www.biorxiv.org/content/early/2021/08/27/2021.02.12.430858}, + eprint = + {https://www.biorxiv.org/content/early/2021/08/27/2021.02.12.430858.full.pdf}, + journal = {bioRxiv} +} +#+end_src +** Highly accurate protein structure prediction with AlphaFold +#+begin_src bibtex +@article{Jumper2021, + author = {Jumper, John and Evans, Richard and Pritzel, Alexander and + Green, Tim and Figurnov, Michael and Ronneberger, Olaf and + Tunyasuvunakool, Kathryn and Bates, Russ and {\v{Z}}{\'i}dek, + Augustin and Potapenko, Anna and Bridgland, Alex and Meyer, + Clemens and Kohl, Simon A. A. and Ballard, Andrew J. and + Cowie, Andrew and Romera-Paredes, Bernardino and Nikolov, + Stanislav and Jain, Rishub and Adler, Jonas and Back, Trevor + and Petersen, Stig and Reiman, David and Clancy, Ellen and + Zielinski, Michal and Steinegger, Martin and Pacholska, + Michalina and Berghammer, Tamas and Bodenstein, Sebastian and + Silver, David and Vinyals, Oriol and Senior, Andrew W. and + Kavukcuoglu, Koray and Kohli, Pushmeet and Hassabis, Demis}, + title = {Highly accurate protein structure prediction with + AlphaFold}, + journal = {Nature}, + year = 2021, + month = {Aug}, + day = 01, + volume = 596, + number = 7873, + pages = {583-589}, + abstract = {Proteins are essential to life, and understanding their + structure can facilitate a mechanistic understanding of their + function. Through an enormous experimental effort1--4, the + structures of around 100,000 unique proteins have been + determined5, but this represents a small fraction of the + billions of known protein sequences6,7. Structural coverage is + bottlenecked by the months to years of painstaking effort + required to determine a single protein structure. Accurate + computational approaches are needed to address this gap and to + enable large-scale structural bioinformatics. Predicting the + three-dimensional structure that a protein will adopt based + solely on its amino acid sequence---the structure prediction + component of the `protein folding problem'8---has been an + important open research problem for more than 50 years9. + Despite recent progress10--14, existing methods fall far short + of atomic accuracy, especially when no homologous structure is + available. Here we provide the first computational method that + can regularly predict protein structures with atomic accuracy + even in cases in which no similar structure is known. We + validated an entirely redesigned version of our neural + network-based model, AlphaFold, in the challenging 14th + Critical Assessment of protein Structure Prediction + (CASP14)15, demonstrating accuracy competitive with + experimental structures in a majority of cases and greatly + outperforming other methods. Underpinning the latest version + of AlphaFold is a novel machine learning approach that + incorporates physical and biological knowledge about protein + structure, leveraging multi-sequence alignments, into the + design of the deep learning algorithm.}, + issn = {1476-4687}, + doi = {10.1038/s41586-021-03819-2}, + url = {https://doi.org/10.1038/s41586-021-03819-2} +} +#+end_src +* Biology diff --git a/docs/Notes.org b/docs/Notes.org new file mode 100644 index 0000000..71acbbb --- /dev/null +++ b/docs/Notes.org @@ -0,0 +1,258 @@ +#+TITLE: Thesis brain dump +#+AUTHOR: Amin Kasrou Aouam +#+DATE: 10-2021 +#+PANDOC_OPTIONS: template:~/.pandoc/templates/eisvogel.latex +#+PANDOC_OPTIONS: listings:t +#+PANDOC_OPTIONS: toc:t +#+PANDOC_OPTIONS: bibliography:bibliography.bib +#+PANDOC_OPTIONS: citeproc:t +#+PANDOC_METADATA: titlepage:t +#+PANDOC_METADATA: listings-no-page-break:t +#+PANDOC_METADATA: toc-own-page:t +#+PANDOC_METADATA: table-use-row-colors:t +#+PANDOC_METADATA: colorlinks:t +#+PANDOC_METADATA: logo:/home/coolneng/Photos/Logos/UGent.png +#+PANDOC_METADATA: link-citations:t +#+CITE_EXPORT: biblatex +* Deep Learning +** Activation functions +*** Sigmoid +:PROPERTIES: +:ID: 66e07b8e-6267-4743-89ac-ac176753d4ae +:END: +The sigmoid function has a range of [0,1] and can be used both as a *squashing function*, in order to any real number to a value between 0 and 1, or as an *activation* function that guarantees that the output of that unit is between 0 and 1. Furthermore, it is a *non-linear* function and thus ensures that a neural network can learn a non-linearly separable problem. + +#+CAPTION: Sigmoid function +#+LABEL: sigmoid-function +#+ATTR_HTML: :width 50% +[[attachment:_20221011_094602screenshot.png]] + +A general problem with this function, as an activation function, is that it saturates. This means that large values correspond to 1 and low values to 0, and they are only really sensitive to values around their mid-point. When it is saturated, the learning algorithm has trouble adjusting the weights. + +*** ReLu +:PROPERTIES: +:ID: 4e0e3218-fb3c-463b-9b90-6aebcce7237b +:END: +The rectified linear unit function or ReLu is a non-linear function that acts like a linear one. Its range is [0, $\infty$) as it returns 0 for any negative value and the original value if it is positive. In other words, it is linear for positive values and non-linear for negative values. + +#+CAPTION: Rectified linear unit function +#+LABEL: relu-function +#+ATTR_HTML: :width 50% +[[attachment:_20221011_095153screenshot.png]] + +It is the /de facto/ activation function used for Deep Learning networks, due to its many advantages. It is *computationally simple*, allows *sparse representations* (it outputs zero values), has a *linear behavior* (easier to optimize and avoids vanishing gradients). + +* Transformers +Deep learning models that are design to process a connected set of units (e.g. tokens in a sequence or pixels in an image) only using the mechanism of *self-attention*. +They are simple models that still haven't reached their performance limit, as it is currently only being limited by computational resources. They are also *extremely generic*, they have been mostly used for NLP but they can be exploited for more tasks, which is very useful for *multi-modal learning*. +** Inputs +*** Word representation +A word embedding is a featurized representation of a set of words. These high dimensional vectors give a good representation to learn semantic properties of words. +A common technique to visualize them is a t-SNE plot, as it plots these high dimensional embeddings into a 2D space. The distance between the points indicates the similarity of the words, which allows us to perform some kind of clustering. + +The steps to use word embeddings are the following: + +1. Learn them from very large corpuses of unlabeled text/use pretrained word embeddings +2. Transfer embedding to a new task with a smaller training set +3. Finetune the embeddings with new data (optional, useful when the training set is big) + +This is a *transfer learning* process. +**** TODO Analogy +Similarity measures (e.g. cosine similarity) +*** Positional encoding +Transformers see sentences as sets of words, which means that the order of the words is not relevant. This can be circunvented by using positional encoding, which forces them to evaluate a sentence as a *sequence*. +The most common ways of performing it is by using *sine and cosine* functions of different frequencies: + +$PE_{(pos,2i)} = sin(pos/10000^{2i/d_{model}})$ +$PE_{(pos,2i+1)} = cos(pos/10000^{2i/d_{model}})$ + +Each dimension corresponds to a sinusoid, which forms a geometric progression from $2\pi$ to $10000 \cdot 2\pi$ [cite:@https://doi.org/10.48550/arxiv.1706.03762]. +** Self attention +:PROPERTIES: +:ID: 9cb809a3-f9f7-4578-a903-1bb9a7ad91ff +:END: +It is a sequence-to-sequence operation: + +Input vector => Model => output vector + +To produce the output vector, the self attention operation performs a weighted average over the input vectors: + +$y_{i} = \Sigma_{j} w_{ij}x_{j}$ + +The weight $w_{ij}$ is not a parameter, but rather it's derived from a function over $x_i$ and $x_{j}$. The simplest function is the dot product: + +$w'{ij} = x_{i}^T x_{j}$ + +The softmax function is applied to the dot product in order to map the values to [0,1]. + +#+CAPTION: Operation of self attention +#+LABEL: self-attention +#+ATTR_HTML: :width 50% +[[attachment:_20220929_154442screenshot.png]] + +The self attention operation is the only one that *propagates information between vectors*. + +It is called self attention because there are mechanisms that decide which elements of the input are relevant for a particular output. The general mechanism is as follows, the input are *values*, a mechanism assigns a *key* to each value and to each output the mechanism assigns a *query*. This is similar to how a key-value store works, in our case for each query we will obtain a sum of all the keys weighted by the extent of the match. +*** Basic mechanism +By using feature selection and performing the dot product, we can apply self attention to NLP. self attention to NLP. By creating an *embedding vector*, which is a numeric representation of a sequence of words we apply the previously formulated $y_{i}$ function in order to obtain an output vector. +The output vector will represent how *related* are two vectors in the input set, in this case related is determined by which learning task we are performing. Self attention sees the *input as set*, the order of the elements is not taken into account. +*** Additional mechanisms +**** Queries, keys and values +Each input vector $x_i$ is used in 3 different ways: + +- Query: Compared to every other vector to establish the weights for its own output $y_i$ +- Key: Compared to every other vector to establish the weights for its own output $y_j$ +- Value: Used as part of the weighted sum to compute each output vector + +In this case we use new vector for each role, which means that we add 3 weight matrices $W_q$, $W_k$, $W_v$ and compute 3 linear transformations. + +#+CAPTION: Self attention with query, key and value +#+LABEL: query-key-value +#+ATTR_HTML: :width 50% +[[attachment:_20220929_154554screenshot.png]] + +**** Scaling the dot product +The softmax function is sensitive to large values, which produce low gradients. We solve this by scaling it down: + +$w'_{ij} = \frac{q_{i}^Tk_{j}}{\sqrt{k}}$ + +**** Multi-head attention +A word can have different meanings depending on its neighbours, in order to work around this problem we combine multiple self attention mechanisms. We assign each attention head a different matrix $W_q^r$, $W_k^r$, $W_v^r$. In order to perform a multi-head self attention efficiently, we divide the input vector by the number of heads ($|x_i| = 256$, $R = 8$ => 8 chunks of 32 dimensions) and generate queries, keys and values for each chunk. +** Building transformers +:PROPERTIES: +:ID: 8df6ff92-d916-4d95-b33e-5490aa8a18e0 +:END: +The standard architecture revolves around 3 types of layers: + +- Self attention +- Layer normalization: normalizes the activation of the previous layer *for each sample* in a batch (instead of the whole batch) +- Feed forward layer (MLP) + +Residual connections, which allow the neural network to skip them, are added between each layer normalization. + +#+CAPTION: Transformer architecture +#+LABEL: transformer-architecture +#+ATTR_HTML: :width 50% +[[attachment:_20220929_164050screenshot.png]] + +The input of the transformer is the embedding vector (word embedding), but in order to take into account the position of the words we need an additional data structure. There are 2 approaches: + +- Position embeddings: create an embedding vector containing the position. It's easy to implement but we need to use sequences of every length during the training. +- Position encodings: use a function $f: \mathbb{N} \rightarrow \mathbb{R}^k$ to map the positions to vectors of real numbers that the network can interpret. For a well chosen function the network also works on longer sequences, but it is a complicated hyperparameter. + +#+CAPTION: Higher level view of the architecture +#+LABEL: input-transformers +#+ATTR_HTML: :width 50% +[[attachment:_20221003_142245screenshot.png]] +** Example - Text generation transformer +:PROPERTIES: +:ID: bee6719a-7f5c-442b-b3d7-f862015240ab +:END: +Transformers can be used as autoregressive models (i.e. they use data from the past to predict the future), one example is a model that predicts the next character in a sequence. + +In order to use self-attention for this use case, we need to mask the values after the chosen position i. This is implemented by applying a mask to the matrix of dot products, before the softmax function. The mask sets all the elements above the diagonal to $-\infty$. + +#+CAPTION: Application of the mask to the dot product +#+LABEL: mask +#+ATTR_HTML: :width 50% +[[attachment:_20221003_143444screenshot.png]] +** Design considerations +Transformers were created to overcome the shortcomings of RNNs, as the recurrent connection imposes a dependency of the previous timestep to compute the current one. + +They can model dependencies over the whole range of the input sequence (unlike CNNs) and they can be computed in a very efficient way. Furthermore, they were designed to allow for deep models, as almost all the model (except softmax and ReLU) are linear transformations which *preserve the gradient*. +** Modeling of graph data +Transformers are able to interpret graph data, as they see a sentence as a *fully connected* graph of words. In the case of NLP, it is possible to use full attention as the number of nodes (and subsequently of edges) is small enough which makes it computationally tractable. +However, this approach is not possible to interpret most types of graph data such as biological networks. In that case, we need to apply sparse attention (e.g. evaluate the local neighbours). +* Literature review +** CpG Transformer for imputation of single-cell methylomes +*** DNA methylation methodologies +DNA methylation is a mechanism that is associated with multiple cellular processes, such as *gene expression*. +In the last decade, multiple new single-cell protocols have been developed and although they provide an unprecedented look into cellular processes, they come with some caveats. The smaller amount of reads result in *noisier* data. +*** CpG site imputation +Prediction of methylation states is a well known problem that has been tackled by leveraging dependencies between sites, using multiple techniques: + +- Dimensionality reduction +- Imputation of single CpG sites +- Use of information from multiple tissues +- Use of intra and extracellular correlations +- Differences in local CpG profiles between cells: methylation states at a target sites and its neighbouring ones +*** Approach +A transfomer model is used to attempt to fill the gaps in a known sequence of methylation states. This is achieved using a mask and then asking the model to predict the masked value. This approach is common in NLP, but has not been explored to *impute gaps in matrices*. +**** Inputs +- CpG matrix +- CpG positions in the genome +- DNA surrounding these sites +- Cell index embedding -> cell identity + +The CpG matrix is *corrupted* by randomly masking some tokens and 20% of the tokens are also assigned a random binary state. + +Five different datasets were used: + +| Dataset | Organism | Medium | Platform | +| 20 embryonic stem cells | Mouse | Serum | scBS-seq | +| 12 embryonic stem cells | Mouse | 2i | scBS-seq | +| 25 hepatocellular carcinoma cells | Human | | scRRBS-seq | +| 30 monoclonal B lymphocytes | Human | | scRRBS-seq | +| 122 hematopoietic stem cells | Human | | scBS-seq | + +Methylation states are assigned when $\frac{\#(reads_{positive})}{\#(reads_{total}} \geq 0.5$ and holdout validation is used (fixed splits). +**** Mechanism +The model learns a representation for every site and combines them in a graph-like way. It uses *axial* and *sliding* window attention. + +***** Axial attention +:PROPERTIES: +:ID: c0c067e2-2bb2-4a35-8c63-43a34e0a39a0 +:END: +Self-attention is a powerful method but it comes at a high computational cost, as its memory and computation scale quadratically $O(n^2m^2)$, which makes it prohibitely expensive to apply it to long sequences [cite:@https://doi.org/10.48550/arxiv.1912.12180]. + +Axial attention applies attention along *one axis* of the tensor (e.g. height/width of an image) which is faster than applying it on all the elements. It allows for the majority of the context to be embedded, with a high degree of parallelism. This reduces the complexity to $O(mn(n+m))$. + +#+CAPTION: Types of axial attention layers +#+LABEL: axial-attention +#+ATTR_HTML: :width 50% +[[attachment:_20221010_160722screenshot.png]] + +***** Sliding window attention +:PROPERTIES: +:ID: 09caa7a4-2d3c-4b17-8282-22c049952917 +:END: +Sliding window attention employs a fixed-size window of size $w$ around each token, each token then attends to $\frac{w}{2}$ tokens to each side. The complexity of this pattern is $O(n \times w)$, to make this pattern efficient $w$ needs to be smaller than $n$. + +As CpG sites in close proximity are often correlated, we can apply this mechanism in order to limit row-wise attention, which reduces the complexity to $O(mn(n+w))$. + +#+CAPTION: Sliding-window attention +#+LABEL: sliding-window-attention +#+ATTR_HTML: :width 50% :height 20% +#+ATTR_HTML: :width 50% :height 50% +[[attachment:_20221010_165903screenshot.png]] +***** Architecture +:PROPERTIES: +:ID: 9476d3d2-e4d1-44bd-aa98-557c264fffc7 +:END: +The CpG trasnformer is composed of a stack of *four* identical layers, each layer is composed of *three* different sublayers arranged in the following order: + +- Sliding-window attention +- Layer normalization +- Axial attention (column wise) +- Layer normalization +- MLP (ReLu activation) +- Layer normalization + +#+CAPTION: Single layer of the CpG transformer +#+LABEL: cpg-transformer-layer +#+ATTR_HTML: :width 50% :height 30% +[[attachment:_20221011_093433screenshot.png]] + +All sublayers have a *residual connection* and the outputs of the last layer are reduced to *one hidden dimension* and subjected to a sigmoid operation. + +**** Objective +The objective is to impute and denoise the DNA methylation data, and it is based on the masked language model (MLM) which is a a type of denoising autoencoding in which the loss function only acts on the subset of corrupted inputs [cite:@devlin-etal-2019-bert]. +**** Results +It provides a general-purpose way of learning interactions between CpG sites within and between cells. Furthermore, it is also *interpretable* and *enables transfer learning*. It also is evaluated against another DL method [[https://genomebiology.biomedcentral.com/articles/10.1186/s13059-017-1189-z][DeepCpG]] and a traditional ML one, [[https://academic.oup.com/bioinformatics/article/37/13/1814/6103564?login=false#278705269][CaMelia]] and it outperforms both of them. Furthermore, the performance gain seems to be more pronounced in contexts with higher cell-to-cell variability, which demonstrates an *ability to encode cell heterogeneity*. +Unfortunately, it cannot be scaled to large number of cells. Data subsetting techniques would have to be used in order to apply the model, or alternative attention mechanisms (e.g. clustered attention). +Furthermore, its performance is hindered in areas of high sparsity as it is not able to properly estimate local methylation profiles. The local neighbourdhood is important, as in areas with low coverage but with a populated neighbourdhood the results are more accurate. +**** Limitations +* Glossary +- Temperature: hyperparameter of neural networks used to control the randomness of predictions, by scaling the logits prior to applying softmax: $\frac{logits}{temperature}$. The higher the temperature, the network is more easily excited and thus results in more diversity and mistakes. +- Ablation: removal of components of the input to evaluate their significance +* References diff --git a/docs/Notes.pdf b/docs/Notes.pdf new file mode 100644 index 0000000..dd37882 Binary files /dev/null and b/docs/Notes.pdf differ diff --git a/docs/assets/ieee.csl b/docs/assets/ieee.csl new file mode 100644 index 0000000..44ec764 --- /dev/null +++ b/docs/assets/ieee.csl @@ -0,0 +1,457 @@ + + diff --git a/docs/bibliography.bib b/docs/bibliography.bib new file mode 100644 index 0000000..3f25de6 --- /dev/null +++ b/docs/bibliography.bib @@ -0,0 +1,225 @@ +@article{https://doi.org/10.48550/arxiv.1706.03762, + doi = {10.48550/ARXIV.1706.03762}, + url = {https://arxiv.org/abs/1706.03762}, + author = {Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and + Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N. and + Kaiser, Lukasz and Polosukhin, Illia}, + keywords = {Computation and Language (cs.CL), Machine Learning (cs.LG), + FOS: Computer and information sciences, FOS: Computer and + information sciences}, + title = {Attention Is All You Need}, + publisher = {arXiv}, + year = 2017, + copyright = {arXiv.org perpetual, non-exclusive license} +} + +@article{https://doi.org/10.48550/arxiv.1912.12180, + doi = {10.48550/ARXIV.1912.12180}, + url = {https://arxiv.org/abs/1912.12180}, + author = {Ho, Jonathan and Kalchbrenner, Nal and Weissenborn, Dirk + and Salimans, Tim}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: + Computer and information sciences, FOS: Computer and + information sciences}, + title = {Axial Attention in Multidimensional Transformers}, + publisher = {arXiv}, + year = 2019, + copyright = {arXiv.org perpetual, non-exclusive license} +} + +@article{https://doi.org/10.48550/arxiv.2004.05150, + doi = {10.48550/ARXIV.2004.05150}, + url = {https://arxiv.org/abs/2004.05150}, + author = {Beltagy, Iz and Peters, Matthew E. and Cohan, Arman}, + keywords = {Computation and Language (cs.CL), FOS: Computer and + information sciences, FOS: Computer and information sciences}, + title = {Longformer: The Long-Document Transformer}, + publisher = {arXiv}, + year = 2020, + copyright = {arXiv.org perpetual, non-exclusive license} +} + +@article{https://doi.org/10.48550/arxiv.1901.02860, + doi = {10.48550/ARXIV.1901.02860}, + url = {https://arxiv.org/abs/1901.02860}, + author = {Dai, Zihang and Yang, Zhilin and Yang, Yiming and + Carbonell, Jaime and Le, Quoc V. and Salakhutdinov, Ruslan}, + keywords = {Machine Learning (cs.LG), Computation and Language (cs.CL), + Machine Learning (stat.ML), FOS: Computer and information + sciences, FOS: Computer and information sciences}, + title = {Transformer-XL: Attentive Language Models Beyond a + Fixed-Length Context}, + publisher = {arXiv}, + year = 2019, + copyright = {Creative Commons Attribution Non Commercial Share Alike 4.0 + International} +} + +@inproceedings{devlin-etal-2019-bert, + title = "{BERT}: Pre-training of Deep Bidirectional Transformers for + Language Understanding", + author = "Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and + Toutanova, Kristina", + booktitle = "Proceedings of the 2019 Conference of the North {A}merican + Chapter of the Association for Computational Linguistics: + Human Language Technologies, Volume 1 (Long and Short Papers)", + month = jun, + year = 2019, + address = "Minneapolis, Minnesota", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/N19-1423", + doi = "10.18653/v1/N19-1423", + pages = "4171--4186", + abstract = "We introduce a new language representation model called + BERT, which stands for Bidirectional Encoder Representations + from Transformers. Unlike recent language representation + models (Peters et al., 2018a; Radford et al., 2018), BERT is + designed to pre-train deep bidirectional representations from + unlabeled text by jointly conditioning on both left and right + context in all layers. As a result, the pre-trained BERT model + can be fine-tuned with just one additional output layer to + create state-of-the-art models for a wide range of tasks, such + as question answering and language inference, without + substantial task-specific architecture modifications. BERT is + conceptually simple and empirically powerful. It obtains new + state-of-the-art results on eleven natural language processing + tasks, including pushing the GLUE score to 80.5 (7.7 point + absolute improvement), MultiNLI accuracy to 86.7{\%} (4.6{\%} + absolute improvement), SQuAD v1.1 question answering Test F1 + to 93.2 (1.5 point absolute improvement) and SQuAD v2.0 Test + F1 to 83.1 (5.1 point absolute improvement).", +} + +@article{https://doi.org/10.48550/arxiv.2007.04825, + doi = {10.48550/ARXIV.2007.04825}, + url = {https://arxiv.org/abs/2007.04825}, + author = {Vyas, Apoorv and Katharopoulos, Angelos and Fleuret, + François}, + keywords = {Machine Learning (cs.LG), Machine Learning (stat.ML), FOS: + Computer and information sciences, FOS: Computer and + information sciences}, + title = {Fast Transformers with Clustered Attention}, + publisher = {arXiv}, + year = 2020, + copyright = {arXiv.org perpetual, non-exclusive license} +} + +@inproceedings{bastings-filippova-2020-elephant, + title = "The elephant in the interpretability room: Why use + attention as explanation when we have saliency methods?", + author = "Bastings, Jasmijn and Filippova, Katja", + booktitle = "Proceedings of the Third BlackboxNLP Workshop on Analyzing + and Interpreting Neural Networks for NLP", + month = nov, + year = 2020, + address = "Online", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/2020.blackboxnlp-1.14", + doi = "10.18653/v1/2020.blackboxnlp-1.14", + pages = "149--155", + abstract = "There is a recent surge of interest in using attention as + explanation of model predictions, with mixed evidence on + whether attention can be used as such. While attention + conveniently gives us one weight per input token and is easily + extracted, it is often unclear toward what goal it is used as + explanation. We find that often that goal, whether explicitly + stated or not, is to find out what input tokens are the most + relevant to a prediction, and that the implied user for the + explanation is a model developer. For this goal and user, we + argue that input saliency methods are better suited, and that + there are no compelling reasons to use attention, despite the + coincidence that it provides a weight for each input. With + this position paper, we hope to shift some of the recent focus + on attention to saliency methods, and for authors to clearly + state the goal and user for their explanations.", +} + +@article {Rao2021.02.12.430858, + author = {Rao, Roshan and Liu, Jason and Verkuil, Robert and Meier, + Joshua and Canny, John F. and Abbeel, Pieter and Sercu, Tom + and Rives, Alexander}, + title = {MSA Transformer}, + elocation-id = {2021.02.12.430858}, + year = 2021, + doi = {10.1101/2021.02.12.430858}, + publisher = {Cold Spring Harbor Laboratory}, + abstract = {Unsupervised protein language models trained across + millions of diverse sequences learn structure and function of + proteins. Protein language models studied to date have been + trained to perform inference from individual sequences. The + longstanding approach in computational biology has been to + make inferences from a family of evo lutionarily related + sequences by fitting a model to each family independently. In + this work we combine the two paradigms. We introduce a protein + language model which takes as input a set of sequences in the + form of a multiple sequence alignment. The model interleaves + row and column attention across the input sequences and is + trained with a variant of the masked language modeling + objective across many protein families. The performance of the + model surpasses current state-of-the-art unsupervised + structure learning methods by a wide margin, with far greater + parameter efficiency than prior state-of-the-art protein + language models.Competing Interest StatementThe authors have + declared no competing interest.}, + URL = + {https://www.biorxiv.org/content/early/2021/08/27/2021.02.12.430858}, + eprint = + {https://www.biorxiv.org/content/early/2021/08/27/2021.02.12.430858.full.pdf}, + journal = {bioRxiv} +} + +@article{Jumper2021, + author = {Jumper, John and Evans, Richard and Pritzel, Alexander and + Green, Tim and Figurnov, Michael and Ronneberger, Olaf and + Tunyasuvunakool, Kathryn and Bates, Russ and {\v{Z}}{\'i}dek, + Augustin and Potapenko, Anna and Bridgland, Alex and Meyer, + Clemens and Kohl, Simon A. A. and Ballard, Andrew J. and + Cowie, Andrew and Romera-Paredes, Bernardino and Nikolov, + Stanislav and Jain, Rishub and Adler, Jonas and Back, Trevor + and Petersen, Stig and Reiman, David and Clancy, Ellen and + Zielinski, Michal and Steinegger, Martin and Pacholska, + Michalina and Berghammer, Tamas and Bodenstein, Sebastian and + Silver, David and Vinyals, Oriol and Senior, Andrew W. and + Kavukcuoglu, Koray and Kohli, Pushmeet and Hassabis, Demis}, + title = {Highly accurate protein structure prediction with + AlphaFold}, + journal = {Nature}, + year = 2021, + month = {Aug}, + day = 01, + volume = 596, + number = 7873, + pages = {583-589}, + abstract = {Proteins are essential to life, and understanding their + structure can facilitate a mechanistic understanding of their + function. Through an enormous experimental effort1--4, the + structures of around 100,000 unique proteins have been + determined5, but this represents a small fraction of the + billions of known protein sequences6,7. Structural coverage is + bottlenecked by the months to years of painstaking effort + required to determine a single protein structure. Accurate + computational approaches are needed to address this gap and to + enable large-scale structural bioinformatics. Predicting the + three-dimensional structure that a protein will adopt based + solely on its amino acid sequence---the structure prediction + component of the `protein folding problem'8---has been an + important open research problem for more than 50 years9. + Despite recent progress10--14, existing methods fall far short + of atomic accuracy, especially when no homologous structure is + available. Here we provide the first computational method that + can regularly predict protein structures with atomic accuracy + even in cases in which no similar structure is known. We + validated an entirely redesigned version of our neural + network-based model, AlphaFold, in the challenging 14th + Critical Assessment of protein Structure Prediction + (CASP14)15, demonstrating accuracy competitive with + experimental structures in a majority of cases and greatly + outperforming other methods. Underpinning the latest version + of AlphaFold is a novel machine learning approach that + incorporates physical and biological knowledge about protein + structure, leveraging multi-sequence alignments, into the + design of the deep learning algorithm.}, + issn = {1476-4687}, + doi = {10.1038/s41586-021-03819-2}, + url = {https://doi.org/10.1038/s41586-021-03819-2} +}