From 34fefed3ed8ac2ef4382934518ea448963aeaad2 Mon Sep 17 00:00:00 2001 From: coolneng Date: Thu, 6 May 2021 20:35:31 +0200 Subject: [PATCH] Add literate programming notebook --- docs/experiments.org | 72 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) create mode 100644 docs/experiments.org diff --git a/docs/experiments.org b/docs/experiments.org new file mode 100644 index 0000000..edad193 --- /dev/null +++ b/docs/experiments.org @@ -0,0 +1,72 @@ +#+TITLE: Tensorflow experiments +#+AUTHOR: Amin Kasrou Aouam +#+PROPERTY: header-args :session poetry-session +* Experiments + +#+begin_src elisp :results silent +(pyvenv-activate "~/.cache/pypoetry/virtualenvs/locimend-hM_4JND0-py3.8/") +#+end_src + +In this notebook we'll extract knowledge from our generated dataset. First, let's import our dependencies: + +#+begin_src python +from tensorflow_io import genome +#+end_src + +#+RESULTS: +: 2021-05-06 20:41:53.592058: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /nix/store/9ilyrqidrjbqvmnn8ykjc7lygdd86g7q-gcc-10.2.0-lib/lib: +: 2021-05-06 20:41:53.592101: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. + + +Tensorflow I/O is an extension that contains a module for genome parsing, we'll use it to import the sequences contained in our FASTQ files: + +#+begin_src python :results silent +def parse_data(filepath): + HVR = genome.read_fastq(filename=filepath) + return HVR.sequences, HVR.raw_quality +#+end_src + +Let's import both the immuneSIM generated HVR dataset and the CuReSim processed one, which contains sequencing errors (mostly indels): + +#+begin_src python +original_HVR, _ = parse_data("../data/HVR.fastq") +processed_HVR, _ = parse_data("../data/CuReSim-HVR.fastq") +print(original_HVR) +print(processed_HVR) +#+end_src + +#+RESULTS: +#+begin_example +tf.Tensor( +[b'TGTGCCAGCAGCTTAACCATCGGACGCAGTACTTCGGGCCAGGCACGCGGCTCCTGG' + b'TGTGCCAGCAGCTTAACCATCGGACGCAGTACTTCGGGCCAGGCACGCGGCTCCTGG' + b'TGTGCCAGCAGCTTAACCATCGGACGCAGTACTTCGGGCCAGGCACGCGGCTCCTGG' + b'TGTGCCAGCAGCTTAACCATCGGACGCAGTACTTCGGGCCAGGCACGCGGCTCCTGG' + b'TGTGCCAGCAGCTTAACCATCGGACGCAGTACTTCGGGCCAGGCACGCGGCTCCTGG' + b'TGTGCCAGCAGCTTAACCATCGGACGCAGTACTTCGGGCCAGGCACGCGGCTCCTGG' + b'TGTGCCAGCAGCTTAACCATCGGACGCAGTACTTCGGGCCAGGCACGCGGCTCCTGG' + b'TGTGCCAGCAGCTTAACCATCGGACGCAGTACTTCGGGCCAGGCACGCGGCTCCTGG' + b'TGTGCCAGCAGCTTAACCATCGGACGCAGTACTTCGGGCCAGGCACGCGGCTCCTGG' + b'TGTGCCAGCAGCTTAACCATCGGACGCAGTACTTCGGGCCAGGCACGCGGCTCCTGG'], shape=(10,), dtype=string) +tf.Tensor( +[b'GCGCCAGCAGCTATTGGATATGGACTAGCTACTC' + b'TGTGCCAGCAGTGATGTGGTGACATGGGTGCGTAGCAATCAGCCAGCATG' + b'GCGCCAGCAGCTTGGATAGGACTAGCTACTT' + b'TGTGCCAGCAGTGAATGGGTGACAGGGTGCGTAGCATCAGCCCCAGCATTT' + b'TTGCGCAGCAGCTTGGATAGGACTAGCTACTT' + b'TGTGCCAGCAGTGAATGGGGACAGGGGCGTAGCAATCAGCCCCAGCATTT' + b'TTGCGCCAGCAGCTTGGATAGGACTAGCTACTT' + b'TGTGCAGCAGTGAATGGGGACAGGGGCGTAGCAATCAGCCCCAGCATTT' + b'TGCGCCAGCAGCTTGGATAGGACTAGCTACTT' + b'TGTGCCAGCAGTGAATGGGGACAGGGGCGTAGCAATCAGCCCAGCATTT' + b'TTGCGCCAGCAGCTTGGATAGGACTAGCTACTT' + b'TGTGCCAGCAGTGAATGGGGACAGGGGCGTAGCAATCAGCCCCAGCATTT' + b'TGCGCCAGCAGCTTGGATAGGACTAGCTACTT' + b'TGTGCCAGCAGTGAATGGGGACAGGGGCGTAGCAATCAGCCCCAGCATTT' + b'TGCGCCAGCAGCTTGGATAGGACTAGCTACTT' + b'TGTGCCAGCAGTGAATGGGGACAGGGGCGTAGCAATCAGCCCCAGCATTT' + b'TGCGCCAGCAGCTTGGATAGGACTAGCTACTT' + b'TGTGCCAGCAGTGAATGGGGACAGGGGCGTAGCAATCAGCCCCAGCATTT' + b'TGCGCCAGCAGCTTGGATAGGACTAGCTACTT' + b'TGTGCCAGCAGTGAATGGGGACAGGGGCGTAGCAATCAGCCCCAGCATTT'], shape=(20,), dtype=string) +#+end_example