From ca6273788440c974c367c4613f866c9d653c1621 Mon Sep 17 00:00:00 2001 From: coolneng Date: Thu, 22 Apr 2021 13:36:43 +0200 Subject: [PATCH] Generate FASTQ files from the simulated repertoire --- generation.sh | 7 ++----- src/alignment.r | 1 - src/repertoire.r | 26 ++++++++++++++++---------- 3 files changed, 18 insertions(+), 16 deletions(-) diff --git a/generation.sh b/generation.sh index dfb62e1..07f7114 100755 --- a/generation.sh +++ b/generation.sh @@ -11,14 +11,11 @@ fi sequences=$1 number_of_reads=$2 -read_mean_size=350 -read_variance_size=0.0 data_directory="data/" -fasta=".fasta" fastq=".fastq" filename="sequence" prefix="curesim_" -Rscript src/repertoire.r "$sequences" && - java -jar tools/CuReSim.jar -n $((number_of_reads * sequences)) -m "$read_mean_size" -sd "$read_variance_size" -f "$data_directory$filename$fasta" -o "$data_directory$prefix$filename$fastq" +Rscript src/repertoire.r "$sequences" "$number_of_reads" && + java -jar tools/CuReSim.jar -f "$data_directory$filename$fastq" -o "$data_directory$prefix$filename$fastq" rm "$data_directory/log.txt" diff --git a/src/alignment.r b/src/alignment.r index 2067f92..0920b0d 100644 --- a/src/alignment.r +++ b/src/alignment.r @@ -55,7 +55,6 @@ get_hvr_sequences <- function(sequences, vdj_segments) { df <- fetch_vj_sequences(sequences, vdj_segments) v_alignment <- parallel::mcmapply(sequences, df$v_seq, FUN = align_sequence) j_alignment <- parallel::mcmapply(sequences, df$j_seq, FUN = align_sequence) - print(v_alignment) } data <- parse_data(file = "data/curesim_sequence.fastq") diff --git a/src/repertoire.r b/src/repertoire.r index a3669ff..8edb480 100644 --- a/src/repertoire.r +++ b/src/repertoire.r @@ -11,26 +11,32 @@ generate_repertoire <- function(number_of_sequences) { } save_data <- function(data) { - Biostrings::writeXStringSet(data$sequence, "data/sequence.fasta") - Biostrings::writeXStringSet(data$junction, "data/HVR.fasta") + Biostrings::writeXStringSet(data$sequence, + "data/sequence.fastq", + format = "fastq" + ) + Biostrings::writeXStringSet(data$junction, "data/HVR.fastq", format = "fastq") } -process_data <- function(data) { +process_data <- function(data, reads) { dna_sequence <- Biostrings::DNAStringSet(data$sequence) data$sequence <- Biostrings::reverseComplement(dna_sequence) names(data$sequence) <- paste(rownames(data), data$v_call, data$j_call, " ") - return(data) + data$junction <- Biostrings::DNAStringSet(data$junction) + names(data$junction) <- rownames(data) + amplified_data <- data[rep(seq_len(nrow(data)), reads), ] + return(amplified_data) } parse_cli_arguments <- function() { args <- commandArgs(trailingOnly = TRUE) - if (length(args) != 1) { - stop("usage: repertoire.r ") + if (length(args) != 2) { + stop("usage: repertoire.r ") } - return(args[1]) + return(args) } -argument <- parse_cli_arguments() -repertoire <- generate_repertoire(number_of_sequences = as.integer(argument)) -data <- process_data(data = repertoire) +args <- parse_cli_arguments() +repertoire <- generate_repertoire(number_of_sequences = as.integer(args[1])) +data <- process_data(data = repertoire, reads = args[2]) save_data(data) \ No newline at end of file