Generate FASTQ files from the simulated repertoire

This commit is contained in:
coolneng 2021-04-22 13:36:43 +02:00
parent 4adb92e901
commit 4dec2061fc
Signed by: coolneng
GPG Key ID: 9893DA236405AF57
4 changed files with 18 additions and 17 deletions

1
.gitignore vendored
View File

@ -1,2 +1 @@
*.fasta
*.fastq

View File

@ -11,14 +11,11 @@ fi
sequences=$1
number_of_reads=$2
read_mean_size=350
read_variance_size=0.0
data_directory="data/"
fasta=".fasta"
fastq=".fastq"
filename="sequence"
prefix="curesim_"
Rscript src/repertoire.r "$sequences" &&
java -jar tools/CuReSim.jar -n $((number_of_reads * sequences)) -m "$read_mean_size" -sd "$read_variance_size" -f "$data_directory$filename$fasta" -o "$data_directory$prefix$filename$fastq"
Rscript src/repertoire.r "$sequences" "$number_of_reads" &&
java -jar tools/CuReSim.jar -f "$data_directory$filename$fastq" -o "$data_directory$prefix$filename$fastq"
rm "$data_directory/log.txt"

View File

@ -55,7 +55,6 @@ get_hvr_sequences <- function(sequences, vdj_segments) {
df <- fetch_vj_sequences(sequences, vdj_segments)
v_alignment <- parallel::mcmapply(sequences, df$v_seq, FUN = align_sequence)
j_alignment <- parallel::mcmapply(sequences, df$j_seq, FUN = align_sequence)
print(v_alignment)
}
data <- parse_data(file = "data/curesim_sequence.fastq")

View File

@ -11,26 +11,32 @@ generate_repertoire <- function(number_of_sequences) {
}
save_data <- function(data) {
Biostrings::writeXStringSet(data$sequence, "data/sequence.fasta")
Biostrings::writeXStringSet(data$junction, "data/HVR.fasta")
Biostrings::writeXStringSet(data$sequence,
"data/sequence.fastq",
format = "fastq"
)
Biostrings::writeXStringSet(data$junction, "data/HVR.fastq", format = "fastq")
}
process_data <- function(data) {
process_data <- function(data, reads) {
dna_sequence <- Biostrings::DNAStringSet(data$sequence)
data$sequence <- Biostrings::reverseComplement(dna_sequence)
names(data$sequence) <- paste(rownames(data), data$v_call, data$j_call, " ")
return(data)
data$junction <- Biostrings::DNAStringSet(data$junction)
names(data$junction) <- rownames(data)
amplified_data <- data[rep(seq_len(nrow(data)), reads), ]
return(amplified_data)
}
parse_cli_arguments <- function() {
args <- commandArgs(trailingOnly = TRUE)
if (length(args) != 1) {
stop("usage: repertoire.r <number of sequences>")
if (length(args) != 2) {
stop("usage: repertoire.r <number of sequences> <number of reads>")
}
return(args[1])
return(args)
}
argument <- parse_cli_arguments()
repertoire <- generate_repertoire(number_of_sequences = as.integer(argument))
data <- process_data(data = repertoire)
args <- parse_cli_arguments()
repertoire <- generate_repertoire(number_of_sequences = as.integer(args[1]))
data <- process_data(data = repertoire, reads = args[2])
save_data(data)