locigenesis/src/repertoire.r

55 lines
1.5 KiB
R
Raw Normal View History

library(immuneSIM)
library(Biostrings)
2021-02-25 20:00:35 +01:00
generate_repertoires <- function(number_of_sequences) {
a_chain <- immuneSIM(
number_of_seqs = number_of_sequences,
species = "hs",
receptor = "tr",
chain = "a",
verbose = TRUE
)
b_chain <- immuneSIM(
number_of_seqs = number_of_sequences,
species = "hs",
receptor = "tr",
chain = "b",
verbose = TRUE
)
return(list("a_chain" = a_chain, "b_chain" = b_chain))
}
process_chain <- function(repertoire) {
sequences <- as.character(repertoire$sequence)
counts <- as.integer(repertoire$counts)
reads <- Biostrings::DNAStringSet(rep(sequences, counts))
names(reads) <- seq_len(length(reads))
reverse_complement <- Biostrings::reverseComplement(reads)
return(reverse_complement)
}
preprocess_data <- function(repertoires) {
filtered_repertoires <- lapply(repertoires, process_chain)
names(filtered_repertoires) <- names(repertoires)
return(filtered_repertoires)
}
2021-02-26 02:20:11 +01:00
save_data <- function(repertoires) {
for (chain in names(repertoires)) {
file_name <- paste("data/", chain, ".fastq", sep = "")
Biostrings::writeXStringSet(repertoires[[chain]], file_name, format = "fastq")
}
}
parse_cli_arguments <- function(args) {
if (length(args) != 1) {
stop("usage: repertoire.r <number of sequences>")
}
return(as.integer(args[1]))
}
args <- commandArgs(trailingOnly = TRUE)
number_of_sequences <- parse_cli_arguments(args)
sim_repertoire <- generate_repertoires(number_of_sequences)
processed_data <- preprocess_data(sim_repertoire)
2021-02-26 02:20:11 +01:00
save_data(processed_data)