57 lines
2.0 KiB
R
57 lines
2.0 KiB
R
library(immuneSIM)
|
|
library(Biostrings)
|
|
|
|
#' Generate the beta chain of a human T-cell receptor (TCR)
|
|
#'
|
|
#' @param number_of_sequences Number of different sequences to generate
|
|
#' @return A \code{data.frame} with the sequences, V and J genes and CDR3
|
|
generate_repertoire <- function(number_of_sequences) {
|
|
return(immuneSIM(
|
|
number_of_seqs = number_of_sequences,
|
|
species = "hs",
|
|
receptor = "tr",
|
|
chain = "b"
|
|
))
|
|
}
|
|
|
|
#' Saves the sequences and CDR3 to FASTQ files
|
|
#'
|
|
#' @param data A \code{data.frame} with the preprocessed TCR sequences and CDR3
|
|
save_data <- function(data) {
|
|
Biostrings::writeXStringSet(data$sequence,
|
|
"data/sequence.fastq",
|
|
format = "fastq"
|
|
)
|
|
Biostrings::writeXStringSet(data$junction, "data/HVR.fastq", format = "fastq")
|
|
}
|
|
|
|
#' Applies the reverse complement and amplifies the number of sequences
|
|
#'
|
|
#' @param data A \code{data.frame} containing the TCR sequences and CDR3
|
|
#' @param reads Number of times to amplify each sequence
|
|
#' @return A \code{data.frame} with reverse complement sequences and VJ metadata
|
|
process_data <- function(data, reads) {
|
|
dna_sequence <- Biostrings::DNAStringSet(data$sequence)
|
|
data$sequence <- Biostrings::reverseComplement(dna_sequence)
|
|
names(data$sequence) <- paste(rownames(data), data$v_call, data$j_call, " ")
|
|
data$junction <- Biostrings::DNAStringSet(data$junction)
|
|
names(data$junction) <- rownames(data)
|
|
amplified_data <- data[rep(seq_len(nrow(data)), reads), ]
|
|
return(amplified_data)
|
|
}
|
|
|
|
#' Checks the number of command line arguments and captures them
|
|
#'
|
|
#' @return A \code{vector} containing the command line arguments
|
|
parse_cli_arguments <- function() {
|
|
args <- commandArgs(trailingOnly = TRUE)
|
|
if (length(args) != 2) {
|
|
stop("usage: repertoire.r <number of sequences> <number of reads>")
|
|
}
|
|
return(args)
|
|
}
|
|
|
|
args <- parse_cli_arguments()
|
|
repertoire <- generate_repertoire(number_of_sequences = as.integer(args[1]))
|
|
data <- process_data(data = repertoire, reads = args[2])
|
|
save_data(data) |