library(immuneSIM) library(Biostrings) #' Generate the beta chain of a human T-cell receptor (TCR) #' #' @param number_of_sequences Number of different sequences to generate #' @return A \code{data.frame} with the sequences, V and J genes and CDR3 generate_repertoire <- function(number_of_sequences) { return(immuneSIM( number_of_seqs = number_of_sequences, species = "hs", receptor = "tr", chain = "b" )) } #' Saves the sequences and CDR3 to FASTQ files #' #' @param data A \code{data.frame} with the preprocessed TCR sequences and CDR3 save_data <- function(data) { Biostrings::writeXStringSet(data$sequence, "data/sequence.fastq", format = "fastq" ) Biostrings::writeXStringSet(data$junction, "data/HVR.fastq", format = "fastq") } #' Applies the reverse complement and amplifies the number of sequences #' #' @param data A \code{data.frame} containing the TCR sequences and CDR3 #' @param reads Number of times to amplify each sequence #' @return A \code{data.frame} with reverse complement sequences and VJ metadata process_data <- function(data, reads) { dna_sequence <- Biostrings::DNAStringSet(data$sequence) data$sequence <- Biostrings::reverseComplement(dna_sequence) names(data$sequence) <- paste(rownames(data), data$v_call, data$j_call, " ") data$junction <- Biostrings::DNAStringSet(data$junction) names(data$junction) <- rownames(data) amplified_data <- data[rep(seq_len(nrow(data)), reads), ] return(amplified_data) } #' Checks the number of command line arguments and captures them #' #' @return A \code{vector} containing the command line arguments parse_cli_arguments <- function() { args <- commandArgs(trailingOnly = TRUE) if (length(args) != 2) { stop("usage: repertoire.r ") } return(args) } args <- parse_cli_arguments() repertoire <- generate_repertoire(number_of_sequences = as.integer(args[1])) data <- process_data(data = repertoire, reads = args[2]) save_data(data)