diff --git a/src/alignment.r b/src/alignment.r index 5d9adcc..d28e028 100644 --- a/src/alignment.r +++ b/src/alignment.r @@ -1,6 +1,10 @@ library(Biostrings) library(parallel) +#' Import and process the TCR and VJ sequences +#' +#' @param file A file path with the sequences after applying a read simulator +#' @return A \code{list} with the TCR sequences and VJ sequences parse_data <- function(file) { reversed_sequences <- Biostrings::readQualityScaledDNAStringSet(file) sequences <- Biostrings::reverseComplement(reversed_sequences) @@ -11,6 +15,10 @@ parse_data <- function(file) { return(list(sequences, vj_segments)) } +#' Extracts the VJ metadata from the sequences read identifier +#' +#' @param metadata The read identifier of a sequence +#' @return A \code{list} with the V and J gene identifier parse_metadata <- function(metadata) { id_elements <- unlist(strsplit(metadata, split = " ")) v_identifier <- id_elements[2] @@ -18,12 +26,24 @@ parse_metadata <- function(metadata) { return(list(v_id = v_identifier, j_id = j_identifier)) } +#' Fetches the sequence that matches the VJ gene identifier +#' +#' @param names The names of the VJ sequences +#' @param vdj_segments A \code{DNAStringSet} containing the VJ sequences +#' @param id The read identifier of a sequence +#' @return A \code{character} containing the gene sequence match_id_sequence <- function(names, vdj_segments, id) { matches <- grep(names, pattern = id) row <- matches[1] return(as.character(vdj_segments[row])) } +#' Gets the V and J sequences for a particular read identifier +#' +#' @param metadata The read identifier of a sequence +#' @param names The names of the VJ sequences +#' @param vdj_segments A \code{DNAStringSet} containing the VJ sequences +#' @return A \code{list} with the V and J sequences get_vj_sequence <- function(metadata, names, vdj_segments) { identifiers <- parse_metadata(metadata) v_sequence <- match_id_sequence(names, vdj_segments, id = identifiers["v_id"]) @@ -31,6 +51,11 @@ get_vj_sequence <- function(metadata, names, vdj_segments) { return(list(v_seq = v_sequence, j_seq = j_sequence)) } +#' Obtains the VJ sequences for all the TCR sequences +#' +#' @param sequences A \code{QualityScaledDNAStringSet} with the TCR sequences +#' @param vdj_segments A \code{DNAStringSet} containing the VJ sequences +#' @return A \code{data.frame} with the V and J sequences fetch_vj_sequences <- function(sequences, vdj_segments) { vj_sequences <- sapply(names(sequences), names(vdj_segments), @@ -41,6 +66,11 @@ fetch_vj_sequences <- function(sequences, vdj_segments) { return(results) } +#' Perform a pairwise alignment of a sequence with the canonical V or J sequence +#' +#' @param sequence A \code{DNAString} containing the TCR sequences +#' @param vdj_segment A \code{DNAString} containing the V or J sequence +#' @return A \code{PairwiseAlignments} align_sequence <- function(sequence, vdj_segment) { return(Biostrings::pairwiseAlignment( subject = sequence, @@ -50,6 +80,13 @@ align_sequence <- function(sequence, vdj_segment) { )) } +#' Computes the coordinate shift of the Cysteine due to indels +#' +#' @param insertion An \code{IRanges} containing the insertions +#' @param deletion An \code{IRanges} containing the deletions +#' @param cys A \code{list} with the Cysteine coordinates +#' @param alignment A \code{PairwiseAlignments} +#' @return A \code{list} with the delta of the Cysteine coordinates handle_indels <- function(insertion, deletion, cys, alignment) { ins_start <- sum(Biostrings::width(deletion[start(deletion) <= cys$start])) ins_end <- sum(Biostrings::width(deletion[end(deletion) <= cys$end])) @@ -60,6 +97,10 @@ handle_indels <- function(insertion, deletion, cys, alignment) { return(list("start" = ins_start - gaps, "end" = ins_end - gaps)) } +#' Find the coordinates of the first Cysteine of the HVR +#' +#' @param alignment A \code{PairwiseAlignments} +#' @return A \code{list} with the Cysteine coordinates get_cys_coordinates <- function(alignment) { cys <- list("start" = 310, "end" = 312) insertion <- unlist(Biostrings::insertion(alignment)) @@ -70,6 +111,12 @@ get_cys_coordinates <- function(alignment) { return(list("start" = cys_start, "end" = cys_end)) } +#' Delimit the hypervariable region (HVR) for each TCR sequence +#' +#' @param sequences A \code{QualityScaledDNAStringSet} with the TCR sequences +#' @param vdj_segments A \code{DNAStringSet} containing the VJ sequences +#' @param cores Number of cores to apply multiprocessing +#' @return A \code{QualityScaledDNAStringSet} containing the HVR get_hvr_sequences <- function(sequences, vdj_segments, cores = detectCores()) { df <- fetch_vj_sequences(sequences, vdj_segments) v_alignment <- parallel::mcmapply(sequences,