Document the alignment script

2021-05-04 19:25:11 +02:00 · 2021-05-04 19:25:11 +02:00 · e8f03189c2
parent f4b7a41599
commit e8f03189c2
1 changed files with 47 additions and 0 deletions
--- a/src/alignment.r
+++ b/src/alignment.r
@ -1,6 +1,10 @@
 library(Biostrings)
 library(parallel)
 #' Import and process the TCR and VJ sequences
 #'
 #' @param file A file path with the sequences after applying a read simulator
 #' @return A \code{list} with the TCR sequences and VJ sequences
 parse_data <- function(file) {
  reversed_sequences <- Biostrings::readQualityScaledDNAStringSet(file)
  sequences <- Biostrings::reverseComplement(reversed_sequences)
@ -11,6 +15,10 @@ parse_data <- function(file) {
  return(list(sequences, vj_segments))
 }
 #' Extracts the VJ metadata from the sequences read identifier
 #'
 #' @param metadata The read identifier of a sequence
 #' @return A \code{list} with the V and J gene identifier
 parse_metadata <- function(metadata) {
  id_elements <- unlist(strsplit(metadata, split = " "))
  v_identifier <- id_elements[2]
@ -18,12 +26,24 @@ parse_metadata <- function(metadata) {
  return(list(v_id = v_identifier, j_id = j_identifier))
 }
 #' Fetches the sequence that matches the VJ gene identifier
 #'
 #' @param names The names of the VJ sequences
 #' @param vdj_segments A \code{DNAStringSet} containing the VJ sequences
 #' @param id The read identifier of a sequence
 #' @return A \code{character} containing the gene sequence
 match_id_sequence <- function(names, vdj_segments, id) {
  matches <- grep(names, pattern = id)
  row <- matches[1]
  return(as.character(vdj_segments[row]))
 }
 #' Gets the V and J sequences for a particular read identifier
 #'
 #' @param metadata The read identifier of a sequence
 #' @param names The names of the VJ sequences
 #' @param vdj_segments A \code{DNAStringSet} containing the VJ sequences
 #' @return A \code{list} with the V and J sequences
 get_vj_sequence <- function(metadata, names, vdj_segments) {
  identifiers <- parse_metadata(metadata)
  v_sequence <- match_id_sequence(names, vdj_segments, id = identifiers["v_id"])
@ -31,6 +51,11 @@ get_vj_sequence <- function(metadata, names, vdj_segments) {
  return(list(v_seq = v_sequence, j_seq = j_sequence))
 }
 #' Obtains the VJ sequences for all the TCR sequences
 #'
 #' @param sequences A \code{QualityScaledDNAStringSet} with the TCR sequences
 #' @param vdj_segments A \code{DNAStringSet} containing the VJ sequences
 #' @return A \code{data.frame} with the V and J sequences
 fetch_vj_sequences <- function(sequences, vdj_segments) {
  vj_sequences <- sapply(names(sequences),
    names(vdj_segments),
@ -41,6 +66,11 @@ fetch_vj_sequences <- function(sequences, vdj_segments) {
  return(results)
 }
 #' Perform a pairwise alignment of a sequence with the canonical V or J sequence
 #'
 #' @param sequence A \code{DNAString} containing the TCR sequences
 #' @param vdj_segment A \code{DNAString} containing the V or J sequence
 #' @return A \code{PairwiseAlignments}
 align_sequence <- function(sequence, vdj_segment) {
  return(Biostrings::pairwiseAlignment(
    subject = sequence,
@ -50,6 +80,13 @@ align_sequence <- function(sequence, vdj_segment) {
  ))
 }
 #' Computes the coordinate shift of the Cysteine due to indels
 #'
 #' @param insertion An \code{IRanges} containing the insertions
 #' @param deletion An \code{IRanges} containing the deletions
 #' @param cys A \code{list} with the Cysteine coordinates
 #' @param alignment A \code{PairwiseAlignments}
 #' @return A \code{list} with the delta of the Cysteine coordinates
 handle_indels <- function(insertion, deletion, cys, alignment) {
  ins_start <- sum(Biostrings::width(deletion[start(deletion) <= cys$start]))
  ins_end <- sum(Biostrings::width(deletion[end(deletion) <= cys$end]))
@ -60,6 +97,10 @@ handle_indels <- function(insertion, deletion, cys, alignment) {
  return(list("start" = ins_start - gaps, "end" = ins_end - gaps))
 }
 #' Find the coordinates of the first Cysteine of the HVR
 #'
 #' @param alignment A \code{PairwiseAlignments}
 #' @return A \code{list} with the Cysteine coordinates
 get_cys_coordinates <- function(alignment) {
  cys <- list("start" = 310, "end" = 312)
  insertion <- unlist(Biostrings::insertion(alignment))
@ -70,6 +111,12 @@ get_cys_coordinates <- function(alignment) {
  return(list("start" = cys_start, "end" = cys_end))
 }
 #' Delimit the hypervariable region (HVR) for each TCR sequence
 #'
 #' @param sequences A \code{QualityScaledDNAStringSet} with the TCR sequences
 #' @param vdj_segments A \code{DNAStringSet} containing the VJ sequences
 #' @param cores Number of cores to apply multiprocessing
 #' @return A \code{QualityScaledDNAStringSet} containing the HVR
 get_hvr_sequences <- function(sequences, vdj_segments, cores = detectCores()) {
  df <- fetch_vj_sequences(sequences, vdj_segments)
  v_alignment <- parallel::mcmapply(sequences,