From e8f03189c246f8993ecfdb8941e106cc3310e53f Mon Sep 17 00:00:00 2001
From: coolneng <akasroua@gmail.com>
Date: Tue, 4 May 2021 19:25:11 +0200
Subject: [PATCH] Document the alignment script

---
 src/alignment.r | 47 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/src/alignment.r b/src/alignment.r
index 5d9adcc..d28e028 100644
--- a/src/alignment.r
+++ b/src/alignment.r
@@ -1,6 +1,10 @@
 library(Biostrings)
 library(parallel)
 
+#' Import and process the TCR and VJ sequences
+#'
+#' @param file A file path with the sequences after applying a read simulator
+#' @return A \code{list} with the TCR sequences and VJ sequences
 parse_data <- function(file) {
   reversed_sequences <- Biostrings::readQualityScaledDNAStringSet(file)
   sequences <- Biostrings::reverseComplement(reversed_sequences)
@@ -11,6 +15,10 @@ parse_data <- function(file) {
   return(list(sequences, vj_segments))
 }
 
+#' Extracts the VJ metadata from the sequences read identifier
+#'
+#' @param metadata The read identifier of a sequence
+#' @return A \code{list} with the V and J gene identifier
 parse_metadata <- function(metadata) {
   id_elements <- unlist(strsplit(metadata, split = " "))
   v_identifier <- id_elements[2]
@@ -18,12 +26,24 @@ parse_metadata <- function(metadata) {
   return(list(v_id = v_identifier, j_id = j_identifier))
 }
 
+#' Fetches the sequence that matches the VJ gene identifier
+#'
+#' @param names The names of the VJ sequences
+#' @param vdj_segments A \code{DNAStringSet} containing the VJ sequences
+#' @param id The read identifier of a sequence
+#' @return A \code{character} containing the gene sequence
 match_id_sequence <- function(names, vdj_segments, id) {
   matches <- grep(names, pattern = id)
   row <- matches[1]
   return(as.character(vdj_segments[row]))
 }
 
+#' Gets the V and J sequences for a particular read identifier
+#'
+#' @param metadata The read identifier of a sequence
+#' @param names The names of the VJ sequences
+#' @param vdj_segments A \code{DNAStringSet} containing the VJ sequences
+#' @return A \code{list} with the V and J sequences
 get_vj_sequence <- function(metadata, names, vdj_segments) {
   identifiers <- parse_metadata(metadata)
   v_sequence <- match_id_sequence(names, vdj_segments, id = identifiers["v_id"])
@@ -31,6 +51,11 @@ get_vj_sequence <- function(metadata, names, vdj_segments) {
   return(list(v_seq = v_sequence, j_seq = j_sequence))
 }
 
+#' Obtains the VJ sequences for all the TCR sequences
+#'
+#' @param sequences A \code{QualityScaledDNAStringSet} with the TCR sequences
+#' @param vdj_segments A \code{DNAStringSet} containing the VJ sequences
+#' @return A \code{data.frame} with the V and J sequences
 fetch_vj_sequences <- function(sequences, vdj_segments) {
   vj_sequences <- sapply(names(sequences),
     names(vdj_segments),
@@ -41,6 +66,11 @@ fetch_vj_sequences <- function(sequences, vdj_segments) {
   return(results)
 }
 
+#' Perform a pairwise alignment of a sequence with the canonical V or J sequence
+#'
+#' @param sequence A \code{DNAString} containing the TCR sequences
+#' @param vdj_segment A \code{DNAString} containing the V or J sequence
+#' @return A \code{PairwiseAlignments}
 align_sequence <- function(sequence, vdj_segment) {
   return(Biostrings::pairwiseAlignment(
     subject = sequence,
@@ -50,6 +80,13 @@ align_sequence <- function(sequence, vdj_segment) {
   ))
 }
 
+#' Computes the coordinate shift of the Cysteine due to indels
+#'
+#' @param insertion An \code{IRanges} containing the insertions
+#' @param deletion An \code{IRanges} containing the deletions
+#' @param cys A \code{list} with the Cysteine coordinates
+#' @param alignment A \code{PairwiseAlignments}
+#' @return A \code{list} with the delta of the Cysteine coordinates
 handle_indels <- function(insertion, deletion, cys, alignment) {
   ins_start <- sum(Biostrings::width(deletion[start(deletion) <= cys$start]))
   ins_end <- sum(Biostrings::width(deletion[end(deletion) <= cys$end]))
@@ -60,6 +97,10 @@ handle_indels <- function(insertion, deletion, cys, alignment) {
   return(list("start" = ins_start - gaps, "end" = ins_end - gaps))
 }
 
+#' Find the coordinates of the first Cysteine of the HVR
+#'
+#' @param alignment A \code{PairwiseAlignments}
+#' @return A \code{list} with the Cysteine coordinates
 get_cys_coordinates <- function(alignment) {
   cys <- list("start" = 310, "end" = 312)
   insertion <- unlist(Biostrings::insertion(alignment))
@@ -70,6 +111,12 @@ get_cys_coordinates <- function(alignment) {
   return(list("start" = cys_start, "end" = cys_end))
 }
 
+#' Delimit the hypervariable region (HVR) for each TCR sequence
+#'
+#' @param sequences A \code{QualityScaledDNAStringSet} with the TCR sequences
+#' @param vdj_segments A \code{DNAStringSet} containing the VJ sequences
+#' @param cores Number of cores to apply multiprocessing
+#' @return A \code{QualityScaledDNAStringSet} containing the HVR
 get_hvr_sequences <- function(sequences, vdj_segments, cores = detectCores()) {
   df <- fetch_vj_sequences(sequences, vdj_segments)
   v_alignment <- parallel::mcmapply(sequences,