Load libraries:

########## LOAD LIBRARIES ###########
library(Biostrings)
library(msaR)
library(msa)
library(stringr)
library(dplyr)

1 V and J alignment

1.1 `vj_alignment` function

The vj_alignment function performs the V and J alignments in the TCRβ reads to delimit their HVR.

vj_alignment <- function(reads, Vsegments, Jsegments, numCores = detectCores() - 4, gapOpening = 1){
  #' Align V and J segments in a set of reads to delimit the hypervariable region (HVR)
  #' 
  #' @param reads A \code{DNAStringSet} or a \code{QualityScaledDNAStringSet} containing the reads.
  #' @param Vsegments A \code{DNAStringSet} with the V segments.
  #' @param Jsegments A \code{DNAStringSet} with the J segments trimmed from the Phe codon.
  #' @param numCores Number of cores to execute the function in parallel.
  #' @param gapOpening Penalty for opening a gap in the alignment.
  #' @return A \code{data.frame} with V and J alignment information and HVR sequence, quality and reading frame.
  
  results <- dplyr::bind_rows(mclapply(mc.cores = numCores, X = 1:length(reads), FUN = function(i){
    
  read <- reads[i]  
  
  # Get the read ID
  read_id <- names(read)
  

  #########################################
  ############## V ALIGNMENT ##############
  #########################################
  
  ### Remove IMGT gaps before alignment to avoid them affecting the score ###
  Vsegments_nogaps <- DNAStringSet(gsub("\\.", "", Vsegments))
  names(Vsegments_nogaps) <- names(Vsegments)
  
  ### Perform 147 V alignments and get the score only (NO IMGT GAPS) ###
  v_alignment1 <- pairwiseAlignment(pattern = Vsegments_nogaps, 
                                 subject = read,
                                 type = "global-local",
                                 scoreOnly = TRUE,
                                 gapOpening = gapOpening)
  
  ### Select the segment with the best score ###
  best_v <- Vsegments[which.max(v_alignment1)]
  v_match <- names(best_v)
  
  ### Perform a complete pairwise alignment with the best match (W/ IMGT GAPS, for Cys location) ###
  v_alignment2 <- pairwiseAlignment(pattern = best_v,
                                 subject = read,
                                 type = "global-local",
                                 scoreOnly = FALSE,
                                 gapOpening = gapOpening)
  
  ### Perform a complete pairwise alignment with the best match (NO IMGT GAPS) ###
  best_v_nogaps <- DNAString(gsub("\\.", "", best_v))
  
  v_alignment3 <- pairwiseAlignment(pattern = best_v_nogaps,
                                    subject = read,
                                    type = "global-local",
                                    scoreOnly = FALSE,
                                    gapOpening = gapOpening)
  
  ### Get V alignment information ###
  v_score <- score(v_alignment3) # Score of alignment with no gaps
  v_identity <- pid(v_alignment3, type = "PID3")
  v_start <- start(Views(v_alignment3))
  v_end <- end(Views(v_alignment3))
  

  if (v_score < 0) { # If V score is negative, Cys check nor J alignment are performed 
    
    cys_check <- NA
    cys_status <- NA

    j_match <- NA
    j_score <- NA
    j_identity <- NA
    j_start <- NA
    j_end <- NA
    phe_check <- NA
    
    read_cys_start <- NA
    HVR_seq <- NA
    HVR_qual <- NA
    HVR_frame <- NA

    
  } else { # Only find Cys and align J if V score is positive
  
    #########################################
    ####### FIND CYS, HANDLE INDELS #########
    #########################################
    
    # Cys coordinates in the alignment, ideally
    cys_start <- 310
    cys_end <- 312
    
    # If there is any insertion in the read, Cys coordinates will be shifted in the alignment
    v_ins <- unlist(deletion(v_alignment2)) # IRanges with V insertions
    alg_cys_start <- cys_start + sum(width(v_ins[start(v_ins) <= cys_start])) # Cys start in alignment
    alg_cys_end <- cys_end + sum(width(v_ins[start(v_ins) <= cys_end])) # Cys end in alignment
    
    message(alg_cys_start, " ", alg_cys_end)
    
    # Find Cys coordinates in the read
    v_del <- unlist(insertion(v_alignment2)) # IRanges with deletions in subject (mostly IMGT gaps)
    print(v_del)
    num_v_del <- length(v_del) # Number of deletions (of any length)
    shift <- c(0, cumsum(width(v_del))[-num_v_del]) 
    v_del_sft <- shift(v_del, shift) # Fix coordinates
    subject_gaps <- sum(width(v_del_sft[end(v_del_sft) < alg_cys_start])) + nchar(str_extract(alignedSubject(v_alignment2), "^-*")) # Number of gaps in subject before Cys in the alignment. The "nchar" term takes into account the gaps at the beginning of the subject, which are not considered by the "insertion" function.
    
    read_cys_start <- v_start + alg_cys_start - subject_gaps -1 # Cys start in the read
    message(read_cys_start)
    
    # Get aligned Cys sequences in pattern and subject
    cys_subject <- subseq(alignedSubject(v_alignment2), start = alg_cys_start, end = alg_cys_end) # Cys sequence in subject
    message(cys_subject)
    gap_subject <- unlist(gregexpr(pattern = "-", text = cys_subject)) # Gap position in subject (deletion), if any
    
    cys_pattern <- subseq(alignedPattern(v_alignment2), start = alg_cys_start, end = alg_cys_end) # Cys sequence in pattern
    gap_pattern <- unlist(gregexpr(pattern = "-", text = cys_pattern)) # Gap position in pattern (insertion), if any
    
    # Check Cys sequence in subject
    if (cys_subject %in% c("TGT", "TGC")) { # Cys codon found
    
      cys_check <- TRUE
    
      if (cys_subject == cys_pattern) { # Same Cys codon in pattern and subject
        cys_status <- "Match"
    
      } else { # Different Cys codon in pattern and subject
        cys_status <- paste0("Pseudomatch (", cys_pattern," > ", cys_subject, ")")
      }
    
    } else if (all(gap_pattern > 0)) { # Insertion in subject, fix it by removing it
      cys_status <- paste0("Insertion (", cys_pattern," > ", cys_subject, "), removed")
    
      read_ins_pos <- read_cys_start + as.numeric(gap_pattern) -1 # Insertion position in read
      
      # Delete insertion in read
      fix_read <- as.character(read)
      fix_qual <- quality(read)
      stringr::str_sub(fix_read, read_ins_pos[1], read_ins_pos[length(read_ins_pos)]) <- "" # Remove Cys insertion in the read
      stringr::str_sub(fix_qual, read_ins_pos[1], read_ins_pos[length(read_ins_pos)]) <- "" # Remove Cys insertion in the quality string
      fix_qual <- as(fix_qual, new.quality.class) # Convert to PhredQuality class
      read_id <- names(read)
      read <- QualityScaledDNAStringSet(fix_read, fix_qual) # Rebuild QualityScaledDNAStringSet without Cys insertion
      names(read) <- read_id
      
      cys_codon <- subseq(read, start = read_cys_start, end = read_cys_start + 2)
      cys_check <- ifelse(cys_codon %in% c("TGT", "TGC"), TRUE, FALSE)
      
    } else if (all(gap_subject > 0)) { # Deletion in subject
      cys_status <- paste0("Deletion (", cys_pattern," > ", cys_subject, ")")
      cys_check <- FALSE
    
    } else { # Mismatch
      cys_status <- paste0("Mismatch (", cys_pattern," > ", cys_subject, ")")
      cys_check <- FALSE
    }
    
    
    
    
    
    #########################################
    ############## J ALIGNMENT ##############
    #########################################
    
    ### Remove V match from the read
    remaining <- subseq(read, start = read_cys_start + 3) # Align J from Cys codon (not included)
    
    ### Perform 16 J alignments and get the score only
    j_alignment1 <- pairwiseAlignment(pattern = Jsegments, 
                                   subject = remaining,
                                   type = "global-local",
                                   scoreOnly = TRUE,
                                   gapOpening = gapOpening)
    
    ### Select the segment with the best score
    best_j <- Jsegments[which.max(j_alignment1)]
    j_match <- names(best_j)
    
    ### Perform a complete pairwise alignment with the best match
    j_alignment2 <- pairwiseAlignment(pattern = best_j,
                                   subject = remaining,
                                   type = "global-local",
                                   scoreOnly = FALSE,
                                   gapOpening = gapOpening)
    
    ### Get J alignment information
    j_score <- score(j_alignment2)
    j_identity <- pid(j_alignment2, type = "PID3")
    j_start <- start(Views(j_alignment2)) + read_cys_start + 2 # Get the coordinates **in the read**, not in the sequence without the V match
    j_end <- end(Views(j_alignment2)) + read_cys_start + 2
    
    ### Check if the first codon is a Phe
    phe_codon <- subseq(read, start = j_start, end = j_start + 2)
    phe_check <- phe_codon %in% c("TTT", "TTC")
    
    

    
    #########################################
    ############## HVR LOCATION #############
    #########################################
    
    ### Get HVR sequence and sequencing quality
    HVR <- subseq(read, start = read_cys_start, end = j_start + 2)
    HVR_seq <- as.character(HVR)
    HVR_qual <- as.character(quality(HVR))
    
    # Find out HVR frame (0: Cys and Phe in frame; 1 OR 2: out of frame)
    HVR_frame <- nchar(HVR_seq) %% 3
    
    if (HVR_frame != 0) { # If the remainder is 1, frame = 2 and viceversa
      HVR_frame <- ifelse(HVR_frame == 1, 2, 1)
    }
  }
  
  
    
  
  #########################################
  ################# OUTPUT ################
  #########################################
  
  df <- data.frame(
    read_ID = read_id,
    read_length = width(read),
    
    V_match = v_match,
    V_score = v_score,
    V_identity = v_identity,
    V_start = v_start,
    Cys_check = cys_check,
    Cys_status = cys_status,
    
    J_match = j_match,
    J_score = j_score,
    J_identity = j_identity,
    J_end = j_end,
    Phe_check = phe_check,
    
    HVR_start = read_cys_start,
    HVR_end = j_start + 2,
    HVR_sequence = HVR_seq,
    HVR_quality = HVR_qual,
    HVR_frame = HVR_frame
    
  ) %>% mutate(read_ID = as.character(read_ID),
               V_match = as.character(V_match),
               J_match = as.character(J_match),
               Cys_status = as.character(Cys_status),
               HVR_sequence = as.character(HVR_sequence),
               HVR_quality = as.character(HVR_quality))

  
  return(df)
  }))
}

1.2 Run alignment

Load input:

v_segments and j_segments: both are DNAStringset that contains the reference sequences for V (147 sequences: all 64 segments + alleles) and J segments (16 sequences).
examples: QualityScaledDNAStringSet that contains three example reads and their sequencing qualities. The reads have been previously reverse-complemented (!!!).

###############################################################################################
####################### CONVERT .fastq TO QualityScaledDNAStringSet ###########################
###############################################################################################

# curesim_fq <- ShortRead::readFastq("software/CuReSim/output/00_first_test.fastq") # Read .fq file and store in a ShortReadQ class object
# reads_sim <- ShortRead::sread(curesim_fq) # DNAStringSet with the reads
# quality_sim <- quality(curesim_fq) # BStringSet with the sequencing qualities
# 
# new.quality.class <- switch(class(quality_sim), # Convert from BStringSet to XStringQuality
#                                       SFastqQuality="SolexaQuality",
#                                       FastqQuality="PhredQuality",
#                                       "XStringQuality")
# quality_sim <- as(quality_sim, new.quality.class)
# 
# sample_sim <- QualityScaledDNAStringSet(reads_sim, quality_sim) # Get QualityScaledDNAStringSet to use as a pairwiseAlignment input
# names(sample_sim) <- as.character(curesim_fq@id) # Character vector with the reads IDs
# 
# sample_sim_rc <- reverseComplement(sample_sim) # V-CDR3-J

(!!!) ImmuneSIM-generated reads have the following structure: V-CDR3-J. To mimic the sequencing process of our collaborators, before CuReSim, reads need to be reverse-complemented (J-CDR3-V), so that CDR3 lies at the beginning of the read, where sequencing quality is better. However, to perform V and J alignment, the reads must be reverse-complemented again (V-CDR3-J) for the alignment to perform correctly. Reverse complement can be performed with Biostrings::reverseComplement() function.

# Load reference sequences (Class: DNAStringSet)
v_segments <- readRDS("v_segments.rds")
j_segments <- readRDS("j_segments_phe.rds")
  
# Load sample (Class: QualityScaledDNAStringSet)
examples <- readRDS("example_reads.rds")

# Run alignment
start_time <- Sys.time()

vj_alg <- vj_alignment(reads = examples, 
                       Vsegments = v_segments, 
                       Jsegments = j_segments,
                       numCores = detectCores() - 4) ### Alignment parallelization (alignment of 1 read vs all reference sequences per thread)

end_time <- Sys.time()

# Show results
DT::datatable(vj_alg,
              class = 'nowrap compact order-column',
              extensions = c('Buttons',  'FixedColumns'),
              options = list(dom = 'Bfrtip',
                             buttons = c('csv', 'excel'),
                             scrollX = TRUE,
                             fixedColumns = TRUE))

end_time - start_time

## Time difference of 2.013929 secs

2 Alignment visualization

2.1 `vj_align_vis` function

vj_align_vis <- function(alignmentOutput, readsDSS, vseg = v_segments, jseg = j_segments, whichSegment, gapOpening = 1, seqLogo = FALSE, widgHeight = 100){
  row <- alignmentOutput
  segment <- paste0(whichSegment, "_match")
  
  read <- readsDSS[row[1, "read_ID"]]
  match <- row[1, segment]
  HVR_start <- row[1, "HVR_start"]
  
  if (whichSegment == "V") {
    best <- vseg[match]
  } else {
    best <- jseg[match]
    read <- subseq(read, start = HVR_start + 3)
  }
  
  alg <- pairwiseAlignment(pattern = best,
                                 subject = read,
                                 type = "global-local",
                                 scoreOnly = FALSE,
                                 gapOpening = gapOpening)
  
  seq <- DNAStringSet(c(alignedSubject(alg), alignedPattern(alg)))
  
  widget <- msaR(msa = seq,
              overviewbox = FALSE,
              seqlogo = seqLogo,
              alignmentHeight = 100,
              height = widgHeight)
  
  return(widget)
}

2.2 Visualize alignments

To easily see sequence conservation enable sequence logo (Vis.elements > Show sequence logo).

rownum <- 1 # Row of the vj_alg dataframe to visualize

V alignment

vj_align_vis(alignmentOutput = vj_alg[rownum,],  
             readsDSS = examples,
             whichSegment = "V")

J alignment

vj_align_vis(alignmentOutput = vj_alg[rownum,],  
             readsDSS = examples,
             whichSegment = "J")

3 Session info

sessionInfo()

## R version 3.6.1 (2019-07-05)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.2 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/openblas/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/libopenblasp-r0.2.20.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats4    parallel  stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
## [1] dplyr_1.0.2         stringr_1.4.0       msa_1.16.0         
## [4] msaR_0.3.0          Biostrings_2.52.0   XVector_0.24.0     
## [7] IRanges_2.18.3      S4Vectors_0.22.1    BiocGenerics_0.30.0
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_1.0.5        compiler_3.6.1    pillar_1.4.6      tools_3.6.1      
##  [5] zlibbioc_1.30.0   digest_0.6.27     jsonlite_1.7.1    evaluate_0.14    
##  [9] lifecycle_0.2.0   tibble_3.0.4      nlme_3.1-140      lattice_0.20-38  
## [13] pkgconfig_2.0.3   rlang_0.4.8       crosstalk_1.1.0.1 yaml_2.2.1       
## [17] xfun_0.19         knitr_1.30        generics_0.1.0    vctrs_0.3.4      
## [21] htmlwidgets_1.5.2 grid_3.6.1        DT_0.16           tidyselect_1.1.0 
## [25] glue_1.4.2        R6_2.5.0          rmarkdown_2.5     purrr_0.3.4      
## [29] magrittr_1.5      htmltools_0.5.0   ellipsis_0.3.1    ape_5.4-1        
## [33] stringi_1.5.3     crayon_1.3.4

V and J segments alignment visualization

M. S. Benítez-Cantos

01/03/2021

1 V and J alignment

1.1 `vj_alignment` function

1.2 Run alignment

2 Alignment visualization

2.1 `vj_align_vis` function

2.2 Visualize alignments

3 Session info

V and J segments alignment visualization

M. S. Benítez-Cantos

01/03/2021

1 V and J alignment

1.1 vj_alignment function

1.2 Run alignment

2 Alignment visualization

2.1 vj_align_vis function

2.2 Visualize alignments

3 Session info

1.1 `vj_alignment` function

2.1 `vj_align_vis` function