Load libraries:
########## LOAD LIBRARIES ###########
library(Biostrings)
library(msaR)
library(msa)
library(stringr)
library(dplyr)
vj_alignment
functionThe vj_alignment
function performs the V and J alignments in the TCRβ reads to delimit their HVR.
vj_alignment <- function(reads, Vsegments, Jsegments, numCores = detectCores() - 4, gapOpening = 1){
#' Align V and J segments in a set of reads to delimit the hypervariable region (HVR)
#'
#' @param reads A \code{DNAStringSet} or a \code{QualityScaledDNAStringSet} containing the reads.
#' @param Vsegments A \code{DNAStringSet} with the V segments.
#' @param Jsegments A \code{DNAStringSet} with the J segments trimmed from the Phe codon.
#' @param numCores Number of cores to execute the function in parallel.
#' @param gapOpening Penalty for opening a gap in the alignment.
#' @return A \code{data.frame} with V and J alignment information and HVR sequence, quality and reading frame.
results <- dplyr::bind_rows(mclapply(mc.cores = numCores, X = 1:length(reads), FUN = function(i){
read <- reads[i]
# Get the read ID
read_id <- names(read)
#########################################
############## V ALIGNMENT ##############
#########################################
### Remove IMGT gaps before alignment to avoid them affecting the score ###
Vsegments_nogaps <- DNAStringSet(gsub("\\.", "", Vsegments))
names(Vsegments_nogaps) <- names(Vsegments)
### Perform 147 V alignments and get the score only (NO IMGT GAPS) ###
v_alignment1 <- pairwiseAlignment(pattern = Vsegments_nogaps,
subject = read,
type = "global-local",
scoreOnly = TRUE,
gapOpening = gapOpening)
### Select the segment with the best score ###
best_v <- Vsegments[which.max(v_alignment1)]
v_match <- names(best_v)
### Perform a complete pairwise alignment with the best match (W/ IMGT GAPS, for Cys location) ###
v_alignment2 <- pairwiseAlignment(pattern = best_v,
subject = read,
type = "global-local",
scoreOnly = FALSE,
gapOpening = gapOpening)
### Perform a complete pairwise alignment with the best match (NO IMGT GAPS) ###
best_v_nogaps <- DNAString(gsub("\\.", "", best_v))
v_alignment3 <- pairwiseAlignment(pattern = best_v_nogaps,
subject = read,
type = "global-local",
scoreOnly = FALSE,
gapOpening = gapOpening)
### Get V alignment information ###
v_score <- score(v_alignment3) # Score of alignment with no gaps
v_identity <- pid(v_alignment3, type = "PID3")
v_start <- start(Views(v_alignment3))
v_end <- end(Views(v_alignment3))
if (v_score < 0) { # If V score is negative, Cys check nor J alignment are performed
cys_check <- NA
cys_status <- NA
j_match <- NA
j_score <- NA
j_identity <- NA
j_start <- NA
j_end <- NA
phe_check <- NA
read_cys_start <- NA
HVR_seq <- NA
HVR_qual <- NA
HVR_frame <- NA
} else { # Only find Cys and align J if V score is positive
#########################################
####### FIND CYS, HANDLE INDELS #########
#########################################
# Cys coordinates in the alignment, ideally
cys_start <- 310
cys_end <- 312
# If there is any insertion in the read, Cys coordinates will be shifted in the alignment
v_ins <- unlist(deletion(v_alignment2)) # IRanges with V insertions
alg_cys_start <- cys_start + sum(width(v_ins[start(v_ins) <= cys_start])) # Cys start in alignment
alg_cys_end <- cys_end + sum(width(v_ins[start(v_ins) <= cys_end])) # Cys end in alignment
message(alg_cys_start, " ", alg_cys_end)
# Find Cys coordinates in the read
v_del <- unlist(insertion(v_alignment2)) # IRanges with deletions in subject (mostly IMGT gaps)
print(v_del)
num_v_del <- length(v_del) # Number of deletions (of any length)
shift <- c(0, cumsum(width(v_del))[-num_v_del])
v_del_sft <- shift(v_del, shift) # Fix coordinates
subject_gaps <- sum(width(v_del_sft[end(v_del_sft) < alg_cys_start])) + nchar(str_extract(alignedSubject(v_alignment2), "^-*")) # Number of gaps in subject before Cys in the alignment. The "nchar" term takes into account the gaps at the beginning of the subject, which are not considered by the "insertion" function.
read_cys_start <- v_start + alg_cys_start - subject_gaps -1 # Cys start in the read
message(read_cys_start)
# Get aligned Cys sequences in pattern and subject
cys_subject <- subseq(alignedSubject(v_alignment2), start = alg_cys_start, end = alg_cys_end) # Cys sequence in subject
message(cys_subject)
gap_subject <- unlist(gregexpr(pattern = "-", text = cys_subject)) # Gap position in subject (deletion), if any
cys_pattern <- subseq(alignedPattern(v_alignment2), start = alg_cys_start, end = alg_cys_end) # Cys sequence in pattern
gap_pattern <- unlist(gregexpr(pattern = "-", text = cys_pattern)) # Gap position in pattern (insertion), if any
# Check Cys sequence in subject
if (cys_subject %in% c("TGT", "TGC")) { # Cys codon found
cys_check <- TRUE
if (cys_subject == cys_pattern) { # Same Cys codon in pattern and subject
cys_status <- "Match"
} else { # Different Cys codon in pattern and subject
cys_status <- paste0("Pseudomatch (", cys_pattern," > ", cys_subject, ")")
}
} else if (all(gap_pattern > 0)) { # Insertion in subject, fix it by removing it
cys_status <- paste0("Insertion (", cys_pattern," > ", cys_subject, "), removed")
read_ins_pos <- read_cys_start + as.numeric(gap_pattern) -1 # Insertion position in read
# Delete insertion in read
fix_read <- as.character(read)
fix_qual <- quality(read)
stringr::str_sub(fix_read, read_ins_pos[1], read_ins_pos[length(read_ins_pos)]) <- "" # Remove Cys insertion in the read
stringr::str_sub(fix_qual, read_ins_pos[1], read_ins_pos[length(read_ins_pos)]) <- "" # Remove Cys insertion in the quality string
fix_qual <- as(fix_qual, new.quality.class) # Convert to PhredQuality class
read_id <- names(read)
read <- QualityScaledDNAStringSet(fix_read, fix_qual) # Rebuild QualityScaledDNAStringSet without Cys insertion
names(read) <- read_id
cys_codon <- subseq(read, start = read_cys_start, end = read_cys_start + 2)
cys_check <- ifelse(cys_codon %in% c("TGT", "TGC"), TRUE, FALSE)
} else if (all(gap_subject > 0)) { # Deletion in subject
cys_status <- paste0("Deletion (", cys_pattern," > ", cys_subject, ")")
cys_check <- FALSE
} else { # Mismatch
cys_status <- paste0("Mismatch (", cys_pattern," > ", cys_subject, ")")
cys_check <- FALSE
}
#########################################
############## J ALIGNMENT ##############
#########################################
### Remove V match from the read
remaining <- subseq(read, start = read_cys_start + 3) # Align J from Cys codon (not included)
### Perform 16 J alignments and get the score only
j_alignment1 <- pairwiseAlignment(pattern = Jsegments,
subject = remaining,
type = "global-local",
scoreOnly = TRUE,
gapOpening = gapOpening)
### Select the segment with the best score
best_j <- Jsegments[which.max(j_alignment1)]
j_match <- names(best_j)
### Perform a complete pairwise alignment with the best match
j_alignment2 <- pairwiseAlignment(pattern = best_j,
subject = remaining,
type = "global-local",
scoreOnly = FALSE,
gapOpening = gapOpening)
### Get J alignment information
j_score <- score(j_alignment2)
j_identity <- pid(j_alignment2, type = "PID3")
j_start <- start(Views(j_alignment2)) + read_cys_start + 2 # Get the coordinates **in the read**, not in the sequence without the V match
j_end <- end(Views(j_alignment2)) + read_cys_start + 2
### Check if the first codon is a Phe
phe_codon <- subseq(read, start = j_start, end = j_start + 2)
phe_check <- phe_codon %in% c("TTT", "TTC")
#########################################
############## HVR LOCATION #############
#########################################
### Get HVR sequence and sequencing quality
HVR <- subseq(read, start = read_cys_start, end = j_start + 2)
HVR_seq <- as.character(HVR)
HVR_qual <- as.character(quality(HVR))
# Find out HVR frame (0: Cys and Phe in frame; 1 OR 2: out of frame)
HVR_frame <- nchar(HVR_seq) %% 3
if (HVR_frame != 0) { # If the remainder is 1, frame = 2 and viceversa
HVR_frame <- ifelse(HVR_frame == 1, 2, 1)
}
}
#########################################
################# OUTPUT ################
#########################################
df <- data.frame(
read_ID = read_id,
read_length = width(read),
V_match = v_match,
V_score = v_score,
V_identity = v_identity,
V_start = v_start,
Cys_check = cys_check,
Cys_status = cys_status,
J_match = j_match,
J_score = j_score,
J_identity = j_identity,
J_end = j_end,
Phe_check = phe_check,
HVR_start = read_cys_start,
HVR_end = j_start + 2,
HVR_sequence = HVR_seq,
HVR_quality = HVR_qual,
HVR_frame = HVR_frame
) %>% mutate(read_ID = as.character(read_ID),
V_match = as.character(V_match),
J_match = as.character(J_match),
Cys_status = as.character(Cys_status),
HVR_sequence = as.character(HVR_sequence),
HVR_quality = as.character(HVR_quality))
return(df)
}))
}
Load input:
v_segments
and j_segments
: both are DNAStringset
that contains the reference sequences for V (147 sequences: all 64 segments + alleles) and J segments (16 sequences).examples
: QualityScaledDNAStringSet
that contains three example reads and their sequencing qualities. The reads have been previously reverse-complemented (!!!).###############################################################################################
####################### CONVERT .fastq TO QualityScaledDNAStringSet ###########################
###############################################################################################
# curesim_fq <- ShortRead::readFastq("software/CuReSim/output/00_first_test.fastq") # Read .fq file and store in a ShortReadQ class object
# reads_sim <- ShortRead::sread(curesim_fq) # DNAStringSet with the reads
# quality_sim <- quality(curesim_fq) # BStringSet with the sequencing qualities
#
# new.quality.class <- switch(class(quality_sim), # Convert from BStringSet to XStringQuality
# SFastqQuality="SolexaQuality",
# FastqQuality="PhredQuality",
# "XStringQuality")
# quality_sim <- as(quality_sim, new.quality.class)
#
# sample_sim <- QualityScaledDNAStringSet(reads_sim, quality_sim) # Get QualityScaledDNAStringSet to use as a pairwiseAlignment input
# names(sample_sim) <- as.character(curesim_fq@id) # Character vector with the reads IDs
#
# sample_sim_rc <- reverseComplement(sample_sim) # V-CDR3-J
(!!!) ImmuneSIM-generated reads have the following structure: V-CDR3-J. To mimic the sequencing process of our collaborators, before CuReSim, reads need to be reverse-complemented (J-CDR3-V), so that CDR3 lies at the beginning of the read, where sequencing quality is better. However, to perform V and J alignment, the reads must be reverse-complemented again (V-CDR3-J) for the alignment to perform correctly. Reverse complement can be performed with Biostrings::reverseComplement()
function.
# Load reference sequences (Class: DNAStringSet)
v_segments <- readRDS("v_segments.rds")
j_segments <- readRDS("j_segments_phe.rds")
# Load sample (Class: QualityScaledDNAStringSet)
examples <- readRDS("example_reads.rds")
# Run alignment
start_time <- Sys.time()
vj_alg <- vj_alignment(reads = examples,
Vsegments = v_segments,
Jsegments = j_segments,
numCores = detectCores() - 4) ### Alignment parallelization (alignment of 1 read vs all reference sequences per thread)
end_time <- Sys.time()
# Show results
DT::datatable(vj_alg,
class = 'nowrap compact order-column',
extensions = c('Buttons', 'FixedColumns'),
options = list(dom = 'Bfrtip',
buttons = c('csv', 'excel'),
scrollX = TRUE,
fixedColumns = TRUE))
end_time - start_time
## Time difference of 2.013929 secs
vj_align_vis
functionvj_align_vis <- function(alignmentOutput, readsDSS, vseg = v_segments, jseg = j_segments, whichSegment, gapOpening = 1, seqLogo = FALSE, widgHeight = 100){
row <- alignmentOutput
segment <- paste0(whichSegment, "_match")
read <- readsDSS[row[1, "read_ID"]]
match <- row[1, segment]
HVR_start <- row[1, "HVR_start"]
if (whichSegment == "V") {
best <- vseg[match]
} else {
best <- jseg[match]
read <- subseq(read, start = HVR_start + 3)
}
alg <- pairwiseAlignment(pattern = best,
subject = read,
type = "global-local",
scoreOnly = FALSE,
gapOpening = gapOpening)
seq <- DNAStringSet(c(alignedSubject(alg), alignedPattern(alg)))
widget <- msaR(msa = seq,
overviewbox = FALSE,
seqlogo = seqLogo,
alignmentHeight = 100,
height = widgHeight)
return(widget)
}
To easily see sequence conservation enable sequence logo (Vis.elements > Show sequence logo).
rownum <- 1 # Row of the vj_alg dataframe to visualize
V alignment
vj_align_vis(alignmentOutput = vj_alg[rownum,],
readsDSS = examples,
whichSegment = "V")
J alignment
vj_align_vis(alignmentOutput = vj_alg[rownum,],
readsDSS = examples,
whichSegment = "J")
sessionInfo()
## R version 3.6.1 (2019-07-05)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.2 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/openblas/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/libopenblasp-r0.2.20.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats4 parallel stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] dplyr_1.0.2 stringr_1.4.0 msa_1.16.0
## [4] msaR_0.3.0 Biostrings_2.52.0 XVector_0.24.0
## [7] IRanges_2.18.3 S4Vectors_0.22.1 BiocGenerics_0.30.0
##
## loaded via a namespace (and not attached):
## [1] Rcpp_1.0.5 compiler_3.6.1 pillar_1.4.6 tools_3.6.1
## [5] zlibbioc_1.30.0 digest_0.6.27 jsonlite_1.7.1 evaluate_0.14
## [9] lifecycle_0.2.0 tibble_3.0.4 nlme_3.1-140 lattice_0.20-38
## [13] pkgconfig_2.0.3 rlang_0.4.8 crosstalk_1.1.0.1 yaml_2.2.1
## [17] xfun_0.19 knitr_1.30 generics_0.1.0 vctrs_0.3.4
## [21] htmlwidgets_1.5.2 grid_3.6.1 DT_0.16 tidyselect_1.1.0
## [25] glue_1.4.2 R6_2.5.0 rmarkdown_2.5 purrr_0.3.4
## [29] magrittr_1.5 htmltools_0.5.0 ellipsis_0.3.1 ape_5.4-1
## [33] stringi_1.5.3 crayon_1.3.4