From 5afe040592cdb943b955022ed452e5bbb863c919 Mon Sep 17 00:00:00 2001 From: coolneng Date: Mon, 3 May 2021 21:15:40 +0200 Subject: [PATCH] Isolate HVR sequence and save it to a file --- shell.nix | 1 + src/alignment.r | 49 +++++++++++++++++++++++++++++++++++++------------ 2 files changed, 38 insertions(+), 12 deletions(-) diff --git a/shell.nix b/shell.nix index 576771b..4624240 100644 --- a/shell.nix +++ b/shell.nix @@ -7,6 +7,7 @@ mkShell { R rPackages.immuneSIM rPackages.Biostrings + rPackages.stringr jdk # Development tools rPackages.languageserver diff --git a/src/alignment.r b/src/alignment.r index 11f4808..83d8eeb 100644 --- a/src/alignment.r +++ b/src/alignment.r @@ -50,26 +50,51 @@ align_sequence <- function(sequence, vdj_segment) { )) } -locate_cysteine <- function(sequence, v_segment) { - codons <- Biostrings::DNAStringSet(c("TGT", "TGC")) - matches <- Biostrings::matchPDict( - pdict = codons, - subject = DNAString(toString(v_segment)) - ) - position <- as.data.frame(intersect(matches[[1]], matches[[2]])) - return(position) +handle_indels <- function(insertion, deletion, cys, alignment) { + ins_start <- sum(Biostrings::width(deletion[start(deletion) <= cys$start])) + ins_end <- sum(Biostrings::width(deletion[end(deletion) <= cys$end])) + shift_num <- c(0, cumsum(Biostrings::width(insertion))[-length(ins_start)]) + shifted_ins <- IRanges::shift(insertion, shift_num) + gaps <- sum(width(shifted_ins[end(shifted_ins) < cys$start + ins_start])) + + nchar(stringr::str_extract(alignedSubject(alignment), "^-*")) + return(list("start" = ins_start - gaps, "end" = ins_end - gaps)) } -# TODO Extract CDR3 +get_cys_coordinates <- function(alignment) { + cys <- list("start" = 310, "end" = 312) + insertion <- unlist(Biostrings::insertion(alignment)) + deletion <- unlist(Biostrings::deletion(alignment)) + delta_coordinates <- handle_indels(insertion, deletion, cys, alignment) + cys_start <- cys$start + delta_coordinates$start + cys_end <- cys$end + delta_coordinates$end + return(list("start" = cys_start, "end" = cys_end)) +} + +# TODO Refactor this mess get_hvr_sequences <- function(sequences, vdj_segments) { df <- fetch_vj_sequences(sequences, vdj_segments) v_alignment <- parallel::mcmapply(sequences, df$v_seq, FUN = align_sequence) - hvr_start <- parallel::mcmapply(sequences, v_alignment, FUN = locate_cysteine) - hvr_start_df <- as.data.frame(t(hvr_start)) + j_alignment <- parallel::mcmapply(sequences, df$j_seq, FUN = align_sequence) + cys_coordinates <- parallel::mclapply(v_alignment, FUN = get_cys_coordinates) + cys_df <- as.data.frame(do.call(rbind, cys_coordinates)) + j_start <- parallel::mclapply( + j_alignment, + function(x) start(Biostrings::Views(x)) + ) + hvr <- Biostrings::subseq(sequences, + start = unlist(cys_df$start), + end = unlist(j_start) + 2 + ) + return(hvr) +} + +save_data <- function(data) { + Biostrings::writeXStringSet(data, "data/CuReSim-HVR.fastq", format = "fastq") } data <- parse_data(file = "data/curesim_sequence.fastq") -hvr_sequences <- get_hvr_sequences( +hvr <- get_hvr_sequences( sequences = data[[1]], vdj_segments = data[[2]] ) +Biostrings::writeXStringSet(hvr, "data/CuReSim-HVR.fastq", format = "fastq") \ No newline at end of file