From 38b35f7d12a2765ad5afc2fb63d4208d015d15f3 Mon Sep 17 00:00:00 2001 From: coolneng Date: Wed, 7 Apr 2021 18:31:39 +0200 Subject: [PATCH] Align full sequences efficiently --- src/alignment.r | 56 +++++++++++++++++++++++++++++++++++++----------- src/repertoire.r | 3 ++- 2 files changed, 45 insertions(+), 14 deletions(-) diff --git a/src/alignment.r b/src/alignment.r index 596c06a..50cc1ff 100644 --- a/src/alignment.r +++ b/src/alignment.r @@ -1,14 +1,14 @@ library(Biostrings) library(parallel) -construct_dataframe <- function(data) { - vdj_string_set <- lapply(data, FUN = Biostrings::DNAStringSet) - vdj_dataframe <- as.data.frame(vdj_string_set) - vdj_dataframe$hvr_region <- paste(vdj_dataframe$v_sequence, +construct_hvr_sequence <- function(data) { + vdj_dataframe <- as.data.frame(data) + hvr_sequence <- paste(vdj_dataframe$v_sequence, vdj_dataframe$d_sequence, vdj_dataframe$j_sequence, sep = "" ) + vdj_dataframe$hvr <- hvr_sequence return(vdj_dataframe) } @@ -16,8 +16,33 @@ parse_data <- function(files) { reversed_sequences <- Biostrings::readQualityScaledDNAStringSet(files[1]) sequences <- Biostrings::reverseComplement(reversed_sequences) vdj_alignment <- read.csv(files[2]) - vdj_dataframe <- construct_dataframe(vdj_alignment) - return(list(sequences, vdj_dataframe)) + vdj_dataframe <- construct_hvr_sequence(vdj_alignment) + vj_segments <- union( + readRDS("data/v_segments.rds"), + readRDS("data/j_segments_phe.rds") + ) + return(list(sequences, vj_segments, vdj_dataframe)) +} + +# TODO Test if grep can return more than one match +get_vj_sequence <- function(identifier, names, sequences) { + row <- grep(names, pattern = identifier) + return(as.character(sequences[row])) +} + +construct_full_sequences <- function(vdj_segments, metadata) { + v_sequences <- lapply(metadata$v_call, + names(vdj_segments), + vdj_segments, + FUN = get_vj_sequence + ) + j_sequences <- lapply(metadata$j_call, + names(vdj_segments), + vdj_segments, + FUN = get_vj_sequence + ) + full_sequence <- paste(v_sequences, metadata$hvr, j_sequences, sep = "") + return(Biostrings::DNAStringSet(full_sequence)) } align_sequence <- function(sequence, vdj_segment) { @@ -25,20 +50,25 @@ align_sequence <- function(sequence, vdj_segment) { pattern = sequence, subject = vdj_segment, type = "global-local", - gapOpening = 1 + gapOpening = 1, )) } -perform_alignment <- function(sequences, vdj_segments) { - sequence_alignment <- mcmapply(sequences, - vdj_segments$hvr_region, +perform_alignment <- function(sequences, vdj_segments, metadata) { + vj_sequences <- construct_full_sequences(vdj_segments, metadata) + sequence_alignment <- mcmapply(vj_sequences, + vdj_segments, FUN = align_sequence, - mc.cores = 4 + mc.cores = detectCores() ) return(sequence_alignment) } input_files <- c("data/curesim_sequence.fastq", "data/vdj_alignment.csv") data <- parse_data(input_files) -alignment <- perform_alignment(sequences = data[[1]], vdj_segments = data[[2]]) -print(alignment) \ No newline at end of file +alignment <- perform_alignment( + sequences = data[[1]], + vdj_segments = data[[2]], + metadata = data[[3]] +) +print(alignment) diff --git a/src/repertoire.r b/src/repertoire.r index c8107e6..cb60947 100644 --- a/src/repertoire.r +++ b/src/repertoire.r @@ -20,7 +20,8 @@ save_data <- function(data, reads) { process_data <- function(repertoire, reads) { columns <- c( "sequence", "v_sequence_alignment", - "d_sequence_alignment", "j_sequence_alignment" + "d_sequence_alignment", "j_sequence_alignment", + "v_call", "j_call" ) data <- repertoire[, columns] dna_sequence <- Biostrings::DNAStringSet(data$sequence)