diff --git a/src/alignment.r b/src/alignment.r index 25facbb..7175e86 100644 --- a/src/alignment.r +++ b/src/alignment.r @@ -1,36 +1,39 @@ library(Biostrings) library(parallel) -parse_data <- function(files) { - reversed_sequences <- Biostrings::readQualityScaledDNAStringSet(files[1]) +parse_data <- function(file) { + reversed_sequences <- Biostrings::readQualityScaledDNAStringSet(file) sequences <- Biostrings::reverseComplement(reversed_sequences) - vdj_metadata <- read.csv(files[2]) vj_segments <- union( readRDS("data/v_segments.rds"), readRDS("data/j_segments_phe.rds") ) - return(list(sequences, vj_segments, vdj_metadata)) + return(list(sequences, vj_segments)) } -get_vj_sequence <- function(identifier, names, sequences) { - matches <- grep(names, pattern = identifier) +match_id_sequence <- function(names, sequences, id) { + matches <- grep(names, pattern = id) row <- matches[1] return(as.character(sequences[row])) } -construct_full_sequences <- function(vdj_segments, metadata) { - v_sequences <- lapply(metadata$v_call, +get_vj_sequence <- function(sequences, names, vdj_segments) { + metadata <- unlist(strsplit(sequences, split = " ")) + v_identifier <- metadata[2] + j_identifier <- metadata[3] + v_sequence <- match_id_sequence(names, vdj_segments, id = v_identifier) + j_sequence <- match_id_sequence(names, vdj_segments, id = j_identifier) + return(c(v_sequence, j_sequence)) +} + +fetch_vj_sequences <- function(vdj_segments, sequences) { + vj_sequences <- mclapply(names(sequences), names(vdj_segments), vdj_segments, - FUN = get_vj_sequence + FUN = get_vj_sequence, + mc.cores = detectCores() ) - j_sequences <- lapply(metadata$j_call, - names(vdj_segments), - vdj_segments, - FUN = get_vj_sequence - ) - full_sequence <- paste(v_sequences, metadata$junction, j_sequences, sep = "") - return(Biostrings::DNAStringSet(full_sequence)) + return(c(vj_sequences[1], vj_sequences[2])) } align_sequence <- function(sequence, vdj_segment) { @@ -38,25 +41,24 @@ align_sequence <- function(sequence, vdj_segment) { subject = sequence, pattern = vdj_segment, type = "global-local", - gapOpening = 1, + gapOpening = 1 )) } -perform_alignment <- function(sequences, vdj_segments, metadata) { - vj_sequences <- construct_full_sequences(vdj_segments, metadata) - sequence_alignment <- mcmapply(vj_sequences, - vdj_segments, - FUN = align_sequence, - mc.cores = detectCores() +get_hvr_sequences <- function(sequences, vdj_segments) { + vj_sequences <- fetch_vj_sequences(vdj_segments, sequences) + v_alignment <- parallel::mcmapply(sequences, + vj_sequences[1], + FUN = align_sequence + ) + j_alignment <- parallel::mcmapply(sequences, + vj_sequences[2], + FUN = align_sequence ) - return(sequence_alignment) } -input_files <- c("data/curesim_sequence.fastq", "data/vdj_metadata.csv") -data <- parse_data(files = input_files) -alignment <- perform_alignment( +data <- parse_data(file = "data/curesim_sequence.fastq") +hvr_sequences <- get_hvr_sequences( sequences = data[[1]], - vdj_segments = data[[2]], - metadata = data[[3]] -) -print(alignment) \ No newline at end of file + vdj_segments = data[[2]] +) \ No newline at end of file