diff --git a/src/alignment.r b/src/alignment.r index d5f3e17..f4bbee4 100644 --- a/src/alignment.r +++ b/src/alignment.r @@ -1,27 +1,15 @@ library(Biostrings) library(parallel) -construct_hvr_sequence <- function(data) { - vdj_dataframe <- as.data.frame(data) - hvr_sequence <- paste(vdj_dataframe$v_sequence, - vdj_dataframe$d_sequence, - vdj_dataframe$j_sequence, - sep = "" - ) - vdj_dataframe$hvr <- hvr_sequence - return(vdj_dataframe) -} - parse_data <- function(files) { reversed_sequences <- Biostrings::readQualityScaledDNAStringSet(files[1]) sequences <- Biostrings::reverseComplement(reversed_sequences) - vdj_alignment <- read.csv(files[2]) - vdj_dataframe <- construct_hvr_sequence(vdj_alignment) + vdj_metadata <- read.csv(files[2]) vj_segments <- union( readRDS("data/v_segments.rds"), readRDS("data/j_segments_phe.rds") ) - return(list(sequences, vj_segments, vdj_dataframe)) + return(list(sequences, vj_segments, vdj_metadata)) } get_vj_sequence <- function(identifier, names, sequences) { @@ -41,7 +29,7 @@ construct_full_sequences <- function(vdj_segments, metadata) { vdj_segments, FUN = get_vj_sequence ) - full_sequence <- paste(v_sequences, metadata$hvr, j_sequences, sep = "") + full_sequence <- paste(v_sequences, metadata$junction, j_sequences, sep = "") return(Biostrings::DNAStringSet(full_sequence)) } @@ -64,11 +52,11 @@ perform_alignment <- function(sequences, vdj_segments, metadata) { return(sequence_alignment) } -input_files <- c("data/curesim_sequence.fastq", "data/vdj_alignment.csv") -data <- parse_data(input_files) +input_files <- c("data/curesim_sequence.fastq", "data/vdj_metadata.csv") +data <- parse_data(files = input_files) alignment <- perform_alignment( sequences = data[[1]], vdj_segments = data[[2]], metadata = data[[3]] ) -print(alignment) +print(alignment) \ No newline at end of file diff --git a/src/repertoire.r b/src/repertoire.r index cb60947..8904ad7 100644 --- a/src/repertoire.r +++ b/src/repertoire.r @@ -14,15 +14,11 @@ save_data <- function(data, reads) { Biostrings::writeXStringSet(data$sequence, "data/sequence.fasta") vdj_sequences <- data[-1] amplified_vdj <- vdj_sequences[rep(seq_len(nrow(vdj_sequences)), reads), ] - write.csv(amplified_vdj, "data/vdj_alignment.csv", row.names = FALSE) + write.csv(amplified_vdj, "data/vdj_metadata.csv", row.names = FALSE) } process_data <- function(repertoire, reads) { - columns <- c( - "sequence", "v_sequence_alignment", - "d_sequence_alignment", "j_sequence_alignment", - "v_call", "j_call" - ) + columns <- c("sequence", "junction", "v_call", "j_call") data <- repertoire[, columns] dna_sequence <- Biostrings::DNAStringSet(data$sequence) data$sequence <- Biostrings::reverseComplement(dna_sequence)