diff --git a/src/preprocessing.py b/src/preprocessing.py index 74d4cca..e86eeac 100644 --- a/src/preprocessing.py +++ b/src/preprocessing.py @@ -22,19 +22,6 @@ def align_sequences(sequence, label) -> Tuple[str, str]: return aligned_seq, aligned_label -def generate_example(sequence, label) -> bytes: - """ - Create a binary-string for each sequence containing the sequence and the bases' counts - """ - aligned_seq, aligned_label = align_sequences(sequence, label) - schema = { - "sequence": Feature(int64_list=Int64List(value=encode_sequence(aligned_seq))), - "label": Feature(int64_list=Int64List(value=encode_sequence(aligned_label))), - } - example = Example(features=Features(feature=schema)) - return example.SerializeToString() - - def encode_sequence(sequence) -> List[int]: """ Encode the DNA sequence using the indices of the BASES constant @@ -43,6 +30,30 @@ def encode_sequence(sequence) -> List[int]: return encoded_sequence +def prepare_sequences(sequence, label): + """ + Align and encode the sequences to obtain a fixed length output in order to perform batching + """ + encoded_sequences = [] + aligned_seq, aligned_label = align_sequences(sequence, label) + for item in [aligned_seq, aligned_label]: + encoded_sequences.append(encode_sequence(item)) + return encoded_sequences[0], encoded_sequences[1] + + +def generate_example(sequence, label) -> bytes: + """ + Create a binary-string for each sequence containing the sequence and the bases' counts + """ + processed_seq, processed_label = prepare_sequences(sequence, label) + schema = { + "sequence": Feature(int64_list=Int64List(value=processed_seq)), + "label": Feature(int64_list=Int64List(value=processed_label)), + } + example = Example(features=Features(feature=schema)) + return example.SerializeToString() + + def read_fastq(hyperparams) -> List[bytes]: """ Parses a data and a label FASTQ files and generates a List of serialized Examples