diff --git a/src/constants.py b/src/constants.py index e4c3746..1aaf633 100644 --- a/src/constants.py +++ b/src/constants.py @@ -1,4 +1,4 @@ -BASES = "ACGT" +BASES = "ACGT-" TRAIN_DATASET = "data/train_data.tfrecords" TEST_DATASET = "data/test_data.tfrecords" EVAL_DATASET = "data/eval_data.tfrecords" diff --git a/src/preprocessing.py b/src/preprocessing.py index 9912dee..d4a8734 100644 --- a/src/preprocessing.py +++ b/src/preprocessing.py @@ -1,5 +1,6 @@ from typing import List, Tuple +from Bio.pairwise2 import align from Bio.SeqIO import parse from numpy.random import random from tensorflow import Tensor, int64 @@ -10,13 +11,24 @@ from tensorflow.train import Example, Feature, Features, Int64List from constants import * +def align_sequences(sequence, label) -> Tuple[str, str]: + """ + Align the altered sequence with the reference sequence to obtain a same length output + """ + alignments = align.globalxx(label, sequence) + best_alignment = alignments[0] + aligned_seq, aligned_label, _, _, _ = best_alignment + return aligned_seq, aligned_label + + def generate_example(sequence, label) -> bytes: """ Create a binary-string for each sequence containing the sequence and the bases' counts """ + aligned_seq, aligned_label = align_sequences(sequence, label) schema = { - "sequence": Feature(int64_list=Int64List(value=encode_sequence(sequence))), - "label": Feature(int64_list=Int64List(value=encode_sequence(label))), + "sequence": Feature(int64_list=Int64List(value=encode_sequence(aligned_seq))), + "label": Feature(int64_list=Int64List(value=encode_sequence(aligned_label))), } example = Example(features=Features(feature=schema)) return example.SerializeToString()