diff --git a/src/constants.py b/src/constants.py
index e4c3746..1aaf633 100644
--- a/src/constants.py
+++ b/src/constants.py
@@ -1,4 +1,4 @@
-BASES = "ACGT"
+BASES = "ACGT-"
 TRAIN_DATASET = "data/train_data.tfrecords"
 TEST_DATASET = "data/test_data.tfrecords"
 EVAL_DATASET = "data/eval_data.tfrecords"
diff --git a/src/preprocessing.py b/src/preprocessing.py
index 9912dee..d4a8734 100644
--- a/src/preprocessing.py
+++ b/src/preprocessing.py
@@ -1,5 +1,6 @@
 from typing import List, Tuple
 
+from Bio.pairwise2 import align
 from Bio.SeqIO import parse
 from numpy.random import random
 from tensorflow import Tensor, int64
@@ -10,13 +11,24 @@ from tensorflow.train import Example, Feature, Features, Int64List
 from constants import *
 
 
+def align_sequences(sequence, label) -> Tuple[str, str]:
+    """
+    Align the altered sequence with the reference sequence to obtain a same length output
+    """
+    alignments = align.globalxx(label, sequence)
+    best_alignment = alignments[0]
+    aligned_seq, aligned_label, _, _, _ = best_alignment
+    return aligned_seq, aligned_label
+
+
 def generate_example(sequence, label) -> bytes:
     """
     Create a binary-string for each sequence containing the sequence and the bases' counts
     """
+    aligned_seq, aligned_label = align_sequences(sequence, label)
     schema = {
-        "sequence": Feature(int64_list=Int64List(value=encode_sequence(sequence))),
-        "label": Feature(int64_list=Int64List(value=encode_sequence(label))),
+        "sequence": Feature(int64_list=Int64List(value=encode_sequence(aligned_seq))),
+        "label": Feature(int64_list=Int64List(value=encode_sequence(aligned_label))),
     }
     example = Example(features=Features(feature=schema))
     return example.SerializeToString()