Align altered sequence with the reference sequence

This commit is contained in:
coolneng 2021-06-23 18:28:34 +02:00
parent 0912600fdc
commit c9466baa68
Signed by: coolneng
GPG Key ID: 9893DA236405AF57
2 changed files with 15 additions and 3 deletions

View File

@ -1,4 +1,4 @@
BASES = "ACGT"
BASES = "ACGT-"
TRAIN_DATASET = "data/train_data.tfrecords"
TEST_DATASET = "data/test_data.tfrecords"
EVAL_DATASET = "data/eval_data.tfrecords"

View File

@ -1,5 +1,6 @@
from typing import List, Tuple
from Bio.pairwise2 import align
from Bio.SeqIO import parse
from numpy.random import random
from tensorflow import Tensor, int64
@ -10,13 +11,24 @@ from tensorflow.train import Example, Feature, Features, Int64List
from constants import *
def align_sequences(sequence, label) -> Tuple[str, str]:
"""
Align the altered sequence with the reference sequence to obtain a same length output
"""
alignments = align.globalxx(label, sequence)
best_alignment = alignments[0]
aligned_seq, aligned_label, _, _, _ = best_alignment
return aligned_seq, aligned_label
def generate_example(sequence, label) -> bytes:
"""
Create a binary-string for each sequence containing the sequence and the bases' counts
"""
aligned_seq, aligned_label = align_sequences(sequence, label)
schema = {
"sequence": Feature(int64_list=Int64List(value=encode_sequence(sequence))),
"label": Feature(int64_list=Int64List(value=encode_sequence(label))),
"sequence": Feature(int64_list=Int64List(value=encode_sequence(aligned_seq))),
"label": Feature(int64_list=Int64List(value=encode_sequence(aligned_label))),
}
example = Example(features=Features(feature=schema))
return example.SerializeToString()