Align altered sequence with the reference sequence

This commit is contained in:
coolneng 2021-06-23 18:28:34 +02:00
parent 0912600fdc
commit c9466baa68
Signed by: coolneng
GPG Key ID: 9893DA236405AF57
2 changed files with 15 additions and 3 deletions

View File

@ -1,4 +1,4 @@
BASES = "ACGT" BASES = "ACGT-"
TRAIN_DATASET = "data/train_data.tfrecords" TRAIN_DATASET = "data/train_data.tfrecords"
TEST_DATASET = "data/test_data.tfrecords" TEST_DATASET = "data/test_data.tfrecords"
EVAL_DATASET = "data/eval_data.tfrecords" EVAL_DATASET = "data/eval_data.tfrecords"

View File

@ -1,5 +1,6 @@
from typing import List, Tuple from typing import List, Tuple
from Bio.pairwise2 import align
from Bio.SeqIO import parse from Bio.SeqIO import parse
from numpy.random import random from numpy.random import random
from tensorflow import Tensor, int64 from tensorflow import Tensor, int64
@ -10,13 +11,24 @@ from tensorflow.train import Example, Feature, Features, Int64List
from constants import * from constants import *
def align_sequences(sequence, label) -> Tuple[str, str]:
"""
Align the altered sequence with the reference sequence to obtain a same length output
"""
alignments = align.globalxx(label, sequence)
best_alignment = alignments[0]
aligned_seq, aligned_label, _, _, _ = best_alignment
return aligned_seq, aligned_label
def generate_example(sequence, label) -> bytes: def generate_example(sequence, label) -> bytes:
""" """
Create a binary-string for each sequence containing the sequence and the bases' counts Create a binary-string for each sequence containing the sequence and the bases' counts
""" """
aligned_seq, aligned_label = align_sequences(sequence, label)
schema = { schema = {
"sequence": Feature(int64_list=Int64List(value=encode_sequence(sequence))), "sequence": Feature(int64_list=Int64List(value=encode_sequence(aligned_seq))),
"label": Feature(int64_list=Int64List(value=encode_sequence(label))), "label": Feature(int64_list=Int64List(value=encode_sequence(aligned_label))),
} }
example = Example(features=Features(feature=schema)) example = Example(features=Features(feature=schema))
return example.SerializeToString() return example.SerializeToString()