Align altered sequence with the reference sequence
This commit is contained in:
parent
0912600fdc
commit
c9466baa68
|
@ -1,4 +1,4 @@
|
|||
BASES = "ACGT"
|
||||
BASES = "ACGT-"
|
||||
TRAIN_DATASET = "data/train_data.tfrecords"
|
||||
TEST_DATASET = "data/test_data.tfrecords"
|
||||
EVAL_DATASET = "data/eval_data.tfrecords"
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
from typing import List, Tuple
|
||||
|
||||
from Bio.pairwise2 import align
|
||||
from Bio.SeqIO import parse
|
||||
from numpy.random import random
|
||||
from tensorflow import Tensor, int64
|
||||
|
@ -10,13 +11,24 @@ from tensorflow.train import Example, Feature, Features, Int64List
|
|||
from constants import *
|
||||
|
||||
|
||||
def align_sequences(sequence, label) -> Tuple[str, str]:
|
||||
"""
|
||||
Align the altered sequence with the reference sequence to obtain a same length output
|
||||
"""
|
||||
alignments = align.globalxx(label, sequence)
|
||||
best_alignment = alignments[0]
|
||||
aligned_seq, aligned_label, _, _, _ = best_alignment
|
||||
return aligned_seq, aligned_label
|
||||
|
||||
|
||||
def generate_example(sequence, label) -> bytes:
|
||||
"""
|
||||
Create a binary-string for each sequence containing the sequence and the bases' counts
|
||||
"""
|
||||
aligned_seq, aligned_label = align_sequences(sequence, label)
|
||||
schema = {
|
||||
"sequence": Feature(int64_list=Int64List(value=encode_sequence(sequence))),
|
||||
"label": Feature(int64_list=Int64List(value=encode_sequence(label))),
|
||||
"sequence": Feature(int64_list=Int64List(value=encode_sequence(aligned_seq))),
|
||||
"label": Feature(int64_list=Int64List(value=encode_sequence(aligned_label))),
|
||||
}
|
||||
example = Example(features=Features(feature=schema))
|
||||
return example.SerializeToString()
|
||||
|
|
Loading…
Reference in New Issue