Align altered sequence with the reference sequence
This commit is contained in:
parent
0912600fdc
commit
c9466baa68
|
@ -1,4 +1,4 @@
|
||||||
BASES = "ACGT"
|
BASES = "ACGT-"
|
||||||
TRAIN_DATASET = "data/train_data.tfrecords"
|
TRAIN_DATASET = "data/train_data.tfrecords"
|
||||||
TEST_DATASET = "data/test_data.tfrecords"
|
TEST_DATASET = "data/test_data.tfrecords"
|
||||||
EVAL_DATASET = "data/eval_data.tfrecords"
|
EVAL_DATASET = "data/eval_data.tfrecords"
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
from typing import List, Tuple
|
from typing import List, Tuple
|
||||||
|
|
||||||
|
from Bio.pairwise2 import align
|
||||||
from Bio.SeqIO import parse
|
from Bio.SeqIO import parse
|
||||||
from numpy.random import random
|
from numpy.random import random
|
||||||
from tensorflow import Tensor, int64
|
from tensorflow import Tensor, int64
|
||||||
|
@ -10,13 +11,24 @@ from tensorflow.train import Example, Feature, Features, Int64List
|
||||||
from constants import *
|
from constants import *
|
||||||
|
|
||||||
|
|
||||||
|
def align_sequences(sequence, label) -> Tuple[str, str]:
|
||||||
|
"""
|
||||||
|
Align the altered sequence with the reference sequence to obtain a same length output
|
||||||
|
"""
|
||||||
|
alignments = align.globalxx(label, sequence)
|
||||||
|
best_alignment = alignments[0]
|
||||||
|
aligned_seq, aligned_label, _, _, _ = best_alignment
|
||||||
|
return aligned_seq, aligned_label
|
||||||
|
|
||||||
|
|
||||||
def generate_example(sequence, label) -> bytes:
|
def generate_example(sequence, label) -> bytes:
|
||||||
"""
|
"""
|
||||||
Create a binary-string for each sequence containing the sequence and the bases' counts
|
Create a binary-string for each sequence containing the sequence and the bases' counts
|
||||||
"""
|
"""
|
||||||
|
aligned_seq, aligned_label = align_sequences(sequence, label)
|
||||||
schema = {
|
schema = {
|
||||||
"sequence": Feature(int64_list=Int64List(value=encode_sequence(sequence))),
|
"sequence": Feature(int64_list=Int64List(value=encode_sequence(aligned_seq))),
|
||||||
"label": Feature(int64_list=Int64List(value=encode_sequence(label))),
|
"label": Feature(int64_list=Int64List(value=encode_sequence(aligned_label))),
|
||||||
}
|
}
|
||||||
example = Example(features=Features(feature=schema))
|
example = Example(features=Features(feature=schema))
|
||||||
return example.SerializeToString()
|
return example.SerializeToString()
|
||||||
|
|
Loading…
Reference in New Issue