Refactor sequence preprocessing

This commit is contained in:
coolneng 2021-07-05 19:54:48 +02:00
parent 72e3de945a
commit 70363a82a0
Signed by: coolneng
GPG Key ID: 9893DA236405AF57
1 changed files with 24 additions and 13 deletions

View File

@ -22,19 +22,6 @@ def align_sequences(sequence, label) -> Tuple[str, str]:
return aligned_seq, aligned_label
def generate_example(sequence, label) -> bytes:
"""
Create a binary-string for each sequence containing the sequence and the bases' counts
"""
aligned_seq, aligned_label = align_sequences(sequence, label)
schema = {
"sequence": Feature(int64_list=Int64List(value=encode_sequence(aligned_seq))),
"label": Feature(int64_list=Int64List(value=encode_sequence(aligned_label))),
}
example = Example(features=Features(feature=schema))
return example.SerializeToString()
def encode_sequence(sequence) -> List[int]:
"""
Encode the DNA sequence using the indices of the BASES constant
@ -43,6 +30,30 @@ def encode_sequence(sequence) -> List[int]:
return encoded_sequence
def prepare_sequences(sequence, label):
"""
Align and encode the sequences to obtain a fixed length output in order to perform batching
"""
encoded_sequences = []
aligned_seq, aligned_label = align_sequences(sequence, label)
for item in [aligned_seq, aligned_label]:
encoded_sequences.append(encode_sequence(item))
return encoded_sequences[0], encoded_sequences[1]
def generate_example(sequence, label) -> bytes:
"""
Create a binary-string for each sequence containing the sequence and the bases' counts
"""
processed_seq, processed_label = prepare_sequences(sequence, label)
schema = {
"sequence": Feature(int64_list=Int64List(value=processed_seq)),
"label": Feature(int64_list=Int64List(value=processed_label)),
}
example = Example(features=Features(feature=schema))
return example.SerializeToString()
def read_fastq(hyperparams) -> List[bytes]:
"""
Parses a data and a label FASTQ files and generates a List of serialized Examples