Document the preprocessing module

This commit is contained in:
coolneng 2021-06-01 18:46:17 +02:00
parent 5ac81c049f
commit 44ff69dc9e
Signed by: coolneng
GPG Key ID: 9893DA236405AF57
1 changed files with 13 additions and 3 deletions

View File

@ -3,9 +3,13 @@ from Bio.SeqIO import parse
from numpy.random import random from numpy.random import random
from tensorflow.io import TFRecordWriter from tensorflow.io import TFRecordWriter
from tensorflow.train import BytesList, Example, Feature, Features, FloatList from tensorflow.train import BytesList, Example, Feature, Features, FloatList
from typing import List
def generate_example(sequence, weight_matrix): def generate_example(sequence, weight_matrix) -> bytes:
"""
Create a binary-string for each sequence containing the sequence and the bases' frequency
"""
schema = { schema = {
"sequence": Feature(bytes_list=BytesList(value=[sequence.encode()])), "sequence": Feature(bytes_list=BytesList(value=[sequence.encode()])),
"A_counts": Feature(float_list=FloatList(value=[weight_matrix["A"][0]])), "A_counts": Feature(float_list=FloatList(value=[weight_matrix["A"][0]])),
@ -17,7 +21,10 @@ def generate_example(sequence, weight_matrix):
return example.SerializeToString() return example.SerializeToString()
def parse_data(filepath): def parse_data(filepath) -> List[bytes]:
"""
Parse a FASTQ file and generate a List of serialized Examples
"""
examples = [] examples = []
with open(filepath) as handle: with open(filepath) as handle:
for row in parse(handle, "fastq"): for row in parse(handle, "fastq"):
@ -28,7 +35,10 @@ def parse_data(filepath):
return examples return examples
def create_dataset(filepath): def create_dataset(filepath) -> None:
"""
Create a training and test dataset with a 70/30 split respectively
"""
data = parse_data(filepath) data = parse_data(filepath)
train_test_split = 0.7 train_test_split = 0.7
with TFRecordWriter("data/train_data.tfrecords") as train_writer, TFRecordWriter( with TFRecordWriter("data/train_data.tfrecords") as train_writer, TFRecordWriter(