Document the preprocessing module

This commit is contained in:
coolneng 2021-06-01 18:46:17 +02:00
parent 5ac81c049f
commit 44ff69dc9e
Signed by: coolneng
GPG Key ID: 9893DA236405AF57
1 changed files with 13 additions and 3 deletions

View File

@ -3,9 +3,13 @@ from Bio.SeqIO import parse
from numpy.random import random
from tensorflow.io import TFRecordWriter
from tensorflow.train import BytesList, Example, Feature, Features, FloatList
from typing import List
def generate_example(sequence, weight_matrix):
def generate_example(sequence, weight_matrix) -> bytes:
"""
Create a binary-string for each sequence containing the sequence and the bases' frequency
"""
schema = {
"sequence": Feature(bytes_list=BytesList(value=[sequence.encode()])),
"A_counts": Feature(float_list=FloatList(value=[weight_matrix["A"][0]])),
@ -17,7 +21,10 @@ def generate_example(sequence, weight_matrix):
return example.SerializeToString()
def parse_data(filepath):
def parse_data(filepath) -> List[bytes]:
"""
Parse a FASTQ file and generate a List of serialized Examples
"""
examples = []
with open(filepath) as handle:
for row in parse(handle, "fastq"):
@ -28,7 +35,10 @@ def parse_data(filepath):
return examples
def create_dataset(filepath):
def create_dataset(filepath) -> None:
"""
Create a training and test dataset with a 70/30 split respectively
"""
data = parse_data(filepath)
train_test_split = 0.7
with TFRecordWriter("data/train_data.tfrecords") as train_writer, TFRecordWriter(