From 44ff69dc9e7191dfea7e28a2ec8787d9d2d28be6 Mon Sep 17 00:00:00 2001 From: coolneng Date: Tue, 1 Jun 2021 18:46:17 +0200 Subject: [PATCH] Document the preprocessing module --- src/preprocessing.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/src/preprocessing.py b/src/preprocessing.py index 93fa5b6..9904d73 100644 --- a/src/preprocessing.py +++ b/src/preprocessing.py @@ -3,9 +3,13 @@ from Bio.SeqIO import parse from numpy.random import random from tensorflow.io import TFRecordWriter from tensorflow.train import BytesList, Example, Feature, Features, FloatList +from typing import List -def generate_example(sequence, weight_matrix): +def generate_example(sequence, weight_matrix) -> bytes: + """ + Create a binary-string for each sequence containing the sequence and the bases' frequency + """ schema = { "sequence": Feature(bytes_list=BytesList(value=[sequence.encode()])), "A_counts": Feature(float_list=FloatList(value=[weight_matrix["A"][0]])), @@ -17,7 +21,10 @@ def generate_example(sequence, weight_matrix): return example.SerializeToString() -def parse_data(filepath): +def parse_data(filepath) -> List[bytes]: + """ + Parse a FASTQ file and generate a List of serialized Examples + """ examples = [] with open(filepath) as handle: for row in parse(handle, "fastq"): @@ -28,7 +35,10 @@ def parse_data(filepath): return examples -def create_dataset(filepath): +def create_dataset(filepath) -> None: + """ + Create a training and test dataset with a 70/30 split respectively + """ data = parse_data(filepath) train_test_split = 0.7 with TFRecordWriter("data/train_data.tfrecords") as train_writer, TFRecordWriter(