Document the preprocessing module
This commit is contained in:
parent
5ac81c049f
commit
44ff69dc9e
|
@ -3,9 +3,13 @@ from Bio.SeqIO import parse
|
||||||
from numpy.random import random
|
from numpy.random import random
|
||||||
from tensorflow.io import TFRecordWriter
|
from tensorflow.io import TFRecordWriter
|
||||||
from tensorflow.train import BytesList, Example, Feature, Features, FloatList
|
from tensorflow.train import BytesList, Example, Feature, Features, FloatList
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
|
||||||
def generate_example(sequence, weight_matrix):
|
def generate_example(sequence, weight_matrix) -> bytes:
|
||||||
|
"""
|
||||||
|
Create a binary-string for each sequence containing the sequence and the bases' frequency
|
||||||
|
"""
|
||||||
schema = {
|
schema = {
|
||||||
"sequence": Feature(bytes_list=BytesList(value=[sequence.encode()])),
|
"sequence": Feature(bytes_list=BytesList(value=[sequence.encode()])),
|
||||||
"A_counts": Feature(float_list=FloatList(value=[weight_matrix["A"][0]])),
|
"A_counts": Feature(float_list=FloatList(value=[weight_matrix["A"][0]])),
|
||||||
|
@ -17,7 +21,10 @@ def generate_example(sequence, weight_matrix):
|
||||||
return example.SerializeToString()
|
return example.SerializeToString()
|
||||||
|
|
||||||
|
|
||||||
def parse_data(filepath):
|
def parse_data(filepath) -> List[bytes]:
|
||||||
|
"""
|
||||||
|
Parse a FASTQ file and generate a List of serialized Examples
|
||||||
|
"""
|
||||||
examples = []
|
examples = []
|
||||||
with open(filepath) as handle:
|
with open(filepath) as handle:
|
||||||
for row in parse(handle, "fastq"):
|
for row in parse(handle, "fastq"):
|
||||||
|
@ -28,7 +35,10 @@ def parse_data(filepath):
|
||||||
return examples
|
return examples
|
||||||
|
|
||||||
|
|
||||||
def create_dataset(filepath):
|
def create_dataset(filepath) -> None:
|
||||||
|
"""
|
||||||
|
Create a training and test dataset with a 70/30 split respectively
|
||||||
|
"""
|
||||||
data = parse_data(filepath)
|
data = parse_data(filepath)
|
||||||
train_test_split = 0.7
|
train_test_split = 0.7
|
||||||
with TFRecordWriter("data/train_data.tfrecords") as train_writer, TFRecordWriter(
|
with TFRecordWriter("data/train_data.tfrecords") as train_writer, TFRecordWriter(
|
||||||
|
|
Loading…
Reference in New Issue