Create a dataset and write it to TFRecords files

This commit is contained in:
coolneng 2021-06-01 18:26:13 +02:00
parent 59aa61112e
commit 16c01afbe7
Signed by: coolneng
GPG Key ID: 9893DA236405AF57
1 changed files with 46 additions and 0 deletions

46
src/preprocessing.py Normal file
View File

@ -0,0 +1,46 @@
from Bio.motifs import create
from Bio.SeqIO import parse
from numpy.random import random
from tensorflow.io import TFRecordWriter
from tensorflow.train import BytesList, Example, Feature, Features, FloatList
BASES = "ACGT"
def generate_example(sequence, weight_matrix):
schema = {
"sequence": Feature(bytes_list=BytesList(value=[sequence.encode()])),
"A_counts": Feature(float_list=FloatList(value=[weight_matrix["A"][0]])),
"C_counts": Feature(float_list=FloatList(value=[weight_matrix["C"][0]])),
"G_counts": Feature(float_list=FloatList(value=[weight_matrix["G"][0]])),
"T_counts": Feature(float_list=FloatList(value=[weight_matrix["T"][0]])),
}
example = Example(features=Features(feature=schema))
return example.SerializeToString()
def parse_data(filepath):
examples = []
with open(filepath) as handle:
for row in parse(handle, "fastq"):
sequence = str(row.seq)
motifs = create(row.seq)
example = generate_example(sequence=sequence, weight_matrix=motifs.pwm)
examples.append(example)
return examples
def create_dataset(filepath):
data = parse_data(filepath)
train_test_split = 0.7
with TFRecordWriter("data/train_data.tfrecords") as train_writer, TFRecordWriter(
"data/test_data.tfrecords"
) as test_writer:
for element in data:
if random() < train_test_split:
train_writer.write(element)
else:
test_writer.write(element)
create_dataset("data/curesim-HVR.fastq")