Create a dataset and write it to TFRecords files
This commit is contained in:
parent
59aa61112e
commit
16c01afbe7
|
@ -0,0 +1,46 @@
|
||||||
|
from Bio.motifs import create
|
||||||
|
from Bio.SeqIO import parse
|
||||||
|
from numpy.random import random
|
||||||
|
from tensorflow.io import TFRecordWriter
|
||||||
|
from tensorflow.train import BytesList, Example, Feature, Features, FloatList
|
||||||
|
|
||||||
|
BASES = "ACGT"
|
||||||
|
|
||||||
|
|
||||||
|
def generate_example(sequence, weight_matrix):
|
||||||
|
schema = {
|
||||||
|
"sequence": Feature(bytes_list=BytesList(value=[sequence.encode()])),
|
||||||
|
"A_counts": Feature(float_list=FloatList(value=[weight_matrix["A"][0]])),
|
||||||
|
"C_counts": Feature(float_list=FloatList(value=[weight_matrix["C"][0]])),
|
||||||
|
"G_counts": Feature(float_list=FloatList(value=[weight_matrix["G"][0]])),
|
||||||
|
"T_counts": Feature(float_list=FloatList(value=[weight_matrix["T"][0]])),
|
||||||
|
}
|
||||||
|
example = Example(features=Features(feature=schema))
|
||||||
|
return example.SerializeToString()
|
||||||
|
|
||||||
|
|
||||||
|
def parse_data(filepath):
|
||||||
|
examples = []
|
||||||
|
with open(filepath) as handle:
|
||||||
|
for row in parse(handle, "fastq"):
|
||||||
|
sequence = str(row.seq)
|
||||||
|
motifs = create(row.seq)
|
||||||
|
example = generate_example(sequence=sequence, weight_matrix=motifs.pwm)
|
||||||
|
examples.append(example)
|
||||||
|
return examples
|
||||||
|
|
||||||
|
|
||||||
|
def create_dataset(filepath):
|
||||||
|
data = parse_data(filepath)
|
||||||
|
train_test_split = 0.7
|
||||||
|
with TFRecordWriter("data/train_data.tfrecords") as train_writer, TFRecordWriter(
|
||||||
|
"data/test_data.tfrecords"
|
||||||
|
) as test_writer:
|
||||||
|
for element in data:
|
||||||
|
if random() < train_test_split:
|
||||||
|
train_writer.write(element)
|
||||||
|
else:
|
||||||
|
test_writer.write(element)
|
||||||
|
|
||||||
|
|
||||||
|
create_dataset("data/curesim-HVR.fastq")
|
Loading…
Reference in New Issue