diff --git a/src/preprocessing.py b/src/preprocessing.py new file mode 100644 index 0000000..f04b6d6 --- /dev/null +++ b/src/preprocessing.py @@ -0,0 +1,46 @@ +from Bio.motifs import create +from Bio.SeqIO import parse +from numpy.random import random +from tensorflow.io import TFRecordWriter +from tensorflow.train import BytesList, Example, Feature, Features, FloatList + +BASES = "ACGT" + + +def generate_example(sequence, weight_matrix): + schema = { + "sequence": Feature(bytes_list=BytesList(value=[sequence.encode()])), + "A_counts": Feature(float_list=FloatList(value=[weight_matrix["A"][0]])), + "C_counts": Feature(float_list=FloatList(value=[weight_matrix["C"][0]])), + "G_counts": Feature(float_list=FloatList(value=[weight_matrix["G"][0]])), + "T_counts": Feature(float_list=FloatList(value=[weight_matrix["T"][0]])), + } + example = Example(features=Features(feature=schema)) + return example.SerializeToString() + + +def parse_data(filepath): + examples = [] + with open(filepath) as handle: + for row in parse(handle, "fastq"): + sequence = str(row.seq) + motifs = create(row.seq) + example = generate_example(sequence=sequence, weight_matrix=motifs.pwm) + examples.append(example) + return examples + + +def create_dataset(filepath): + data = parse_data(filepath) + train_test_split = 0.7 + with TFRecordWriter("data/train_data.tfrecords") as train_writer, TFRecordWriter( + "data/test_data.tfrecords" + ) as test_writer: + for element in data: + if random() < train_test_split: + train_writer.write(element) + else: + test_writer.write(element) + + +create_dataset("data/curesim-HVR.fastq")