From 38903c5737211040f8f6016114ef00b37b8d74dd Mon Sep 17 00:00:00 2001 From: coolneng Date: Sun, 6 Jun 2021 00:03:15 +0200 Subject: [PATCH] Rename ref_sequence to label --- src/preprocessing.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/src/preprocessing.py b/src/preprocessing.py index c16a6fa..b9353d1 100644 --- a/src/preprocessing.py +++ b/src/preprocessing.py @@ -11,7 +11,7 @@ from tensorflow.train import Example, Feature, Features, FloatList, Int64List from constants import * -def generate_example(sequence, reference_sequence, weight_matrix) -> bytes: +def generate_example(sequence, label, weight_matrix) -> bytes: """ Create a binary-string for each sequence containing the sequence and the bases' frequency """ @@ -19,9 +19,7 @@ def generate_example(sequence, reference_sequence, weight_matrix) -> bytes: "sequence": Feature( int64_list=Int64List(value=list(encode_sequence(sequence))) ), - "reference_sequence": Feature( - int64_list=Int64List(value=list(encode_sequence(reference_sequence))) - ), + "label": Feature(int64_list=Int64List(value=list(encode_sequence(label)))), "A_counts": Feature(float_list=FloatList(value=weight_matrix["A"])), "C_counts": Feature(float_list=FloatList(value=weight_matrix["C"])), "G_counts": Feature(float_list=FloatList(value=weight_matrix["G"])), @@ -49,14 +47,14 @@ def read_fastq(data_file, label_file) -> List[bytes]: motifs = create([element.seq]) example = generate_example( sequence=str(element.seq), - reference_sequence=str(label.seq), + label=str(label.seq), weight_matrix=motifs.pwm, ) examples.append(example) return examples -def create_dataset(filepath) -> None: +def create_dataset(data_file, label_file) -> None: """ Create a training and test dataset with a 70/30 split respectively """ @@ -76,7 +74,7 @@ def process_input(byte_string) -> Example: """ schema = { "sequence": FixedLenFeature(shape=[], dtype=int64), - "reference_sequence": FixedLenFeature(shape=[], dtype=int64), + "label": FixedLenFeature(shape=[], dtype=int64), "A_counts": FixedLenFeature(shape=[], dtype=float32), "C_counts": FixedLenFeature(shape=[], dtype=float32), "G_counts": FixedLenFeature(shape=[], dtype=float32),