Create a validation set

Rename ref_sequence to label
2021-06-06 00:03:39 +02:00 · 2021-06-06 00:03:15 +02:00
1 changed files with 18 additions and 13 deletions
--- a/src/preprocessing.py
+++ b/src/preprocessing.py
@ -11,7 +11,7 @@ from tensorflow.train import Example, Feature, Features, FloatList, Int64List
 from constants import *


-def generate_example(sequence, reference_sequence, weight_matrix) -> bytes:
+def generate_example(sequence, label, weight_matrix) -> bytes:
    """
    Create a binary-string for each sequence containing the sequence and the bases' frequency
    """
@ -19,9 +19,7 @@ def generate_example(sequence, reference_sequence, weight_matrix) -> bytes:
        "sequence": Feature(
            int64_list=Int64List(value=list(encode_sequence(sequence)))
        ),
-        "reference_sequence": Feature(
-            int64_list=Int64List(value=list(encode_sequence(reference_sequence)))
-        ),
+        "label": Feature(int64_list=Int64List(value=list(encode_sequence(label)))),
        "A_counts": Feature(float_list=FloatList(value=weight_matrix["A"])),
        "C_counts": Feature(float_list=FloatList(value=weight_matrix["C"])),
        "G_counts": Feature(float_list=FloatList(value=weight_matrix["G"])),
@ -49,23 +47,27 @@ def read_fastq(data_file, label_file) -> List[bytes]:
            motifs = create([element.seq])
            example = generate_example(
                sequence=str(element.seq),
-                reference_sequence=str(label.seq),
+                label=str(label.seq),
                weight_matrix=motifs.pwm,
            )
            examples.append(example)
    return examples


-def create_dataset(filepath) -> None:
+def create_dataset(data_file, label_file) -> None:
    """
    Create a training and test dataset with a 70/30 split respectively
    """
    data = read_fastq(data_file, label_file)
-    train_test_split = 0.7
-    with TFRecordWriter(TRAIN_DATASET) as train, TFRecordWriter(TEST_DATASET) as test:
+    train_eval_test_split = [0.8, 0.1, 0.1]
+    with TFRecordWriter(TRAIN_DATASET) as training, TFRecordWriter(
+        TEST_DATASET
+    ) as test, TFRecordWriter(EVAL_DATASET) as evaluation:
        for element in data:
-            if random() < train_test_split:
-                train.write(element)
+            if random() < train_eval_test_split[0]:
+                training.write(element)
+            elif random() < train_eval_test_split[0] + train_eval_test_split[1]:
+                evaluation.write(element)
            else:
                test.write(element)

@ -76,7 +78,7 @@ def process_input(byte_string) -> Example:
    """
    schema = {
        "sequence": FixedLenFeature(shape=[], dtype=int64),
-        "reference_sequence": FixedLenFeature(shape=[], dtype=int64),
+        "label": FixedLenFeature(shape=[], dtype=int64),
        "A_counts": FixedLenFeature(shape=[], dtype=float32),
        "C_counts": FixedLenFeature(shape=[], dtype=float32),
        "G_counts": FixedLenFeature(shape=[], dtype=float32),
@ -96,8 +98,11 @@ def read_dataset(filepath) -> TFRecordDataset:
    return batched_dataset


-def dataset_creation(data_file, label_file) -> Tuple[TFRecordDataset, TFRecordDataset]:
+def dataset_creation(
+    data_file, label_file
+) -> Tuple[TFRecordDataset, TFRecordDataset, TFRecordDataset]:
    create_dataset(data_file, label_file)
    train_data = read_dataset(TRAIN_DATASET)
+    eval_data = read_dataset(EVAL_DATASET)
    test_data = read_dataset(TEST_DATASET)
-    return train_data, test_data
+    return train_data, eval_data, test_data
Author	SHA1	Message	Date
coolneng	2920db70b4	Create a validation set	2021-06-06 00:03:39 +02:00
coolneng	38903c5737	Rename ref_sequence to label	2021-06-06 00:03:15 +02:00