Update documentation about data splits

2021-06-06 00:13:37 +02:00 · 2021-06-06 00:13:37 +02:00 · 168a68b50d
commit 168a68b50d
parent 8870da8543
1 changed files with 7 additions and 3 deletions
--- a/src/preprocessing.py
+++ b/src/preprocessing.py
@ -54,12 +54,13 @@ def read_fastq(data_file, label_file) -> List[bytes]:
    return examples


-def create_dataset(data_file, label_file) -> None:
+def create_dataset(
+    data_file, label_file, train_eval_test_split=[0.8, 0.1, 0.1]
+) -> None:
    """
-    Create a training and test dataset with a 70/30 split respectively
+    Create a training, evaluation and test dataset with a 80/10/30 split respectively
    """
    data = read_fastq(data_file, label_file)
-    train_eval_test_split = [0.8, 0.1, 0.1]
    with TFRecordWriter(TRAIN_DATASET) as training, TFRecordWriter(
        TEST_DATASET
    ) as test, TFRecordWriter(EVAL_DATASET) as evaluation:
@ -101,6 +102,9 @@ def read_dataset(filepath) -> TFRecordDataset:
 def dataset_creation(
    data_file, label_file
 ) -> Tuple[TFRecordDataset, TFRecordDataset, TFRecordDataset]:
+    """
+    Generate the TFRecord files and split them into training, validation and test data
+    """
    create_dataset(data_file, label_file)
    train_data = read_dataset(TRAIN_DATASET)
    eval_data = read_dataset(EVAL_DATASET)