From 168a68b50d850b029dae066df84514aea9258fa7 Mon Sep 17 00:00:00 2001 From: coolneng Date: Sun, 6 Jun 2021 00:13:37 +0200 Subject: [PATCH] Update documentation about data splits --- src/preprocessing.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/preprocessing.py b/src/preprocessing.py index a08b6cf..102135c 100644 --- a/src/preprocessing.py +++ b/src/preprocessing.py @@ -54,12 +54,13 @@ def read_fastq(data_file, label_file) -> List[bytes]: return examples -def create_dataset(data_file, label_file) -> None: +def create_dataset( + data_file, label_file, train_eval_test_split=[0.8, 0.1, 0.1] +) -> None: """ - Create a training and test dataset with a 70/30 split respectively + Create a training, evaluation and test dataset with a 80/10/30 split respectively """ data = read_fastq(data_file, label_file) - train_eval_test_split = [0.8, 0.1, 0.1] with TFRecordWriter(TRAIN_DATASET) as training, TFRecordWriter( TEST_DATASET ) as test, TFRecordWriter(EVAL_DATASET) as evaluation: @@ -101,6 +102,9 @@ def read_dataset(filepath) -> TFRecordDataset: def dataset_creation( data_file, label_file ) -> Tuple[TFRecordDataset, TFRecordDataset, TFRecordDataset]: + """ + Generate the TFRecord files and split them into training, validation and test data + """ create_dataset(data_file, label_file) train_data = read_dataset(TRAIN_DATASET) eval_data = read_dataset(EVAL_DATASET)