Update documentation about data splits

This commit is contained in:
coolneng 2021-06-06 00:13:37 +02:00
parent 8870da8543
commit 168a68b50d
Signed by: coolneng
GPG Key ID: 9893DA236405AF57
1 changed files with 7 additions and 3 deletions

View File

@ -54,12 +54,13 @@ def read_fastq(data_file, label_file) -> List[bytes]:
return examples
def create_dataset(data_file, label_file) -> None:
def create_dataset(
data_file, label_file, train_eval_test_split=[0.8, 0.1, 0.1]
) -> None:
"""
Create a training and test dataset with a 70/30 split respectively
Create a training, evaluation and test dataset with a 80/10/30 split respectively
"""
data = read_fastq(data_file, label_file)
train_eval_test_split = [0.8, 0.1, 0.1]
with TFRecordWriter(TRAIN_DATASET) as training, TFRecordWriter(
TEST_DATASET
) as test, TFRecordWriter(EVAL_DATASET) as evaluation:
@ -101,6 +102,9 @@ def read_dataset(filepath) -> TFRecordDataset:
def dataset_creation(
data_file, label_file
) -> Tuple[TFRecordDataset, TFRecordDataset, TFRecordDataset]:
"""
Generate the TFRecord files and split them into training, validation and test data
"""
create_dataset(data_file, label_file)
train_data = read_dataset(TRAIN_DATASET)
eval_data = read_dataset(EVAL_DATASET)