Update documentation about data splits
This commit is contained in:
parent
8870da8543
commit
168a68b50d
|
@ -54,12 +54,13 @@ def read_fastq(data_file, label_file) -> List[bytes]:
|
|||
return examples
|
||||
|
||||
|
||||
def create_dataset(data_file, label_file) -> None:
|
||||
def create_dataset(
|
||||
data_file, label_file, train_eval_test_split=[0.8, 0.1, 0.1]
|
||||
) -> None:
|
||||
"""
|
||||
Create a training and test dataset with a 70/30 split respectively
|
||||
Create a training, evaluation and test dataset with a 80/10/30 split respectively
|
||||
"""
|
||||
data = read_fastq(data_file, label_file)
|
||||
train_eval_test_split = [0.8, 0.1, 0.1]
|
||||
with TFRecordWriter(TRAIN_DATASET) as training, TFRecordWriter(
|
||||
TEST_DATASET
|
||||
) as test, TFRecordWriter(EVAL_DATASET) as evaluation:
|
||||
|
@ -101,6 +102,9 @@ def read_dataset(filepath) -> TFRecordDataset:
|
|||
def dataset_creation(
|
||||
data_file, label_file
|
||||
) -> Tuple[TFRecordDataset, TFRecordDataset, TFRecordDataset]:
|
||||
"""
|
||||
Generate the TFRecord files and split them into training, validation and test data
|
||||
"""
|
||||
create_dataset(data_file, label_file)
|
||||
train_data = read_dataset(TRAIN_DATASET)
|
||||
eval_data = read_dataset(EVAL_DATASET)
|
||||
|
|
Loading…
Reference in New Issue