Update documentation about data splits
This commit is contained in:
parent
8870da8543
commit
168a68b50d
|
@ -54,12 +54,13 @@ def read_fastq(data_file, label_file) -> List[bytes]:
|
||||||
return examples
|
return examples
|
||||||
|
|
||||||
|
|
||||||
def create_dataset(data_file, label_file) -> None:
|
def create_dataset(
|
||||||
|
data_file, label_file, train_eval_test_split=[0.8, 0.1, 0.1]
|
||||||
|
) -> None:
|
||||||
"""
|
"""
|
||||||
Create a training and test dataset with a 70/30 split respectively
|
Create a training, evaluation and test dataset with a 80/10/30 split respectively
|
||||||
"""
|
"""
|
||||||
data = read_fastq(data_file, label_file)
|
data = read_fastq(data_file, label_file)
|
||||||
train_eval_test_split = [0.8, 0.1, 0.1]
|
|
||||||
with TFRecordWriter(TRAIN_DATASET) as training, TFRecordWriter(
|
with TFRecordWriter(TRAIN_DATASET) as training, TFRecordWriter(
|
||||||
TEST_DATASET
|
TEST_DATASET
|
||||||
) as test, TFRecordWriter(EVAL_DATASET) as evaluation:
|
) as test, TFRecordWriter(EVAL_DATASET) as evaluation:
|
||||||
|
@ -101,6 +102,9 @@ def read_dataset(filepath) -> TFRecordDataset:
|
||||||
def dataset_creation(
|
def dataset_creation(
|
||||||
data_file, label_file
|
data_file, label_file
|
||||||
) -> Tuple[TFRecordDataset, TFRecordDataset, TFRecordDataset]:
|
) -> Tuple[TFRecordDataset, TFRecordDataset, TFRecordDataset]:
|
||||||
|
"""
|
||||||
|
Generate the TFRecord files and split them into training, validation and test data
|
||||||
|
"""
|
||||||
create_dataset(data_file, label_file)
|
create_dataset(data_file, label_file)
|
||||||
train_data = read_dataset(TRAIN_DATASET)
|
train_data = read_dataset(TRAIN_DATASET)
|
||||||
eval_data = read_dataset(EVAL_DATASET)
|
eval_data = read_dataset(EVAL_DATASET)
|
||||||
|
|
Loading…
Reference in New Issue