Compare commits
No commits in common. "c9de0c83207a66057c5e0c46d23a354c099d004a" and "f8c1a54be3d7f61ad37a60d4b1b4cbbb298ea1ac" have entirely different histories.
c9de0c8320
...
f8c1a54be3
|
@ -3,5 +3,3 @@ TRAIN_DATASET = "data/train_data.tfrecords"
|
||||||
TEST_DATASET = "data/test_data.tfrecords"
|
TEST_DATASET = "data/test_data.tfrecords"
|
||||||
EPOCHS = 1000
|
EPOCHS = 1000
|
||||||
BATCH_SIZE = 256
|
BATCH_SIZE = 256
|
||||||
LEARNING_RATE = 0.004
|
|
||||||
L2 = 0.001
|
|
||||||
|
|
|
@ -36,7 +36,7 @@ def encode_sequence(sequence) -> List[int]:
|
||||||
return encoded_sequence
|
return encoded_sequence
|
||||||
|
|
||||||
|
|
||||||
def read_fastq(filepath) -> List[bytes]:
|
def parse_data(filepath) -> List[bytes]:
|
||||||
"""
|
"""
|
||||||
Parse a FASTQ file and generate a List of serialized Examples
|
Parse a FASTQ file and generate a List of serialized Examples
|
||||||
"""
|
"""
|
||||||
|
@ -54,7 +54,7 @@ def create_dataset(filepath) -> None:
|
||||||
"""
|
"""
|
||||||
Create a training and test dataset with a 70/30 split respectively
|
Create a training and test dataset with a 70/30 split respectively
|
||||||
"""
|
"""
|
||||||
data = read_fastq(filepath)
|
data = parse_data(filepath)
|
||||||
train_test_split = 0.7
|
train_test_split = 0.7
|
||||||
with TFRecordWriter(TRAIN_DATASET) as train, TFRecordWriter(TEST_DATASET) as test:
|
with TFRecordWriter(TRAIN_DATASET) as train, TFRecordWriter(TEST_DATASET) as test:
|
||||||
for element in data:
|
for element in data:
|
||||||
|
@ -64,10 +64,7 @@ def create_dataset(filepath) -> None:
|
||||||
test.write(element)
|
test.write(element)
|
||||||
|
|
||||||
|
|
||||||
def process_input(byte_string) -> Example:
|
def process_input(byte_string):
|
||||||
"""
|
|
||||||
Parse a byte-string into an Example object
|
|
||||||
"""
|
|
||||||
schema = {
|
schema = {
|
||||||
"sequence": FixedLenFeature(shape=[], dtype=int64),
|
"sequence": FixedLenFeature(shape=[], dtype=int64),
|
||||||
"A_counts": FixedLenFeature(shape=[], dtype=float32),
|
"A_counts": FixedLenFeature(shape=[], dtype=float32),
|
||||||
|
@ -78,12 +75,13 @@ def process_input(byte_string) -> Example:
|
||||||
return parse_single_example(byte_string, features=schema)
|
return parse_single_example(byte_string, features=schema)
|
||||||
|
|
||||||
|
|
||||||
def read_dataset() -> TFRecordDataset:
|
def read_dataset():
|
||||||
"""
|
|
||||||
Read TFRecords files and generate a dataset
|
|
||||||
"""
|
|
||||||
data_input = TFRecordDataset(filenames=[TRAIN_DATASET, TEST_DATASET])
|
data_input = TFRecordDataset(filenames=[TRAIN_DATASET, TEST_DATASET])
|
||||||
dataset = data_input.map(map_func=process_input)
|
dataset = data_input.map(map_func=process_input)
|
||||||
shuffled_dataset = dataset.shuffle(buffer_size=10000, reshuffle_each_iteration=True)
|
shuffled_dataset = dataset.shuffle(buffer_size=10000, reshuffle_each_iteration=True)
|
||||||
batched_dataset = shuffled_dataset.batch(batch_size=BATCH_SIZE).repeat(count=EPOCHS)
|
batched_dataset = shuffled_dataset.batch(batch_size=BATCH_SIZE).repeat(count=EPOCHS)
|
||||||
return batched_dataset
|
return batched_dataset
|
||||||
|
|
||||||
|
|
||||||
|
create_dataset("data/curesim-HVR.fastq")
|
||||||
|
dataset = read_dataset()
|
||||||
|
|
Loading…
Reference in New Issue