diff --git a/src/constants.py b/src/constants.py deleted file mode 100644 index a0c9606..0000000 --- a/src/constants.py +++ /dev/null @@ -1,9 +0,0 @@ -BASES = "ACGT-" -TRAIN_DATASET = "data/train_data.tfrecords" -TEST_DATASET = "data/test_data.tfrecords" -EVAL_DATASET = "data/eval_data.tfrecords" -EPOCHS = 1000 -BATCH_SIZE = 1 -LEARNING_RATE = 0.004 -L2 = 0.001 -LOG_DIR = "logs" diff --git a/src/hyperparameters.py b/src/hyperparameters.py new file mode 100644 index 0000000..e7b5543 --- /dev/null +++ b/src/hyperparameters.py @@ -0,0 +1,24 @@ +class Hyperparameters: + def __init__( + self, + data_file, + label_file, + train_dataset="data/train_data.tfrecords", + test_dataset="data/test_data.tfrecords", + eval_dataset="data/eval_data.tfrecords", + epochs=1000, + batch_size=256, + learning_rate=0.004, + l2_rate=0.001, + log_directory="logs", + ): + self.data_file = data_file + self.label_file = label_file + self.train_dataset = train_dataset + self.eval_dataset = eval_dataset + self.test_dataset = test_dataset + self.epochs = epochs + self.batch_size = batch_size + self.learning_rate = learning_rate + self.l2_rate = l2_rate + self.log_directory = log_directory diff --git a/src/model.py b/src/model.py index 364ae3c..58bd4b9 100644 --- a/src/model.py +++ b/src/model.py @@ -2,17 +2,16 @@ from random import seed from tensorflow.keras import Model, Sequential from tensorflow.keras.layers import * -from tensorflow.keras.callbacks import TensorBoard from tensorflow.keras.losses import categorical_crossentropy from tensorflow.keras.optimizers import Adam from tensorflow.keras.regularizers import l2 from tensorflow.random import set_seed -from constants import * -from preprocessing import dataset_creation +from hyperparameters import Hyperparameters +from preprocessing import BASES, dataset_creation -def build_model() -> Model: +def build_model(hyperparams) -> Model: """ Build the CNN model """ @@ -20,23 +19,33 @@ def build_model() -> Model: [ Input(shape=(None, len(BASES))), Conv1D( - filters=16, kernel_size=5, activation="relu", kernel_regularizer=l2(L2) + filters=16, + kernel_size=5, + activation="relu", + kernel_regularizer=l2(hyperparams.l2_rate), ), MaxPool1D(pool_size=3, strides=1), Conv1D( - filters=16, kernel_size=3, activation="relu", kernel_regularizer=l2(L2) + filters=16, + kernel_size=3, + activation="relu", + kernel_regularizer=l2(hyperparams.l2_rate), ), MaxPool1D(pool_size=3, strides=1), GlobalAveragePooling1D(), - Dense(units=16, activation="relu", kernel_regularizer=l2(L2)), + Dense( + units=16, activation="relu", kernel_regularizer=l2(hyperparams.l2_rate) + ), Dropout(rate=0.3), - Dense(units=16, activation="relu", kernel_regularizer=l2(L2)), + Dense( + units=16, activation="relu", kernel_regularizer=l2(hyperparams.l2_rate) + ), Dropout(rate=0.3), Dense(units=len(BASES), activation="softmax"), ] ) model.compile( - optimizer=Adam(LEARNING_RATE), + optimizer=Adam(hyperparams.learning_rate), loss=categorical_crossentropy, metrics=["accuracy"], ) @@ -59,17 +68,12 @@ def run(data_file, label_file, seed_value=42) -> None: """ seed(seed_value) set_seed(seed_value) - train_data, eval_data, test_data = dataset_creation(data_file, label_file) - tensorboard = TensorBoard(log_dir=LOG_DIR, histogram_freq=1, profile_batch=0) - model = build_model() + hyperparams = Hyperparameters(data_file=data_file, label_file=label_file) + train_data, eval_data, test_data = dataset_creation(hyperparams) + model = build_model(hyperparams) print("Training the model") - model.fit( - train_data, - epochs=EPOCHS, - validation_data=eval_data, - callbacks=[tensorboard], - ) - print("Training complete. Obtaining final metrics...") + model.fit(train_data, epochs=hyperparams.epochs, validation_data=eval_data) + print("Training complete. Obtaining the model's metrics...") show_metrics(model, eval_data, test_data) diff --git a/src/preprocessing.py b/src/preprocessing.py index 756300f..74d4cca 100644 --- a/src/preprocessing.py +++ b/src/preprocessing.py @@ -9,7 +9,7 @@ from tensorflow.io import TFRecordWriter, VarLenFeature, parse_single_example from tensorflow.sparse import to_dense from tensorflow.train import Example, Feature, Features, Int64List -from constants import * +BASES = "ACGT-" def align_sequences(sequence, label) -> Tuple[str, str]: @@ -43,26 +43,26 @@ def encode_sequence(sequence) -> List[int]: return encoded_sequence -def read_fastq(data_file, label_file) -> List[bytes]: +def read_fastq(hyperparams) -> List[bytes]: """ Parses a data and a label FASTQ files and generates a List of serialized Examples """ examples = [] - with open(data_file) as data, open(label_file) as labels: + with open(hyperparams.data_file) as data, open(hyperparams.label_file) as labels: for element, label in zip(parse(data, "fastq"), parse(labels, "fastq")): example = generate_example(sequence=str(element.seq), label=str(label.seq)) examples.append(example) return examples -def create_dataset(data_file, label_file, dataset_split=[0.8, 0.1, 0.1]) -> None: +def create_dataset(hyperparams, dataset_split=[0.8, 0.1, 0.1]) -> None: """ Create a training, evaluation and test dataset with a 80/10/10 split respectively """ - data = read_fastq(data_file, label_file) - with TFRecordWriter(TRAIN_DATASET) as training, TFRecordWriter( - TEST_DATASET - ) as test, TFRecordWriter(EVAL_DATASET) as evaluation: + data = read_fastq(hyperparams) + with TFRecordWriter(hyperparams.train_dataset) as training, TFRecordWriter( + hyperparams.test_dataset + ) as test, TFRecordWriter(hyperparams.eval_dataset) as evaluation: for element in data: if random() < dataset_split[0]: training.write(element) @@ -97,25 +97,27 @@ def process_input(byte_string) -> Tuple[Tensor, Tensor]: return features["sequence"], features["label"] -def read_dataset(filepath) -> TFRecordDataset: +def read_dataset(filepath, hyperparams) -> TFRecordDataset: """ Read TFRecords files and generate a dataset """ data_input = TFRecordDataset(filenames=filepath) dataset = data_input.map(map_func=process_input, num_parallel_calls=AUTOTUNE) shuffled_dataset = dataset.shuffle(buffer_size=10000, seed=42) - batched_dataset = shuffled_dataset.batch(batch_size=BATCH_SIZE).repeat(count=EPOCHS) + batched_dataset = shuffled_dataset.batch(batch_size=hyperparams.batch_size).repeat( + count=hyperparams.epochs + ) return batched_dataset def dataset_creation( - data_file, label_file + hyperparams, ) -> Tuple[TFRecordDataset, TFRecordDataset, TFRecordDataset]: """ Generate the TFRecord files and split them into training, validation and test data """ - create_dataset(data_file, label_file) - train_data = read_dataset(TRAIN_DATASET) - eval_data = read_dataset(EVAL_DATASET) - test_data = read_dataset(TEST_DATASET) + create_dataset(hyperparams) + train_data = read_dataset(hyperparams.train_dataset, hyperparams) + eval_data = read_dataset(hyperparams.eval_dataset, hyperparams) + test_data = read_dataset(hyperparams.test_dataset, hyperparams) return train_data, eval_data, test_data