From 1237394bb1fa4e126d48d7d744034e199360e360 Mon Sep 17 00:00:00 2001 From: coolneng Date: Fri, 25 Jun 2021 00:04:01 +0200 Subject: [PATCH] Perform one hot encoding on the sequences --- src/preprocessing.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/preprocessing.py b/src/preprocessing.py index e2995b4..0b0c52a 100644 --- a/src/preprocessing.py +++ b/src/preprocessing.py @@ -3,8 +3,7 @@ from typing import Dict, List, Tuple from Bio.pairwise2 import align from Bio.SeqIO import parse from numpy.random import random -from tensorflow import Tensor, int64 -from tensorflow.data import TFRecordDataset +from tensorflow import Tensor, int64, one_hot from tensorflow.data import AUTOTUNE, TFRecordDataset from tensorflow.io import TFRecordWriter, VarLenFeature, parse_single_example from tensorflow.sparse import to_dense @@ -78,12 +77,13 @@ def create_dataset(data_file, label_file, dataset_split=[0.8, 0.1, 0.1]) -> None def transform_features(parsed_features) -> Dict[str, Tensor]: """ - Transform the parsed features of an Example into a list of dense Tensors + Transform the parsed features of an Example into a list of dense one hot encoded Tensors """ features = {} sparse_features = ["sequence", "label"] for element in sparse_features: features[element] = to_dense(parsed_features[element]) + features[element] = one_hot(features[element], depth=len(BASES)) return features