Perform one hot encoding on the sequences

2021-06-25 00:04:01 +02:00 · 2021-06-25 00:04:01 +02:00 · 1237394bb1
parent e9582d0883
commit 1237394bb1
1 changed files with 3 additions and 3 deletions
--- a/src/preprocessing.py
+++ b/src/preprocessing.py
@ -3,8 +3,7 @@ from typing import Dict, List, Tuple
 from Bio.pairwise2 import align
 from Bio.SeqIO import parse
 from numpy.random import random
-from tensorflow import Tensor, int64
-from tensorflow.data import TFRecordDataset
+from tensorflow import Tensor, int64, one_hot
 from tensorflow.data import AUTOTUNE, TFRecordDataset
 from tensorflow.io import TFRecordWriter, VarLenFeature, parse_single_example
 from tensorflow.sparse import to_dense
@ -78,12 +77,13 @@ def create_dataset(data_file, label_file, dataset_split=[0.8, 0.1, 0.1]) -> None

 def transform_features(parsed_features) -> Dict[str, Tensor]:
    """
-    Transform the parsed features of an Example into a list of dense Tensors
+    Transform the parsed features of an Example into a list of dense one hot encoded Tensors
    """
    features = {}
    sparse_features = ["sequence", "label"]
    for element in sparse_features:
        features[element] = to_dense(parsed_features[element])
+        features[element] = one_hot(features[element], depth=len(BASES))
    return features