diff --git a/data/.gitkeep b/data/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/src/parser.py b/src/parser.py index 3885be3..0c4ec9d 100644 --- a/src/parser.py +++ b/src/parser.py @@ -1,8 +1,8 @@ from argparse import ArgumentParser from glob import glob -from typing import List, Union +from typing import List -from pandas import DataFrame, read_html +from pandas import DataFrame, read_html, Series def find_html_files(path) -> List: @@ -10,17 +10,28 @@ def find_html_files(path) -> List: return file_list -def extract_adapters(files) -> Union[DataFrame, None]: +def extract_adapters(files) -> Series: all_adapters = DataFrame() for entry in files: tables = read_html(entry) adapter_sequences = tables[1].Sequence all_adapters = all_adapters.append(adapter_sequences) - return all_adapters.dropna() + processed_adapters = preprocess_dataframe(all_adapters) + return processed_adapters + + +def preprocess_dataframe(adapters) -> Series: + na_free_adapters = adapters.dropna(axis=1) + stacked_adapters = na_free_adapters.stack() + duplicate_free_adapters = stacked_adapters.drop_duplicates() + return duplicate_free_adapters def save_to_file(filename, adapters) -> None: - adapters.to_csv(filename, index=False, header=False, sep="\n") + with open(filename, "w") as f: + for index, value in adapters.iteritems(): + fasta_entry = f">{index}\n{value}\n" + f.write(fasta_entry) def parse_arguments():