diff --git a/shell.nix b/shell.nix index a3f2639..ab1fe93 100644 --- a/shell.nix +++ b/shell.nix @@ -2,4 +2,4 @@ with pkgs; -mkShell { buildInputs = [ python39 python39Packages.beautifulsoup4 ]; } +mkShell { buildInputs = [ python39 python39Packages.pandas ]; } diff --git a/src/parser.py b/src/parser.py new file mode 100644 index 0000000..719e41e --- /dev/null +++ b/src/parser.py @@ -0,0 +1,32 @@ +from glob import glob +from typing import List + +from pandas import DataFrame, read_html + + +def find_html_files(path) -> List: + file_list = glob(path + "/*fastqc.html") + return file_list + + +def extract_adapters(files) -> DataFrame: + all_adapters = DataFrame() + for entry in files: + tables = read_html(entry) + adapter_sequences = tables[1].Sequence + all_adapters = all_adapters.append(adapter_sequences) + return all_adapters.dropna() + + +def save_to_file(filename, adapters) -> None: + adapters.to_csv(filename, index=False, header=False, sep="\n") + + +def main(): + file_list = find_html_files("data") + adapters = extract_adapters(file_list) + save_to_file("placeholder.txt", adapters) + + +if __name__ == "__main__": + main()