Implement HTML parsing and output to a file

2021-11-13 14:21:24 +01:00 · 2021-11-13 14:21:24 +01:00 · a987e662b5
parent 36c525fffd
commit a987e662b5
2 changed files with 33 additions and 1 deletions
--- a/shell.nix
+++ b/shell.nix
@ -2,4 +2,4 @@
 with pkgs;
-mkShell { buildInputs = [ python39 python39Packages.beautifulsoup4 ]; }
+mkShell { buildInputs = [ python39 python39Packages.pandas ]; }
--- a/src/parser.py
+++ b/src/parser.py
@ -0,0 +1,32 @@
 from glob import glob
 from typing import List
 from pandas import DataFrame, read_html
 def find_html_files(path) -> List:
    file_list = glob(path + "/*fastqc.html")
    return file_list
 def extract_adapters(files) -> DataFrame:
    all_adapters = DataFrame()
    for entry in files:
        tables = read_html(entry)
        adapter_sequences = tables[1].Sequence
        all_adapters = all_adapters.append(adapter_sequences)
    return all_adapters.dropna()
 def save_to_file(filename, adapters) -> None:
    adapters.to_csv(filename, index=False, header=False, sep="\n")
 def main():
    file_list = find_html_files("data")
    adapters = extract_adapters(file_list)
    save_to_file("placeholder.txt", adapters)
 if __name__ == "__main__":
    main()
`@ -2,4 +2,4 @@`

	`with pkgs;`	`with pkgs;`

	`mkShell { buildInputs = [ python39 python39Packages.beautifulsoup4 ]; }`	`mkShell { buildInputs = [ python39 python39Packages.pandas ]; }`