Implement HTML parsing and output to a file
This commit is contained in:
parent
36c525fffd
commit
a987e662b5
|
@ -2,4 +2,4 @@
|
|||
|
||||
with pkgs;
|
||||
|
||||
mkShell { buildInputs = [ python39 python39Packages.beautifulsoup4 ]; }
|
||||
mkShell { buildInputs = [ python39 python39Packages.pandas ]; }
|
||||
|
|
|
@ -0,0 +1,32 @@
|
|||
from glob import glob
|
||||
from typing import List
|
||||
|
||||
from pandas import DataFrame, read_html
|
||||
|
||||
|
||||
def find_html_files(path) -> List:
|
||||
file_list = glob(path + "/*fastqc.html")
|
||||
return file_list
|
||||
|
||||
|
||||
def extract_adapters(files) -> DataFrame:
|
||||
all_adapters = DataFrame()
|
||||
for entry in files:
|
||||
tables = read_html(entry)
|
||||
adapter_sequences = tables[1].Sequence
|
||||
all_adapters = all_adapters.append(adapter_sequences)
|
||||
return all_adapters.dropna()
|
||||
|
||||
|
||||
def save_to_file(filename, adapters) -> None:
|
||||
adapters.to_csv(filename, index=False, header=False, sep="\n")
|
||||
|
||||
|
||||
def main():
|
||||
file_list = find_html_files("data")
|
||||
adapters = extract_adapters(file_list)
|
||||
save_to_file("placeholder.txt", adapters)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Loading…
Reference in New Issue