Implement HTML parsing and output to a file

This commit is contained in:
coolneng 2021-11-13 14:21:24 +01:00
parent 36c525fffd
commit a987e662b5
Signed by: coolneng
GPG Key ID: 9893DA236405AF57
2 changed files with 33 additions and 1 deletions

View File

@ -2,4 +2,4 @@
with pkgs;
mkShell { buildInputs = [ python39 python39Packages.beautifulsoup4 ]; }
mkShell { buildInputs = [ python39 python39Packages.pandas ]; }

32
src/parser.py Normal file
View File

@ -0,0 +1,32 @@
from glob import glob
from typing import List
from pandas import DataFrame, read_html
def find_html_files(path) -> List:
file_list = glob(path + "/*fastqc.html")
return file_list
def extract_adapters(files) -> DataFrame:
all_adapters = DataFrame()
for entry in files:
tables = read_html(entry)
adapter_sequences = tables[1].Sequence
all_adapters = all_adapters.append(adapter_sequences)
return all_adapters.dropna()
def save_to_file(filename, adapters) -> None:
adapters.to_csv(filename, index=False, header=False, sep="\n")
def main():
file_list = find_html_files("data")
adapters = extract_adapters(file_list)
save_to_file("placeholder.txt", adapters)
if __name__ == "__main__":
main()