from argparse import ArgumentParser from glob import glob from typing import List, Union from pandas import DataFrame, read_html def find_html_files(path) -> List: file_list = glob(path + "/*fastqc.html") return file_list def extract_adapters(files) -> Union[DataFrame, None]: all_adapters = DataFrame() for entry in files: tables = read_html(entry) adapter_sequences = tables[1].Sequence all_adapters = all_adapters.append(adapter_sequences) return all_adapters.dropna() def save_to_file(filename, adapters) -> None: adapters.to_csv(filename, index=False, header=False, sep="\n") def parse_arguments(): parser = ArgumentParser() parser.add_argument("input", help="directory containing the fastqc reports") parser.add_argument("output", help="file where to export the sequences") return parser.parse_args() def main(): args = parse_arguments() file_list = find_html_files(args.input) adapters = extract_adapters(file_list) save_to_file(args.output, adapters) if __name__ == "__main__": main()