from argparse import ArgumentParser from glob import glob from typing import List, Tuple from re import sub from pandas import DataFrame, read_html, Series def remove_parenthesis(identifier): """ Remove parenthesis from the sequence identifier """ sequence_str = "".join(map(str, identifier)) return sub(r"[()]", "", sequence_str) def extract_adapters(files) -> Tuple[Series, List]: """ Extract the adapters sequences and statistics from the files """ all_adapters = DataFrame() for entry in files: tables = read_html(entry) if len(tables) > 1: adapter_sequences = tables[1].Sequence all_adapters = all_adapters.append(adapter_sequences) processed_adapters = preprocess_dataframe(all_adapters) stats = [ processed_adapters.str.len().mean(), processed_adapters.str.len().std(), ] return processed_adapters, stats def preprocess_dataframe(adapters) -> Series: """ Remove empty sequences and duplicates """ na_free_adapters = adapters.dropna(axis=1) stacked_adapters = na_free_adapters.stack() duplicate_free_adapters = stacked_adapters.drop_duplicates() return duplicate_free_adapters def save_to_file(filename, adapters) -> None: """ Save the adapter sequences as a FASTA file """ with open(filename, "w") as f: for index, value in adapters.iteritems(): sequence_id = remove_parenthesis(index) fasta_entry = f">{sequence_id}\n{value}\n" f.write(fasta_entry) def parse_arguments(): """ Parse the command-line arguments """ parser = ArgumentParser() parser.add_argument("input", help="directory containing the fastqc reports") parser.add_argument("output", help="file where to export the sequences") return parser.parse_args() def main(): """ Extract the adapters from FASTQC reports to a FASTA file and show sequence length statistics """ args = parse_arguments() file_list = glob(args.input + "/*fastqc.html") adapters, stats = extract_adapters(file_list) save_to_file(args.output, adapters) print( f"Mean of sequence length: {stats[0]}, standard deviation of sequence length {stats[1]}" ) if __name__ == "__main__": main()