diff --git a/parser.py b/parser.py index 447e43a..9870580 100644 --- a/parser.py +++ b/parser.py @@ -7,11 +7,17 @@ from pandas import DataFrame, read_html, Series def remove_parenthesis(identifier): + """ + Remove parenthesis from the sequence identifier + """ sequence_str = "".join(map(str, identifier)) return sub(r"[()]", "", sequence_str) def extract_adapters(files) -> Tuple[Series, List]: + """ + Extract the adapters sequences and statistics from the files + """ all_adapters = DataFrame() for entry in files: tables = read_html(entry) @@ -27,6 +33,9 @@ def extract_adapters(files) -> Tuple[Series, List]: def preprocess_dataframe(adapters) -> Series: + """ + Remove empty sequences and duplicates + """ na_free_adapters = adapters.dropna(axis=1) stacked_adapters = na_free_adapters.stack() duplicate_free_adapters = stacked_adapters.drop_duplicates() @@ -34,6 +43,9 @@ def preprocess_dataframe(adapters) -> Series: def save_to_file(filename, adapters) -> None: + """ + Save the adapter sequences as a FASTA file + """ with open(filename, "w") as f: for index, value in adapters.iteritems(): sequence_id = remove_parenthesis(index) @@ -42,6 +54,9 @@ def save_to_file(filename, adapters) -> None: def parse_arguments(): + """ + Parse the command-line arguments + """ parser = ArgumentParser() parser.add_argument("input", help="directory containing the fastqc reports") parser.add_argument("output", help="file where to export the sequences") @@ -49,6 +64,9 @@ def parse_arguments(): def main(): + """ + Extract the adapters from FASTQC reports to a FASTA file and show sequence length statistics + """ args = parse_arguments() file_list = glob(args.input + "/*fastqc.html") adapters, stats = extract_adapters(file_list)