diff --git a/parser.py b/parser.py index 0c4ec9d..6770128 100644 --- a/parser.py +++ b/parser.py @@ -1,6 +1,6 @@ from argparse import ArgumentParser from glob import glob -from typing import List +from typing import Dict, List, Tuple from pandas import DataFrame, read_html, Series @@ -10,14 +10,18 @@ def find_html_files(path) -> List: return file_list -def extract_adapters(files) -> Series: +def extract_adapters(files) -> Tuple[Series, Dict]: all_adapters = DataFrame() for entry in files: tables = read_html(entry) adapter_sequences = tables[1].Sequence all_adapters = all_adapters.append(adapter_sequences) processed_adapters = preprocess_dataframe(all_adapters) - return processed_adapters + stats = [ + processed_adapters.str.len().mean(), + processed_adapters.str.len().std(), + ] + return processed_adapters, stats def preprocess_dataframe(adapters) -> Series: @@ -44,8 +48,11 @@ def parse_arguments(): def main(): args = parse_arguments() file_list = find_html_files(args.input) - adapters = extract_adapters(file_list) + adapters, stats = extract_adapters(file_list) save_to_file(args.output, adapters) + print( + f"Mean of sequence length: {stats[0]}, standard deviation of sequence length {stats[1]}" + ) if __name__ == "__main__":