From 776aba466c2d941c8756f48ac2ef9ffa46c736c4 Mon Sep 17 00:00:00 2001 From: coolneng Date: Sat, 13 Nov 2021 18:35:59 +0100 Subject: [PATCH] Show mean and standard deviation of the lengths --- parser.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/parser.py b/parser.py index 0c4ec9d..6770128 100644 --- a/parser.py +++ b/parser.py @@ -1,6 +1,6 @@ from argparse import ArgumentParser from glob import glob -from typing import List +from typing import Dict, List, Tuple from pandas import DataFrame, read_html, Series @@ -10,14 +10,18 @@ def find_html_files(path) -> List: return file_list -def extract_adapters(files) -> Series: +def extract_adapters(files) -> Tuple[Series, Dict]: all_adapters = DataFrame() for entry in files: tables = read_html(entry) adapter_sequences = tables[1].Sequence all_adapters = all_adapters.append(adapter_sequences) processed_adapters = preprocess_dataframe(all_adapters) - return processed_adapters + stats = [ + processed_adapters.str.len().mean(), + processed_adapters.str.len().std(), + ] + return processed_adapters, stats def preprocess_dataframe(adapters) -> Series: @@ -44,8 +48,11 @@ def parse_arguments(): def main(): args = parse_arguments() file_list = find_html_files(args.input) - adapters = extract_adapters(file_list) + adapters, stats = extract_adapters(file_list) save_to_file(args.output, adapters) + print( + f"Mean of sequence length: {stats[0]}, standard deviation of sequence length {stats[1]}" + ) if __name__ == "__main__":