Show mean and standard deviation of the lengths

This commit is contained in:
coolneng 2021-11-13 18:35:59 +01:00
parent a38534fcd0
commit 776aba466c
Signed by: coolneng
GPG Key ID: 9893DA236405AF57
1 changed files with 11 additions and 4 deletions

View File

@ -1,6 +1,6 @@
from argparse import ArgumentParser
from glob import glob
from typing import List
from typing import Dict, List, Tuple
from pandas import DataFrame, read_html, Series
@ -10,14 +10,18 @@ def find_html_files(path) -> List:
return file_list
def extract_adapters(files) -> Series:
def extract_adapters(files) -> Tuple[Series, Dict]:
all_adapters = DataFrame()
for entry in files:
tables = read_html(entry)
adapter_sequences = tables[1].Sequence
all_adapters = all_adapters.append(adapter_sequences)
processed_adapters = preprocess_dataframe(all_adapters)
return processed_adapters
stats = [
processed_adapters.str.len().mean(),
processed_adapters.str.len().std(),
]
return processed_adapters, stats
def preprocess_dataframe(adapters) -> Series:
@ -44,8 +48,11 @@ def parse_arguments():
def main():
args = parse_arguments()
file_list = find_html_files(args.input)
adapters = extract_adapters(file_list)
adapters, stats = extract_adapters(file_list)
save_to_file(args.output, adapters)
print(
f"Mean of sequence length: {stats[0]}, standard deviation of sequence length {stats[1]}"
)
if __name__ == "__main__":