Show mean and standard deviation of the lengths

This commit is contained in:
coolneng 2021-11-13 18:35:59 +01:00
parent a38534fcd0
commit 776aba466c
Signed by: coolneng
GPG Key ID: 9893DA236405AF57
1 changed files with 11 additions and 4 deletions

View File

@ -1,6 +1,6 @@
from argparse import ArgumentParser from argparse import ArgumentParser
from glob import glob from glob import glob
from typing import List from typing import Dict, List, Tuple
from pandas import DataFrame, read_html, Series from pandas import DataFrame, read_html, Series
@ -10,14 +10,18 @@ def find_html_files(path) -> List:
return file_list return file_list
def extract_adapters(files) -> Series: def extract_adapters(files) -> Tuple[Series, Dict]:
all_adapters = DataFrame() all_adapters = DataFrame()
for entry in files: for entry in files:
tables = read_html(entry) tables = read_html(entry)
adapter_sequences = tables[1].Sequence adapter_sequences = tables[1].Sequence
all_adapters = all_adapters.append(adapter_sequences) all_adapters = all_adapters.append(adapter_sequences)
processed_adapters = preprocess_dataframe(all_adapters) processed_adapters = preprocess_dataframe(all_adapters)
return processed_adapters stats = [
processed_adapters.str.len().mean(),
processed_adapters.str.len().std(),
]
return processed_adapters, stats
def preprocess_dataframe(adapters) -> Series: def preprocess_dataframe(adapters) -> Series:
@ -44,8 +48,11 @@ def parse_arguments():
def main(): def main():
args = parse_arguments() args = parse_arguments()
file_list = find_html_files(args.input) file_list = find_html_files(args.input)
adapters = extract_adapters(file_list) adapters, stats = extract_adapters(file_list)
save_to_file(args.output, adapters) save_to_file(args.output, adapters)
print(
f"Mean of sequence length: {stats[0]}, standard deviation of sequence length {stats[1]}"
)
if __name__ == "__main__": if __name__ == "__main__":