Show mean and standard deviation of the lengths
This commit is contained in:
parent
a38534fcd0
commit
776aba466c
15
parser.py
15
parser.py
|
@ -1,6 +1,6 @@
|
||||||
from argparse import ArgumentParser
|
from argparse import ArgumentParser
|
||||||
from glob import glob
|
from glob import glob
|
||||||
from typing import List
|
from typing import Dict, List, Tuple
|
||||||
|
|
||||||
from pandas import DataFrame, read_html, Series
|
from pandas import DataFrame, read_html, Series
|
||||||
|
|
||||||
|
@ -10,14 +10,18 @@ def find_html_files(path) -> List:
|
||||||
return file_list
|
return file_list
|
||||||
|
|
||||||
|
|
||||||
def extract_adapters(files) -> Series:
|
def extract_adapters(files) -> Tuple[Series, Dict]:
|
||||||
all_adapters = DataFrame()
|
all_adapters = DataFrame()
|
||||||
for entry in files:
|
for entry in files:
|
||||||
tables = read_html(entry)
|
tables = read_html(entry)
|
||||||
adapter_sequences = tables[1].Sequence
|
adapter_sequences = tables[1].Sequence
|
||||||
all_adapters = all_adapters.append(adapter_sequences)
|
all_adapters = all_adapters.append(adapter_sequences)
|
||||||
processed_adapters = preprocess_dataframe(all_adapters)
|
processed_adapters = preprocess_dataframe(all_adapters)
|
||||||
return processed_adapters
|
stats = [
|
||||||
|
processed_adapters.str.len().mean(),
|
||||||
|
processed_adapters.str.len().std(),
|
||||||
|
]
|
||||||
|
return processed_adapters, stats
|
||||||
|
|
||||||
|
|
||||||
def preprocess_dataframe(adapters) -> Series:
|
def preprocess_dataframe(adapters) -> Series:
|
||||||
|
@ -44,8 +48,11 @@ def parse_arguments():
|
||||||
def main():
|
def main():
|
||||||
args = parse_arguments()
|
args = parse_arguments()
|
||||||
file_list = find_html_files(args.input)
|
file_list = find_html_files(args.input)
|
||||||
adapters = extract_adapters(file_list)
|
adapters, stats = extract_adapters(file_list)
|
||||||
save_to_file(args.output, adapters)
|
save_to_file(args.output, adapters)
|
||||||
|
print(
|
||||||
|
f"Mean of sequence length: {stats[0]}, standard deviation of sequence length {stats[1]}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
Loading…
Reference in New Issue