adapter-parser/parser.py

64 lines
1.9 KiB
Python
Raw Normal View History

2021-11-13 14:40:07 +01:00
from argparse import ArgumentParser
from glob import glob
2021-11-13 18:46:37 +01:00
from typing import List, Tuple
2021-12-01 18:52:51 +01:00
from re import sub
2021-11-13 17:44:20 +01:00
from pandas import DataFrame, read_html, Series
def find_html_files(path) -> List:
file_list = glob(path + "/*fastqc.html")
return file_list
2021-11-13 18:46:37 +01:00
def extract_adapters(files) -> Tuple[Series, List]:
all_adapters = DataFrame()
for entry in files:
tables = read_html(entry)
if len(tables) > 1:
adapter_sequences = tables[1].Sequence
all_adapters = all_adapters.append(adapter_sequences)
2021-11-13 17:44:20 +01:00
processed_adapters = preprocess_dataframe(all_adapters)
stats = [
processed_adapters.str.len().mean(),
processed_adapters.str.len().std(),
]
return processed_adapters, stats
2021-11-13 17:44:20 +01:00
def preprocess_dataframe(adapters) -> Series:
na_free_adapters = adapters.dropna(axis=1)
stacked_adapters = na_free_adapters.stack()
duplicate_free_adapters = stacked_adapters.drop_duplicates()
return duplicate_free_adapters
def save_to_file(filename, adapters) -> None:
2021-11-13 17:44:20 +01:00
with open(filename, "w") as f:
for index, value in adapters.iteritems():
2021-12-01 18:52:51 +01:00
sequence_str = "".join(map(str, index))
sequence_id = sub(r"[()]", "", sequence_str)
fasta_entry = f">{sequence_id}\n{value}\n"
2021-11-13 17:44:20 +01:00
f.write(fasta_entry)
2021-11-13 14:40:07 +01:00
def parse_arguments():
parser = ArgumentParser()
parser.add_argument("input", help="directory containing the fastqc reports")
parser.add_argument("output", help="file where to export the sequences")
return parser.parse_args()
def main():
2021-11-13 14:40:07 +01:00
args = parse_arguments()
file_list = find_html_files(args.input)
adapters, stats = extract_adapters(file_list)
2021-11-13 14:40:07 +01:00
save_to_file(args.output, adapters)
print(
f"Mean of sequence length: {stats[0]}, standard deviation of sequence length {stats[1]}"
)
if __name__ == "__main__":
main()