adapter-parser/src/parser.py

42 lines
1.1 KiB
Python
Raw Normal View History

2021-11-13 14:40:07 +01:00
from argparse import ArgumentParser
from glob import glob
2021-11-13 14:28:39 +01:00
from typing import List, Union
from pandas import DataFrame, read_html
def find_html_files(path) -> List:
file_list = glob(path + "/*fastqc.html")
return file_list
2021-11-13 14:28:39 +01:00
def extract_adapters(files) -> Union[DataFrame, None]:
all_adapters = DataFrame()
for entry in files:
tables = read_html(entry)
adapter_sequences = tables[1].Sequence
all_adapters = all_adapters.append(adapter_sequences)
return all_adapters.dropna()
def save_to_file(filename, adapters) -> None:
adapters.to_csv(filename, index=False, header=False, sep="\n")
2021-11-13 14:40:07 +01:00
def parse_arguments():
parser = ArgumentParser()
parser.add_argument("input", help="directory containing the fastqc reports")
parser.add_argument("output", help="file where to export the sequences")
return parser.parse_args()
def main():
2021-11-13 14:40:07 +01:00
args = parse_arguments()
file_list = find_html_files(args.input)
adapters = extract_adapters(file_list)
2021-11-13 14:40:07 +01:00
save_to_file(args.output, adapters)
if __name__ == "__main__":
main()