2021-11-13 14:40:07 +01:00
|
|
|
from argparse import ArgumentParser
|
2021-11-13 14:21:24 +01:00
|
|
|
from glob import glob
|
2021-11-13 18:46:37 +01:00
|
|
|
from typing import List, Tuple
|
2021-12-01 18:52:51 +01:00
|
|
|
from re import sub
|
2021-11-13 14:21:24 +01:00
|
|
|
|
2021-11-13 17:44:20 +01:00
|
|
|
from pandas import DataFrame, read_html, Series
|
2021-11-13 14:21:24 +01:00
|
|
|
|
|
|
|
|
2021-12-22 18:36:32 +01:00
|
|
|
def remove_parenthesis(identifier):
|
2021-12-27 17:33:05 +01:00
|
|
|
"""
|
|
|
|
Remove parenthesis from the sequence identifier
|
|
|
|
"""
|
2021-12-22 18:36:32 +01:00
|
|
|
sequence_str = "".join(map(str, identifier))
|
|
|
|
return sub(r"[()]", "", sequence_str)
|
2021-11-13 14:21:24 +01:00
|
|
|
|
|
|
|
|
2021-11-13 18:46:37 +01:00
|
|
|
def extract_adapters(files) -> Tuple[Series, List]:
|
2021-12-27 17:33:05 +01:00
|
|
|
"""
|
|
|
|
Extract the adapters sequences and statistics from the files
|
|
|
|
"""
|
2021-11-13 14:21:24 +01:00
|
|
|
all_adapters = DataFrame()
|
|
|
|
for entry in files:
|
|
|
|
tables = read_html(entry)
|
2021-12-01 15:50:33 +01:00
|
|
|
if len(tables) > 1:
|
|
|
|
adapter_sequences = tables[1].Sequence
|
|
|
|
all_adapters = all_adapters.append(adapter_sequences)
|
2021-11-13 17:44:20 +01:00
|
|
|
processed_adapters = preprocess_dataframe(all_adapters)
|
2021-11-13 18:35:59 +01:00
|
|
|
stats = [
|
|
|
|
processed_adapters.str.len().mean(),
|
|
|
|
processed_adapters.str.len().std(),
|
|
|
|
]
|
|
|
|
return processed_adapters, stats
|
2021-11-13 17:44:20 +01:00
|
|
|
|
|
|
|
|
|
|
|
def preprocess_dataframe(adapters) -> Series:
|
2021-12-27 17:33:05 +01:00
|
|
|
"""
|
|
|
|
Remove empty sequences and duplicates
|
|
|
|
"""
|
2021-11-13 17:44:20 +01:00
|
|
|
na_free_adapters = adapters.dropna(axis=1)
|
|
|
|
stacked_adapters = na_free_adapters.stack()
|
|
|
|
duplicate_free_adapters = stacked_adapters.drop_duplicates()
|
|
|
|
return duplicate_free_adapters
|
2021-11-13 14:21:24 +01:00
|
|
|
|
|
|
|
|
|
|
|
def save_to_file(filename, adapters) -> None:
|
2021-12-27 17:33:05 +01:00
|
|
|
"""
|
|
|
|
Save the adapter sequences as a FASTA file
|
|
|
|
"""
|
2021-11-13 17:44:20 +01:00
|
|
|
with open(filename, "w") as f:
|
|
|
|
for index, value in adapters.iteritems():
|
2021-12-22 18:36:32 +01:00
|
|
|
sequence_id = remove_parenthesis(index)
|
2021-12-01 18:52:51 +01:00
|
|
|
fasta_entry = f">{sequence_id}\n{value}\n"
|
2021-11-13 17:44:20 +01:00
|
|
|
f.write(fasta_entry)
|
2021-11-13 14:21:24 +01:00
|
|
|
|
|
|
|
|
2021-11-13 14:40:07 +01:00
|
|
|
def parse_arguments():
|
2021-12-27 17:33:05 +01:00
|
|
|
"""
|
|
|
|
Parse the command-line arguments
|
|
|
|
"""
|
2021-11-13 14:40:07 +01:00
|
|
|
parser = ArgumentParser()
|
|
|
|
parser.add_argument("input", help="directory containing the fastqc reports")
|
|
|
|
parser.add_argument("output", help="file where to export the sequences")
|
|
|
|
return parser.parse_args()
|
|
|
|
|
|
|
|
|
2021-11-13 14:21:24 +01:00
|
|
|
def main():
|
2021-12-27 17:33:05 +01:00
|
|
|
"""
|
|
|
|
Extract the adapters from FASTQC reports to a FASTA file and show sequence length statistics
|
|
|
|
"""
|
2021-11-13 14:40:07 +01:00
|
|
|
args = parse_arguments()
|
2021-12-22 18:36:13 +01:00
|
|
|
file_list = glob(args.input + "/*fastqc.html")
|
2021-11-13 18:35:59 +01:00
|
|
|
adapters, stats = extract_adapters(file_list)
|
2021-11-13 14:40:07 +01:00
|
|
|
save_to_file(args.output, adapters)
|
2021-11-13 18:35:59 +01:00
|
|
|
print(
|
|
|
|
f"Mean of sequence length: {stats[0]}, standard deviation of sequence length {stats[1]}"
|
|
|
|
)
|
2021-11-13 14:21:24 +01:00
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
main()
|