2021-11-13 14:21:24 +01:00
|
|
|
from glob import glob
|
2021-11-13 14:28:39 +01:00
|
|
|
from typing import List, Union
|
2021-11-13 14:21:24 +01:00
|
|
|
|
|
|
|
from pandas import DataFrame, read_html
|
|
|
|
|
|
|
|
|
|
|
|
def find_html_files(path) -> List:
|
|
|
|
file_list = glob(path + "/*fastqc.html")
|
|
|
|
return file_list
|
|
|
|
|
|
|
|
|
2021-11-13 14:28:39 +01:00
|
|
|
def extract_adapters(files) -> Union[DataFrame, None]:
|
2021-11-13 14:21:24 +01:00
|
|
|
all_adapters = DataFrame()
|
|
|
|
for entry in files:
|
|
|
|
tables = read_html(entry)
|
|
|
|
adapter_sequences = tables[1].Sequence
|
|
|
|
all_adapters = all_adapters.append(adapter_sequences)
|
|
|
|
return all_adapters.dropna()
|
|
|
|
|
|
|
|
|
|
|
|
def save_to_file(filename, adapters) -> None:
|
|
|
|
adapters.to_csv(filename, index=False, header=False, sep="\n")
|
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
file_list = find_html_files("data")
|
|
|
|
adapters = extract_adapters(file_list)
|
|
|
|
save_to_file("placeholder.txt", adapters)
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
main()
|