Compare commits
No commits in common. "master" and "3cc8265a56a6af2988c42a04587000c7953d7ff5" have entirely different histories.
master
...
3cc8265a56
|
@ -1,2 +0,0 @@
|
||||||
*.html
|
|
||||||
data/multiqc_data
|
|
59
README.org
59
README.org
|
@ -1,59 +0,0 @@
|
||||||
* Adapter parser
|
|
||||||
|
|
||||||
This tool parses fastqc reports to extract the adapter sequences, from the *Overrepresented sequences*, and then outputs them to a text file. It also outputs the mean and standard deviation of the sequences length.
|
|
||||||
|
|
||||||
** Technologies
|
|
||||||
|
|
||||||
- Pandas
|
|
||||||
|
|
||||||
** Installation
|
|
||||||
|
|
||||||
This project uses [[https://nixos.org][Nix]] to ensure reproducible
|
|
||||||
builds.
|
|
||||||
|
|
||||||
1. Install Nix (compatible with MacOS, Linux and
|
|
||||||
[[https://docs.microsoft.com/en-us/windows/wsl/about][WSL]]):
|
|
||||||
|
|
||||||
#+begin_src bash
|
|
||||||
curl -L https://nixos.org/nix/install | sh
|
|
||||||
#+end_src
|
|
||||||
|
|
||||||
2. Clone the repository:
|
|
||||||
|
|
||||||
#+begin_src bash
|
|
||||||
git clone https://git.coolneng.duckdns.org/coolneng/adapter-parser
|
|
||||||
#+end_src
|
|
||||||
|
|
||||||
3. Change the working directory to the project:
|
|
||||||
|
|
||||||
#+begin_src bash
|
|
||||||
cd adapter-parser
|
|
||||||
#+end_src
|
|
||||||
|
|
||||||
4. Enter the nix-shell:
|
|
||||||
|
|
||||||
#+begin_src bash
|
|
||||||
nix-shell
|
|
||||||
#+end_src
|
|
||||||
|
|
||||||
After running these commands, you will find yourself in a shell that
|
|
||||||
contains all the needed dependencies.
|
|
||||||
|
|
||||||
** Usage
|
|
||||||
|
|
||||||
The program expects a folder containing the fastqc reports as an input and an output file where to store the sequences in the FASTA format.
|
|
||||||
|
|
||||||
#+begin_src bash
|
|
||||||
python src/parser.py <input> <output>
|
|
||||||
#+end_src
|
|
||||||
|
|
||||||
#+RESULTS:
|
|
||||||
:
|
|
||||||
: usage: parser.py [-h] input output
|
|
||||||
:
|
|
||||||
: positional arguments:
|
|
||||||
: input directory containing the fastqc reports
|
|
||||||
: output file where to export the sequences
|
|
||||||
:
|
|
||||||
: optional arguments:
|
|
||||||
: -h, --help show this help message and exit
|
|
80
parser.py
80
parser.py
|
@ -1,80 +0,0 @@
|
||||||
from argparse import ArgumentParser
|
|
||||||
from glob import glob
|
|
||||||
from typing import List, Tuple
|
|
||||||
from re import sub
|
|
||||||
|
|
||||||
from pandas import DataFrame, read_html, Series
|
|
||||||
|
|
||||||
|
|
||||||
def remove_parenthesis(identifier):
|
|
||||||
"""
|
|
||||||
Remove parenthesis from the sequence identifier
|
|
||||||
"""
|
|
||||||
sequence_str = "".join(map(str, identifier))
|
|
||||||
return sub(r"[()]", "", sequence_str)
|
|
||||||
|
|
||||||
|
|
||||||
def extract_adapters(files) -> Tuple[Series, List]:
|
|
||||||
"""
|
|
||||||
Extract the adapters sequences and statistics from the files
|
|
||||||
"""
|
|
||||||
all_adapters = DataFrame()
|
|
||||||
for entry in files:
|
|
||||||
tables = read_html(entry)
|
|
||||||
if len(tables) > 1:
|
|
||||||
adapter_sequences = tables[1].Sequence
|
|
||||||
all_adapters = all_adapters.append(adapter_sequences)
|
|
||||||
processed_adapters = preprocess_dataframe(all_adapters)
|
|
||||||
stats = [
|
|
||||||
processed_adapters.str.len().mean(),
|
|
||||||
processed_adapters.str.len().std(),
|
|
||||||
]
|
|
||||||
return processed_adapters, stats
|
|
||||||
|
|
||||||
|
|
||||||
def preprocess_dataframe(adapters) -> Series:
|
|
||||||
"""
|
|
||||||
Remove empty sequences and duplicates
|
|
||||||
"""
|
|
||||||
na_free_adapters = adapters.dropna(axis=1)
|
|
||||||
stacked_adapters = na_free_adapters.stack()
|
|
||||||
duplicate_free_adapters = stacked_adapters.drop_duplicates()
|
|
||||||
return duplicate_free_adapters
|
|
||||||
|
|
||||||
|
|
||||||
def save_to_file(filename, adapters) -> None:
|
|
||||||
"""
|
|
||||||
Save the adapter sequences as a FASTA file
|
|
||||||
"""
|
|
||||||
with open(filename, "w") as f:
|
|
||||||
for index, value in adapters.iteritems():
|
|
||||||
sequence_id = remove_parenthesis(index)
|
|
||||||
fasta_entry = f">{sequence_id}\n{value}\n"
|
|
||||||
f.write(fasta_entry)
|
|
||||||
|
|
||||||
|
|
||||||
def parse_arguments():
|
|
||||||
"""
|
|
||||||
Parse the command-line arguments
|
|
||||||
"""
|
|
||||||
parser = ArgumentParser()
|
|
||||||
parser.add_argument("input", help="directory containing the fastqc reports")
|
|
||||||
parser.add_argument("output", help="file where to export the sequences")
|
|
||||||
return parser.parse_args()
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
"""
|
|
||||||
Extract the adapters from FASTQC reports to a FASTA file and show sequence length statistics
|
|
||||||
"""
|
|
||||||
args = parse_arguments()
|
|
||||||
file_list = glob(args.input + "/*fastqc.html")
|
|
||||||
adapters, stats = extract_adapters(file_list)
|
|
||||||
save_to_file(args.output, adapters)
|
|
||||||
print(
|
|
||||||
f"Mean of sequence length: {stats[0]}, standard deviation of sequence length {stats[1]}"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
Loading…
Reference in New Issue