Compare commits
15 Commits
3cc8265a56
...
master
Author | SHA1 | Date |
---|---|---|
coolneng | 93a403182b | |
coolneng | 7ed975c7ce | |
coolneng | fa23b1a950 | |
coolneng | e826d6f92b | |
coolneng | 52eaee4568 | |
coolneng | e59683b925 | |
coolneng | 4f2dfc68d3 | |
coolneng | 776aba466c | |
coolneng | a38534fcd0 | |
coolneng | af412da081 | |
coolneng | 0ed071c69c | |
coolneng | 02a561b4f6 | |
coolneng | d3f7677423 | |
coolneng | a987e662b5 | |
coolneng | 36c525fffd |
|
@ -0,0 +1,2 @@
|
||||||
|
*.html
|
||||||
|
data/multiqc_data
|
59
README.org
59
README.org
|
@ -0,0 +1,59 @@
|
||||||
|
* Adapter parser
|
||||||
|
|
||||||
|
This tool parses fastqc reports to extract the adapter sequences, from the *Overrepresented sequences*, and then outputs them to a text file. It also outputs the mean and standard deviation of the sequences length.
|
||||||
|
|
||||||
|
** Technologies
|
||||||
|
|
||||||
|
- Pandas
|
||||||
|
|
||||||
|
** Installation
|
||||||
|
|
||||||
|
This project uses [[https://nixos.org][Nix]] to ensure reproducible
|
||||||
|
builds.
|
||||||
|
|
||||||
|
1. Install Nix (compatible with MacOS, Linux and
|
||||||
|
[[https://docs.microsoft.com/en-us/windows/wsl/about][WSL]]):
|
||||||
|
|
||||||
|
#+begin_src bash
|
||||||
|
curl -L https://nixos.org/nix/install | sh
|
||||||
|
#+end_src
|
||||||
|
|
||||||
|
2. Clone the repository:
|
||||||
|
|
||||||
|
#+begin_src bash
|
||||||
|
git clone https://git.coolneng.duckdns.org/coolneng/adapter-parser
|
||||||
|
#+end_src
|
||||||
|
|
||||||
|
3. Change the working directory to the project:
|
||||||
|
|
||||||
|
#+begin_src bash
|
||||||
|
cd adapter-parser
|
||||||
|
#+end_src
|
||||||
|
|
||||||
|
4. Enter the nix-shell:
|
||||||
|
|
||||||
|
#+begin_src bash
|
||||||
|
nix-shell
|
||||||
|
#+end_src
|
||||||
|
|
||||||
|
After running these commands, you will find yourself in a shell that
|
||||||
|
contains all the needed dependencies.
|
||||||
|
|
||||||
|
** Usage
|
||||||
|
|
||||||
|
The program expects a folder containing the fastqc reports as an input and an output file where to store the sequences in the FASTA format.
|
||||||
|
|
||||||
|
#+begin_src bash
|
||||||
|
python src/parser.py <input> <output>
|
||||||
|
#+end_src
|
||||||
|
|
||||||
|
#+RESULTS:
|
||||||
|
:
|
||||||
|
: usage: parser.py [-h] input output
|
||||||
|
:
|
||||||
|
: positional arguments:
|
||||||
|
: input directory containing the fastqc reports
|
||||||
|
: output file where to export the sequences
|
||||||
|
:
|
||||||
|
: optional arguments:
|
||||||
|
: -h, --help show this help message and exit
|
|
@ -0,0 +1,80 @@
|
||||||
|
from argparse import ArgumentParser
|
||||||
|
from glob import glob
|
||||||
|
from typing import List, Tuple
|
||||||
|
from re import sub
|
||||||
|
|
||||||
|
from pandas import DataFrame, read_html, Series
|
||||||
|
|
||||||
|
|
||||||
|
def remove_parenthesis(identifier):
|
||||||
|
"""
|
||||||
|
Remove parenthesis from the sequence identifier
|
||||||
|
"""
|
||||||
|
sequence_str = "".join(map(str, identifier))
|
||||||
|
return sub(r"[()]", "", sequence_str)
|
||||||
|
|
||||||
|
|
||||||
|
def extract_adapters(files) -> Tuple[Series, List]:
|
||||||
|
"""
|
||||||
|
Extract the adapters sequences and statistics from the files
|
||||||
|
"""
|
||||||
|
all_adapters = DataFrame()
|
||||||
|
for entry in files:
|
||||||
|
tables = read_html(entry)
|
||||||
|
if len(tables) > 1:
|
||||||
|
adapter_sequences = tables[1].Sequence
|
||||||
|
all_adapters = all_adapters.append(adapter_sequences)
|
||||||
|
processed_adapters = preprocess_dataframe(all_adapters)
|
||||||
|
stats = [
|
||||||
|
processed_adapters.str.len().mean(),
|
||||||
|
processed_adapters.str.len().std(),
|
||||||
|
]
|
||||||
|
return processed_adapters, stats
|
||||||
|
|
||||||
|
|
||||||
|
def preprocess_dataframe(adapters) -> Series:
|
||||||
|
"""
|
||||||
|
Remove empty sequences and duplicates
|
||||||
|
"""
|
||||||
|
na_free_adapters = adapters.dropna(axis=1)
|
||||||
|
stacked_adapters = na_free_adapters.stack()
|
||||||
|
duplicate_free_adapters = stacked_adapters.drop_duplicates()
|
||||||
|
return duplicate_free_adapters
|
||||||
|
|
||||||
|
|
||||||
|
def save_to_file(filename, adapters) -> None:
|
||||||
|
"""
|
||||||
|
Save the adapter sequences as a FASTA file
|
||||||
|
"""
|
||||||
|
with open(filename, "w") as f:
|
||||||
|
for index, value in adapters.iteritems():
|
||||||
|
sequence_id = remove_parenthesis(index)
|
||||||
|
fasta_entry = f">{sequence_id}\n{value}\n"
|
||||||
|
f.write(fasta_entry)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_arguments():
|
||||||
|
"""
|
||||||
|
Parse the command-line arguments
|
||||||
|
"""
|
||||||
|
parser = ArgumentParser()
|
||||||
|
parser.add_argument("input", help="directory containing the fastqc reports")
|
||||||
|
parser.add_argument("output", help="file where to export the sequences")
|
||||||
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""
|
||||||
|
Extract the adapters from FASTQC reports to a FASTA file and show sequence length statistics
|
||||||
|
"""
|
||||||
|
args = parse_arguments()
|
||||||
|
file_list = glob(args.input + "/*fastqc.html")
|
||||||
|
adapters, stats = extract_adapters(file_list)
|
||||||
|
save_to_file(args.output, adapters)
|
||||||
|
print(
|
||||||
|
f"Mean of sequence length: {stats[0]}, standard deviation of sequence length {stats[1]}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
Loading…
Reference in New Issue