Compare commits

..

15 Commits

4 changed files with 142 additions and 1 deletions

2
.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
*.html
data/multiqc_data

View File

@ -0,0 +1,59 @@
* Adapter parser
This tool parses fastqc reports to extract the adapter sequences, from the *Overrepresented sequences*, and then outputs them to a text file. It also outputs the mean and standard deviation of the sequences length.
** Technologies
- Pandas
** Installation
This project uses [[https://nixos.org][Nix]] to ensure reproducible
builds.
1. Install Nix (compatible with MacOS, Linux and
[[https://docs.microsoft.com/en-us/windows/wsl/about][WSL]]):
#+begin_src bash
curl -L https://nixos.org/nix/install | sh
#+end_src
2. Clone the repository:
#+begin_src bash
git clone https://git.coolneng.duckdns.org/coolneng/adapter-parser
#+end_src
3. Change the working directory to the project:
#+begin_src bash
cd adapter-parser
#+end_src
4. Enter the nix-shell:
#+begin_src bash
nix-shell
#+end_src
After running these commands, you will find yourself in a shell that
contains all the needed dependencies.
** Usage
The program expects a folder containing the fastqc reports as an input and an output file where to store the sequences in the FASTA format.
#+begin_src bash
python src/parser.py <input> <output>
#+end_src
#+RESULTS:
:
: usage: parser.py [-h] input output
:
: positional arguments:
: input directory containing the fastqc reports
: output file where to export the sequences
:
: optional arguments:
: -h, --help show this help message and exit

80
parser.py Normal file
View File

@ -0,0 +1,80 @@
from argparse import ArgumentParser
from glob import glob
from typing import List, Tuple
from re import sub
from pandas import DataFrame, read_html, Series
def remove_parenthesis(identifier):
"""
Remove parenthesis from the sequence identifier
"""
sequence_str = "".join(map(str, identifier))
return sub(r"[()]", "", sequence_str)
def extract_adapters(files) -> Tuple[Series, List]:
"""
Extract the adapters sequences and statistics from the files
"""
all_adapters = DataFrame()
for entry in files:
tables = read_html(entry)
if len(tables) > 1:
adapter_sequences = tables[1].Sequence
all_adapters = all_adapters.append(adapter_sequences)
processed_adapters = preprocess_dataframe(all_adapters)
stats = [
processed_adapters.str.len().mean(),
processed_adapters.str.len().std(),
]
return processed_adapters, stats
def preprocess_dataframe(adapters) -> Series:
"""
Remove empty sequences and duplicates
"""
na_free_adapters = adapters.dropna(axis=1)
stacked_adapters = na_free_adapters.stack()
duplicate_free_adapters = stacked_adapters.drop_duplicates()
return duplicate_free_adapters
def save_to_file(filename, adapters) -> None:
"""
Save the adapter sequences as a FASTA file
"""
with open(filename, "w") as f:
for index, value in adapters.iteritems():
sequence_id = remove_parenthesis(index)
fasta_entry = f">{sequence_id}\n{value}\n"
f.write(fasta_entry)
def parse_arguments():
"""
Parse the command-line arguments
"""
parser = ArgumentParser()
parser.add_argument("input", help="directory containing the fastqc reports")
parser.add_argument("output", help="file where to export the sequences")
return parser.parse_args()
def main():
"""
Extract the adapters from FASTQC reports to a FASTA file and show sequence length statistics
"""
args = parse_arguments()
file_list = glob(args.input + "/*fastqc.html")
adapters, stats = extract_adapters(file_list)
save_to_file(args.output, adapters)
print(
f"Mean of sequence length: {stats[0]}, standard deviation of sequence length {stats[1]}"
)
if __name__ == "__main__":
main()

View File

@ -2,4 +2,4 @@
with pkgs;
mkShell { buildInputs = [ python39 python39Packages.beautifulsoup4 ]; }
mkShell { buildInputs = [ python39 python39Packages.pandas ]; }