4 changed files with 1 additions and 142 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,2 +0,0 @@
 *.html
 data/multiqc_data
--- a/README.org
+++ b/README.org
@ -1,59 +0,0 @@
 * Adapter parser
 This tool parses fastqc reports to extract the adapter sequences, from the *Overrepresented sequences*, and then outputs them to a text file. It also outputs the mean and standard deviation of the sequences length.
 ** Technologies
 - Pandas
 ** Installation
 This project uses [[https://nixos.org][Nix]] to ensure reproducible
 builds.
 1.  Install Nix (compatible with MacOS, Linux and
    [[https://docs.microsoft.com/en-us/windows/wsl/about][WSL]]):
 #+begin_src bash
 curl -L https://nixos.org/nix/install | sh
 #+end_src
 2.  Clone the repository:
 #+begin_src bash
 git clone https://git.coolneng.duckdns.org/coolneng/adapter-parser
 #+end_src
 3.  Change the working directory to the project:
 #+begin_src bash
 cd adapter-parser
 #+end_src
 4.  Enter the nix-shell:
 #+begin_src bash
 nix-shell
 #+end_src
 After running these commands, you will find yourself in a shell that
 contains all the needed dependencies.
 ** Usage
 The program expects a folder containing the fastqc reports as an input and an output file where to store the sequences in the FASTA format.
 #+begin_src bash
 python src/parser.py <input> <output>
 #+end_src
 #+RESULTS:
 :
 : usage: parser.py [-h] input output
 :
 : positional arguments:
 :   input       directory containing the fastqc reports
 :   output      file where to export the sequences
 :
 : optional arguments:
 :   -h, --help  show this help message and exit
--- a/parser.py
+++ b/parser.py
@ -1,80 +0,0 @@
 from argparse import ArgumentParser
 from glob import glob
 from typing import List, Tuple
 from re import sub
 from pandas import DataFrame, read_html, Series
 def remove_parenthesis(identifier):
    """
    Remove parenthesis from the sequence identifier
    """
    sequence_str = "".join(map(str, identifier))
    return sub(r"[()]", "", sequence_str)
 def extract_adapters(files) -> Tuple[Series, List]:
    """
    Extract the adapters sequences and statistics from the files
    """
    all_adapters = DataFrame()
    for entry in files:
        tables = read_html(entry)
        if len(tables) > 1:
            adapter_sequences = tables[1].Sequence
            all_adapters = all_adapters.append(adapter_sequences)
    processed_adapters = preprocess_dataframe(all_adapters)
    stats = [
        processed_adapters.str.len().mean(),
        processed_adapters.str.len().std(),
    ]
    return processed_adapters, stats
 def preprocess_dataframe(adapters) -> Series:
    """
    Remove empty sequences and duplicates
    """
    na_free_adapters = adapters.dropna(axis=1)
    stacked_adapters = na_free_adapters.stack()
    duplicate_free_adapters = stacked_adapters.drop_duplicates()
    return duplicate_free_adapters
 def save_to_file(filename, adapters) -> None:
    """
    Save the adapter sequences as a FASTA file
    """
    with open(filename, "w") as f:
        for index, value in adapters.iteritems():
            sequence_id = remove_parenthesis(index)
            fasta_entry = f">{sequence_id}\n{value}\n"
            f.write(fasta_entry)
 def parse_arguments():
    """
    Parse the command-line arguments
    """
    parser = ArgumentParser()
    parser.add_argument("input", help="directory containing the fastqc reports")
    parser.add_argument("output", help="file where to export the sequences")
    return parser.parse_args()
 def main():
    """
    Extract the adapters from FASTQC reports to a FASTA file and show sequence length statistics
    """
    args = parse_arguments()
    file_list = glob(args.input + "/*fastqc.html")
    adapters, stats = extract_adapters(file_list)
    save_to_file(args.output, adapters)
    print(
        f"Mean of sequence length: {stats[0]}, standard deviation of sequence length {stats[1]}"
    )
 if __name__ == "__main__":
    main()
--- a/shell.nix
+++ b/shell.nix
@ -2,4 +2,4 @@
 with pkgs;
-mkShell { buildInputs = [ python39 python39Packages.pandas ]; }
+mkShell { buildInputs = [ python39 python39Packages.beautifulsoup4 ]; }
`@ -2,4 +2,4 @@`

	`with pkgs;`	`with pkgs;`

	`mkShell { buildInputs = [ python39 python39Packages.pandas ]; }`	`mkShell { buildInputs = [ python39 python39Packages.beautifulsoup4 ]; }`