zavolanlab · fgypas · Sep 18, 2024 · Oct 8, 2024 · Oct 8, 2024 · Oct 8, 2024
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -25,7 +25,7 @@ jobs:
         uses: conda-incubator/setup-miniconda@v3
         with:
           python-version: "3.10"
-          mamba-version: "*"
+          mamba-version: "1"
           channels: conda-forge
           channel-priority: true
           auto-update-conda: false
@@ -73,7 +73,7 @@ jobs:
         uses: conda-incubator/setup-miniconda@v3
         with:
           python-version: "3.10"
-          mamba-version: "*"
+          mamba-version: "1"
           channels: conda-forge
           channel-priority: true
           auto-update-conda: false
@@ -82,7 +82,9 @@ jobs:
           auto-activate-base: false
 
       - name: Update zarp env with root. packages
-        run: mamba env update -p $CONDA_PREFIX -f install/environment.root.yml
+        run: |
+          mamba --version
+          mamba env update -p $CONDA_PREFIX -f install/environment.root.yml
 
       - name: Update zarp env with dev. packages
         run: mamba env update -p $CONDA_PREFIX -f install/environment.dev.yml
@@ -116,7 +118,7 @@ jobs:
         uses: conda-incubator/setup-miniconda@v3
         with:
           python-version: "3.10"
-          mamba-version: "*"
+          mamba-version: "1"
           channels: conda-forge
           channel-priority: true
           auto-update-conda: false
@@ -153,7 +155,7 @@ jobs:
         uses: conda-incubator/setup-miniconda@v3
         with:
           python-version: "3.10"
-          mamba-version: "*"
+          mamba-version: "1"
           channels: conda-forge
           channel-priority: true
           auto-update-conda: false
@@ -190,7 +192,7 @@ jobs:
         uses: conda-incubator/setup-miniconda@v3
         with:
           python-version: "3.10"
-          mamba-version: "*"
+          mamba-version: "1"
           channels: conda-forge
           channel-priority: true
           auto-update-conda: false
@@ -214,3 +216,35 @@ jobs:
       - name: Run SRA downloads workflow
         run: bash tests/test_sra_download_with_conda/test.local.sh
 
+  integration-docker:
+    needs:
+      - snakemake-graphs-format
+    runs-on: ubuntu-20.04
+    defaults:
+      run:
+        shell: bash -l {0}
+    steps:
+
+        - name: Checkout zarp repository
+          uses: actions/checkout@v4
+
+        - name: Setup miniconda & zarp env
+          uses: conda-incubator/setup-miniconda@v3
+          with:
+            python-version: "3.10"
+            mamba-version: "1"
+            channels: conda-forge
+            channel-priority: true
+            auto-update-conda: false
+            activate-environment: zarp
+            environment-file: install/environment.yml
+            auto-activate-base: false
+
+        - name: Update zarp env with dev. packages
+          run: mamba env update -p $CONDA_PREFIX -f install/environment.dev.yml
+
+        - name: Run test script
+          run: bash tests/test_integration_workflow_with_docker/test.local.sh
+
+        - name: Clean up
+          run: rm -rf data
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
@@ -2,7 +2,7 @@ name: Docs
 
 on:
   push:
-    branches: [main, dev]
+    branches: [main, dev, docs-describe-zarp-outputs]
   workflow_dispatch:
 
 jobs:

diff --git a/CITATION.cff b/CITATION.cff
@@ -1,9 +1,10 @@
 cff-version: 1.2.0
 message: "If you use this software, please cite it as below."
-title: "ZARP: An automated workflow for processing of RNA-seq data"
-version: 0.3.0
-doi: 10.1101/2021.11.18.469017
-date-released: 2021-11-18
+title: "ZARP: A user-friendly and versatile RNA-seq analysis workflow"
+
+version: 1.0.0
+doi: 10.12688/f1000research.149237.1
+date-released: 2024-05-24
 url: "https://github.com/zavolanlab/zarp"
 preferred-citation:
   type: article
@@ -32,6 +33,18 @@ preferred-citation:
   - family-names: "Ataman"
     given-names: "Meric"
     orcid: "https://orcid.org/0000-0002-7942-9226"
+  - family-names: "Balajti"
+    given-names: "Máté"
+    orcid: "https://orcid.org/0009-0000-3932-3964"
+  - family-names: "Pozzan"
+    given-names: "Noè"
+  - family-names: "Schlusser"
+    given-names: "Niels"
+  - family-names: "Moon"
+    given-names: "Youngbin"
+    orcid: "https://orcid.org/0009-0001-5728-3959"
+  - family-names: "Mironov"
+    given-names: "Aleksei"
   - family-names: "Boersch"
     given-names: "Anastasiya"
     orcid: "https://orcid.org/0000-0003-3392-5272"
@@ -41,8 +54,8 @@ preferred-citation:
   - family-names: "Kanitz"
     given-names: "Alexander"
     orcid: "https://orcid.org/0000-0002-3468-0652"
-  doi: "10.1101/2021.11.18.469017"
-  journal: "bioRxiv"
-  month: 11
-  title: "ZARP: An automated workflow for processing of RNA-seq data"
-  year: 2021
+  doi: "10.12688/f1000research.149237.1"
+  journal: "F1000Research"
+  month: 05
+  title: "ZARP: A user-friendly and versatile RNA-seq analysis workflow"
+  year: 2024
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,37 @@
+FROM continuumio/miniconda3:24.7.1-0
+
+
+COPY install/environment.yml /environment.yml
+COPY workflow /workflow
+COPY resources /resources
+COPY tests/input_files/config.yaml /config.yaml
+COPY tests/input_files/samples.tsv /samples.tsv
+COPY tests/input_files/rule_config.yaml /rule_config.yaml
+COPY tests/input_files/project1/synthetic.mate_1.fastq.gz /project1/synthetic.mate_1.fastq.gz
+COPY tests/input_files/project1/synthetic.mate_2.fastq.gz /project1/synthetic.mate_2.fastq.gz
+COPY tests/input_files/project2/synthetic.mate_1.fastq.gz /project2/synthetic.mate_1.fastq.gz
+COPY tests/input_files/homo_sapiens/annotation.gtf /annotation.gtf
+COPY tests/input_files/homo_sapiens/genome.fa /genome.fa
+
+RUN sed -i 's#  - conda-forge##' workflow/envs/STAR.yaml && \
+  sed -i 's#2.7.11#2.7.10#' workflow/envs/STAR.yaml && \
+  sed -i 's#../input_files/project1/#/project1/#g' /samples.tsv && \
+  sed -i 's#../input_files/project2/#/project2/#g' /samples.tsv && \
+  sed -i 's#../input_files/homo_sapiens/##g' /samples.tsv && \
+  sed -i 's#../input_files/##' /config.yaml
+
+RUN conda install -c conda-forge mamba --yes && \
+  mamba env create -f /environment.yml && \
+  conda clean --all --yes
+
+RUN echo "source activate zarp" > ~/.bashrc
+
+ENV SNAKEMAKE_CONDA_PREFIX="/conda_envs"
+ENV PATH=/opt/conda/envs/zarp/bin:$PATH
+
+RUN snakemake -p --snakefile /workflow/Snakefile --configfile /config.yaml --cores 4 --use-conda --conda-create-envs-only --verbose && \
+  conda clean --all --yes
+
+RUN rm /config.yaml /samples.tsv /rule_config.yaml /project1/synthetic.mate_1.fastq.gz  /project1/synthetic.mate_2.fastq.gz /project2/synthetic.mate_1.fastq.gz
+
+RUN mkdir -p /data
diff --git a/README.md b/README.md
@@ -188,163 +188,9 @@ bash tests/test_integration_workflow_with_conda/test.slurm.sh
 Head over to the [ZARP-cli](https://zavolanlab.github.io/zarp-cli/) to learn how to
 start ZARP runs with very simple commands, like:
 
-```sh
-zarp SRR23590181
-```
-
 ## Running ZARP without ZARP-cli
 
-1. Assuming that your current directory is the workflow repository's root directory,
-create a directory for your workflow run and move into it with:
-
-    ```bash
-    mkdir config/my_run
-    cd config/my_run
-    ```
-
-2. Create an empty sample table and a workflow configuration file:
-
-    ```bash
-    touch samples.tsv
-    touch config.yaml
-    ```
-
-3. Use your editor of choice to populate these files with appropriate
-values. Have a look at the examples in the `tests/` directory to see what the
-files should look like, specifically:
-
-    - [samples.tsv](tests/input_files/samples.tsv)
-    - [config.yaml](tests/input_files/config.yaml)
-
-    - For more details and explanations, refer to the [pipeline-documentation]
-
-
-4. Create a runner script. Pick one of the following choices for either local
-or cluster execution. Before execution of the respective command, you need to
-remember to update the argument of the `--singularity-args` option of a
-respective profile (file: `profiles/{profile}/config.yaml`) so that
-it contains a comma-separated list of _all_ directories
-containing input data files (samples and any annotation files etc) required for
-your run.
-
-    Runner script for _local execution_:
-
-    ```bash
-    cat << "EOF" > run.sh
-    #!/bin/bash
-
-    snakemake \
-        --profile="../../profiles/local-singularity" \
-        --configfile="config.yaml"
-
-    EOF
-    ```
-
-    **OR**
-
-    Runner script for _Slurm cluster exection_ (note that you may need
-    to modify the arguments to `--jobs` and `--cores` in the file:
-    `profiles/slurm-singularity/config.yaml` depending on your HPC
-    and workload manager configuration):
-
-    ```bash
-    cat << "EOF" > run.sh
-    #!/bin/bash
-    mkdir -p logs/cluster_log
-    snakemake \
-        --profile="../profiles/slurm-singularity" \
-        --configfile="config.yaml"
-    EOF
-    ```
-
-    > Note: When running the pipeline with *conda* you should use `local-conda` and
-    `slurm-conda` profiles instead.
-
-    > Note: The slurm profiles are adapted to a cluster that uses the quality-of-service (QOS) keyword. If QOS is not supported by your slurm instance, you have to remove all the lines with "qos" in `profiles/slurm-config.json`.
-
-5. Start your workflow run:
-
-    ```bash
-    bash run.sh
-    ```
-
-# Sample downloads from SRA
-
-An independent Snakemake workflow `workflow/rules/sra_download.smk` is included
-for the download of sequencing libraries from the Sequence Read Archive and
-conversion into FASTQ.
-
-The workflow expects the following parameters in the configuration file:
-* `samples`, a sample table (tsv) with column *sample* containing *SRR*
-  identifiers (ERR and DRR are also supported), see
-  [example](tests/input_files/sra_samples.tsv).
-* `outdir`, an output directory
-* `samples_out`, a pointer to a modified sample table with the locations of
-  the corresponding FASTQ files
-* `cluster_log_dir`, the cluster log directory.
-
-For executing the example with Conda environments, one can use the following
-command (from within the activated `zarp` Conda environment):
-
-```bash
-snakemake --snakefile="workflow/rules/sra_download.smk" \
-          --profile="profiles/local-conda" \
-          --config samples="tests/input_files/sra_samples.tsv" \
-                   outdir="results/sra_downloads" \
-                   samples_out="results/sra_downloads/sra_samples.out.tsv" \
-                   log_dir="logs" \
-                   cluster_log_dir="logs/cluster_log"
-```
-
-Alternatively, change the argument to `--profile` from `local-conda` to
-`local-singularity` to execute the workflow steps within Singularity
-containers.
-
-After successful execution, `results/sra_downloads/sra_samples.out.tsv` should
-contain:
-
-```tsv
-sample  fq1     fq2
-SRR18552868     results/sra_downloads/compress/SRR18552868/SRR18552868.fastq.gz 
-SRR18549672     results/sra_downloads/compress/SRR18549672/SRR18549672_1.fastq.gz       results/sra_downloads/compress/SRR18549672/SRR18549672_2.fastq.gz
-ERR2248142      results/sra_downloads/compress/ERR2248142/ERR2248142.fastq.gz 
-```
-
-
-# Metadata completion with HTSinfer
-An independent Snakemake workflow `workflow/rules/htsinfer.smk` that populates the `samples.tsv` required by ZARP with the sample specific parameters `seqmode`, `f1_3p`, `f2_3p`, `organism`, `libtype` and `index_size`. Those parameters are inferred from the provided `fastq.gz` files by [HTSinfer][hts-infer].
-
-> Note: The workflow uses the implicit temporary directory 
-from snakemake, which is called with [resources.tmpdir].
-
-
-The workflow expects the following config:
-* `samples`, a sample table (tsv) with column *sample* containing sample identifiers, as well as columns *fq1* and *fq2* containing the paths to the input fastq files
-see example [here](tests/input_files/sra_samples.tsv). If the table contains further ZARP compatible columns (see [pipeline documentation][sample-doc]), the values specified there by the user are given priority over htsinfer's results. 
-* `outdir`, an output directory
-* `samples_out`, path to a modified sample table with inferred parameters
-* `records`, set to 100000 per default
-
-For executing the example one can use the following
-(with activated *zarp* environment):
-```bash
-cd tests/test_htsinfer_workflow
-snakemake \
-    --snakefile="../../workflow/rules/htsinfer.smk" \
-    --restart-times=0 \
-    --profile="../../profiles/local-singularity" \
-    --config outdir="results" \
-             samples="../input_files/htsinfer_samples.tsv" \
-             samples_out="samples_htsinfer.tsv" \
-    --notemp \
-    --keep-incomplete
-```
-
-However, this call will exit with an error, as not all parameters can be inferred from the example files. The argument `--keep-incomplete` makes sure the `samples_htsinfer.tsv` file can nevertheless be inspected. 
-
-After successful execution - if all parameters could be either inferred or were specified by the user - `[OUTDIR]/[SAMPLES_OUT]` should contain a populated table with parameters `seqmode`, `f1_3p`, `f2_3p`, `organism`, `libtype` and `index_size` for all input samples as described in the [pipeline documentation][sample-doc].
-
-
+You can also trigger ZARP without ZARP-cli. This is convenient for users who have some experience with snakemake and don't want to use a CLI to trigger their runs. Please head over to the [ZARP](https://zavolanlab.github.io/zarp/) documentation to learn how to start ZARP.
 
 [conda]: <https://docs.conda.io/projects/conda/en/latest/index.html>
 [hts-infer]: <https://github.com/zavolanlab/htsinfer>
-Original file line number
+Diff line change
@@ Expand Up / @@ -2,7 +2,7 @@ name: Docs @@
     on:
       push:
-        branches: [main, dev]
+        branches: [main, dev, docs-describe-zarp-outputs]
       workflow_dispatch:
     jobs:
@@ Expand Down @@