Merge pull request #1 from EnvGen/devel

Merge changes from devel
EnvGen · May 8, 2023 · 8321b9e · 8321b9e
2 parents df9abab + 6d314d4
commit 8321b9e
Show file tree

Hide file tree

Showing 19 changed files with 702 additions and 63 deletions.
diff --git a/config/config.yaml b/config/config.yaml
@@ -119,6 +119,7 @@ metaspades:
 annotation:
   hmm_dbs: []
   splits: 0
+  assembly_splits: 0
   # run tRNAscan-SE?
   tRNAscan: False
   # run infernal for rRNA identification?
@@ -131,6 +132,10 @@ annotation:
   rgi: False
   # run taxonomic annotation of assembled contigs (using contigtax + sourmash)?
   taxonomy: False
+  # name of cmsearch database
+  # if this is set, there must be files with the pattern 'resources/<cmdb>/<cmdb>.cm.<suff>'
+  # for example if cmdb: thi-box then there must exist 'resources/thi-box/thi-box.cm.i1f' etc.
+  cmdb: ""
   norm_models:
     kos:
       - K06942 # ychF; ribosome-binding ATPase
@@ -175,6 +180,12 @@ hmmsearch:
 
 # params for taxonomic annotation of contigs/orfs
 taxonomy:
+  # Run krakenuniq on assembled contigs?
+  krakenuniq_contigs: False
+  # Run kraken2 on assembled contigs?
+  kraken_contigs: False
+  # Run contigtax on contigs?
+  contigtax: True
   # minimum length of contigs to use for taxonomic annotation
   min_len: 300
   # parameters for contigtax search
@@ -320,6 +331,8 @@ kraken:
   # setting reduce_memory to True makes kraken2 run in "--memory-mapping" mode
   # which avoids loading database into RAM and uses less memory
   reduce_memory: False
+  # Kraken confidence threshold
+  confidence: 0.5
 
 # centrifuge params
 centrifuge:

diff --git a/config/myconfig.yaml b/config/myconfig.yaml
@@ -117,6 +117,7 @@ metaspades:
   extra_settings: "-k 21,31,41,51,61,71,81,91,101,111,121"
 
 annotation:
+  hmm_dbs: ["pfam-hmmsearch"]
   # run tRNAscan-SE?
   tRNAscan: False
   # run infernal for rRNA identification?
@@ -130,6 +131,10 @@ annotation:
   # run taxonomic annotation of assembled contigs (using contigtax + sourmash)?
   taxonomy: True
 
+hmmsearch:
+  threads: 10
+  evalue: 0.001
+  extra_settings: "-Z 45638612"
 # params for taxonomic annotation of contigs/orfs
 taxonomy:
   # minimum length of contigs to use for taxonomic annotation

diff --git a/config/myconfig_R-samples.yaml b/config/myconfig_R-samples.yaml
@@ -118,15 +118,16 @@ metaspades:
 
 annotation:
   hmm_dbs: ["protein-models", "pfam-hmmsearch"]
-  splits: 0
+  cmdb: "thi-box"
+  splits: 1000
   # run tRNAscan-SE?
   tRNAscan: False
   # run infernal for rRNA identification?
   infernal: True
   # run eggnog-mapper to infer KEGG orthologs, pathways and modules?
   eggnog: True
   # run PFAM-scan to infer protein families from PFAM?
-  pfam: True
+  pfam: False
   # run Resistance gene identifier?
   rgi: False
   # run taxonomic annotation of assembled contigs (using contigtax + sourmash)?
@@ -136,9 +137,22 @@ hmmsearch:
   threads: 10
   evalue: 0.001
   extra_settings: "-Z 45638612"
-
+  scores:
+    PF05690.13: 48
+    PF02581.16: 40
+    TIGR00693: 40
+    PF02110.14: 50
+    TIGR00694: 50
+    thiY_custom: 100
+    thiV_custom: 100
+    TIGR01254: 65
+    TIGR01276: 65
 # params for taxonomic annotation of contigs/orfs
 taxonomy:
+  # Run kraken2 on assembled contigs?
+  kraken_contigs: False
+  # Run contigtax on contigs?
+  contigtax: True
   # minimum length of contigs to use for taxonomic annotation
   min_len: 300
   # parameters for contigtax search
@@ -179,12 +193,12 @@ binning:
   contig_lengths:
     - 1500
     # uncomment and/or add below to run binning at more lengths
-    - 2500
+    #- 2500
     #- 5000
   # run Metabat2 binner?
   metabat: True
   # run CONCOCT binner?
-  concoct: True
+  concoct: False
   # run MaxBin2 binner?
   maxbin: False
   # maximum threads for binners
@@ -195,6 +209,7 @@ binning:
   gtdbtk: True
   # calculate average nucleotide identity for binned genomes with fastANI?
   fastani: True
+  all-against-all: True
 
 # parameters for MaxBin2
 maxbin:
@@ -226,7 +241,7 @@ fastani:
   # L_bac ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/209/385/GCF_000209385.2_Lach_bact_2_1_46_FAA_V2
   # E_bac ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/302/695/GCF_000302695.1_LSJC7_1.0
   # B_mal ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/002/346/085/GCF_002346085.1_ASM234608v1
-  ref_list: ""
+  ref_list: "resources/fastANI/reference_sequences_DK.tsv"
   # fraction overlap of alignments between two genomes to evaluate them for
   # clustering
   fraction: 0.5
@@ -261,13 +276,16 @@ classification:
   # run metaphlan profiler?
   metaphlan: False
 
+krakenuniq:
+  threads: 20
+  db_path: "resources/krakenuniq/standard/database.kdb"
 # parameters for kraken
 kraken:
   # below are some options for creating the kraken database
   # the default is to use the prebuilt and lightweight "minikraken" database but
   # if you instead want to build the standard database you may do so, just keep
   # in mind that it will use a lot of resources during the build step
-
+  confidence: 0
   # generate the standard kraken database?
   standard_db: False
   # download a prebuilt kraken2 database from the CCB servers
@@ -276,7 +294,7 @@ kraken:
   # if you already have access to a built kraken database you may specify the
   # database path here (path must contain hash.k2d, opts.k2d and taxo.k2d files
   prebuilt_url: "https://genome-idx.s3.amazonaws.com/kraken/k2_standard_8gb_20200919.tar.gz"
-  custom: "resources/kraken/nt"
+  custom: "resources/kraken/maxikraken2_1903_140GB"
   # should kraken2 run with reduced memory requirements?
   # setting reduce_memory to True makes kraken2 run in "--memory-mapping" mode
   # which avoids loading database into RAM and uses less memory

diff --git a/environment.yml b/environment.yml
@@ -1,4 +1,4 @@
-name: nbis-meta
+name: B1-ocean
 
 channels:
   - conda-forge

diff --git a/workflow/Snakefile b/workflow/Snakefile
@@ -8,9 +8,12 @@ include: "rules/other.smk"
 include: "rules/annotation.smk"
 # if config says to run analysis on split protein file,
 # this makes the necessary adjustments
-if int(config["annotation"]["splits"]) > 1:
+if int(config["annotation"]["splits"])>0:
     include: "rules/annotation_split.smk"
     ruleorder: pfam_scan_gather > pfam_scan
+if int(config["annotation"]["assembly_splits"])>0:
+    include: "rules/taxonomy_split.smk"
+    ruleorder: contigtax_gather > contigtax_assign
 include: "rules/assembly.smk"
 include: "rules/binning.smk"
 include: "rules/classification.smk"

diff --git a/workflow/envs/krakenuniq.yml b/workflow/envs/krakenuniq.yml
@@ -0,0 +1,7 @@
+name: krakenuniq
+channels:
+  - bioconda
+  - conda-forge
+  - defaults
+dependencies:
+  - krakenuniq=1.0.0
diff --git a/workflow/rules/annotation.smk b/workflow/rules/annotation.smk
@@ -9,12 +9,14 @@ localrules:
     download_pfam_info,
     press_hmms,
     parse_pfam,
+    parse_cmsearch,
     parse_hmmsearch,
     download_eggnog,
     get_kegg_info,
     parse_emapper,
     download_rgi_data,
-    parse_rgi
+    parse_rgi,
+    parse_hmmsearch
 
 ##### annotation master rule #####
 
@@ -136,6 +138,35 @@ rule infernal:
             {input.fastafile} > /dev/null 2>{log}
         """
 
+##### cmscan on custom database
+rule cmsearch:
+    input:
+        fa=results+"/assembly/{assembly}/final_contigs.fa",
+        cm=expand("resources/{cmdb}/{cmdb}.cm.i1{suff}", cmdb = config["annotation"]["cmdb"], suff = ["f","i","m","p"])
+    output:
+        cm=results+"/annotation/{assembly}/{assembly}.{cmdb}.tblout"
+    log:
+        results+"/annotation/{assembly}/{assembly}.{cmdb}.log"
+    conda:
+        "../envs/annotation.yml"
+    resources:
+        runtime = lambda wildcards: 60 * 24
+    params:
+        cm="resources/{cmdb}/{cmdb}.cm"
+    threads: 10
+    shell:
+        """
+        cmsearch --cpu {threads} --tblout {output.cm} {params.cm} {input.fa} >/dev/null 2>{log}
+        """
+
+rule parse_cmsearch:
+    input:
+        rules.cmsearch.output.cm
+    output:
+        results+"/annotation/{assembly}/{assembly}.{cmdb}.parsed.tsv"
+    script:
+        "../scripts/annotation_utils.py"
+
 ##### protein families #####
 
 rule make_pfamdb:
@@ -264,7 +295,8 @@ rule parse_hmmsearch:
     output:
         results+"/annotation/{assembly}/{hmm_db}.parsed.tsv"
     params:
-        evalue = config["hmmsearch"]["evalue"]
+        evalue = config["hmmsearch"]["evalue"],
+        scores = config["hmmsearch"]["scores"]
     script:
         "../scripts/annotation_utils.py"
 

diff --git a/workflow/rules/classification.smk b/workflow/rules/classification.smk
@@ -57,7 +57,7 @@ rule kraken_build_standard:
     conda:
         "../envs/kraken.yml"
     resources:
-        runtime=lambda wildcards, attempt: attempt**2*60*24
+        runtime=lambda wildcards, attempt: attempt**2 * 60 * 24
     shell:
         """
         kraken2-build --standard --db {params.dir} --threads {threads} > {log.build} 2>&1
@@ -80,18 +80,63 @@ rule kraken_contigs:
             results_path=config["paths"]["results"])
     params:
         db=config["kraken"]["index_path"],
-        mem=config["kraken"]["mem"]
-    threads: 10
+        mem=config["kraken"]["mem"],
+        confidence=config["kraken"]["confidence"]
+    threads: 20
     resources:
         runtime= lambda wildcards, attempt: attempt**2 * 60 * 4
     conda:
         "../envs/kraken.yml"
     shell:
         """
-        kraken2 {params.mem} --db {params.db} --output {output[0]} \
+        kraken2 {params.mem} --confidence {params.confidence} --db {params.db} --output {output[0]} \
             --report {output[1]} --threads {threads} {input.fa} > {log} 2>&1
         """
 
+rule krakenuniq_contigs:
+    input:
+        fa=results+"/assembly/{assembly}/final_contigs.fa",
+        db=config["krakenuniq"]["db_path"]
+    output:
+        out=results+"/annotation/{assembly}/taxonomy/final_contigs.krakenuniq.out",
+        report=results+"/annotation/{assembly}/taxonomy/final_contigs.krakenuniq.kreport"
+    log:
+        results+"/annotation/{assembly}/taxonomy/final_contigs.krakenuniq.log"
+    params:
+        db = lambda wildcards, input: os.path.dirname(input.db),
+        preload_size = f"{config['krakenuniq']['threads']*5}G",
+        out = "$TMPDIR/{assembly}.krakenuniq.out"
+    conda:
+        "../envs/krakenuniq.yml"
+    threads: config["krakenuniq"]["threads"]
+    resources:
+        runtime = 60 * 220
+    shell:
+        """
+        krakenuniq --db {params.db} --report-file {output.report} \
+            --output {params.out} --threads {threads} --preload-size {params.preload_size} {input.fa} > {log} 2>&1
+        mv {params.out} {output.out}
+        """
+
+rule parse_kraken_contigs:
+    input:
+        results+"/annotation/{assembly}/taxonomy/final_contigs.{krakentool}.out",
+        "resources/taxonomy/taxonomy.sqlite"
+    output:
+        results+"/annotation/{assembly}/taxonomy/{krakentool}.taxonomy.tsv"
+    log:
+        results+"/annotation/{assembly}/taxonomy/parse_{krakentool}_contigs.log"
+    conda:
+        "../envs/taxonomy.yml"
+    params:
+        ranks = " ".join(config["taxonomy"]["ranks"]),
+        script = "workflow/scripts/parse_kraken_contigs.py",
+        taxdir = lambda wildcards, input: os.path.dirname(input[1])
+    shell:
+        """
+        python {params.script} {input[0]} {output[0]} {params.taxdir} --ranks {params.ranks} > {log} 2>&1
+        """
+
 rule kraken_pe:
     input:
         R1=expand("{results_path}/intermediate/preprocess/{{sample}}_{{unit}}_R1{preprocess}.fastq.gz",

diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
@@ -40,7 +40,9 @@ wildcard_constraints:
     group="\w+",
     l="\d+",
     counts_type="(counts|rpkm)",
-    norm_method="(TMM|RLE)"
+    norm_method="(TMM|RLE)",
+    krakentool="kraken|krakenuniq",
+    tool="contigtax|kraken|krakenuniq"
 
 from scripts.common import check_uppmax, check_annotation, check_assembly, check_classifiers
 

diff --git a/workflow/rules/quantification.smk b/workflow/rules/quantification.smk
@@ -184,11 +184,10 @@ rule sum_to_taxa:
     Sums read counts and RPKM values for genes to assigned taxonomy
     """
     input:
-        tax=expand(results+"/annotation/{{assembly}}/taxonomy/orfs.{db}.taxonomy.tsv",
-            db=config["taxonomy"]["database"]),
+        tax=results+"/annotation/{assembly}/taxonomy/orfs.{tool}.tsv",
         abund=results+"/annotation/{assembly}/gene_{counts_type}.tsv"
     output:
-        results+"/annotation/{assembly}/taxonomy/tax.{counts_type}.tsv"
+        results+"/annotation/{assembly}/taxonomy/tax.{tool}.{counts_type}.tsv"
     script:
         "../scripts/quantification_utils.py"