Skip to content

Commit

Permalink
Merge pull request #1 from EnvGen/devel
Browse files Browse the repository at this point in the history
Merge changes from devel
  • Loading branch information
johnne authored May 8, 2023
2 parents df9abab + 6d314d4 commit 8321b9e
Show file tree
Hide file tree
Showing 19 changed files with 702 additions and 63 deletions.
13 changes: 13 additions & 0 deletions config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@ metaspades:
annotation:
hmm_dbs: []
splits: 0
assembly_splits: 0
# run tRNAscan-SE?
tRNAscan: False
# run infernal for rRNA identification?
Expand All @@ -131,6 +132,10 @@ annotation:
rgi: False
# run taxonomic annotation of assembled contigs (using contigtax + sourmash)?
taxonomy: False
# name of cmsearch database
# if this is set, there must be files with the pattern 'resources/<cmdb>/<cmdb>.cm.<suff>'
# for example if cmdb: thi-box then there must exist 'resources/thi-box/thi-box.cm.i1f' etc.
cmdb: ""
norm_models:
kos:
- K06942 # ychF; ribosome-binding ATPase
Expand Down Expand Up @@ -175,6 +180,12 @@ hmmsearch:

# params for taxonomic annotation of contigs/orfs
taxonomy:
# Run krakenuniq on assembled contigs?
krakenuniq_contigs: False
# Run kraken2 on assembled contigs?
kraken_contigs: False
# Run contigtax on contigs?
contigtax: True
# minimum length of contigs to use for taxonomic annotation
min_len: 300
# parameters for contigtax search
Expand Down Expand Up @@ -320,6 +331,8 @@ kraken:
# setting reduce_memory to True makes kraken2 run in "--memory-mapping" mode
# which avoids loading database into RAM and uses less memory
reduce_memory: False
# Kraken confidence threshold
confidence: 0.5

# centrifuge params
centrifuge:
Expand Down
5 changes: 5 additions & 0 deletions config/myconfig.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ metaspades:
extra_settings: "-k 21,31,41,51,61,71,81,91,101,111,121"

annotation:
hmm_dbs: ["pfam-hmmsearch"]
# run tRNAscan-SE?
tRNAscan: False
# run infernal for rRNA identification?
Expand All @@ -130,6 +131,10 @@ annotation:
# run taxonomic annotation of assembled contigs (using contigtax + sourmash)?
taxonomy: True

hmmsearch:
threads: 10
evalue: 0.001
extra_settings: "-Z 45638612"
# params for taxonomic annotation of contigs/orfs
taxonomy:
# minimum length of contigs to use for taxonomic annotation
Expand Down
34 changes: 26 additions & 8 deletions config/myconfig_R-samples.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -118,15 +118,16 @@ metaspades:

annotation:
hmm_dbs: ["protein-models", "pfam-hmmsearch"]
splits: 0
cmdb: "thi-box"
splits: 1000
# run tRNAscan-SE?
tRNAscan: False
# run infernal for rRNA identification?
infernal: True
# run eggnog-mapper to infer KEGG orthologs, pathways and modules?
eggnog: True
# run PFAM-scan to infer protein families from PFAM?
pfam: True
pfam: False
# run Resistance gene identifier?
rgi: False
# run taxonomic annotation of assembled contigs (using contigtax + sourmash)?
Expand All @@ -136,9 +137,22 @@ hmmsearch:
threads: 10
evalue: 0.001
extra_settings: "-Z 45638612"

scores:
PF05690.13: 48
PF02581.16: 40
TIGR00693: 40
PF02110.14: 50
TIGR00694: 50
thiY_custom: 100
thiV_custom: 100
TIGR01254: 65
TIGR01276: 65
# params for taxonomic annotation of contigs/orfs
taxonomy:
# Run kraken2 on assembled contigs?
kraken_contigs: False
# Run contigtax on contigs?
contigtax: True
# minimum length of contigs to use for taxonomic annotation
min_len: 300
# parameters for contigtax search
Expand Down Expand Up @@ -179,12 +193,12 @@ binning:
contig_lengths:
- 1500
# uncomment and/or add below to run binning at more lengths
- 2500
#- 2500
#- 5000
# run Metabat2 binner?
metabat: True
# run CONCOCT binner?
concoct: True
concoct: False
# run MaxBin2 binner?
maxbin: False
# maximum threads for binners
Expand All @@ -195,6 +209,7 @@ binning:
gtdbtk: True
# calculate average nucleotide identity for binned genomes with fastANI?
fastani: True
all-against-all: True

# parameters for MaxBin2
maxbin:
Expand Down Expand Up @@ -226,7 +241,7 @@ fastani:
# L_bac ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/209/385/GCF_000209385.2_Lach_bact_2_1_46_FAA_V2
# E_bac ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/302/695/GCF_000302695.1_LSJC7_1.0
# B_mal ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/002/346/085/GCF_002346085.1_ASM234608v1
ref_list: ""
ref_list: "resources/fastANI/reference_sequences_DK.tsv"
# fraction overlap of alignments between two genomes to evaluate them for
# clustering
fraction: 0.5
Expand Down Expand Up @@ -261,13 +276,16 @@ classification:
# run metaphlan profiler?
metaphlan: False

krakenuniq:
threads: 20
db_path: "resources/krakenuniq/standard/database.kdb"
# parameters for kraken
kraken:
# below are some options for creating the kraken database
# the default is to use the prebuilt and lightweight "minikraken" database but
# if you instead want to build the standard database you may do so, just keep
# in mind that it will use a lot of resources during the build step

confidence: 0
# generate the standard kraken database?
standard_db: False
# download a prebuilt kraken2 database from the CCB servers
Expand All @@ -276,7 +294,7 @@ kraken:
# if you already have access to a built kraken database you may specify the
# database path here (path must contain hash.k2d, opts.k2d and taxo.k2d files
prebuilt_url: "https://genome-idx.s3.amazonaws.com/kraken/k2_standard_8gb_20200919.tar.gz"
custom: "resources/kraken/nt"
custom: "resources/kraken/maxikraken2_1903_140GB"
# should kraken2 run with reduced memory requirements?
# setting reduce_memory to True makes kraken2 run in "--memory-mapping" mode
# which avoids loading database into RAM and uses less memory
Expand Down
2 changes: 1 addition & 1 deletion environment.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: nbis-meta
name: B1-ocean

channels:
- conda-forge
Expand Down
5 changes: 4 additions & 1 deletion workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,12 @@ include: "rules/other.smk"
include: "rules/annotation.smk"
# if config says to run analysis on split protein file,
# this makes the necessary adjustments
if int(config["annotation"]["splits"]) > 1:
if int(config["annotation"]["splits"])>0:
include: "rules/annotation_split.smk"
ruleorder: pfam_scan_gather > pfam_scan
if int(config["annotation"]["assembly_splits"])>0:
include: "rules/taxonomy_split.smk"
ruleorder: contigtax_gather > contigtax_assign
include: "rules/assembly.smk"
include: "rules/binning.smk"
include: "rules/classification.smk"
Expand Down
7 changes: 7 additions & 0 deletions workflow/envs/krakenuniq.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
name: krakenuniq
channels:
- bioconda
- conda-forge
- defaults
dependencies:
- krakenuniq=1.0.0
36 changes: 34 additions & 2 deletions workflow/rules/annotation.smk
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,14 @@ localrules:
download_pfam_info,
press_hmms,
parse_pfam,
parse_cmsearch,
parse_hmmsearch,
download_eggnog,
get_kegg_info,
parse_emapper,
download_rgi_data,
parse_rgi
parse_rgi,
parse_hmmsearch

##### annotation master rule #####

Expand Down Expand Up @@ -136,6 +138,35 @@ rule infernal:
{input.fastafile} > /dev/null 2>{log}
"""

##### cmscan on custom database
rule cmsearch:
input:
fa=results+"/assembly/{assembly}/final_contigs.fa",
cm=expand("resources/{cmdb}/{cmdb}.cm.i1{suff}", cmdb = config["annotation"]["cmdb"], suff = ["f","i","m","p"])
output:
cm=results+"/annotation/{assembly}/{assembly}.{cmdb}.tblout"
log:
results+"/annotation/{assembly}/{assembly}.{cmdb}.log"
conda:
"../envs/annotation.yml"
resources:
runtime = lambda wildcards: 60 * 24
params:
cm="resources/{cmdb}/{cmdb}.cm"
threads: 10
shell:
"""
cmsearch --cpu {threads} --tblout {output.cm} {params.cm} {input.fa} >/dev/null 2>{log}
"""

rule parse_cmsearch:
input:
rules.cmsearch.output.cm
output:
results+"/annotation/{assembly}/{assembly}.{cmdb}.parsed.tsv"
script:
"../scripts/annotation_utils.py"

##### protein families #####

rule make_pfamdb:
Expand Down Expand Up @@ -264,7 +295,8 @@ rule parse_hmmsearch:
output:
results+"/annotation/{assembly}/{hmm_db}.parsed.tsv"
params:
evalue = config["hmmsearch"]["evalue"]
evalue = config["hmmsearch"]["evalue"],
scores = config["hmmsearch"]["scores"]
script:
"../scripts/annotation_utils.py"

Expand Down
53 changes: 49 additions & 4 deletions workflow/rules/classification.smk
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ rule kraken_build_standard:
conda:
"../envs/kraken.yml"
resources:
runtime=lambda wildcards, attempt: attempt**2*60*24
runtime=lambda wildcards, attempt: attempt**2 * 60 * 24
shell:
"""
kraken2-build --standard --db {params.dir} --threads {threads} > {log.build} 2>&1
Expand All @@ -80,18 +80,63 @@ rule kraken_contigs:
results_path=config["paths"]["results"])
params:
db=config["kraken"]["index_path"],
mem=config["kraken"]["mem"]
threads: 10
mem=config["kraken"]["mem"],
confidence=config["kraken"]["confidence"]
threads: 20
resources:
runtime= lambda wildcards, attempt: attempt**2 * 60 * 4
conda:
"../envs/kraken.yml"
shell:
"""
kraken2 {params.mem} --db {params.db} --output {output[0]} \
kraken2 {params.mem} --confidence {params.confidence} --db {params.db} --output {output[0]} \
--report {output[1]} --threads {threads} {input.fa} > {log} 2>&1
"""

rule krakenuniq_contigs:
input:
fa=results+"/assembly/{assembly}/final_contigs.fa",
db=config["krakenuniq"]["db_path"]
output:
out=results+"/annotation/{assembly}/taxonomy/final_contigs.krakenuniq.out",
report=results+"/annotation/{assembly}/taxonomy/final_contigs.krakenuniq.kreport"
log:
results+"/annotation/{assembly}/taxonomy/final_contigs.krakenuniq.log"
params:
db = lambda wildcards, input: os.path.dirname(input.db),
preload_size = f"{config['krakenuniq']['threads']*5}G",
out = "$TMPDIR/{assembly}.krakenuniq.out"
conda:
"../envs/krakenuniq.yml"
threads: config["krakenuniq"]["threads"]
resources:
runtime = 60 * 220
shell:
"""
krakenuniq --db {params.db} --report-file {output.report} \
--output {params.out} --threads {threads} --preload-size {params.preload_size} {input.fa} > {log} 2>&1
mv {params.out} {output.out}
"""

rule parse_kraken_contigs:
input:
results+"/annotation/{assembly}/taxonomy/final_contigs.{krakentool}.out",
"resources/taxonomy/taxonomy.sqlite"
output:
results+"/annotation/{assembly}/taxonomy/{krakentool}.taxonomy.tsv"
log:
results+"/annotation/{assembly}/taxonomy/parse_{krakentool}_contigs.log"
conda:
"../envs/taxonomy.yml"
params:
ranks = " ".join(config["taxonomy"]["ranks"]),
script = "workflow/scripts/parse_kraken_contigs.py",
taxdir = lambda wildcards, input: os.path.dirname(input[1])
shell:
"""
python {params.script} {input[0]} {output[0]} {params.taxdir} --ranks {params.ranks} > {log} 2>&1
"""

rule kraken_pe:
input:
R1=expand("{results_path}/intermediate/preprocess/{{sample}}_{{unit}}_R1{preprocess}.fastq.gz",
Expand Down
4 changes: 3 additions & 1 deletion workflow/rules/common.smk
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,9 @@ wildcard_constraints:
group="\w+",
l="\d+",
counts_type="(counts|rpkm)",
norm_method="(TMM|RLE)"
norm_method="(TMM|RLE)",
krakentool="kraken|krakenuniq",
tool="contigtax|kraken|krakenuniq"

from scripts.common import check_uppmax, check_annotation, check_assembly, check_classifiers

Expand Down
5 changes: 2 additions & 3 deletions workflow/rules/quantification.smk
Original file line number Diff line number Diff line change
Expand Up @@ -184,11 +184,10 @@ rule sum_to_taxa:
Sums read counts and RPKM values for genes to assigned taxonomy
"""
input:
tax=expand(results+"/annotation/{{assembly}}/taxonomy/orfs.{db}.taxonomy.tsv",
db=config["taxonomy"]["database"]),
tax=results+"/annotation/{assembly}/taxonomy/orfs.{tool}.tsv",
abund=results+"/annotation/{assembly}/gene_{counts_type}.tsv"
output:
results+"/annotation/{assembly}/taxonomy/tax.{counts_type}.tsv"
results+"/annotation/{assembly}/taxonomy/tax.{tool}.{counts_type}.tsv"
script:
"../scripts/quantification_utils.py"

Expand Down
Loading

0 comments on commit 8321b9e

Please sign in to comment.