project_parameters.Config.yaml

# the following parameters are the same across the project and might be needed in more than one module #
root_dir: "./sc-rna-seq-snap" # path to the main dir of the project where GitHub repo lives
data_dir: "./sc-rna-seq-snap/analyses/cellranger-analysis/results/02_cellranger_count/ForcedCells8000Parameters" # path to data dir of the project with CellRanger output results; Options: "DefaultParameters", "ForcedCells8000Parameters", or else
metadata_dir: "./mouse-test-dataset" # path to metadata dir of the project
genome_name: "GRCm39" # define genome reference and versioning
PROJECT_NAME: "mouse-test-dataset"
PI_NAME: "Stanislav Zakharenko"
TASK_ID: "NA"
PROJECT_LEAD_NAME: "NA"
DEPARTMENT: "Developmental Neurobiology"
LEAD_ANALYSTS: "Antonia Chroni, PhD"
GROUP_LEAD: "Cody A. Ramirez, PhD"
CONTACT_EMAIL: "antonia.chroni@stjude.org"
PIPELINE: "Standard sc-/sn-RNA-Seq Analysis in 10X Genomics data"
START_DATE: "10/15/2024"
COMPLETION_DATE: "ONGOING"


# the following parameters are set up as default values and/or are specific for the following modules: 
# `./analyses/fastqc-analysis`
# path to the fastqc files for the `fastqc-analysis` module
fastqc_dir: 
  - /path1
  #- /path2

# `./analyses/cellranger-analysis`
genome_reference_path: "./" # path to genome reference to be used for the `cellranger-analysis` module
cellranger_parameters: "ForcedCells8000Parameters" # Options: "DefaultParameters", "ForcedCells8000Parameters", or else
genome_name_cellranger: "GRCm39" # define the genome of preference for dual genomes. In case for single genomes, please use the same as used for `genome_name`.

# `./analyses/upstream-analysis`
print_pdf_seurat_multiple_samples: "YES" # Options: "YES" (by default ALWAYS), for `02B_run_seurat_qc_multiple_samples.R`
use_condition_split_seurat_multiple_samples: "NO" # Options: "NO" (by default ALWAYS), for  `02B_run_seurat_qc_multiple_samples.R`
grouping: "orig.ident" # define grouping to use 
Regress_Cell_Cycle_value: "NO" # Options: "YES", "NO" or "DIFF". Indicates whether or not to regress for cell cycle and, if so, which method to use and scale data; acceptable values.
assay_seurat_qc: "RNA" # Options: "RNA" (by default ALWAYS)
use_SoupX_filtering_seurat_qc: "YES" # Options: "YES" or "NO", for `02A_run_seurat_qc.Rmd`
min_genes: 300 # define minimum number of genes for filtering
min_count: 500 # define minimum number of UMIs for filtering
mtDNA_pct_default: 10 # define minimum percentage of mtDNA for filtering
normalize_method: "log_norm" # define method for normalization of counts
num_pcs: 30 # define number of principal components
nfeatures_value: 3000 # define number of variable features
prefix: "lognorm" # create label based on the normalization method used
use_miQC: "NO" # Options: "YES" or "NO". Use of miQC R package or not; see `README.md` file for more information.
use_only_step1: "NO" # Options: "YES" or "NO". Use of both or only first step for filtering low quality cells; see `README.md` file for more information.
condition_value: "Genotype" # define main condition of the project; this can be used for visualization purposes on the UMAPs; value to be extracted from column name in `project_metadata.tsv` file
num_dim_seurat_qc: [20, 25] # number of PCs to use in UMAP
num_neighbors_seurat_qc: [10, 20, 30] # number of neighbors to use in UMAP
soup_fraction_value_default: 0.05 # set rho default value to use if estimated rho is > 20%
assay_filter_object: "RNA"  # Optionssay: "RNA" (by default) or "RNA_SoupX"
num_dim_filter_object: 30 # set one value for `04_run_filter_object.Rmd`
num_neighbors_filter_object: 30 # set one value for `04_run_filter_object.Rmd`
use_condition_split_filter_object: "YES" # Options: "YES" (by default) or "NO", for `04_run_filter_object.Rmd`
print_pdf_filter_object: "NO" # # Options: "NO" (by default ALWAYS), for `04_run_filter_object.Rmd`
use_SoupX_filtering_filter_object: "NO" # Options: "YES" or "NO" (by default) , for `04_run_filter_object.Rmd`.
use_scDblFinder_filtering_filter_object: "NO" # Options: "YES" or "NO" (by default) , for `04_run_filter_object.Rmd`.
#PCA_Feature_List_value: transcription.factor.gene.list # set for 04_run_filter_object.Rmd if necessary
use_SoupX_filtering_summary_report: "NO" # Options: "YES" or "NO" (by default) , for `05_run_summary_report.Rmd`.
use_scDblFinder_filtering_summary_report: "NO" # Options: "YES" or "NO" (by default) , for `05_run_summary_report.Rmd`.

# `./analyses/integrative-analysis`
use_seurat_integration: "NO" # Options: "YES" or "NO"
use_harmony_integration: "YES" # Options: "YES" or "NO"
use_liger_integration: "NO" # Options: "YES" or "NO"
integration_method: "harmony" # Options: "seurat", "harmony", "inmf"
num_dim_seurat: 30
num_dim_seurat_integration: 50
big_data_value: FALSE # Options: "TRUE" or "FALSE"
num_dim_harmony: 30
n_neighbors_value: 20
variable_value: "ID"
reference_list_value: NULL
PCA_Feature_List_value: NULL

# `./analyses/cluster-cell-calling`
resolution_clustering_module: "default_multiple" # Options: "custom_multiple", "default_multiple" or define value of resolution, e.g., "0.5"
integration_method_clustering_module: "harmony" # Options:: "seurat", "harmony", "inmf"
num_dim_clustering_module: 30 
reduction_value_clustering_module: "harmony" # Options: For seurat: "pca"; For harmony: "harmony"; For liger: "inmf"
assay_clustering_module: "RNA"  # Options: "RNA" (by default) or "RNA_SoupX"

# Initially, we use a list of multiple resolutions and then we run the module again with the single resolution that fits the data best. 
# If no list is provided, then it will calculate the clusters by the default list.
# We recommend to run first by default and then explore a customized list of resolutions (if these are not provided in the list already).
# In the latter case, user needs to comment in/out the `resolution_list_default_clustering_module` accordingly.
resolution_list_clustering_module: [0.1, 0.5, 1] # This can be a single or multiple resolutions or NULL, e.g., NULL; [0.5]; [0.1, 0.5, 1]
resolution_list_default_clustering_module: [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
algorithm_value_clustering_module: 4 # Leiden algorthim (by default)

resolution_find_markers: "default_multiple" # "custom_multiple", "default_multiple" or define value of resolution, e.g., 0.5
resolution_list_find_markers: 0.1 # this will be the single resolution that fits the data the best
n_value_find_markers: 10 # number of top genes to explore

# `./analyses/cell-contamination-removal-analysis`
keep_clusters_contamination_module: [1, 2, 3, 4, 5, 7, 8, 9, 10, 12, 13, 15, 16]
assay_contamination_module: "RNA" # Options: "RNA" (by default) or "RNA_SoupX"

# `./analyses/cell-types-annotation`
module_with_input_data: "cluster-cell-calling" # Options: "cluster-cell-calling" or "cell-contamination-removal-analysis"
input_data_folder_name: "01_cluster_cell_calling" # Options: "01_cluster_cell_calling" or "03_cluster_cell_calling" if the data are coming from the `cell-contamination-removal-analysis` module
redution_value_annotation_module: "umap" # Options: For seurat and harmony use: "umap"; For Liger: "glue::glue("{integration_method}")"
min.diff.med_value_annotation_module: 0.1 # Higher thresholds for pruning labels correspond to greater assignment certainty
use_min.diff.med_annotation_module: "NO" # Options: "YES" or "NO" (by default)
assay_annotation_module: "RNA"  # Options: "RNA" (by default) or "RNA_SoupX"