-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathVignette.Rmd
325 lines (234 loc) · 9.39 KB
/
Vignette.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
---
title: "SignalCell"
geometry: margin=0.5cm
author: "Kyle Gellatly"
output:
html_document:
toc: true
toc_float : yes
toc_depth : 3
theme : cerulean
---
```{r setup, include=FALSE, warning=FALSE, error=FALSE}
knitr::opts_chunk$set(echo = TRUE, comment = "")
# https://github.com/kgellatl/SignalCell
library(SignalCell)
```
## Load Data
```{r}
# load("~/Dropbox (UMass Medical School)/UMASS/GitHub/R/SignalCell/data/tst_dat.Rdata")
# sparse_dat <- as(tst_dat, "dgCMatrix")
load("~/Dropbox (UMass Medical School)/Lab_inDrop/mDC_vignette_data/mDC_dat.Rdata")
sparse_dat <- as(mDC_dat, "dgRMatrix")
object.size(mDC_dat)
object.size(sparse_dat)
rm(mDC_dat)
```
# SCE Class
## Construct SingleCellExperiment
```{r}
### http://bioconductor.org/packages/release/bioc/html/SingleCellExperiment.html
sce <- construct_sce(sparse_dat)
rm(sparse_dat)
class(sce)
class(assay(sce, "counts"))
sce
```
### Enhanced exprs Like functionality
```{r}
counts <- assay(sce, "counts")
libsizes <- colSums(counts)
size.factors <- libsizes/mean(libsizes)
logcounts(sce) <- log2(t(t(counts)/size.factors) + 1)
assayNames(sce)
get_def_assay(sce)
sce <- set_def_assay(sce, "logcounts")
get_def_assay(sce)
```
### pData Like functionality
```{r}
colData(sce)$Timpoint <- "0hr"
sce$Timpoint[grep("1", colnames(sce))] <- "1hr"
sce$Timpoint[grep("4", colnames(sce))] <- "4hr"
sce <- calc_libsize(sce, assay = "counts", label = "UMI_sum")
colData(sce)
sce
```
### fData Like functionality
```{r}
counts <- apply(assay(sce, "counts"), 1, function(c)sum(c!=0))
rowData(sce)$gene_counts <- counts
rowData(sce) # use for vector annotations
sce
```
# Basic Analysis
## Finding variable genes
```{r}
sce <- calc_var_genes(sce, method = "Malhanobis")
sce <- calc_var_genes(sce, method = "CV")
sce <- calc_var_genes(sce, method = "Gini")
rowData(sce)
sce
```
## Bare Minimum Example
```{r}
gene_subset <- get_var_genes(sce, method = "Malhanobis", cutoff = 0.85)
gene_subset[100:110]
sce <- dim_reduce(input = sce, genelist = gene_subset, method = "iPCA")
sce <- embed_cells(input = sce, lem = "iPCA", method = "tSNE")
colData(sce)
```
### Dimension reduction Detailed
```{r}
###### Example 1 : Linear Embedding Matrix (LEM) Class, Single Embedding #####
# First we run dimension reduction
# When NULL, the name of the lem defaults to the method, in this case iPCA
sce <- dim_reduce(input = sce, genelist = gene_subset, method = "iPCA", reducedDim_key = NULL)
# We can see what LEM results exist with reducedDimNames
reducedDimNames(sce)
# We can access the LEM with reducedDim and providing a LEM name
LEM_iPCA <- reducedDim(sce, "iPCA")
# Structure of the LEM
# SVD (like) Matrices
head(sampleFactors(LEM_iPCA)) # cells = rows
head(featureLoadings(LEM_iPCA)) # gene = rows
head(factorData(LEM_iPCA)) # diagonal
# There is also a metadata field. This stores the parameters of the LEM.
# As of now, the embedding field is empty
metadata_iPCA <- metadata(LEM_iPCA)
str(metadata_iPCA)
# Next we embed the cells. Right now tSNE is supported, the default method. We can add UMAP support
# This embedding is stored internally to the input LEM, so a lem must be provided as an argument
# As above, when no embedding_key is provided, the name of the embedding defaults to the method, in this case tSNE
# When TRUE, write_colData will write the x & y coordinates to colData()
# When NULL the colData_labs default to x and y pasted with the method
sce <- embed_cells(input = sce, lem = "iPCA", embedding_key = NULL, write_colData = T, method = "tSNE", seed = 10)
colData(sce)
# Now within the metadata of the LEM, we have an embedding stored, along with its parameters
# To simplify usage, there is a function, embeddings(), to list available LEMs, and their embedding_key
# There is another function, embedding(), to retrieve a specific embedding with its embedding_key
# This requires a LEM argument, and the embedding_key
embeddings(sce)
head(embedding(sce, lem = "iPCA", embedding = "tSNE"))
# Parameters for the embedding are nested within the LEM metadata, and then the embedding
LEM_iPCA <- reducedDim(sce, "iPCA")
metadata_iPCA <- metadata(LEM_iPCA)
metadata_iPCA$embeddings$tSNE$parameters
###### Example 2 : Second LEM, Customization, multiple embeddings ######
# You can store as many LEM objects you want.
# In this case we use reducedDim_key to name it specifically.
# We run PCA for 20 components
sce <- dim_reduce(input = sce, reducedDim_key = "PCA_20comp", genelist = gene_subset, method = "PCA", nComp = 20)
# Now we have 2 LEMs stored
reducedDimNames(sce)
# Now when we call embed_cells, we call the new reducedDim() object, "PCA_20comp"
# However we only select 15 components
# When embedding we can also specify the column names written to colData()
# This way as many x + y embeddings can be stored as you desire within colData (as long as they are uniquely named)
sce <- embed_cells(input = sce, lem = "PCA_20comp", embedding_key = "PCA_20_15", colData_labs = c("PCA_20_15_x", "PCA_20_15_y"), comp_select = 1:15, iterations = 500, tSNE_perp = 50)
colData(sce)
# On a single LEM object we can try as many embeddings as desired
# To prevent them overwriting, we can provide a new embedding_key
# With our second embedding, we tweak parameters
sce <- embed_cells(input = sce, lem = "PCA_20comp", embedding_key = "PCA_embed2", colData_labs = c("PCA_embed2_x", "PCA_embed2_y"), comp_select = 1:10, tSNE_perp = 50)
colData(sce)
# Now our object has 2 sets of LEMS, and within the second LEM are 2 different embeddings
embeddings(sce)
# If write_colData is FALSE when you run embed_cells(), than the colData will not get flooded with all embeddings you run
# You can always write a single embedding result to colData, the one you prefer
int_embedding <- embedding(input = sce, lem = "iPCA", embedding = "tSNE")
colnames(int_embedding) <- c("x", "y")
colData(sce) <- cbind(colData(sce), int_embedding)
colData(sce)
```
## Clustering
```{r}
reducedDims(sce)
sce <- cluster_cells(sce, dims = "Comp", lem = "iPCA", method = "spectral", k = 6)
plot_metadata(sce, color_by = "Cluster")
# 2D clustering can be done on colData()
sce <- cluster_cells(sce, dims = "2d", method = "density", xy = c("tSNE_x", "tSNE_y"))
colData(sce)
# 2D clustering can also be done on an embedding not stored in colData, but requires more arguments because the embedding is stored with the LEM
# We also provide a name argument for this new colData() column to prevent overwriting the default "Cluster"
# Notice the end result clustering on the colData() or lem are the same, because the underlying data is the same
embeddings(sce)
sce <- cluster_cells(sce, dims = "2d", method = "density", lem = "iPCA", embedding = "tSNE", name = "PCA_20_embed2_cluster")
colData(sce)[,c("Cluster", "PCA_20_embed2_cluster")]
# A New argument allows calculation of clustering statistics (cluster_stats()). These are stored in the internal metadata of the sce
ks <- 3:13
reps <- 1:7
for(i in 1:length(ks)) {
for (j in reps) {
sce <- cluster_cells(sce, dims = "Comp", lem = "iPCA", method = "spectral", k = ks[i], cluster_stats = T)
}
}
c_metrics <- int_metadata(sce)$cluster_metrics
c_metrics <- pivot_longer(c_metrics, cols = 1:4, values_to = "value", names_to = "metric")
c_metrics$metric <- as.factor(c_metrics$metric)
c_metrics$k <- as.factor(as.numeric(c_metrics$k))
g <- ggplot(c_metrics)
g <- g + geom_violin(aes(x = k, y = value), scale = "width")
g <- g + geom_jitter(aes(x = k, y = value))
g <- g + facet_wrap(~metric, scales = "free")
plot(g)
# average.between = average distance between clusters.
# average.within = average distance within clusters.
# within.cluster.ss = a generalisation of the within clusters sum of squares
# avg.silwidth = average silhouette width.
#pearsongamma = correlation between distances and a 0-1-vector where 0 means same cluster, 1means different clusters
# dunn = minimum separation / maximum diameter.
# dunn2 = minimum average dissimilarity between two cluster / maximum average withincluster dissimilarity
# entropy = entropy of the distribution of cluster memberships
# wb.ratio = average.within/average.between
# ch = Calinski and Harabasz inde
# https://medium.com/@haataa/how-to-measure-clustering-performances-when-there-are-no-ground-truth-db027e9a871c
# https://cran.r-project.org/web/packages/fpc/fpc.pdf
```
# Basic Plots
## Setting default coordinates
```{r}
###### NEEDS TO BE CHANGED
#### EITHER EMBEDDING OR LEM!
# sce <- set_xy(sce, "iPCA")
#
# sce <- set_xy(sce, "iPCA")
# head(colData(sce))[,c("x", "y")]
```
## tSNE metadata Plot
```{r}
###### NEEDS TO BE CHANGED
#### EITHER EMBEDDING OR LEM!
plot_metadata(sce, color_by = "Cluster")
#
# plot_metadata(sce, color_by = "Cluster", facet_by = "Timepoint", coords = c("iPCA", 1, 2))
```
## tSNE gene Plot
```{r}
###### NEEDS TO BE CHANGED
#### EITHER EMBEDDING OR LEM!
# plot_tsne_gene(sce, gene = "Ccr7")
#
# plot_tsne_gene(sce, gene = c("Ccr7", "Lcn2"))
#
# plot_tsne_gene(sce, gene = c("Tnf"), facet_by = "Timepoint")
```
## Violin Plot
```{r}
# plot_violin(sce, gene = "Ccr7", color_by = "Cluster")
#
# plot_violin(sce, gene = "Ccr7", color_by = "Cluster", assay = "counts")
#
# plot_violin(sce, gene = c("Ccr7", "Lcn2"), color_by = "Cluster") # NEW
#
# plot_violin(sce, gene = c("Tnf"), facet_by = "Timepoint", color_by = "Cluster")
```
# Bulk Analysis
```{r}
# sce@elementMetadata # use for bulk values??
```
# To Do
# Change the dim reduce
### Quantitative clustering
### Test file Sizes on LARGE Data!
### dgC vs dgR matrix performance test