-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathutilities.R
118 lines (100 loc) · 3.59 KB
/
utilities.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
library(stringr)
sanitize <- function(string) {
result <- gsub("&", "\\&", string, fixed = TRUE)
result <- gsub("_", "\\_", result, fixed = TRUE)
result
}
#' Create a word bubble for genes weighted by abundance.
#' @param gene_abundance, a data frame of two cloumns
#' gene_name: name of the genes, chr
#' gene_abundance: weight, integer
#' @param max_num_words
#' @note it plots a word bubble
#' @note this function plots the top max_num_words words sorted by gene_abundance
#' @examples
#' abstracts <- getAbstracts("22238265")
#' counts <- cleanAbstracts(abstracts)
#' generate_word_bubble(counts) ## error to be expected
#' genecount=data.frame(gene_name=counts$word, gene_abundance=counts$freq)
#' generate_word_bubble(genecount)
#' generate_word_bubble(genecount, max_num_words=10)
generate_word_bubble <- function(gene_abundance, max_num_words=500) {
if(! require('PubMedWordcloud')) stop("Need PubMedWordcloud package")
## check if gene_abundance has the correct columns
if ( !all( c("gene_name", "gene_abundance")
%in% colnames(gene_abundance) ) ) stop(
"Expecting two columns:
gene_name as character
gene_abundance as integer")
counts <- data.frame(word=gene_abundance$gene_name,
freq=gene_abundance$gene_abundance )
plotWordCloud(counts,
scale=c(3,0.5),
min.freq=1, max.words=max_num_words,
rot.per = 0,
colors=c(colSets("Set1")[-6],colSets("Paired")))
}
#' convert date format to number of days.
#'
#' the format is "^[mdy]\d+\.\d*$" for example: m10, y7, d3.5, m3., m3.0.
#' @param vector of dates to convert
#' @return vector of days
mdy_to_day <- function(dates) {
stopifnot(check_date_format(dates))
mdy_letter <- get_mdy_letter(dates)
mdy_value <- get_mdy_value(dates)
letter_value <- data.frame(letter=mdy_letter, value=mdy_value)
convert_to_day(letter_value)
}
#' the format is "^[mdy]\d+\.?\d*$"
#' @return TRUE if format is correct else FALSE
check_date_format <- function(dates) {
pattern <- "^[mdy]\\d+\\.?\\d*$"
all(str_detect(dates, pattern))
}
# only get m, d or y
get_mdy_letter <- function(dates) {
date_letter <- "^[mdy]"
str_extract(dates, date_letter)
}
# only get number(ignore m, d, y)
get_mdy_value <- function(dates) {
date_number <- "\\d+\\.?\\d*$"
mdy_value <- str_extract(dates, date_number)
as.numeric(as.character(mdy_value))
}
#' lookup table used in convert_to_day f
mdy_to_day_lookup <- data.frame(
letter=c('d', 'm', 'y'),
days=c(1, 30.5, 366)
)
#' convert to days
#' @param datafram with 2 cols: letter(d, m, y) and value(numeric).
#' @return days
convert_to_day <- function(letter_value) {
#preserve m/d/y of letter_value
lookupTableTranslation <- match(letter_value$letter, mdy_to_day_lookup$letter)
# days here per unit
let_val_days <- cbind(letter_value,
"days"=mdy_to_day_lookup[lookupTableTranslation,"days"])
let_val_days$value * let_val_days$days
}
gg_color_hue <- function(n) {
hues = seq(15, 375, length=n+1)
hcl(h=hues, l=65, c=100)[1:n]
}
sortFactorTimepoints <- function(timepoints){
tps <- mdy_to_day(timepoints)
names(tps) <- timepoints
factor(timepoints, levels=unique(names(sort(tps))))
}
as.sortedFactor <- function(unsortedFactor){
factor(unsortedFactor, sort(unsortedFactor))
}
prepSiteList <- function(sites){
sites <- unname(unlist(GRangesList(sites)))
mcols(sites) <- merge(as.data.frame(mcols(sites)),
sets[,c("GTSP", "Timepoint", "CellType")])
sites$Timepoint <- sortFactorTimepoints(sites$Timepoint)
sites
}