-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdrug_resistance.R
118 lines (89 loc) · 5.78 KB
/
drug_resistance.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
### Stanford resistance information ###
# Load the CSV file with resistance information for the 3CLpro gene from Stanford. Downloaded from here: https://covdb.stanford.edu/drms/3clpro/
data <- read.csv("N:/Virologi/JonBrate/Drug resistance SARSCOV2/data/2024.01.12_3CLpro_inhibitors.csv")
# Rename columns
colnames(data)[1] <- "stanford_res"
colnames(data)[2] <- "NTV_fold"
# Replace "del" with "-" in the "stanford_res" column
data$stanford_res <- gsub("del", "-", data$stanford_res)
# Create "stanford_position" by extracting the numbers from the mutation list
data$stanford_position <- as.numeric(gsub("[^0-9]", "", data$stanford_res))
# Create "stanford_reference" by extracting the first character of the mutation list
data$stanford_reference <- substr(data$stanford_res, 1, 1)
# Create "stanford_mutated" by extracting the last character of the mutation list
data$stanford_mutated <- substr(data$stanford_res, nchar(data$stanford_res), nchar(data$stanford_res))
# Create "nextclade_position" by adding 3263 to "stanford_position".
# 3263 is the shift in genome position to get the start of the nsp5 gene
data$nextclade_position <- data$stanford_position + 3263
### Nextclade output ###
# Read the nextclade output to get the different mutations
nextclade <- read.table("C:/Users/jonr/Downloads/OppsettNGSSEQ20240125-01A_NextcladeAndPangolin.csv",
header = TRUE,
sep = "\t",
stringsAsFactors = FALSE)
# Identify the different NSP5 mutations and convert the nomenclature to match the Stanford nomenclature.
# The nsp5 gene is in ORF1a
# Select necessary columns
data_selected <- nextclade[c("name", "ORF1a")]
# Split the "ORF1a" column into lists of the different mutations based on the ";" character
data_selected$tmp <- strsplit(as.character(data_selected$ORF1a), ";")
# Expand the lists into rows
# Adjust the lapply function to handle empty or NULL lists
data_expanded <- do.call(rbind, lapply(seq_along(data_selected$tmp), function(i) {
if (length(data_selected$tmp[[i]]) == 0) { # If the list is empty
return(data.frame(name = data_selected$name[i], ORF1a = NA, stringsAsFactors = FALSE))
} else {
return(data.frame(name = data_selected$name[i], ORF1a = unlist(data_selected$tmp[[i]]), stringsAsFactors = FALSE))
}
}))
# Ensure that row names are not problematic by resetting them
rownames(data_expanded) <- NULL
# Tease out the position, reference, and mutated amino acid
data_expanded$nextclade_position <- as.numeric(substring(data_expanded$ORF1a, 2, nchar(data_expanded$ORF1a) - 1))
data_expanded$nextclade_reference <- substring(data_expanded$ORF1a, 1, 1)
data_expanded$nextclade_mutated <- substring(data_expanded$ORF1a, nchar(data_expanded$ORF1a), nchar(data_expanded$ORF1a))
# Filter out only the nsp5 gene (aa 3264 to 3569)
data_filtered <- subset(data_expanded, nextclade_position >= 3264 & nextclade_position <= 3569)
# Convert to stanford nomenclature and create column for nsp5 mutations
data_filtered$stanford_position <- data_filtered$nextclade_position - 3263
data_filtered$NSP5 <- with(data_filtered, paste(nextclade_reference, stanford_position, nextclade_mutated, sep = ""))
# Remove unnecessary columns
data_final <- data_filtered[c("name", "ORF1a", "nextclade_position", "NSP5")]
### Merge Stanford positions with our data ###
# Join 'data_final' with 'data' on the NSP5 and stanford_res columns. Only keeping rows in the data_final object
joined_data <- merge(data_final, data, by.x = "NSP5", by.y = "stanford_res", all.x = TRUE)
# Create a column "type" based on different NTV fold values
joined_data$type <- ifelse(joined_data$NTV_fold >= 10, "pax_high_res",
ifelse(joined_data$NTV_fold >= 5, "pax_mid_res",
ifelse(joined_data$NTV_fold >= 2.5, "pax_low_res",
"pax_very_low_res")))
# Collect all NSP5 mutations in a column named "NSP5"
aggregated_data <- aggregate(NSP5 ~ name, data = joined_data, FUN = function(x) paste(x, collapse = ";"))
# Create function that aggregates data based on the resistance levels in the "type" column
# type_name specifies the resistance levels. E.g. "pax_high_res"
# The dataset is first filtered on these categories. Then any mutations present are aggregated on the same sample (name)
aggregate_nsp5_by_type <- function(data, type_name) {
filtered_data <- subset(data, type == type_name)
# Check if there are any rows after filtering
if (nrow(filtered_data) == 0) {
# If no rows, create a placeholder data frame with names from the original dataset and empty values for the type
aggregated <- data.frame(name = unique(data$name), stringsAsFactors = FALSE)
aggregated[type_name] <- "" # Create an empty column for the type
} else {
# If rows exist, proceed with aggregation
aggregated <- aggregate(NSP5 ~ name, data = filtered_data, FUN = function(x) paste(unique(x), collapse = ";"))
colnames(aggregated)[2] <- type_name
}
return(aggregated)
}
# Re-run the aggregation for each resistance category (level) and merge back into aggregated_data as before
pax_high_res_data <- aggregate_nsp5_by_type(joined_data, "pax_high_res")
pax_mid_res_data <- aggregate_nsp5_by_type(joined_data, "pax_mid_res")
pax_low_res_data <- aggregate_nsp5_by_type(joined_data, "pax_low_res")
# Merge the aggregated data back into aggregated_data on the sample names
aggregated_data <- merge(aggregated_data, pax_high_res_data, by = "name", all.x = TRUE)
aggregated_data <- merge(aggregated_data, pax_mid_res_data, by = "name", all.x = TRUE)
aggregated_data <- merge(aggregated_data, pax_low_res_data, by = "name", all.x = TRUE)
### Write final data ###
# Write the final data as a tsv file
write.table(aggregated_data, file = "stanford_3clpro_resistance.tsv", sep = "\t", row.names = FALSE, quote = FALSE)