-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmatch_mutations.R
81 lines (64 loc) · 3.2 KB
/
match_mutations.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
library(tidyverse)
# RdRP --------------------------------------------------------------------
stanf_rdrp <- read_csv("data/2024.01.12_RdRP_inhibitors.csv") %>%
select("stanford_mutation" = Mutation) %>%
mutate(stanford_position = as.numeric(str_sub(stanford_mutation, 2, -2)),
stanford_reference = str_sub(stanford_mutation, 1, 1),
stanford_mutated = str_sub(stanford_mutation, -1, -1)) %>%
mutate(nextclade_position = stanford_position - 9) %>%
add_column("Gene" = "RdRP")
# Read nextclade mutations
# Nanopore 2023
mutatations_raw <- read_tsv("N:/Virologi/NGS/1-NGS-Analyser/1-Rutine/2-Resultater/SARS-CoV-2/1-Nanopore/2023/OppsettNGSSEQ20231219-01B_summaries/OppsettNGSSEQ20231219-01B_summaries_and_Pangolin.csv")
nextclade_ORF1b_mutations <- mutatations_raw %>%
# Get only ORF1b
select(name, ORF1b) %>%
mutate("tmp" = str_split(ORF1b, ";")) %>%
# Unnest the list column (which is basically another pivot_longer)
unnest(tmp) %>%
# Remove tmp
select(-ORF1b) %>%
rename("ORF1b" = tmp) %>%
mutate(nextclade_position = as.numeric(str_sub(ORF1b, 2, -2)),
nextclade_reference = str_sub(ORF1b, 1, 1),
nextclade_mutated = str_sub(ORF1b, -1, -1))
# Add RdRP resistance mutations to our sequences analyzed by nextclade.
# NA means that the resistance mutation is not present
# NB! Check that it's correct
tmp <- left_join(nextclade_ORF1b_mutations, stanf_rdrp, by = "nextclade_position") %>%
# Rename NA in Gene column to "RdRP"
replace_na(list(Gene = "RdRP"))
# 3CLpro ------------------------------------------------------------------
stanf_3clpro <- read_csv("data/2024.01.12_3CLpro_inhibitors.csv") %>%
select("stanford_mutation" = Mutation) %>%
# Change del to "-"
mutate(stanford_mutation = str_replace(stanford_mutation, "del", "-")) %>%
mutate(stanford_position = as.numeric(str_sub(stanford_mutation, 2, -2)),
stanford_reference = str_sub(stanford_mutation, 1, 1),
stanford_mutated = str_sub(stanford_mutation, -1, -1)) %>%
mutate(nextclade_position = stanford_position + 3263) %>%
add_column("Gene" = "3CLpro")
nextclade_ORF1a_mutations <- mutatations_raw %>%
# Get only ORF1a
select(name, ORF1a) %>%
mutate("tmp" = str_split(ORF1a, ";")) %>%
# Unnest the list column (which is basically another pivot_longer)
unnest(tmp) %>%
# Remove tmp
select(-ORF1a) %>%
rename("ORF1a" = tmp) %>%
mutate(nextclade_position = as.numeric(str_sub(ORF1a, 2, -2)),
nextclade_reference = str_sub(ORF1a, 1, 1),
nextclade_mutated = str_sub(ORF1a, -1, -1))
# Add 3CLpro resistance mutations to our sequences analyzed by nextclade.
# NA means that the resistance mutation is not present
# NB! Check that it's correct
tmp2 <- left_join(nextclade_ORF1a_mutations, stanf_3clpro, by = "nextclade_position") %>%
# Rename NA in Gene column to "3CLpro"
replace_na(list(Gene = "3CLpro"))
# Join RdRP and 3CLpro together -------------------------------------------
df <- bind_rows(tmp, tmp2)
# For pretty sharing:
# pivot_wider on gene - values are resistance mutations. Might need to do separately for RdRP and 3CLpro.
# Read the 3CLpro inhibitors
CLPro_inhib <- read_csv("data/2023.01.05_3CLpro_inhibitors.csv")