Last update: 24-10-15
Last markdown compiled: 24-10-15
ToDo: - finished
knitr::opts_chunk$set(warning = FALSE, message = FALSE, results = FALSE)
# dirs
## Sebastian @Linux
dir_moa <- "/home/isar/Dropbox/scn_moa/"
# data, revised 2023.11.29
# exp
load(file = paste0(dir_moa, "/data/final/", "gnp.pro.RData"))
load(file = paste0(dir_moa, "/data/final/", "gnp.mrn.RData"))
load(file = paste0(dir_moa, "/data/final/", "gnp.mic.RData"))
# results
load(file = paste0(dir_moa, "/data/final/", "dex.all.RData"))
load(file = paste0(dir_moa, "/data/final/", "timeomics.singles.RData"))
load(file = paste0(dir_moa, "/data/final/", "res.wgcna.RData"))
## packages
source(paste0(dir_moa, "/scripts/", "scn_moa_fct.R"))
# load packagea
sapply(packages_general, require, character.only = TRUE)
sapply(packages_pca, require, character.only = TRUE)
sapply(packages_cluster, require, character.only = TRUE)
sapply(packages_tables, require, character.only = TRUE)
# prep result df
res.cluster.mrn <- gnp.mrn$san_mrn %>% dplyr::select(id) # is idL
res.cluster.pro <- gnp.pro$san_pro %>% dplyr::select(id)
res.cluster.mic <- gnp.mic$san_mic %>% dplyr::select(id)
# rownames= samples, colnames = SYMBOLS
## exp_pre
exp_4clust_pro_pre <- gnp.pro$exp_pro_vsn #
exp_4clust_mrn_pre <- gnp.mrn$exp_mrn_tpm_vsn_NIPALS #
exp_4clust_mic_pre <- gnp.mic$exp_mic_vsn_NIPALS
# colnames = SYMBOLS
colnames(exp_4clust_pro_pre) <- gnp.mrn$fan$SYMBOL[match(colnames(exp_4clust_pro_pre), gnp.mrn$fan$UNIPROT)]
colnames(exp_4clust_mrn_pre) <- gnp.mrn$fan$SYMBOL[match(colnames(exp_4clust_pro_pre), gnp.mrn$fan$ENSG)]
colnames(exp_4clust_mic_pre) # hsa
# rownames = sample label
rownames(exp_4clust_pro_pre) # lab
rownames(exp_4clust_mrn_pre) # lab
rownames(exp_4clust_mic_pre) # lab
# CENTER AND SCALE
exp_4clust_pro <- as.data.frame(scale(exp_4clust_pro_pre, center = TRUE, scale = TRUE))
exp_4clust_mrn <- as.data.frame(scale(exp_4clust_mrn_pre, center = TRUE, scale = TRUE))
exp_4clust_mic <- as.data.frame(scale(exp_4clust_mic_pre, center = TRUE, scale = TRUE))
Ward’s minimum variance method is a total sums of squares based clustering method. It minimizes the variance in each cluster. It tends to produce clusters of relatively equal sizes and shapes, and it is robust to outliers but is computationally intensive.
plot(res.wardD_mrn, cex = 0.6, hang = -1, main = "Transcriptome: WardD k=5 and k=7")
rect.hclust(res.wardD_mrn, k = 5, border = 2:5)
rect.hclust(res.wardD_mrn, k = 7, border = 2:5)
cut7.wardD_mrn = cutree(res.wardD_mrn, 7)
cut7.wardD_mrn_4ext <- as.data.frame(cut7.wardD_mrn)
cut7.wardD_mrn_4ext$id <- rownames(cut7.wardD_mrn_4ext)
# extract
res.cluster.mrn$euk.7 <- cut7.wardD_mrn_4ext$cut7.wardD_mrn[match(res.cluster.mrn$id, cut7.wardD_mrn_4ext$id)]
### dendrogram
plot(res.wardD_mic, cex = 0.6, hang = -1, main = "microRNA: WardD k=4 and k=5")
rect.hclust(res.wardD_mic, k = 4, border = 2:5)
rect.hclust(res.wardD_mic, k = 5, border = 2:5)
# extract
cut5.wardD_mic = cutree(res.wardD_mic, 5)
cut5.wardD_mic <- as.data.frame(cut5.wardD_mic)
cut5.wardD_mic$id <- rownames(cut5.wardD_mic)
res.cluster.mic$euk.5 <- cut5.wardD_mic$cut5.wardD_mic[match(res.cluster.mic$id, cut5.wardD_mic$id)]
### dendrogram
plot(res.wardD_pro, cex = 0.6, hang = -1, main = "Proteome: WardD k=3,4,5,6")
rect.hclust(res.wardD_pro, k = 3, border = 2:5)
rect.hclust(res.wardD_pro, k = 4, border = 2:5)
rect.hclust(res.wardD_pro, k = 5, border = 2:5)
rect.hclust(res.wardD_pro, k = 6, border = 2:5)
# euclidean doent work well for pro!
cut3.wardD_pro = cutree(res.wardD_pro, 3)
cut3.wardD_pro <- as.data.frame(cut3.wardD_pro)
cut3.wardD_pro$id <- rownames(cut3.wardD_pro)
cut4.wardD_pro = cutree(res.wardD_pro, 4)
cut4.wardD_pro <- as.data.frame(cut4.wardD_pro)
cut4.wardD_pro$id <- rownames(cut4.wardD_pro)
res.cluster.pro$euk.4 <- cut4.wardD_pro$cut4.wardD_pro[match(res.cluster.pro$id, cut4.wardD_pro$id)]
res.cluster.pro$euk.3 <- cut3.wardD_pro$cut3.wardD_pro[match(res.cluster.pro$id, cut4.wardD_pro$id)]
set.seed(1306)
# function to asses
asses_custerSize <- function(exp) {
# WSS
plot_wss <- fviz_nbclust(exp, FUN = hcut, method = "wss") +
labs(title= "Sums of squares")
# average silhouette width
plot_sil <- fviz_nbclust(exp, FUN = hcut, method = "silhouette") +
labs(title= "Silhouette width")
# gap statistics
gap_stat <- clusGap(exp, FUN = hcut, nstart = 25, K.max = 10, B = 50)
plot_gap <- fviz_gap_stat(gap_stat) + labs(title= "Gap statistic")
# distance matrix
dist <- dist(exp, method = "euclidean")
plot_DistMat <- fviz_dist(dist,
lab_size = 5,
gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))
plot_DistMat <- plot_DistMat + ggtitle("Distance matrix")
# collect plots
plot_list <- list(
wss = plot_wss,
sil = plot_sil,
gap = plot_gap,
DistMat = plot_DistMat)
return(plot_list)
}
# calc kmeans
calc_kmeans <- function(df, centers_nr, nstart_nr){
set.seed(123)
k <- kmeans(df, centers = centers_nr, nstart = nstart_nr)
return(k)
}
# kMeans and plotting
plot_kmeans <- function(exp, k, title) {
set.seed(123)
# calculate
kmeans <- calc_kmeans(exp, k, 50)
# plot
plot <- fviz_cluster(
kmeans,
data = exp,
labelsize = 8,
repel = T ) +
ggtitle(title)
return(plot)
}
fviz_dist(dist_mrn, lab_size = 5, gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07")) +
ggtitle("Distance matrix transcriptome") +
theme(text = element_text(size=12),
axis.text.x = element_text(size = 12),
axis.text.y = element_text(size = 12))
fviz_dist(dist_pro, lab_size = 5, gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07")) + ggtitle("Distance matrix proteome")+
theme(text = element_text(size=12),
axis.text.x = element_text(size = 12),
axis.text.y = element_text(size = 12))
fviz_dist(dist_mic, lab_size = 5, gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07")) + ggtitle("Distance matrix microRNA")+
theme(text = element_text(size=12),
axis.text.x = element_text(size = 12),
axis.text.y = element_text(size = 12))
## exp_pre
exp_4clust_pro_pre <- gnp.pro$exp_pro_vsn #
exp_4clust_mrn_pre <- gnp.mrn$exp_mrn_tpm_vsn_NIPALS #
exp_4clust_mic_pre <- gnp.mic$exp_mic_vsn_NIPALS
# rownames = sample label
rownames(exp_4clust_pro_pre) # lab
rownames(exp_4clust_mrn_pre) # lab
rownames(exp_4clust_mic_pre) # lab
# center data
exp_4clust_pro <- as.data.frame(scale(exp_4clust_pro_pre, center = T, scale = F))
exp_4clust_mrn <- as.data.frame(scale(exp_4clust_mrn_pre, center = T, scale = F))
exp_4clust_mic <- as.data.frame(scale(exp_4clust_mic_pre, center = T, scale = F))
exp_4clust_pro.sc <- as.data.frame(scale(exp_4clust_pro_pre, center = T, scale = T))
exp_4clust_mrn.sc <- as.data.frame(scale(exp_4clust_mrn_pre, center = T, scale = T))
exp_4clust_mic.sc <- as.data.frame(scale(exp_4clust_mic_pre, center = T, scale = T))
# calc pca
res.pca.mrn <- prcomp(exp_4clust_mrn)
res.pca.mic <- prcomp(exp_4clust_mic)
res.pca.pro <- prcomp(exp_4clust_pro)
res.pca.mrn.sc <- prcomp(exp_4clust_mrn.sc)
res.pca.mic <- prcomp(exp_4clust_mic)
res.pca.pro <- prcomp(exp_4clust_pro)
# scree
fviz_eig(res.pca.mrn, main = 'unscaled')
fviz_eig(res.pca.mrn.sc, main = 'scaled')
fviz_eig(res.pca.mic)
fviz_eig(res.pca.pro)
# check PCA
fviz_pca_ind(res.pca.mrn, geom = "text")
fviz_pca_ind(res.pca.mic, geom = "text")
fviz_pca_ind(res.pca.pro, geom = "text")
# extract data
extract_PCs <- function(res.pca){
dat.pca <- res.pca$x %>% as.data.frame() %>%
rownames_to_column('fid') %>%
dplyr::select(fid, PC1, PC2, PC3, PC4, PC5, PC6, PC7, PC8) %>%
column_to_rownames('fid')
dat.pca$id <- rownames(dat.pca)
return(dat.pca)
}
dat.pca.mrn <- extract_PCs(res.pca.mrn)
dat.pca.mic <- extract_PCs(res.pca.mic)
dat.pca.pro <- extract_PCs(res.pca.pro)
## add data of interest
# stage
dat.pca.mrn <- add_stage(dat.pca.mrn, gnp.mrn$san_mrn, gnp.mrn$levels_mrn)
dat.pca.mic <- add_stage(dat.pca.mic, gnp.mic$san_mic, gnp.mic$levels_mic)
dat.pca.pro <- add_stage(dat.pca.pro, gnp.pro$san_pro, gnp.pro$levels_pro)
## cluster results
dat.pca.mrn$clu <- res.cluster.mrn$euk.7[match(dat.pca.mrn$id, res.cluster.mrn$id)]
dat.pca.mic$clu <- res.cluster.mic$euk.5[match(dat.pca.mic$id, res.cluster.mic$id)]
dat.pca.pro$clu <- res.cluster.pro$km.k3[match(dat.pca.pro$id, res.cluster.pro$id)]
dat.pca.pro$euk.4 <- res.cluster.pro$euk.4[match(dat.pca.pro$id, res.cluster.pro$id)]
dat.pca.pro$km.3 <- res.cluster.pro$km.k3[match(dat.pca.pro$id, res.cluster.pro$id)]
dat.pca.pro$km.5 <- res.cluster.pro$km.k5[match(dat.pca.pro$id, res.cluster.pro$id)]
## PCA dotplots with kMeans
# fct
plot_pca.gnp <- function(res.pca, dat.pca, title){
# get vals
eigen.vals <- get_eig(res.pca)
x_val <- round(eigen.vals$variance.percent[1],0)
y_val <- round(eigen.vals$variance.percent[2],0)
# plot
dat.pca %>%
ggplot(aes(x=PC1, y=PC2)) +
#geom_circle(aes(group = clu), alpha=0.8, show.legend = F) +
geom_point(aes(colour = stage), size = 8) +
#geom_encircle(aes(group = clu), alpha = 0.5, show.legend = F, linetype = "dotted", size = 3) +
scale_color_manual(values = colors_stage) +
#ggtitle(title) +
xlab(paste0("Variance PC1: ", x_val, "%")) +
ylab(paste0("Variance PC2: ", y_val, "%"))# +
#theme(legend.position="none")
}
# mrn
plot_pca.gnp(res.pca.mrn, dat.pca.mrn, 'Transcriptome') +
scale_x_reverse()
# mic
plot_pca.gnp(res.pca.mic, dat.pca.mic, 'microRNA') +
scale_x_reverse() # + scale_y_reverse()
# pro
# km5
dat.pca.pro$clu <- dat.pca.pro$km.3 # is best
#dat.pca.pro$clu <- dat.pca.pro$euk.3
plot_pca.gnp(res.pca.pro, dat.pca.pro, 'Proteome') +
scale_x_reverse() + scale_y_reverse()
## PCA dotplots with kMeans
# fct
plot_myPCA_km <- function(res.pca, dat.pca, title){
# get vals
eigen.vals <- get_eig(res.pca)
x_val <- round(eigen.vals$variance.percent[1],0)
y_val <- round(eigen.vals$variance.percent[2],0)
# plot
dat.pca %>%
ggplot(aes(x=PC1, y=PC2)) +
geom_point(aes(colour = stage), size = 9) +
scale_color_manual(values = colors_stage) +
geom_encircle(aes(group = clu), alpha=0.4, show.legend = F, linetype = 'dashed', size = 2.5) +
#ggtitle(title) +
xlab(paste0("Variance PC1: ", x_val, "%")) +
ylab(paste0("Variance PC2: ", y_val, "%")) +
theme_light(base_size = 20) +
theme(legend.position="none")
}
# mrn
p.pca.mrn <- plot_myPCA_km(res.pca.mrn, dat.pca.mrn, 'Transcriptome') +
scale_x_reverse()
p.pca.mrn
ggsave(
filename = paste0(dir_moa, "/figures/fig1/", "p.pca.mrn", ".png"),
plot = p.pca.mrn,
device = png,
width = 6,
height = 6)
ggsave(
filename = paste0(dir_moa, "/figures/fig1/", "p.pca.mrn", ".svg"),
plot = p.pca.mrn,
device = svg,
width = 6,
height = 6)
# mic
p.pca.mic <- plot_myPCA_km(res.pca.mic, dat.pca.mic, 'microRNA') +
scale_x_reverse()
p.pca.mic
ggsave(
filename = paste0(dir_moa, "/figures/fig1/", "p.pca.mic", ".png"),
plot = p.pca.mic,
device = png,
width = 6,
height = 6)
ggsave(
filename = paste0(dir_moa, "/figures/fig1/", "p.pca.mic", ".svg"),
plot = p.pca.mic,
device = svg,
width = 6,
height = 6)
# pro
p.pca.pro <- plot_myPCA_km(res.pca.pro, dat.pca.pro, 'Proteome') +
scale_x_reverse() + scale_y_reverse()
p.pca.pro
ggsave(
filename = paste0(dir_moa, "/figures/fig1/", "p.pca.pro", ".png"),
plot = p.pca.pro,
device = png,
width = 6,
height = 6)
ggsave(
filename = paste0(dir_moa, "/figures/fig1/", "p.pca.pro", ".svg"),
plot = p.pca.pro,
device = svg,
width = 6,
height = 6)