Last update: 24-10-15

Last markdown compiled: 24-10-15

ToDo: - finished

knitr::opts_chunk$set(warning = FALSE, message = FALSE, results = FALSE) 

# dirs
## Sebastian @Linux
dir_moa <- "/home/isar/Dropbox/scn_moa/"

# data, revised 2023.11.29
# exp
load(file = paste0(dir_moa, "/data/final/", "gnp.pro.RData"))
load(file = paste0(dir_moa, "/data/final/", "gnp.mrn.RData"))
load(file = paste0(dir_moa, "/data/final/", "gnp.mic.RData"))
# results
load(file = paste0(dir_moa, "/data/final/", "dex.all.RData"))
load(file = paste0(dir_moa, "/data/final/", "timeomics.singles.RData"))
load(file = paste0(dir_moa, "/data/final/", "res.wgcna.RData"))

## packages
source(paste0(dir_moa, "/scripts/", "scn_moa_fct.R"))


# load packagea
sapply(packages_general, require, character.only = TRUE)
sapply(packages_pca, require, character.only = TRUE)
sapply(packages_cluster, require, character.only = TRUE)
sapply(packages_tables, require, character.only = TRUE)

CLUSTER

prep

# prep result df
res.cluster.mrn <- gnp.mrn$san_mrn %>% dplyr::select(id) # is idL
res.cluster.pro <- gnp.pro$san_pro %>% dplyr::select(id)
res.cluster.mic <- gnp.mic$san_mic %>% dplyr::select(id)


# rownames=  samples, colnames = SYMBOLS

## exp_pre
exp_4clust_pro_pre <- gnp.pro$exp_pro_vsn #
exp_4clust_mrn_pre <- gnp.mrn$exp_mrn_tpm_vsn_NIPALS # 
exp_4clust_mic_pre <- gnp.mic$exp_mic_vsn_NIPALS

# colnames = SYMBOLS
colnames(exp_4clust_pro_pre) <- gnp.mrn$fan$SYMBOL[match(colnames(exp_4clust_pro_pre), gnp.mrn$fan$UNIPROT)]
colnames(exp_4clust_mrn_pre) <- gnp.mrn$fan$SYMBOL[match(colnames(exp_4clust_pro_pre), gnp.mrn$fan$ENSG)]
colnames(exp_4clust_mic_pre) # hsa

# rownames = sample label
rownames(exp_4clust_pro_pre) # lab
rownames(exp_4clust_mrn_pre) # lab
rownames(exp_4clust_mic_pre) # lab

# CENTER AND SCALE
exp_4clust_pro <- as.data.frame(scale(exp_4clust_pro_pre, center = TRUE, scale = TRUE))
exp_4clust_mrn <- as.data.frame(scale(exp_4clust_mrn_pre, center = TRUE, scale = TRUE))
exp_4clust_mic <- as.data.frame(scale(exp_4clust_mic_pre, center = TRUE, scale = TRUE))

euklidean

select best method

Ward’s minimum variance method is a total sums of squares based clustering method. It minimizes the variance in each cluster. It tends to produce clusters of relatively equal sizes and shapes, and it is robust to outliers but is computationally intensive.

transcriptome

all

plot(res.wardD_mrn, cex = 0.6, hang = -1, main = "Transcriptome: WardD k=5 and k=7")
rect.hclust(res.wardD_mrn, k = 5, border = 2:5)
rect.hclust(res.wardD_mrn, k = 7, border = 2:5)

cut7.wardD_mrn = cutree(res.wardD_mrn, 7)

cut7.wardD_mrn_4ext <- as.data.frame(cut7.wardD_mrn)
cut7.wardD_mrn_4ext$id <- rownames(cut7.wardD_mrn_4ext)

# extract
res.cluster.mrn$euk.7 <- cut7.wardD_mrn_4ext$cut7.wardD_mrn[match(res.cluster.mrn$id, cut7.wardD_mrn_4ext$id)]

mRNA

lncRNA

mic

### dendrogram
plot(res.wardD_mic, cex = 0.6, hang = -1, main = "microRNA: WardD k=4 and k=5")
rect.hclust(res.wardD_mic, k = 4, border = 2:5)
rect.hclust(res.wardD_mic, k = 5, border = 2:5)

# extract
cut5.wardD_mic = cutree(res.wardD_mic, 5)
cut5.wardD_mic <- as.data.frame(cut5.wardD_mic)
cut5.wardD_mic$id <- rownames(cut5.wardD_mic)

res.cluster.mic$euk.5 <- cut5.wardD_mic$cut5.wardD_mic[match(res.cluster.mic$id, cut5.wardD_mic$id)]

pro

### dendrogram
plot(res.wardD_pro, cex = 0.6, hang = -1, main = "Proteome: WardD k=3,4,5,6")
rect.hclust(res.wardD_pro, k = 3, border = 2:5)
rect.hclust(res.wardD_pro, k = 4, border = 2:5)
rect.hclust(res.wardD_pro, k = 5, border = 2:5)
rect.hclust(res.wardD_pro, k = 6, border = 2:5)

# euclidean doent work well for pro!

cut3.wardD_pro = cutree(res.wardD_pro, 3)
cut3.wardD_pro <- as.data.frame(cut3.wardD_pro)
cut3.wardD_pro$id <- rownames(cut3.wardD_pro)

cut4.wardD_pro = cutree(res.wardD_pro, 4)
cut4.wardD_pro <- as.data.frame(cut4.wardD_pro)
cut4.wardD_pro$id <- rownames(cut4.wardD_pro)

res.cluster.pro$euk.4 <- cut4.wardD_pro$cut4.wardD_pro[match(res.cluster.pro$id, cut4.wardD_pro$id)]
res.cluster.pro$euk.3 <- cut3.wardD_pro$cut3.wardD_pro[match(res.cluster.pro$id, cut4.wardD_pro$id)]

kMeans

fct

set.seed(1306)

# function to asses
asses_custerSize <- function(exp) {
  # WSS
  plot_wss <- fviz_nbclust(exp, FUN = hcut, method = "wss") +
    labs(title= "Sums of squares") 
  # average silhouette width
  plot_sil <- fviz_nbclust(exp, FUN = hcut, method = "silhouette") +
    labs(title= "Silhouette width")
  # gap statistics
  gap_stat <- clusGap(exp, FUN = hcut, nstart = 25, K.max = 10, B = 50)
  plot_gap <- fviz_gap_stat(gap_stat) + labs(title= "Gap statistic")
  # distance matrix
  dist <- dist(exp, method = "euclidean")
  plot_DistMat <- fviz_dist(dist, 
                            lab_size = 5,
                            gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))
  plot_DistMat <- plot_DistMat + ggtitle("Distance matrix")
  # collect plots
  plot_list <- list(
    wss = plot_wss,
    sil = plot_sil,
    gap = plot_gap,
    DistMat = plot_DistMat)
  return(plot_list)
}


# calc kmeans
calc_kmeans <- function(df, centers_nr, nstart_nr){
  set.seed(123)
  k <- kmeans(df, centers = centers_nr, nstart = nstart_nr)
  return(k)
}

# kMeans and plotting
plot_kmeans <- function(exp, k, title) {
  set.seed(123)
  # calculate
  kmeans <- calc_kmeans(exp, k, 50)
  # plot
  plot <- fviz_cluster(
    kmeans, 
    data = exp, 
    labelsize = 8,
    repel = T ) +
    ggtitle(title)
  return(plot)
}

Cluster size

Clustering

pro

Distance matrix

fviz_dist(dist_mrn, lab_size = 5, gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07")) +
  ggtitle("Distance matrix transcriptome") +
  theme(text = element_text(size=12),
        axis.text.x = element_text(size = 12),
        axis.text.y = element_text(size = 12))

fviz_dist(dist_pro, lab_size = 5, gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07")) + ggtitle("Distance matrix proteome")+
  theme(text = element_text(size=12),
        axis.text.x = element_text(size = 12),
        axis.text.y = element_text(size = 12))

fviz_dist(dist_mic, lab_size = 5, gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07")) + ggtitle("Distance matrix microRNA")+
  theme(text = element_text(size=12),
        axis.text.x = element_text(size = 12),
        axis.text.y = element_text(size = 12))

PCA

without clusters

prep

## exp_pre
exp_4clust_pro_pre <- gnp.pro$exp_pro_vsn #
exp_4clust_mrn_pre <- gnp.mrn$exp_mrn_tpm_vsn_NIPALS # 
exp_4clust_mic_pre <- gnp.mic$exp_mic_vsn_NIPALS

# rownames = sample label
rownames(exp_4clust_pro_pre) # lab
rownames(exp_4clust_mrn_pre) # lab
rownames(exp_4clust_mic_pre) # lab

# center data
exp_4clust_pro <- as.data.frame(scale(exp_4clust_pro_pre, center = T, scale = F))
exp_4clust_mrn <- as.data.frame(scale(exp_4clust_mrn_pre, center = T, scale = F))
exp_4clust_mic <- as.data.frame(scale(exp_4clust_mic_pre, center = T, scale = F))

exp_4clust_pro.sc <- as.data.frame(scale(exp_4clust_pro_pre, center = T, scale = T))
exp_4clust_mrn.sc <- as.data.frame(scale(exp_4clust_mrn_pre, center = T, scale = T))
exp_4clust_mic.sc <- as.data.frame(scale(exp_4clust_mic_pre, center = T, scale = T))

# calc pca
res.pca.mrn <- prcomp(exp_4clust_mrn)
res.pca.mic <- prcomp(exp_4clust_mic)
res.pca.pro <- prcomp(exp_4clust_pro)

res.pca.mrn.sc <- prcomp(exp_4clust_mrn.sc)
res.pca.mic <- prcomp(exp_4clust_mic)
res.pca.pro <- prcomp(exp_4clust_pro)

# scree
fviz_eig(res.pca.mrn, main = 'unscaled')

fviz_eig(res.pca.mrn.sc, main = 'scaled')

fviz_eig(res.pca.mic)

fviz_eig(res.pca.pro)

# check PCA
fviz_pca_ind(res.pca.mrn, geom = "text")

fviz_pca_ind(res.pca.mic, geom = "text")

fviz_pca_ind(res.pca.pro, geom = "text")

# extract data
extract_PCs <- function(res.pca){
  dat.pca <- res.pca$x %>% as.data.frame() %>%
    rownames_to_column('fid') %>% 
    dplyr::select(fid, PC1, PC2, PC3, PC4, PC5, PC6, PC7, PC8) %>% 
    column_to_rownames('fid')
  dat.pca$id <- rownames(dat.pca)
  return(dat.pca)
}

dat.pca.mrn <- extract_PCs(res.pca.mrn)
dat.pca.mic <- extract_PCs(res.pca.mic)
dat.pca.pro <- extract_PCs(res.pca.pro)


## add data of interest

# stage
dat.pca.mrn <- add_stage(dat.pca.mrn, gnp.mrn$san_mrn, gnp.mrn$levels_mrn)
dat.pca.mic <- add_stage(dat.pca.mic, gnp.mic$san_mic, gnp.mic$levels_mic)
dat.pca.pro <- add_stage(dat.pca.pro, gnp.pro$san_pro, gnp.pro$levels_pro)

## cluster results
dat.pca.mrn$clu <- res.cluster.mrn$euk.7[match(dat.pca.mrn$id, res.cluster.mrn$id)]
dat.pca.mic$clu <- res.cluster.mic$euk.5[match(dat.pca.mic$id, res.cluster.mic$id)]
dat.pca.pro$clu <- res.cluster.pro$km.k3[match(dat.pca.pro$id, res.cluster.pro$id)]

dat.pca.pro$euk.4 <- res.cluster.pro$euk.4[match(dat.pca.pro$id, res.cluster.pro$id)]
dat.pca.pro$km.3 <- res.cluster.pro$km.k3[match(dat.pca.pro$id, res.cluster.pro$id)]
dat.pca.pro$km.5 <- res.cluster.pro$km.k5[match(dat.pca.pro$id, res.cluster.pro$id)]

plot

## PCA dotplots with kMeans

# fct
plot_pca.gnp <- function(res.pca, dat.pca, title){
  # get vals
  eigen.vals <- get_eig(res.pca)
  x_val <- round(eigen.vals$variance.percent[1],0)
  y_val <- round(eigen.vals$variance.percent[2],0)
  # plot
  dat.pca %>%
    ggplot(aes(x=PC1, y=PC2)) +
    #geom_circle(aes(group = clu), alpha=0.8, show.legend = F) +
    geom_point(aes(colour = stage), size = 8) +
    #geom_encircle(aes(group = clu), alpha = 0.5, show.legend = F, linetype =  "dotted", size = 3) +
    scale_color_manual(values = colors_stage) +
    #ggtitle(title) +
    xlab(paste0("Variance PC1: ", x_val, "%")) +
    ylab(paste0("Variance PC2: ", y_val, "%"))# +
    #theme(legend.position="none")
}

# mrn
plot_pca.gnp(res.pca.mrn, dat.pca.mrn, 'Transcriptome') +
  scale_x_reverse()

# mic
plot_pca.gnp(res.pca.mic, dat.pca.mic, 'microRNA') +
  scale_x_reverse() # + scale_y_reverse() 

# pro

# km5
dat.pca.pro$clu <- dat.pca.pro$km.3 # is best
#dat.pca.pro$clu <- dat.pca.pro$euk.3

plot_pca.gnp(res.pca.pro, dat.pca.pro, 'Proteome') +
  scale_x_reverse() + scale_y_reverse() 

with clusters

## PCA dotplots with kMeans

# fct
plot_myPCA_km <- function(res.pca, dat.pca, title){
  # get vals
  eigen.vals <- get_eig(res.pca)
  x_val <- round(eigen.vals$variance.percent[1],0)
  y_val <- round(eigen.vals$variance.percent[2],0)
  # plot
  dat.pca %>%
    ggplot(aes(x=PC1, y=PC2)) +
    geom_point(aes(colour = stage), size = 9) +
    scale_color_manual(values = colors_stage) +
    geom_encircle(aes(group = clu), alpha=0.4, show.legend = F, linetype = 'dashed', size = 2.5) +
    #ggtitle(title) +
    xlab(paste0("Variance PC1: ", x_val, "%")) +
    ylab(paste0("Variance PC2: ", y_val, "%")) +
    theme_light(base_size = 20) +
    theme(legend.position="none")
}

# mrn
p.pca.mrn <- plot_myPCA_km(res.pca.mrn, dat.pca.mrn, 'Transcriptome') +
  scale_x_reverse()

p.pca.mrn

ggsave(
  filename = paste0(dir_moa, "/figures/fig1/", "p.pca.mrn", ".png"),
  plot = p.pca.mrn,
  device = png,
  width = 6,
  height = 6)

ggsave(
  filename = paste0(dir_moa, "/figures/fig1/", "p.pca.mrn", ".svg"),
  plot = p.pca.mrn,
  device = svg,
  width = 6,
  height = 6)


# mic
p.pca.mic <- plot_myPCA_km(res.pca.mic, dat.pca.mic, 'microRNA') +
  scale_x_reverse()

p.pca.mic

ggsave(
  filename = paste0(dir_moa, "/figures/fig1/", "p.pca.mic", ".png"),
  plot = p.pca.mic,
  device = png,
  width = 6,
  height = 6)

ggsave(
  filename = paste0(dir_moa, "/figures/fig1/", "p.pca.mic", ".svg"),
  plot = p.pca.mic,
  device = svg,
  width = 6,
  height = 6)



# pro
p.pca.pro <- plot_myPCA_km(res.pca.pro, dat.pca.pro, 'Proteome') +
  scale_x_reverse() + scale_y_reverse() 

p.pca.pro

ggsave(
  filename = paste0(dir_moa, "/figures/fig1/", "p.pca.pro", ".png"),
  plot = p.pca.pro,
  device = png,
  width = 6,
  height = 6)

ggsave(
  filename = paste0(dir_moa, "/figures/fig1/", "p.pca.pro", ".svg"),
  plot = p.pca.pro,
  device = svg,
  width = 6,
  height = 6)