Last update: 24-10-16
Last markdown compiled: 24-10-16
knitr::opts_chunk$set(warning = FALSE, message = FALSE, results = FALSE)
# dirs
## Sebastian @Linux
dir_moa <- "/home/isar/Dropbox/scn_moa/"
# data, revised 2023.10.18
# exp
load(file = paste0(dir_moa, "/data/final/", "gnp.pro.RData"))
load(file = paste0(dir_moa, "/data/final/", "gnp.mrn.RData"))
load(file = paste0(dir_moa, "/data/final/", "gnp.mic.RData"))
load(file = paste0(dir_moa, "/data/final/", "gnp.Isets.RData"))
# results
load(file = paste0(dir_moa, "/data/final/", "dex.all.RData"))
## packages
source(paste0(dir_moa, "/scripts/", "scn_moa_fct.R"))
# load packages
sapply(packages_general, require, character.only = TRUE)
#sapply(packages_pca, require, character.only = TRUE)
#sapply(packages_cluster, require, character.only = TRUE)
#sapply(packages_heatmap, require, character.only = TRUE)
library(propr)
library(timeOmics)
library(lmms)
# library(TCseq) # cMeans. discarded
library(gt)
set.seed(1409) # unfortunatly this does not seem to increase reproducibility
# save(
# pca.ncomp.mrn,
# tune.spca.mrn,
# pca.ncomp.pro,
# tune.spca.pro,
# pca.ncomp.mic,
# tune.spca.mic,
# pls.ncomp2,
# file = paste0(dir_moa, "/data/final/preps/timeomics/", "time.data.RData"))
## data
exp.mrn_4to <- gnp.mrn$exp_mrn_tpm_vsn_NIPALS # NIPALS imputed data
# glimpse(exp.mrn)
# filter out non difEx features
dex.mrn_4to <- dex.all$dex.mrn$dex_stage_mrn_vsn
dex.mrn.sigFID <- dex.mrn_4to %>% filter(adj.P.Val < 0.05 & abs(logFC) >= 1) %>% pull(fid)
length(unique(dex.mrn.sigFID)) # 7790
exp.mrn.sdx <- exp.mrn_4to[, colnames(exp.mrn_4to) %in% dex.mrn.sigFID]
# check
rownames(exp.mrn.sdx) # samples: ok
colnames(exp.mrn.sdx) # features (ENSG). Never changed or switched. ok!
# san
san.mrn_4to <- gnp.mrn$san_mrn
san.mrn_4to <- as.data.frame(san.mrn_4to)
rownames(san.mrn_4to) <- san.mrn_4to$id
identical(rownames(exp.mrn.sdx), rownames(san.mrn_4to)) # ok
# numeric vector containing the sample time point information
time_mrn <- san.mrn_4to$stage_nr # this is a vector that is already in the correct order. CAVE, only in mrn, NOT in pro
unique(sort(time_mrn)) # 1:7, no missing number
# create models
lmms.mrn <- lmms::lmmSpline(data = exp.mrn.sdx, # features filtered for dex
time = time_mrn,
sampleID = rownames(exp.mrn.sdx),
deri = FALSE,
basis = "cubic",
numCores = 10,
timePredict = 1:7, # max points
keepModels = TRUE)
# here its ok not to define the knots? In proteome analysis it wont accept it.
# remove noise
filter.mrn <- lmms.filter.lines(data = exp.mrn.sdx,
lmms.obj = lmms.mrn,
time = time_mrn)
exp.mrn.fin <- filter.mrn$filtered
dim(exp.mrn.sdx) # 34, 7789
dim(exp.mrn.fin) # 7, 7347
ncol(exp.mrn.sdx) - ncol(exp.mrn.fin)
# cubic: 428 removed (used in final)
# p-splines: 442 removed
# cubic p-splines: 875 removed
Numbers for features to check based on WGCNA
# check
nrow(exp.mrn.fin) # 7 stages
ncol(exp.mrn.fin) # 7361 features
### numbers from WGCNA
# cont up: 3266 ft
# cont down: 3467 ft
# up down: 706 ft
# down up: 277 ft
keepX_mrn_1d <- 5000
keepX_mrn_2d <- 1000
keepX_mrn_3d <- 200
keepX_mrn_4d <- 100
keepX_mrn_wgcna_4d <- c(keepX_mrn_1d, keepX_mrn_2d, keepX_mrn_3d, keepX_mrn_4d)
spca.mrn_wgcna <- spca(X = exp.mrn.fin,
ncomp = 4,
keepX = keepX_mrn_wgcna_4d,
scale = T, center = T)
plotLong(spca.mrn_wgcna, title = "mRNA trajectory, wgcna oriented numbers") #
# too many features seem not to adhere. Reduce!
# manual curation of WGCNA numbers
keepX_mrn_1d_c <- 4000
keepX_mrn_2d_c <- 750
keepX_mrn_3d_c <- 120
keepX_mrn_4d_c <- 50
keepX_mrn_wgcna_4d_c <- c(keepX_mrn_1d_c, keepX_mrn_2d_c, keepX_mrn_3d_c, keepX_mrn_4d_c)
spca.mrn <- spca(X = exp.mrn.fin,
ncomp = 4,
keepX = keepX_mrn_wgcna_4d_c,
scale = T, center = T)
plotLong(spca.mrn, title = "mRNA trajectory, manually curated")
# cleaner trajectories. Use these ones
# plot indiv
plotIndiv(spca.mrn) # not sure for what useful
## features of indiv
plotVar(spca.mrn) # not sure for what useful
## loadings:
plotLoadings(spca.mrn) # not sure for what useful
## get the features per cluster
trajec.mrn <- getCluster(spca.mrn)
# check some features
trajec.mrn %>% group_by(cluster) %>% top_n(contrib.max, n = 3) %>% arrange(cluster) %>% print(n = nrow(.))
# # cluster 1
# plotter.expL.ENSG(exp_mrnL, 'ENSG00000166579') # up
# plotter.expL.ENSG(exp_mrnL, 'ENSG00000168591') # up
# plotter.expL.ENSG(exp_mrnL, 'ENSG00000197622') # up
#
# # cluster -1
# plotter.expL.ENSG(exp_mrnL, 'ENSG00000100567') # down
# plotter.expL.ENSG(exp_mrnL, 'ENSG00000147535') # down up
# plotter.expL.ENSG(exp_mrnL, 'ENSG00000150433') # down upish
#
# # cluster 2
# plotter.expL.ENSG(exp_mrnL, 'ENSG00000066777') # up dn
# plotter.expL.ENSG(exp_mrnL, 'ENSG00000157379') # up dn
# plotter.expL.ENSG(exp_mrnL, 'ENSG00000169762') # up dn
#
# # cluster -2
# plotter.expL.ENSG(exp_mrnL, 'ENSG00000068878') # dn up
# plotter.expL.ENSG(exp_mrnL, 'ENSG00000162650') # dn up
# plotter.expL.ENSG(exp_mrnL, 'ENSG00000227500') # dn up
#
# # cluster 3
# plotter.expL.ENSG(exp_mrnL, 'ENSG00000010278') # up dn up
# plotter.expL.ENSG(exp_mrnL, 'ENSG00000135211') # up dn up
# plotter.expL.ENSG(exp_mrnL, 'ENSG00000187808') # up dn up
#
# # cluster -3
# plotter.expL.ENSG(exp_mrnL, 'ENSG00000142528') # dn up dn
# plotter.expL.ENSG(exp_mrnL, 'ENSG00000170004') # dn up dn
# plotter.expL.ENSG(exp_mrnL, 'ENSG00000198231') # dn up dn
# medians of features in clusters are consistent!
## data
exp.pro_4to <- gnp.pro$exp_pro_vsn
# filter dex
dex.pro <- dex.all$dex.pro$dex_stage_pro_vsn
dex.pro.sigFID <- dex.pro %>% filter(adj.P.Val < 0.05 & abs(logFC) >= 1) %>% pull(fid)
length(unique(dex.pro.sigFID)) # 2026
exp.pro.sdx <- exp.pro_4to[, colnames(exp.pro_4to) %in% dex.pro.sigFID]
# prep
rownames(exp.pro.sdx) # samples
colnames(exp.pro.sdx) # features: UNIPROT
san.pro_4to <- gnp.pro$san_pro
san.pro_4to <- as.data.frame(san.pro_4to)
rownames(san.pro_4to) <- san.pro_4to$id
identical(rownames(exp.pro.sdx), rownames(san.pro_4to)) # ok!
# define fin in san
san.pro_4to %<>% mutate(stage.c = recode(stage,
'MB' = 'MB',
'PM' = 'PM',
'MC' = 'MC',
'MM' = 'fin',
'B' = 'fin',
'PMN' = 'fin'))
san.pro_4to$stage.c <- factor(san.pro_4to$stage.c, levels = c('MB', 'PM', 'MC', 'fin'))
identical(rownames(exp.pro.sdx), rownames(san.pro_4to)) # ok!
# timepoints by stages
time_pro <- as.numeric(san.pro_4to$stage.c)
# list of models
lmms.pro <- lmms::lmmSpline(data = exp.pro.sdx,
time = time_pro,
sampleID = rownames(exp.pro.sdx),
deri = FALSE,
basis = "cubic", # cubic does not need definition of knots = preferred
numCores = 10,
timePredict = 1:4, # max points
keepModels = TRUE) #
# remove noise
filter.pro <- lmms.filter.lines(data = exp.pro.sdx,
lmms.obj = lmms.pro,
time = time_pro)
exp.pro.fin <- filter.pro$filtered
ncol(exp.pro.sdx) - ncol(exp.pro.fin) # 187 removed.
# check
nrow(exp.pro.fin) # 4 stages
ncol(exp.pro.fin) # 1839 features
### integrating numbers from WGCNA: way too big becuase we filtered our features! DOES NOT MAKE SENSE!
# keepX_pro_1d <- 1500
# keepX_pro_2d <- 1000
# keepX_pro_3d <- 300
# keepX_mrn_4d <- 100
# keepX_pro_wgcna_3d <- c(keepX_pro_1d, keepX_pro_2d, keepX_pro_3d)
#
# spca.pro_wgcna.cFin <- spca(X = exp.pro.cFin.fin, # exp.pro.sdx vs exp.pro.fin
# ncomp = 3,
# keepX = keepX_pro_wgcna_3d,
# scale = T, center = T)
# plotLong(spca.pro_wgcna.cFin, title = "Proteome trajectory, wgcna oriented numbers") #
# manual curation of WGCNA oriented numbers
keepX_pro_1d_c <- 1010
keepX_pro_2d_c <- 710
keepX_pro_3d_c <- 110
keepX_pro_wgcna_3d_c <- c(keepX_pro_1d_c, keepX_pro_2d_c, keepX_pro_3d_c)
spca.pro <- spca(X = exp.pro.fin,
ncomp = 3,
keepX = keepX_pro_wgcna_3d_c,
scale = T, center = T) # would they be better if not scaled and centered?
plotLong(spca.pro, title = "Proteome trajectory, manually curated")
# plot indiv
plotIndiv(spca.pro) # which one is this refereing to? Is this a pca or a trajectory?
## features of indiv
plotVar(spca.pro)
## loadings: doesnt work
plotLoadings(spca.pro) # how to interpret this?
## get the features per cluster
trajec.pro <- getCluster(spca.pro)
# check some features
trajec.pro %>% group_by(cluster) %>% top_n(contrib.max, n = 3) %>% arrange(cluster) %>% print(n = nrow(.))
# # cluster 1
# plotter.expL.UNIPROT(exp_proL, 'P04839') # up
# plotter.expL.UNIPROT(exp_proL, 'P46940') # up
# plotter.expL.UNIPROT(exp_proL, 'Q96CW6') # dn up dn... ? this one is a bit strange
# # plotter.expL.UNIPROT(exp_proL, 'Q9BXS4') # up
#
# # cluster -1
# plotter.expL.UNIPROT(exp_proL, 'A6NIH7') # down
# plotter.expL.UNIPROT(exp_proL, 'O00330') # down
# plotter.expL.UNIPROT(exp_proL, 'O14737') # down
#
# # cluster 2
# plotter.expL.UNIPROT(exp_proL, 'P11172') # up dn
# plotter.expL.UNIPROT(exp_proL, 'Q15084') # up dn
# plotter.expL.UNIPROT(exp_proL, 'Q16836') # up dn
#
# # cluster - 2
# plotter.expL.UNIPROT(exp_proL, 'P35579') # up ?
# plotter.expL.UNIPROT(exp_proL, 'P62993') # up ?
# plotter.expL.UNIPROT(exp_proL, 'Q14019') # up ?
## data
exp.mic_4to <- gnp.mic$exp_mic_vsn_NIPALS # imputed values
# filter out non difEx features
dex.mic <- dex.all$dex.mic$dex_stage_mic_vsn
dex.mic.sigFID <- dex.mic %>% filter(adj.P.Val < 0.05 & abs(logFC) >= 1) %>% pull(fid)
length(unique(dex.mic.sigFID)) # 209
exp.mic.sdx <- exp.mic_4to[, colnames(exp.mic_4to) %in% dex.mic.sigFID]
dim(exp.mic.sdx) # 24, 209
# prep
rownames(exp.mic.sdx) # samples
colnames(exp.mic.sdx) # features: UNImicT
san.mic <- gnp.mic$san_mic
san.mic <- as.data.frame(san.mic)
rownames(san.mic) <- san.mic$id
identical(rownames(exp.mic.sdx), rownames(san.mic)) # NO
san.mic <- san.mic[order(match(san.mic$id, rownames(exp.mic.sdx))),]
# time points
san.mic$stage <- factor(san.mic$stage, levels = c('MB', 'MC', 'MM', 'B', 'S', 'PMN'))
time_mic <- as.numeric(san.mic$stage)
# check
san.mic$stage
time_mic
identical(rownames(exp.mic.sdx), rownames(san.mic))
# ok!
lmms.mic <- lmms::lmmSpline(data = exp.mic.sdx,
time = time_mic,
sampleID = rownames(exp.mic.sdx),
deri = FALSE,
basis = "cubic", # dbasis oesnt matter much because its just used for filtering
numCores = 10,
timePredict = 1:6, # max points
keepModels = TRUE)
# remove noise
filter.mic <- lmms.filter.lines(data = exp.mic.sdx,
lmms.obj = lmms.mic,
time = time_mic)
exp.mic.fin <- filter.mic$filtered
ncol(exp.mic.sdx) - ncol(exp.mic.fin) # 12 removed
# check
nrow(exp.mic.fin) # 6 stages
ncol(exp.mic.fin) # 197 features
# manual curation of WGCNA oriented numbers
keepX_mic_1d_c <- 140
keepX_mic_2d_c <- 40
keepX_mic_3d_c <- 17
keepX_mic_wgcna_3d_c <- c(keepX_mic_1d_c, keepX_mic_2d_c, keepX_mic_3d_c)
spca.mic <- spca(X = exp.mic.fin,
ncomp = 3,
keepX = keepX_mic_wgcna_3d_c,
scale = T, center = T)
plotLong(spca.mic, title = "microRNA trajectory, manually curated") #
# plot indiv
plotIndiv(spca.mic)
## features of indiv
plotVar(spca.mic)
## loadings
plotLoadings(spca.mic) # how to interpret this?
## get the features per cluster
trajec.mic <- getCluster(spca.mic)
save(
spca.mrn,
spca.pro,
spca.mic,
file = paste0(dir_moa, "/data/final/preps/timeomics/", "spca.timeOmics.RData"))
knitr::knit_exit()