singles

load saved

# save(
#   pca.ncomp.mrn,
#   tune.spca.mrn,
#   pca.ncomp.pro,
#   tune.spca.pro,
#   pca.ncomp.mic,
#   tune.spca.mic,
#   pls.ncomp2,
#   file = paste0(dir_moa, "/data/final/preps/timeomics/", "time.data.RData"))

mrn

prep

## data 
exp.mrn_4to <- gnp.mrn$exp_mrn_tpm_vsn_NIPALS # NIPALS imputed data
# glimpse(exp.mrn)

# filter out non difEx features
dex.mrn_4to <- dex.all$dex.mrn$dex_stage_mrn_vsn
dex.mrn.sigFID <- dex.mrn_4to %>% filter(adj.P.Val < 0.05 & abs(logFC) >= 1) %>% pull(fid)
length(unique(dex.mrn.sigFID)) # 7790
exp.mrn.sdx <- exp.mrn_4to[, colnames(exp.mrn_4to) %in% dex.mrn.sigFID]

# check
rownames(exp.mrn.sdx) # samples: ok
colnames(exp.mrn.sdx) # features (ENSG). Never changed or switched. ok!

# san
san.mrn_4to <- gnp.mrn$san_mrn
san.mrn_4to <- as.data.frame(san.mrn_4to)
rownames(san.mrn_4to) <- san.mrn_4to$id
identical(rownames(exp.mrn.sdx), rownames(san.mrn_4to)) # ok

# numeric vector containing the sample time point information
time_mrn <- san.mrn_4to$stage_nr # this is a vector that is already in the correct order. CAVE, only in mrn, NOT in pro
unique(sort(time_mrn)) # 1:7, no missing number

filter

# create models
lmms.mrn <- lmms::lmmSpline(data = exp.mrn.sdx, # features filtered for dex 
                               time = time_mrn,
                               sampleID = rownames(exp.mrn.sdx), 
                               deri = FALSE,
                               basis = "cubic", 
                               numCores = 10, 
                               timePredict = 1:7, # max points
                               keepModels = TRUE) 
# here its ok not to define the knots? In proteome analysis it wont accept it.

# remove noise
filter.mrn <- lmms.filter.lines(data = exp.mrn.sdx, 
                                lmms.obj = lmms.mrn, 
                                time = time_mrn)
exp.mrn.fin <- filter.mrn$filtered

dim(exp.mrn.sdx) # 34, 7789
dim(exp.mrn.fin) # 7, 7347

ncol(exp.mrn.sdx) - ncol(exp.mrn.fin) 
# cubic: 428 removed (used in final)
# p-splines: 442 removed
# cubic p-splines: 875 removed

sPCA

Numbers for features to check based on WGCNA

# check
nrow(exp.mrn.fin) # 7 stages
ncol(exp.mrn.fin) # 7361 features

### numbers from WGCNA

# cont up: 3266 ft
# cont down: 3467 ft
# up down: 706 ft
# down up: 277 ft

keepX_mrn_1d <- 5000
keepX_mrn_2d <- 1000
keepX_mrn_3d <- 200
keepX_mrn_4d <- 100

keepX_mrn_wgcna_4d <- c(keepX_mrn_1d, keepX_mrn_2d, keepX_mrn_3d, keepX_mrn_4d)

spca.mrn_wgcna <- spca(X = exp.mrn.fin, 
                 ncomp = 4, 
                 keepX = keepX_mrn_wgcna_4d, 
                 scale = T, center = T)
plotLong(spca.mrn_wgcna, title = "mRNA trajectory, wgcna oriented numbers") #

# too many features seem not to adhere. Reduce!


# manual curation of WGCNA numbers
keepX_mrn_1d_c <- 4000
keepX_mrn_2d_c <- 750
keepX_mrn_3d_c <- 120
keepX_mrn_4d_c <- 50

keepX_mrn_wgcna_4d_c <- c(keepX_mrn_1d_c, keepX_mrn_2d_c, keepX_mrn_3d_c, keepX_mrn_4d_c)

spca.mrn <- spca(X = exp.mrn.fin, 
                 ncomp = 4, 
                 keepX = keepX_mrn_wgcna_4d_c, 
                 scale = T, center = T)
plotLong(spca.mrn, title = "mRNA trajectory, manually curated")

# cleaner trajectories. Use these ones


# plot indiv
plotIndiv(spca.mrn) # not sure for what useful

## features of indiv
plotVar(spca.mrn) # not sure for what useful

## loadings:
plotLoadings(spca.mrn) # not sure for what useful

## get the features per cluster
trajec.mrn <- getCluster(spca.mrn)


# check some features
trajec.mrn %>% group_by(cluster) %>% top_n(contrib.max, n = 3) %>% arrange(cluster) %>% print(n = nrow(.))

# # cluster 1
# plotter.expL.ENSG(exp_mrnL, 'ENSG00000166579') # up
# plotter.expL.ENSG(exp_mrnL, 'ENSG00000168591') # up
# plotter.expL.ENSG(exp_mrnL, 'ENSG00000197622') # up
# 
# # cluster -1
# plotter.expL.ENSG(exp_mrnL, 'ENSG00000100567') # down
# plotter.expL.ENSG(exp_mrnL, 'ENSG00000147535') # down up
# plotter.expL.ENSG(exp_mrnL, 'ENSG00000150433') # down upish
# 
# # cluster 2
# plotter.expL.ENSG(exp_mrnL, 'ENSG00000066777') # up dn
# plotter.expL.ENSG(exp_mrnL, 'ENSG00000157379') # up dn
# plotter.expL.ENSG(exp_mrnL, 'ENSG00000169762') # up dn
# 
# # cluster -2
# plotter.expL.ENSG(exp_mrnL, 'ENSG00000068878') # dn up
# plotter.expL.ENSG(exp_mrnL, 'ENSG00000162650') # dn up
# plotter.expL.ENSG(exp_mrnL, 'ENSG00000227500') # dn up
# 
# # cluster 3
# plotter.expL.ENSG(exp_mrnL, 'ENSG00000010278') # up dn up
# plotter.expL.ENSG(exp_mrnL, 'ENSG00000135211') # up dn up
# plotter.expL.ENSG(exp_mrnL, 'ENSG00000187808') # up dn up
# 
# # cluster -3
# plotter.expL.ENSG(exp_mrnL, 'ENSG00000142528') # dn up dn
# plotter.expL.ENSG(exp_mrnL, 'ENSG00000170004') # dn up dn
# plotter.expL.ENSG(exp_mrnL, 'ENSG00000198231') # dn up dn


# medians of features in clusters are consistent!

pro cFin

prep

## data 
exp.pro_4to <- gnp.pro$exp_pro_vsn
# filter dex
dex.pro <- dex.all$dex.pro$dex_stage_pro_vsn
dex.pro.sigFID <- dex.pro %>% filter(adj.P.Val < 0.05 & abs(logFC) >= 1) %>% pull(fid)
length(unique(dex.pro.sigFID)) # 2026
exp.pro.sdx <- exp.pro_4to[, colnames(exp.pro_4to) %in% dex.pro.sigFID]

# prep
rownames(exp.pro.sdx) # samples
colnames(exp.pro.sdx) # features: UNIPROT

san.pro_4to <- gnp.pro$san_pro
san.pro_4to <- as.data.frame(san.pro_4to)
rownames(san.pro_4to) <- san.pro_4to$id

identical(rownames(exp.pro.sdx), rownames(san.pro_4to)) # ok!

# define fin in san
san.pro_4to %<>% mutate(stage.c = recode(stage,
                    'MB' = 'MB',
                    'PM' = 'PM',
                    'MC' = 'MC',
                    'MM' = 'fin',
                    'B' = 'fin',
                    'PMN' = 'fin'))

san.pro_4to$stage.c <- factor(san.pro_4to$stage.c, levels = c('MB', 'PM', 'MC', 'fin'))

identical(rownames(exp.pro.sdx), rownames(san.pro_4to)) # ok!

# timepoints by stages
time_pro <- as.numeric(san.pro_4to$stage.c)

filter

# list of models
lmms.pro <- lmms::lmmSpline(data = exp.pro.sdx, 
                               time = time_pro,
                               sampleID = rownames(exp.pro.sdx), 
                               deri = FALSE,
                               basis = "cubic",  # cubic does not need definition of knots = preferred
                               numCores = 10, 
                               timePredict = 1:4, # max points
                               keepModels = TRUE) # 

# remove noise
filter.pro <- lmms.filter.lines(data = exp.pro.sdx, 
                                lmms.obj = lmms.pro, 
                                time = time_pro)

exp.pro.fin <- filter.pro$filtered

ncol(exp.pro.sdx) - ncol(exp.pro.fin) # 187 removed.

sPCA

# check
nrow(exp.pro.fin) # 4 stages
ncol(exp.pro.fin) # 1839 features


### integrating numbers from WGCNA: way too big becuase we filtered our features! DOES NOT MAKE SENSE!
# keepX_pro_1d <- 1500
# keepX_pro_2d <- 1000
# keepX_pro_3d <- 300
# keepX_mrn_4d <- 100

# keepX_pro_wgcna_3d <- c(keepX_pro_1d, keepX_pro_2d, keepX_pro_3d)
# 
# spca.pro_wgcna.cFin <- spca(X = exp.pro.cFin.fin, # exp.pro.sdx vs exp.pro.fin
#                  ncomp = 3, 
#                  keepX = keepX_pro_wgcna_3d, 
#                  scale = T, center = T)
# plotLong(spca.pro_wgcna.cFin, title = "Proteome trajectory, wgcna oriented numbers") #


# manual curation of WGCNA oriented numbers
keepX_pro_1d_c <- 1010
keepX_pro_2d_c <- 710
keepX_pro_3d_c <- 110

keepX_pro_wgcna_3d_c <- c(keepX_pro_1d_c, keepX_pro_2d_c, keepX_pro_3d_c)

spca.pro <- spca(X = exp.pro.fin,
                 ncomp = 3,
                 keepX = keepX_pro_wgcna_3d_c,
                 scale = T, center = T) # would they be better if not scaled and centered?
plotLong(spca.pro, title = "Proteome trajectory, manually curated")

# plot indiv
plotIndiv(spca.pro) # which one is this refereing to? Is this a pca or a trajectory?

## features of indiv
plotVar(spca.pro)

## loadings: doesnt work
plotLoadings(spca.pro) # how to interpret this?

## get the features per cluster
trajec.pro <- getCluster(spca.pro)



# check some features
trajec.pro %>% group_by(cluster) %>% top_n(contrib.max, n = 3) %>% arrange(cluster) %>% print(n = nrow(.))

# # cluster 1
# plotter.expL.UNIPROT(exp_proL, 'P04839') # up
# plotter.expL.UNIPROT(exp_proL, 'P46940') # up
# plotter.expL.UNIPROT(exp_proL, 'Q96CW6') # dn up dn... ? this one is a bit strange
# # plotter.expL.UNIPROT(exp_proL, 'Q9BXS4') # up
# 
# # cluster -1
# plotter.expL.UNIPROT(exp_proL, 'A6NIH7') # down
# plotter.expL.UNIPROT(exp_proL, 'O00330') # down
# plotter.expL.UNIPROT(exp_proL, 'O14737') # down
# 
# # cluster 2
# plotter.expL.UNIPROT(exp_proL, 'P11172') # up dn
# plotter.expL.UNIPROT(exp_proL, 'Q15084') # up dn
# plotter.expL.UNIPROT(exp_proL, 'Q16836') # up dn
# 
# # cluster - 2
# plotter.expL.UNIPROT(exp_proL, 'P35579') # up ?
# plotter.expL.UNIPROT(exp_proL, 'P62993') # up ?
# plotter.expL.UNIPROT(exp_proL, 'Q14019') # up ?

mic

prep

## data 
exp.mic_4to <- gnp.mic$exp_mic_vsn_NIPALS # imputed values

# filter out non difEx features
dex.mic <- dex.all$dex.mic$dex_stage_mic_vsn
dex.mic.sigFID <- dex.mic %>% filter(adj.P.Val < 0.05 & abs(logFC) >= 1) %>% pull(fid)
length(unique(dex.mic.sigFID)) # 209
exp.mic.sdx <- exp.mic_4to[, colnames(exp.mic_4to) %in% dex.mic.sigFID]
dim(exp.mic.sdx) # 24, 209

# prep
rownames(exp.mic.sdx) # samples
colnames(exp.mic.sdx) # features: UNImicT

san.mic <- gnp.mic$san_mic
san.mic <- as.data.frame(san.mic)
rownames(san.mic) <- san.mic$id

identical(rownames(exp.mic.sdx), rownames(san.mic)) # NO
san.mic <- san.mic[order(match(san.mic$id, rownames(exp.mic.sdx))),]


# time points
san.mic$stage <- factor(san.mic$stage, levels = c('MB', 'MC', 'MM', 'B', 'S', 'PMN'))
time_mic <- as.numeric(san.mic$stage)

# check
san.mic$stage
time_mic
identical(rownames(exp.mic.sdx), rownames(san.mic)) 

# ok!

filter

lmms.mic <- lmms::lmmSpline(data = exp.mic.sdx, 
                               time = time_mic,
                               sampleID = rownames(exp.mic.sdx), 
                               deri = FALSE,
                               basis = "cubic",  # dbasis oesnt matter much because its just used for filtering
                               numCores = 10, 
                               timePredict = 1:6, # max points
                               keepModels = TRUE)

# remove noise
filter.mic <- lmms.filter.lines(data = exp.mic.sdx, 
                                lmms.obj = lmms.mic, 
                                time = time_mic)

exp.mic.fin <- filter.mic$filtered

ncol(exp.mic.sdx) - ncol(exp.mic.fin) # 12 removed

sPCA

# check
nrow(exp.mic.fin) # 6 stages
ncol(exp.mic.fin) # 197 features


# manual curation of WGCNA oriented numbers
keepX_mic_1d_c <- 140
keepX_mic_2d_c <- 40
keepX_mic_3d_c <- 17

keepX_mic_wgcna_3d_c <- c(keepX_mic_1d_c, keepX_mic_2d_c, keepX_mic_3d_c)

spca.mic <- spca(X = exp.mic.fin,
                 ncomp = 3,
                 keepX = keepX_mic_wgcna_3d_c,
                 scale = T, center = T)
plotLong(spca.mic, title = "microRNA trajectory, manually curated") #

# plot indiv
plotIndiv(spca.mic)

## features of indiv
plotVar(spca.mic)

## loadings
plotLoadings(spca.mic) # how to interpret this?

## get the features per cluster
trajec.mic <- getCluster(spca.mic)

timeOmics

Sebastian Hesse, contact: dr.sebastian.hesse@gmail.com

2023-10-23

singles

load saved

mrn

prep

filter

sPCA

pro cFin

prep

filter

sPCA

mic

prep

filter

sPCA

export