Last update: 23-10-05 Last markdown compiled: 23-10-09 ToDo: - all done
knitr::opts_chunk$set(warning = FALSE, message = FALSE, results = FALSE)
# dirs
## Sebastian @Linux
dir_moa <- "/home/isar/Dropbox/scn_moa/"
# data: only raw data, loaded below
## packages
source(paste0(dir_moa, "/scripts/", "scn_moa_fct.R"))
# load required package collections
sapply(packages_general, require, character.only = TRUE)
sapply(packages_annotation, require, character.only = TRUE)
sapply(packages_missingValues, require, character.only = TRUE)
sapply(packages_normalisation, require, character.only = TRUE)
# import: raw counts and tpn
exp_mrn_counts_raw <- read_delim(paste0(dir_moa,"/data/gnp/mrn/original/", "unstranded_raw_counts.tsv"), delim = "\t", escape_double = FALSE, trim_ws = TRUE)
exp_mrn_tpn_raw <- read_delim(paste0(dir_moa,"/data/gnp/mrn/original/", "gene_count_unstranded_tpm_norm.tsv"), delim = "\t", escape_double = FALSE, trim_ws = TRUE)
# format ids
names(exp_mrn_counts_raw) <- gsub("-", "_", names(exp_mrn_counts_raw))
names(exp_mrn_tpn_raw) <- gsub("-", "_", names(exp_mrn_tpn_raw))
# make numeric
exp_mrn_counts_raw[,2:50] <- sapply(exp_mrn_counts_raw[,2:50], as.numeric)
exp_mrn_tpn_raw[,2:50] <- sapply(exp_mrn_tpn_raw[,2:50], as.numeric)
# remove and export length info from TPB
gene_length_gnp_mrn <- exp_mrn_tpn_raw %>% select(fid = gene_id, gene_length)
#exp_mrn_tpn_raw$gene_length <- NULL
#### Density check of raw data: not in use
## counts
plot_hist_counts <- exp_mrn_counts_raw %>%
ggplot(aes(x=G13)) +
geom_freqpoly(binwidth = 1) +
scale_x_continuous(trans='log2') +
ggtitle("Counts")
plot_hist_tpn <- exp_mrn_tpn_raw %>%
ggplot(aes(x=G13)) +
geom_freqpoly(binwidth = 1) +
scale_x_continuous(trans='log2') +
ggtitle("TPN")
# fig FC level comparison
fig_hist_pre <- ggarrange(
plot_hist_counts,
plot_hist_tpn
)
fig_hist <- annotate_figure(fig_hist_pre,
top = text_grob("Histogam of raw mRNA data: counts vs tpn ",
color = "black", face = "bold", size = 18),
#bottom = text_grob("FDR <= 0.05, abs(log FC) >= 1]",
# color = "black", size = 12 )
)
fig_hist
# copy as pre and final format
exp_tpn_pre <- exp_mrn_tpn_raw
exp_tpn_pre$gene_length <- NULL
exp_tpn_pre %<>% rename(fid = gene_id)
exp_counts_pre <- exp_mrn_counts_raw
exp_counts_pre %<>% rename(fid = gene_id)
identical(names(exp_tpn_pre), names(exp_counts_pre)) # ok!
identical(exp_tpn_pre$fid, exp_counts_pre$fid) # ok!
nrow(exp_tpn_pre) # 61552
nrow(exp_counts_pre) # 61552
# import
san_mrn_raw <- read_csv(paste0(dir_moa,"/data/gnp/mrn/original/", "sequenced_RNA_samples_granulopoiesis - RNA_samples_granulopoiesis.csv"))
# create IDs
san_mrn_raw$id <- san_mrn_raw$name
san_mrn_raw$id <- gsub("-", "_", san_mrn_raw$id)
# san for all exp?
setdiff(san_mrn_raw$id, names(exp_counts_pre)) # way more samples in here bc of micro RNA!
setdiff(names(exp_counts_pre), san_mrn_raw$id) # info for all exp is present -> Reduce san
setdiff(names(exp_tpn_pre), san_mrn_raw$id) # info for all exp is present -> Reduce san
# reduce to samples we have
san_mrn_temp <- san_mrn_raw[san_mrn_raw$id %in% c(names(exp_tpn_pre), "fid"),]
# check
setdiff(san_mrn_temp$id, names(exp_tpn_pre)) #ok!
setdiff(names(exp_tpn_pre), san_mrn_temp$id) #ok!
# add donor details
donor_details <- read_csv(paste0(dir_moa, "/data/gnp/pro/original/", "donor_details - Donors.csv"))
donor_details$donor <- toupper(donor_details$donor)
san_mrn_temp <- left_join(san_mrn_temp, donor_details, by = c("donor"))
exclude low counts and <3 values/group
exp_counts_unfiltered <- exp_counts_pre
# copy
exp_counts_4filt <- exp_counts_pre # copy raw counts
# set all values below threshold (10) NA
exp_counts_4filt[exp_counts_4filt< 10] = NA
# check NAs per feature and group
exp_counts_4filt_long <- exp_counts_4filt %>% pivot_longer(!fid, names_to = 'id', values_to = "value")
exp_counts_4filt_long$pop_gen <- san_mrn_temp$population_general[match(exp_counts_4filt_long$id, san_mrn_temp$id)]
# not NA/group
notNAperGeneANDgroup_cou <- exp_counts_4filt_long %>%
dplyr::group_by(fid, pop_gen) %>%
dplyr::summarise(sum_notNA = sum(!is.na(value)))
# create filter
summary_notNA_counts <- notNAperGeneANDgroup_cou %>% group_by(fid) %>%
filter(sum(sum_notNA>1)>1, sum_notNA > 1) %>%
dplyr::summarise(tot_valid = n(), valid_groups = str_c(pop_gen, collapse = ';'))
table(summary_notNA_counts$tot_valid)
# min2 in all 7 groups = good for DEX
fid_min2INallGroups_counts <- summary_notNA_counts %>%
filter(tot_valid == 7) %>% pull(fid)
length(fid_min2INallGroups_counts) # 9663 valis
Publication on tpm filtering: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4937821/
## Copy the NAs from counts to TPM for consistency
# copy data
exp_tpm_4filt <- exp_tpn_pre
# check consistency
identical(exp_tpm_4filt$fid, exp_counts_pre$fid)
identical(names(exp_tpm_4filt),names(exp_counts_pre))
###### TPM below threshold filtering (abandoned! proceed below)
# median library size = 37 Million
# 10/37 = 0.28 = minimum to filter for in TPN data (check if reads would get the same)
exp_tpm_4filt[exp_tpm_4filt <= 0.28] = NA
nrow(exp_tpm_4filt)
# check NAs per feature and group
exp_tpm_4filt_long <- exp_tpm_4filt %>% pivot_longer(!fid, names_to = 'id', values_to = "value")
exp_tpm_4filt_long$pop_gen <- san_mrn_temp$population_general[match(exp_tpm_4filt_long$id, san_mrn_temp$id)]
# not NA/group
notNAperGeneANDgroup_tpm <- exp_tpm_4filt_long %>%
group_by(fid, pop_gen) %>%
summarise(sum_notNA = sum(!is.na(value)))
length(unique(notNAperGeneANDgroup_tpm$fid)) # 61552
# create filter
summary_notNA_tpm <- notNAperGeneANDgroup_tpm %>% group_by(fid) %>%
filter(sum(sum_notNA>1)>1, sum_notNA > 1) %>%
summarise(tot_valid = n(), valid_groups = str_c(pop_gen, collapse = ';'))
table(summary_notNA_tpm$tot_valid) # 10009 in 7
table(summary_notNA_counts$tot_valid) # 9663 in 7
# min2 in all 7 groups = good for DEX
fid_min2INallGroups_tpm <- summary_notNA_tpm %>%
filter(tot_valid == 7) %>% pull(fid)
length(fid_min2INallGroups_tpm)
# compare invlaid in TPM and counts
# fid_min2INallGroups_counts
# fid_min2INallGroups_tpm
length(intersect(fid_min2INallGroups_counts, fid_min2INallGroups_tpm))
# 8897 intersecting: keep only these
fid_min2inALL_CountsANDtpm <- intersect(fid_min2INallGroups_counts, fid_min2INallGroups_tpm)
length(fid_min2inALL_CountsANDtpm) # not final yet becuase I will summarize groups below
#### copy usable data at this point
# COUNTS
exp_counts_RAW <- exp_counts_pre # all features with count threshold = 10 (61.552)
exp_counts_filtered <- exp_counts_pre %>% filter(fid %in% fid_min2inALL_CountsANDtpm)
nrow(exp_counts_RAW) # 61552
nrow(exp_counts_filtered) # 8897
# set threshold to NA in filtered
exp_counts_filtered[exp_counts_filtered < 10] = NA
NAinCounts <- is.na(exp_counts_filtered)
# TPM
exp_tpm_RAW <- exp_tpn_pre
exp_tpm_filtered <- exp_tpn_pre %>% filter(fid %in% fid_min2inALL_CountsANDtpm)
nrow(exp_tpm_RAW) # 61552
nrow(exp_tpm_filtered) # 8897
exp_tpm_filtered[exp_tpm_filtered <= 0.28] = NA
NAinTPM <- is.na(exp_tpm_filtered)
# copy NAs between the two DFs
exp_tpm_filtered[NAinCounts] <- NA
exp_counts_filtered[NAinTPM] <- NA
# creare non data
exp_tpm_non <- exp_tpm_filtered
exp_cou_non <- exp_counts_filtered
# colors by sample
colors_ids <- san_mrn_temp %>% select(id, population_general)
colors_ids$color <- colors_stage[match(colors_ids$population_general, names(colors_stage))]
colors_ids$population_general <- NULL
col_ids <- as.vector(colors_ids$color)
names(col_ids) <- colors_ids$id
## density histogram plotter
# with log x
plot_sample_density_log <- function(exp, title) {
exp_long <- exp %>% pivot_longer(!fid, values_to = 'value', names_to = 'id')
exp_long$stage <- san_mrn_temp$population_general[match(exp_long$id, san_mrn_temp$id)]
exp_long$stage <- factor(exp_long$stage, levels = levels_mrn)
exp_long$donor <- san_mrn_temp$donor[match(exp_long$id, san_mrn_temp$id)]
exp_long %>%
#filter(donor == 'B') %>%
ggplot(aes(x = value, color = id)) +
geom_density(na.rm = T, bw = 'SJ') +
scale_x_continuous(trans='log2') +
scale_color_manual(values = col_ids) +
theme(legend.position = "none") +
ggtitle(title) +
scale_x_log10(breaks = trans_breaks("log10", function(x) 10^x),
labels = trans_format("log10", math_format(10^.x)))
}
# without log x
plot_sample_density_noLog <- function(exp, title) {
exp_long <- exp %>% pivot_longer(!fid, values_to = 'value', names_to = 'id')
exp_long$stage <- san_mrn_temp$population_general[match(exp_long$id, san_mrn_temp$id)]
exp_long$stage <- factor(exp_long$stage, levels = levels_mrn)
exp_long$donor <- san_mrn_temp$donor[match(exp_long$id, san_mrn_temp$id)]
exp_long %>%
#filter(donor == 'B') %>%
ggplot(aes(x = value, color = id)) +
geom_density(na.rm = T, bw = 'SJ') +
#scale_x_continuous(trans='log2') +
scale_color_manual(values = col_ids) +
theme(legend.position = "none") +
ggtitle(title)
}
# apply
p.hist.tpm_unfiltered <- plot_sample_density_log(exp_tpm_RAW, "TPM unfiltered")
p.hist.tpm_filtered <- plot_sample_density_log(exp_tpm_filtered, "TPM filtered")
p.hist.cou_unfiltered <- plot_sample_density_log(exp_counts_RAW, "Counts unfiltered")
p.hist.cou_filtered <- plot_sample_density_log(exp_counts_filtered, "Counts filtered")
fig_hist_mrn_fitereffect_pre <- ggarrange(
p.hist.cou_unfiltered,
p.hist.cou_filtered,
p.hist.tpm_unfiltered,
p.hist.tpm_filtered,
ncol = 2,
nrow = 2)
fig_hist_mrn_fitereffect_pre
names(san_mrn_temp)
dat.qc.mrn <- san_mrn_temp %>% dplyr::select(id, stage = population_general, reads, mbases_yield, mean_quality, bases_over_30)
dat.qc.mrn$stage <- factor(dat.qc.mrn$stage, levels = levels_mrn)
dat.qc.mrn$stageNUM <- as.numeric(dat.qc.mrn$stage)
# dat.qc.mrn$id <- reorder(dat.qc.mrn$id, dat.qc.mrn$stage)
# reads
dat.qc.mrn %>%
ggplot(aes(x = reorder(id, stageNUM), y = reads, fill = stage)) +
geom_bar(stat = 'identity') +
scale_fill_manual(values = colors_stage) +
theme(axis.text.x = element_text(angle=30, hjust=1)) +
ggtitle("Total reads per sample in mRNA")
# MB yield
dat.qc.mrn %>%
ggplot(aes(x = reorder(id, stageNUM), y = mbases_yield, fill = stage)) +
geom_bar(stat = 'identity') +
scale_fill_manual(values = colors_stage) +
theme(axis.text.x = element_text(angle=30, hjust=1)) +
ggtitle("MByields per sample in mRNA")
# Mean quality
dat.qc.mrn %>%
ggplot(aes(x = reorder(id, stageNUM), y = mean_quality, fill = stage)) +
geom_bar(stat = 'identity') +
scale_fill_manual(values = colors_stage) +
theme(axis.text.x = element_text(angle=30, hjust=1)) +
ggtitle("Mean quality per sample in mRNA")
# Bases over 30
dat.qc.mrn %>%
ggplot(aes(x = reorder(id, stageNUM), y = bases_over_30, fill = stage)) +
geom_bar(stat = 'identity') +
scale_fill_manual(values = colors_stage) +
theme(axis.text.x = element_text(angle=30, hjust=1)) +
ggtitle("Mean quality per sample in mRNA")
exp_cou_4log2 <- FIDtoRow(exp_cou_non)
mx_cou_4log2 <- df2m(exp_cou_4log2)
min(mx_cou_4log2)
min(mx_cou_4log2, na.rm = T) # OK
# exp_cou_log2 <- exp_cou_log2 +1 # all +1 to remove negative values!
exp_cou_log2 <- log2(mx_cou_4log2)
exp_cou_log2 <- as.data.frame(exp_cou_log2)
exp_cou_log2$fid <- rownames(exp_cou_log2) # has -Inf!
exp_tpm_4log2 <- FIDtoRow(exp_tpm_non)
mx_tpm_4log2 <- df2m(exp_tpm_4log2)
min(mx_tpm_4log2) # NA
min(mx_tpm_4log2, na.rm = T) # 0.28 -> all 0.x will give negative values
#mx_tpm_4log2_plus1 <- mx_tpm_4log2 +1 # all +1 to remove negative values!
#min(mx_tpm_4log2_plus1, na.rm = T) # 1.28. perfect
# ACTUALLY< THE +1 is not worth it because VSN keeps putting out negative values.
# BETTER TO KEEP IT AS IT IS!
exp_tpm_log2 <- log2(mx_tpm_4log2) # has some negative values
exp_tpm_log2 <- as.data.frame(exp_tpm_log2)
exp_tpm_log2$fid <- rownames(exp_tpm_log2) #
Comparison of normalisation methods in proteomics: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2680204/
Vignette on VSN: https://www.bioconductor.org/packages/devel/bioc/vignettes/vsn/inst/doc/A-vsn.html
VSN application in expression data: https://academic.oup.com/bioinformatics/article/18/suppl_1/S96/231881
First author on VSN paper: https://www.embl.org/groups/huber/
Why VSN need non log data: https://support.bioconductor.org/p/13221/
# VSN needs counts, not log2!
# copy non log2 exp mx from above
mx_4eSet_cou <- mx_cou_4log2 # min = 10
mx_4eSet_tpm <- mx_tpm_4log2 # avoid negative values with +1
min(mx_4eSet_tpm, na.rm = T) # 0.28... alright
# match san order for pheno creation
san_4eSet <- san_mrn_temp[match(colnames(mx_4eSet_cou), san_mrn_temp$id),]
san_4eSet <- as.data.frame(san_4eSet)
rownames(san_4eSet) <- san_4eSet$id
pheno_eSet <- new("AnnotatedDataFrame", data = san_4eSet)
# compile
eSet_tpm_non <- ExpressionSet (
assayData = mx_4eSet_tpm, #exprs
phenoData = pheno_eSet) #pData
eSet_cou_non <- ExpressionSet (
assayData = mx_4eSet_cou, #exprs
phenoData = pheno_eSet) #pData
# normalize (uses eSet with raw data)
eSet_tpm_vsn = justvsn(eSet_tpm_non)
eSet_cou_vsn = justvsn(eSet_cou_non)
## QC of VSN: needs to be discussed
# # vector of generalized log ratios
# M_cou_non = exprs(eSet_cou_non)[,1] - exprs(eSet_cou_non)[,2]
# M_cou_vsn = exprs(eSet_cou_vsn)[,1] - exprs(eSet_cou_vsn)[,2]
#
# M_tpm_non = exprs(eSet_tpm_non)[,1] - exprs(eSet_tpm_non)[,2]
# M_tpm_vsn = exprs(eSet_tpm_vsn)[,1] - exprs(eSet_tpm_vsn)[,2]
#
# # histogram of the generalized log-ratios
# hist(M_cou_non, breaks = 100, col = "#d95f0e") # data not well normalized
# hist(M_cou_vsn, breaks = 100, col = "#d95f0e") # very nice
#
# hist(M_tpm_non, breaks = 100, col = "#d95f0e")
# hist(M_tpm_vsn, breaks = 100, col = "#d95f0e")
#
# # graphs
# meanSdPlot(eSet_cou_non)
# meanSdPlot(eSet_cou_vsn)
#
# meanSdPlot(eSet_tpm_non)
# meanSdPlot(eSet_tpm_vsn)
# extract
exp_tpm_vsn <- exprs(eSet_tpm_vsn)
min(exp_tpm_vsn, na.rm = T) # has negative values. has NAs
exp_tpm_vsn <- as.data.frame(exp_tpm_vsn)
exp_tpm_vsn$fid <- rownames(exp_tpm_vsn)
exp_cou_vsn <- exprs(eSet_cou_vsn)
min(exp_cou_vsn, na.rm = T) # no - values. has NAs
exp_cou_vsn <- as.data.frame(exp_cou_vsn)
exp_cou_vsn$fid <- rownames(exp_cou_vsn)
DISCARDED! NO NEED FOR THIS!
https://davetang.org/muse/2014/07/07/quantile-normalisation-in-r/
using log2 values bc of the examples given
#
# # counts
# exp_cou_qn <- normalize.quantiles(exp_4eSet_cou)
# exp_cou_qn <- as.data.frame(exp_cou_qn)
# rownames(exp_cou_qn) <- rownames(exp_4eSet_cou)
# colnames(exp_cou_qn) <- colnames(exp_4eSet_cou)
# exp_cou_qn$fid <- rownames(exp_cou_qn)
#
# # tpm
# exp_tpm_qn <- normalize.quantiles(exp_4eSet_tpm)
# exp_tpm_qn <- as.data.frame(exp_tpm_qn)
# rownames(exp_tpm_qn) <- rownames(exp_4eSet_tpm)
# colnames(exp_tpm_qn) <- colnames(exp_4eSet_tpm)
# exp_tpm_qn$fid <- rownames(exp_tpm_qn)
DISCARDED! NO NEED FOR THIS!
Recommended by Gergely, uses minimal assumptions
#
# exp_cou_mn <- normalizeMedianValues(exp_4eSet_cou) # limma
# exp_cou_mn <- as.data.frame(exp_cou_mn)
# exp_cou_mn$fid <- rownames(exp_cou_mn)
# colnames(exp_cou_mn)
#
# exp_tpm_mn <- normalizeMedianValues(exp_4eSet_tpm) # limma
# exp_tpm_mn <- as.data.frame(exp_4eSet_tpm)
# exp_tpm_mn$fid <- rownames(exp_4eSet_tpm)
# colnames(exp_tpm_mn)
#p.hist.cou_non <- plot_sample_density_noLog(exp_cou_non, "Counts non")
p.hist.cou_log2 <- plot_sample_density_noLog(exp_cou_log2, "Counts log2")
p.hist.cou_vsn <- plot_sample_density_noLog(exp_cou_vsn, "Counts vsn")
#p.hist.cou_qn <- plot_sample_density(exp_cou_qn, "Counts qn")
#p.hist.cou_mn <- plot_sample_density_log(exp_cou_mn, "Counts mn")
p.hist.tpm_log2 <- plot_sample_density_noLog(exp_tpm_log2, "TPM log2")
p.hist.tpm_vsn <- plot_sample_density_noLog(exp_tpm_vsn, "TPM vsn")
#p.hist.tpm_qn <- plot_sample_density(exp_tpm_qn, "TPM qn")
#p.hist.tpm_mn <- plot_sample_density_log(exp_tpm_mn, "TPM mn")
fig_hist_norm_pre <- ggarrange(
p.hist.cou_log2,
p.hist.cou_vsn,
#p.hist.cou_mn,
p.hist.tpm_log2,
p.hist.tpm_vsn,
#p.hist.tpm_mn,
ncol = 2,
nrow = 2)
fig_hist_norm_pre
fig_hist_norm <- annotate_figure(fig_hist_norm_pre,
top = text_grob("Normalisation effects on transcriptome counts and TPM", color = "black", face = "bold", size = 18))
fig_hist_norm
### fct in base script
# pepExp_2long_fidCol <- function(exp, san, LEVELS){
# exp_long <- exp %>% pivot_longer(!fid, names_to = 'id', values_to = "value")
# exp_long$stage <- san$population_general[match(exp_long$id, san$id)]
# exp_long$stage <- factor(exp_long$stage , levels = LEVELS)
# return(exp_long)
# }
exp_tpm_log2_long <- pepExp_2long_fidCol(exp_tpm_log2, san_mrn_temp, levels_mrn)
exp_tpm_vsn_long <- pepExp_2long_fidCol(exp_tpm_vsn, san_mrn_temp, levels_mrn)
# exp_tpm_mn_long <- pepExp_2long_fidCol(exp_tpm_mn, san_mrn_temp, levels_mrn)
exp_cou_log2_long <- pepExp_2long_fidCol(exp_cou_log2, san_mrn_temp, levels_mrn)
exp_cou_vsn_long <- pepExp_2long_fidCol(exp_cou_vsn, san_mrn_temp, levels_mrn)
# exp_cou_mn_long <- pepExp_2long_fidCol(exp_cou_mn, san_mrn_temp, levels_mrn)
plot_box_4normCheck <- function(exp_long, title){
exp_long %>%
arrange(stage) %>%
ggplot(aes(x = interaction(id, stage), y = value, fill = stage)) +
geom_boxplot() +
#theme(axis.text.x=element_text(angle=45, hjust=1)) +
#coord_flip() +
ggtitle(title) +
xlab('Sample')+
ylab("Expression")+
theme(axis.text.x=element_text(angle=45, hjust=1),
axis.text=element_text(size=7)) +
scale_fill_manual(values = colors_stage)
}
plot_box_4normCheck_logY <- function(exp_long, title){
exp_long %>%
arrange(stage) %>%
ggplot(aes(x = interaction(id, stage), y = value, fill = stage)) +
geom_boxplot() +
#theme(axis.text.x=element_text(angle=45, hjust=1)) +
#coord_flip() +
ggtitle(title) +
xlab('Sample')+
ylab("Expression")+
theme(axis.text.x=element_text(angle=45, hjust=1),
axis.text=element_text(size=7)) +
scale_fill_manual(values = colors_stage) +
scale_y_continuous(trans='log2')
}
p.box.cou_log2 <- plot_box_4normCheck(exp_cou_log2_long, "cou log2")
p.box.cou_vsn <- plot_box_4normCheck(exp_cou_vsn_long, "cou vsn")
p.box.tpm_log2 <- plot_box_4normCheck(exp_tpm_log2_long, "TPM log2")
p.box.tpm_vsn <- plot_box_4normCheck(exp_tpm_vsn_long, "TPM vsn")
# fig
fig_boxes_pre <- ggarrange(
p.box.cou_log2,
p.box.cou_vsn,
p.box.tpm_log2,
p.box.tpm_vsn,
legend = 'none')
fig_boxes_pre
fig_boxes <- annotate_figure(fig_boxes_pre,
top = text_grob("Total expression per sample", color = "black", face = "bold", size = 20))
fig_boxes
How do the technical repeat samples behave? Samples were distributed in aliquots at the date of collection.
# isolate techReps
san_mrn_techResp <- san_mrn_temp %>% filter(has_tech_rep == T)
# all tech Reps are the same subpopulation!
# fct to get IDs by Yidi
get_Rep_ids_mrn <- function(DONOR, STAGE){
san_mrn_techResp %>%
dplyr::filter(donor == DONOR) %>%
dplyr::filter(population_general == STAGE) %>%
pull(id)
}
donors <- unique(san_mrn_techResp$donor)
stages <- unique(san_mrn_techResp$population_general)
ids <- list()
for(i in donors){
for(j in stages){
id <- paste(i,j,sep = '_')
ids[[id]] <- get_Rep_ids_mrn(i,j)
}
}
rep_ids_mrn <- purrr::compact(ids)
# plots
plot_corr_TechReps <- function(samples, data){
data %>% dplyr::select(all_of(samples)) %>%
rename(Rep1 = 1, Rep2 = 2) %>%
ggplot(aes(x = Rep1, y = Rep2)) +
geom_point() +
stat_cor(method = "spearman", size = 3, na.rm = T)
}
pl.corr_mrn <- purrr::map(rep_ids_mrn, plot_corr_TechReps, exp_cou_log2)
# COUNTS
fig_RepCor_mrn_pre <- ggarrange(
pl.corr_mrn$A_MB + ggtitle("Donor A, stage MB"),
pl.corr_mrn$A_MC + ggtitle("Donor A, stage MC"),
pl.corr_mrn$A_MM + ggtitle("Donor A, stage MM"),
pl.corr_mrn$A_B + ggtitle("Donor A, stage B"),
pl.corr_mrn$C_MC + ggtitle("Donor C, stage MC"),
pl.corr_mrn$C_B + ggtitle("Donor C, stage B"),
pl.corr_mrn$C_S + ggtitle("Donor C, stage S"),
pl.corr_mrn$D_MM + ggtitle("Donor D, stage MM"),
pl.corr_mrn$D_B + ggtitle("Donor D, stage B"),
pl.corr_mrn$D_PMN + ggtitle("Donor D, stage PMN"),
pl.corr_mrn$E_MC + ggtitle("Donor E, stage MC"),
legend = 'none')
fig_RepCor_mrn_pre
fig_RepCor_mrn <- annotate_figure(fig_RepCor_mrn_pre,
top = text_grob("Correlation of technical repeat samples mRNA", color = "black", face = "bold", size = 20))
fig_RepCor_mrn
# one sample with 3 reps
# "G23" "G24" "G25"
triplet_A_MM <- list(
A_MM_G23.G24 = c('G23', 'G24'),
A_MM_G23.G25 = c('G23', 'G25'),
A_MM_G24.G25 = c('G24', 'G25'))
pl.corr_mrn_triplet <- purrr::map(triplet_A_MM, plot_corr_TechReps, exp_cou_log2)
fig_RepCor_mrn_trip_pre <- ggarrange(
pl.corr_mrn_triplet$A_MM_G23.G24 + ggtitle("Donor A, MM, G23 G24"),
pl.corr_mrn_triplet$A_MM_G23.G25 + ggtitle("Donor A, MM, G23 G25"),
pl.corr_mrn_triplet$A_MM_G24.G25 + ggtitle("Donor A, MM, G24 G25"),
legend = 'none')
fig_RepCor_mrn_trip_pre
# define sample to remove
# toRemove_lowQ <- c('G2b') # included in PCA based outliers!
## more general function save later
# extract data
extract_PC1to4 <- function(res.pca){
dat.pc12 <- res.pca$x %>% as.data.frame() %>%
rownames_to_column('id') %>%
dplyr::select(id, PC1, PC2, PC3, PC4)
}
#
prep4_pca <- function(exp, san) {
exp <- exp[complete.cases(exp),]
fid <- exp$fid
exp$fid <- NULL
sid <- colnames(exp)
expT <- t(exp)
expT <- matrix(as.numeric(expT), # Convert to numeric matrix
ncol = ncol(expT))
expT <- scale(expT, center = T, scale = T)
colnames(expT) <- fid
rownames(expT) <- sid
res.pca <- prcomp(expT)
dat.pca <- extract_PC1to4(res.pca)
dat.pca$stage <- san$population_general[match(dat.pca$id, san$id)]
return(dat.pca)
}
dat.pca.cou_log2 <- prep4_pca(exp_cou_log2, san_mrn_temp)
dat.pca.tpm_log2 <- prep4_pca(exp_tpm_log2, san_mrn_temp)
dat.pca.cou_vsn <- prep4_pca(exp_cou_vsn, san_mrn_temp)
dat.pca.tpm_vsn <- prep4_pca(exp_tpm_vsn, san_mrn_temp)
plot_myPCA <- function(dat.pca){
dat.pca %>%
ggplot(aes(x= PC1, y = PC2, color = stage))+
geom_point(size = 3)+
scale_color_manual(values = colors_stage)
}
plot_myPCA_labeled <- function(dat.pca){
dat.pca %>%
ggplot(aes(x= PC1, y = PC2, color = stage, label = id))+
geom_point(size = 3)+
geom_text_repel() +
scale_color_manual(values = colors_stage)
}
p.pca.cou_lg2 <- plot_myPCA(dat.pca.cou_log2)
p.pca.tpm_lg2 <- plot_myPCA(dat.pca.tpm_log2)
p.pca.cou_vsn <- plot_myPCA(dat.pca.cou_vsn)
p.pca.tpm_vsn <- plot_myPCA(dat.pca.tpm_vsn)
fig_PCA_mrn_pre <- ggarrange(
p.pca.cou_lg2 + ggtitle("Counts log2"),
p.pca.cou_vsn + ggtitle("Counts vsn"),
p.pca.tpm_lg2 + ggtitle("TPM log2"),
p.pca.tpm_vsn + ggtitle("TPM vsn"),
legend = 'none')
fig_PCA_mrn_pre
fig_PCA_mrn <- annotate_figure(fig_PCA_mrn_pre,
top = text_grob("PCA of mRNA nromalisations", color = "black", face = "bold", size = 20))
fig_PCA_mrn
plot_myPCA_labeled(dat.pca.cou_vsn) +
ggtitle("PCA of mRNA counts, vsn")
toRemove_mrn_outliers <- c('G2b', 'G7', 'G6', 'G13', 'R1_MM_C')
toRemMrn <- c(toRemove_mrn_outliers)
length(toRemMrn) # 5
# san
san_mrn_clean <- san_mrn_temp %>% filter(! id %in% toRemMrn)
nrow(san_mrn_clean) # 44 samples
## exp
# the other normalisations are not needed!
# counts
exp_cou_non_clean <- exp_cou_non[! colnames(exp_cou_non) %in% toRemMrn]
exp_cou_log2_clean <- exp_cou_log2[! colnames(exp_cou_log2) %in% toRemMrn]
exp_cou_vsn_clean <- exp_cou_vsn[! colnames(exp_cou_vsn) %in% toRemMrn]
# tpm
exp_tpm_non_clean <- exp_tpm_non[! colnames(exp_tpm_non) %in% toRemMrn]
exp_tpm_log2_clean <- exp_tpm_log2[! colnames(exp_tpm_log2) %in% toRemMrn]
exp_tpm_vsn_clean <- exp_tpm_vsn[! colnames(exp_tpm_vsn) %in% toRemMrn]
# combine tech reps and subpopulations (identified by hand)
# fct to get IDs by Yidi
#
# get_Rep_ids_mrn_reps <- function(DONOR, STAGE){
# san_to_combine %>%
# dplyr::filter(donor == DONOR) %>%
# dplyr::filter(population_general == STAGE) %>%
# pull(id)
# }
#
# donors2comb <- unique(san_to_combine$donor)
# stages2comb <- unique(san_to_combine$population_general)
# ids <- list()
# for(i in donors2comb){
# for(j in stages2comb){
# id <- paste(i,j,sep = '_')
# ids[[id]] <- get_Rep_ids_mrn_reps(i,j)
# }
# }
#
# rep_ids2comb <- purrr::compact(ids)
# rep_ids2comb # not perfect bc contains one singleton. But fair enough to work with
# create labels (copy into google sheets. prep there. download and integrate below)
label_generator_mrn_pre <- san_mrn_clean %>% select(id, donor, population_general, population_specific)
# write_clip(label_generator_mrn_pre)
# form medians via function
combine_samples_mrn <- function(exp){
exp_c <- exp
## A
# B reps in A
exp_c <- exp_c %>%
rowwise() %>%
dplyr::mutate(B_a1 = median(c(G16, G17), na.rm = T)) %>%
select(- c(G16, G17))
# MM triplet
exp_c <- exp_c %>%
rowwise() %>%
dplyr::mutate(MM_a1 = median(c(G23, G24, G25), na.rm = T)) %>%
select(- c(G23, G24, G25))
## B: none
## C
# B
exp_c <- exp_c %>%
rowwise() %>%
dplyr::mutate(eB_c = median(c(R1_eB_C, R2_eB_C), na.rm = T)) %>%
select(- c(R1_eB_C, R2_eB_C))
# MC
exp_c <- exp_c %>%
rowwise() %>%
dplyr::mutate(eMC_c = median(c(R2_eMC_C, R1_eMC_C), na.rm = T)) %>%
select(- c(R2_eMC_C, R1_eMC_C))
# S
exp_c <- exp_c %>%
rowwise() %>%
dplyr::mutate(S_c = median(c(R1_S_C, R2_S_C), na.rm = T)) %>%
select(- c(R1_S_C, R2_S_C))
## D
# B
exp_c <- exp_c %>%
rowwise() %>%
dplyr::mutate(lB_d = median(c(R1_lB_D, R2_lB_D), na.rm = T)) %>%
select(- c(R1_lB_D, R2_lB_D))
# MM
exp_c <- exp_c %>%
rowwise() %>%
dplyr::mutate(MM_d = median(c(R2_MM_D, R1_MM_D), na.rm = T)) %>%
select(- c(R2_MM_D, R1_MM_D))
# PMN
exp_c <- exp_c %>%
rowwise() %>%
dplyr::mutate(PMN_d = median(c(R2_PMN_D, R1_PMN_D), na.rm = T)) %>%
select(- c(R2_PMN_D, R1_PMN_D))
## E
# MC
exp_c <- exp_c %>%
rowwise() %>%
dplyr::mutate(eMC_e = median(c(R1_eMC_E, R2_eMC_E), na.rm = T)) %>%
select(- c(R1_eMC_E, R2_eMC_E))
return(exp_c)
}
rename_cols_mrn <- function(exp){
exp_c <- exp
exp_c %<>% rename(eMC_b = R1_eMC_B)
exp_c %<>% rename(lMC_b = R1_lMC_B)
exp_c %<>% rename(eMC_a = R1_eMC_A)
exp_c %<>% rename(lMC_a = R1_lMC_A)
exp_c %<>% rename(B_a2 = R1_B_A)
exp_c %<>% rename(MB_a = R1_MB_A)
exp_c %<>% rename(MC_a = G2a)
exp_c %<>% rename(MM_a2 = R1_MM_A)
exp_c %<>% rename(PM_a = R1_PM_A)
exp_c %<>% rename(PMN_a = R1_PMN_A)
exp_c %<>% rename(S_a = R1_S_A)
exp_c %<>% rename(B_b = R1_B_B)
exp_c %<>% rename(MB_b = R1_MB_B)
exp_c %<>% rename(MM_b = R1_MM_B)
exp_c %<>% rename(PM_b = R1_PM_B)
exp_c %<>% rename(PMN_b = R1_PMN_B)
exp_c %<>% rename(MB_c = R1_MB_C)
exp_c %<>% rename(PM_c = R1_PM_C)
exp_c %<>% rename(eMC_d = R1_eMC_D)
exp_c %<>% rename(B_e = R1_eB_E)
exp_c %<>% rename(MB_e = R1_MB_E)
exp_c %<>% rename(MM_e = R1_MM_E)
exp_c %<>% rename(PM_e = R1_PM_E)
exp_c %<>% rename(PMN_e = R1_PMN_E)
exp_c %<>% rename(S_e = R1_S_E)
return(exp_c)
}
### apply to data
### COUNTS
# non
exp_cou_non_clean_comb <- combine_samples_mrn(exp_cou_non_clean)
exp_cou_non_clean_comb <- rename_cols_mrn(exp_cou_non_clean_comb)
sort(names(exp_cou_non_clean_comb))
sort(names(exp_counts_RAW))
# log2
exp_cou_log2_clean_comb <- combine_samples_mrn(exp_cou_log2_clean)
exp_cou_log2_clean_comb <- rename_cols_mrn(exp_cou_log2_clean_comb)
# vsn
exp_cou_vsn_clean_comb <- combine_samples_mrn(exp_cou_vsn_clean)
exp_cou_vsn_clean_comb <- rename_cols_mrn(exp_cou_vsn_clean_comb)
### TPM
# non
exp_tpm_non_clean_comb <- combine_samples_mrn(exp_tpm_non_clean)
exp_tpm_non_clean_comb <- rename_cols_mrn(exp_tpm_non_clean_comb)
sort(names(exp_tpm_non_clean_comb))
# log2
exp_tpm_log2_clean_comb <- combine_samples_mrn(exp_tpm_log2_clean)
exp_tpm_log2_clean_comb <- rename_cols_mrn(exp_tpm_log2_clean_comb)
# vsn
exp_tpm_vsn_clean_comb <- combine_samples_mrn(exp_tpm_vsn_clean)
exp_tpm_vsn_clean_comb <- rename_cols_mrn(exp_tpm_vsn_clean_comb)
# copy to gSheets
#write_clip(san_mrn_clean)
#colnames(exp_tpm_vsn_clean_comb)[-1]
#write_clip(colnames(exp_tpm_vsn_clean_comb)[-1])
san_mrn_pre <- read_csv("/home/isar/Dropbox/scn_moa/data/gnp/mrn/san_mrn_final - Sheet1.csv")
san_mrn_pre %<>% filter(! is.na(label_final))
setdiff(colnames(exp_tpm_vsn_clean_comb), san_mrn_pre$label_final) # OK
setdiff(san_mrn_pre$label_final, colnames(exp_tpm_vsn_clean_comb)) # OK
# Finalize
san_mrn_fin <- san_mrn_pre %>% select(
id = label_final,
stage = population_general,
stage_nr = population_hirachical_number,
substage = population_specific,
donor = donor,
sex = sex,
age = age
)
# eMC_b missed in labeling. Add
san_mrn_fin[san_mrn_fin$id == 'eMC_b', 'stage'] <- 'MC'
san_mrn_fin$donor <- tolower(san_mrn_fin$donor)
san_mrn_fin$idI <- make_unique(san_mrn_fin$stage, sep = "_", wrap_in_brackets = FALSE)
# align cols in all exp
identical(colnames(exp_cou_non_clean_comb), colnames(exp_cou_vsn_clean_comb)) # need to align!
identical(rownames(exp_cou_non_clean_comb), rownames(exp_cou_vsn_clean_comb)) # OK!
identical(rownames(exp_tpm_non_clean_comb), rownames(exp_cou_vsn_clean_comb)) # OK!
order_colnames <- sort(colnames(exp_cou_non_clean_comb))
order_colnames <- moveMe(order_colnames, "fid first")
exp_cou_non_clean_comb <- exp_cou_non_clean_comb[order_colnames]
exp_cou_log2_clean_comb <- exp_cou_log2_clean_comb[order_colnames]
exp_cou_vsn_clean_comb <- exp_cou_vsn_clean_comb[order_colnames]
exp_tpm_non_clean_comb <- exp_tpm_non_clean_comb[order_colnames]
exp_tpm_log2_clean_comb <- exp_tpm_log2_clean_comb[order_colnames]
exp_tpm_vsn_clean_comb <- exp_tpm_vsn_clean_comb[order_colnames]
identical(colnames(exp_cou_non_clean_comb), colnames(exp_cou_vsn_clean_comb))
# order san by cols
san_mrn_fin <- san_mrn_fin[order(match(san_mrn_fin$id, colnames(exp_tpm_vsn_clean_comb)[-1])),]
identical(san_mrn_fin$id, colnames(exp_tpm_vsn_clean_comb)[-1])
# check NAs
NAinCOU_non <- is.na(exp_cou_non_clean_comb)
NAinCOU_log2 <- is.na(exp_cou_log2_clean_comb)
NAinCOU_vsn <- is.na(exp_cou_vsn_clean_comb)
NAinVSN_non <- is.na(exp_tpm_non_clean_comb)
NAinVSN_log2 <- is.na(exp_tpm_log2_clean_comb)
NAinVSN_vsn <- is.na(exp_tpm_vsn_clean_comb)
identical(NAinCOU_non, NAinCOU_vsn) # OK
identical(NAinCOU_vsn, NAinCOU_log2) # OK
# ALL DATA HAVE THE SAME NAs!
# COUNTS
exp_4filt2_long <- exp_tpm_vsn_clean_comb %>% pivot_longer(!fid, names_to = 'id', values_to = "value")
exp_4filt2_long$stage <- san_mrn_fin$stage[match(exp_4filt2_long$id, san_mrn_fin$id)]
# not NA/group
notNAperGeneANDgroup_final <- exp_4filt2_long %>%
group_by(fid, stage) %>%
summarise(sum_notNA = sum(!is.na(value)))
# create filter
summary_notNA_final <- notNAperGeneANDgroup_final %>% group_by(fid) %>%
filter(sum(sum_notNA>1)>1, sum_notNA > 1) %>%
summarise(tot_valid = n(), valid_groups = str_c(stage, collapse = ';'))
table(summary_notNA_final$tot_valid)
# min2 in all 7 groups = good for DEX
fid_min2INallGroups_final <- summary_notNA_final %>%
filter(tot_valid == 7) %>% pull(fid)
length(fid_min2INallGroups_final) # 8431
## remove from exp
# COU
exp_cou_non_preF <- exp_cou_non_clean_comb %>% filter(fid %in% fid_min2INallGroups_final)
exp_cou_log2_preF <- exp_cou_log2_clean_comb %>% filter(fid %in% fid_min2INallGroups_final)
exp_cou_vsn_preF <- exp_cou_vsn_clean_comb %>% filter(fid %in% fid_min2INallGroups_final)
# VSN
exp_tpm_non_preF <- exp_tpm_non_clean_comb %>% filter(fid %in% fid_min2INallGroups_final)
exp_tpm_log2_preF <- exp_tpm_log2_clean_comb %>% filter(fid %in% fid_min2INallGroups_final)
exp_tpm_vsn_preF <- exp_tpm_vsn_clean_comb %>% filter(fid %in% fid_min2INallGroups_final)
# load data pro
load(paste0(dir_moa, "/data/final/old/", "pro_prefinal.RData"))
# manually curated IDs to harmonize mRNA and proteins
fan_complete_pre <- read_csv(paste0(dir_moa, "/data/anno/", "fan_complete.csv") )
# setdiff(exp_tpm_vsn_clean_comb$fid, fan_complete_pre$ENSG) # 231 missing
# get annotation and add to master fan
# get fan for fid not in master fan
fan_mrn_missing <- data.frame(ENSG = setdiff(exp_tpm_vsn_clean_comb$fid, fan_complete_pre$ENSG)) # 231
# check for doubles
n_occur_ENSG <- data.frame(table(fan_mrn_missing$ENSG))
n_occur_ENSG[n_occur_ENSG$Freq > 1,] # no doubles
sum(is.na(fan_mrn_missing$ENSG)) # no NA
## main IDs from org.Hs.eg.db
fids_mrn <- AnnotationDbi::select(org.Hs.eg.db, keys= fan_mrn_missing$ENSG, columns= c("SYMBOL", "UNIPROT", "ENTREZID", "GENENAME"), keytype="ENSEMBL")
nrow(fids_mrn)
fids_mrn_temp <- fids_mrn %>%
group_by(ENSEMBL) %>%
dplyr::summarize(UNIPROT = paste(UNIPROT, collapse = ", "), .groups = 'drop')
nrow(fids_mrn_temp)
# remove double entries, keep only first
fids_mrn <- fids_mrn %>% distinct(ENSEMBL, .keep_all = T)
fids_mrn %<>% rename(ENSG = ENSEMBL, NAME_PROT = GENENAME)
fids_mrn$UNIPROT_mult <- fids_mrn_temp$UNIPROT[match(fids_mrn$ENSG, fids_mrn_temp$ENSEMBL)]
# nit actually important and only confusing. Keep it simple
nrow(fids_mrn) # 231. ok
# intersect(fids_mrn$UNIPROT, fan_pro_temp$UNIPROT) # no overlaps! good (all new entries = list is clean)
# add fids to fan
names(fan_complete_pre)
names(fids_mrn)
fids_mrn_toAdd <- fids_mrn %>% select(ENSG, UNIPROT, SYMBOL, ENTREZ = ENTREZID)
fan_complete_pro_mrn_PRE <- rbind(fan_complete_pre, fids_mrn_toAdd)
# check for completion
setdiff(exp_cou_non_preF$fid, fan_complete_pro_mrn_PRE$ENSG) # all contained!
setdiff(exp_pro_lg2_preF3$fid, fan_complete_pro_mrn_PRE$UNIPROT) # all contained!
# also quickcheck SCN
exp_scn_vsn_cor <- read_csv(paste0(dir_moa, "/data/scn/pro/final/", "exp_scn_vsn_cor_STRINGENT.csv"))
setdiff(exp_scn_vsn_cor$UNIPROT, fan_complete_pro_mrn_PRE$UNIPROT) # all contained! PERFECT!
gg_miss_var(fan_complete_pro_mrn_PRE)
# fan_complete_pro_mrn_PRE %>%
# group_by(ENSG) %>%
# miss_var_summary()
# check ENSG
n_occur_ENS <- data.frame(table(fan_complete_pro_mrn_PRE$ENSG))
nrow(n_occur_ENS[n_occur_ENS$Freq > 1,]) # NO doubles
sum(is.na(fan_complete_pro_mrn_PRE$ENSG)) # NO NA
# check SYMBOL
n_occur_SYM <- data.frame(table(fan_complete_pro_mrn_PRE$SYMBOL))
nrow(n_occur_SYM[n_occur_SYM$Freq > 1,]) # NO doubles
sum(is.na(fan_complete_pro_mrn_PRE$SYMBOL)) # 47 NA. Simply copy ENSG
# check
fan_complete_pro_mrn_PRE %>% filter(is.na(SYMBOL)) # RNA genes! Could be interesting...
# copy to google
# fan_complete_pro_mrn_PRE %>% filter(is.na(SYMBOL)) %>% write_clip()
#
fan_complete_pro_mrn_PRE %>% filter(str_detect(SYMBOL, 'ENSG')) # 682 !!!
# it doesnt make sense to import the 40 I have without checking all of these.
# do on a case by case basis of they pop up as difEx
# dont import for now, just copy ENSG to SYMBOLS
fan_complete_pro_mrn_PRE$SYMBOL <- ifelse(is.na(fan_complete_pro_mrn_PRE$SYMBOL), fan_complete_pro_mrn_PRE$ENSG, fan_complete_pro_mrn_PRE$SYMBOL)
sum(is.na(fan_complete_pro_mrn_PRE$SYMBOL)) # ok
n_occur_SYM <- data.frame(table(fan_complete_pro_mrn_PRE$SYMBOL))
nrow(n_occur_SYM[n_occur_SYM$Freq > 1,]) # no doubles
# UNIPROT
sum(is.na(fan_complete_pro_mrn_PRE$UNIPROT)) # 1263
n_occur_UNI <- data.frame(table(fan_complete_pro_mrn_PRE$UNIPROT))
nrow(n_occur_UNI[n_occur_UNI$Freq > 1,]) # no doubles
# ok to proceed!
fan_fin <- fan_complete_pro_mrn_PRE
# change one UNIPROT ids that is outdated
fan_fin$UNIPROT <- recode(fan_fin$UNIPROT,
P30042 = 'P0DPI2')
exp_mrn_cou_non_fin <- transpose_FIDcols(exp_cou_non_preF) # contains NA
exp_mrn_cou_log2_fin <- transpose_FIDcols(exp_cou_log2_preF)
exp_mrn_cou_vsn_fin <- transpose_FIDcols(exp_cou_vsn_preF)
exp_mrn_tpm_non_fin <- transpose_FIDcols(exp_tpm_non_preF) # contains NA
exp_mrn_tpm_log2_fin <- transpose_FIDcols(exp_tpm_log2_preF)
exp_mrn_tpm_vsn_fin <- transpose_FIDcols(exp_tpm_vsn_preF)
# savety check
setdiff(san_mrn_fin$id, rownames(exp_mrn_tpm_vsn_fin)) # all IDs match
setdiff(rownames(exp_mrn_tpm_vsn_fin), san_mrn_fin$id) # all IDs match
identical(rownames(exp_mrn_tpm_vsn_fin), san_mrn_fin$id) # OK!
setdiff(colnames(exp_mrn_tpm_vsn_fin), fan_fin$ENSG) # all fid in master fan
Using NIPALS
library(mixOmics)
nrow(exp_mrn_tpm_non_fin) # 34
# impute the TPM vsn values only
# exp_mrn_tpm_vsn_fin_NIP <- impute.nipals(X = exp_mrn_tpm_vsn_fin, ncomp = 34) # ncomp = n samples
# takes very long to compute. use saved data
load(file = paste0(dir_moa, "/data/final/", "gnp.mrn.RData"))
exp_mrn_tpm_vsn_fin_NIP <- gnp.mrn$exp_mrn_tpm_vsn_NIPALS
#exp_mrn_tpm_vsn_fin_NIP
gnp.mrn <- list(
## final data
# counts
exp_mrn_cou_non = exp_mrn_cou_non_fin,
exp_mrn_cou_lg2 = exp_mrn_cou_log2_fin,
exp_mrn_cou_vsn = exp_mrn_cou_vsn_fin,
# TPM
exp_mrn_tpm_non = exp_mrn_tpm_non_fin,
exp_mrn_tpm_lg2 = exp_mrn_tpm_log2_fin,
exp_mrn_tpm_vsn = exp_mrn_tpm_vsn_fin,
exp_mrn_tpm_vsn_NIPALS = exp_mrn_tpm_vsn_fin_NIP,
# meta data
san_mrn = san_mrn_fin,
levels_mrn = levels_mrn,
fan = fan_fin,
## raw data
exp_mrn_cou_raw = exp_counts_pre,
exp_mrn_tpn_raw = exp_tpn_pre,
san_mrn_raw = san_mrn_temp)
save(
gnp.mrn,
file = paste0(dir_moa, "/data/final/", "gnp.mrn.RData"))