Last update: 23-10-05 Last markdown compiled: 23-10-09 ToDo: - all done

knitr::opts_chunk$set(warning = FALSE, message = FALSE, results = FALSE) 

# dirs
## Sebastian @Linux
dir_moa <- "/home/isar/Dropbox/scn_moa/"

# data: only raw data, loaded below

## packages
source(paste0(dir_moa, "/scripts/", "scn_moa_fct.R"))

# load required package collections
sapply(packages_general, require, character.only = TRUE)
sapply(packages_annotation, require, character.only = TRUE)
sapply(packages_missingValues, require, character.only = TRUE)
sapply(packages_normalisation, require, character.only = TRUE)

Import

exp raw

# import: raw counts and tpn
exp_mrn_counts_raw <- read_delim(paste0(dir_moa,"/data/gnp/mrn/original/",   "unstranded_raw_counts.tsv"), delim = "\t", escape_double = FALSE, trim_ws = TRUE)

exp_mrn_tpn_raw <- read_delim(paste0(dir_moa,"/data/gnp/mrn/original/",   "gene_count_unstranded_tpm_norm.tsv"), delim = "\t", escape_double = FALSE, trim_ws = TRUE)

# format ids
names(exp_mrn_counts_raw) <- gsub("-", "_", names(exp_mrn_counts_raw))
names(exp_mrn_tpn_raw) <- gsub("-", "_", names(exp_mrn_tpn_raw))

# make numeric
exp_mrn_counts_raw[,2:50] <- sapply(exp_mrn_counts_raw[,2:50], as.numeric)
exp_mrn_tpn_raw[,2:50] <- sapply(exp_mrn_tpn_raw[,2:50], as.numeric)

# remove and export length info from TPB
gene_length_gnp_mrn <- exp_mrn_tpn_raw %>% select(fid = gene_id, gene_length)
#exp_mrn_tpn_raw$gene_length <- NULL


#### Density check of raw data: not in use
## counts
plot_hist_counts <- exp_mrn_counts_raw %>%
  ggplot(aes(x=G13)) + 
  geom_freqpoly(binwidth = 1) +
  scale_x_continuous(trans='log2') +
  ggtitle("Counts")

plot_hist_tpn <- exp_mrn_tpn_raw %>%
  ggplot(aes(x=G13)) + 
  geom_freqpoly(binwidth = 1) +
  scale_x_continuous(trans='log2') +
  ggtitle("TPN")


# fig FC level comparison
fig_hist_pre <- ggarrange(
  plot_hist_counts,
  plot_hist_tpn
)

fig_hist <- annotate_figure(fig_hist_pre,
            top = text_grob("Histogam of raw mRNA data: counts vs tpn ", 
                color = "black", face = "bold", size = 18),
            #bottom = text_grob("FDR <= 0.05, abs(log FC) >= 1]",
               # color = "black", size = 12              )
               )

fig_hist

# copy as pre and final format
exp_tpn_pre <- exp_mrn_tpn_raw
exp_tpn_pre$gene_length <- NULL
exp_tpn_pre %<>% rename(fid = gene_id)

exp_counts_pre <- exp_mrn_counts_raw
exp_counts_pre %<>% rename(fid = gene_id)


identical(names(exp_tpn_pre), names(exp_counts_pre)) # ok!
identical(exp_tpn_pre$fid, exp_counts_pre$fid) # ok!


nrow(exp_tpn_pre) # 61552
nrow(exp_counts_pre) # 61552

san

# import
san_mrn_raw <- read_csv(paste0(dir_moa,"/data/gnp/mrn/original/", "sequenced_RNA_samples_granulopoiesis - RNA_samples_granulopoiesis.csv"))

# create IDs
san_mrn_raw$id <- san_mrn_raw$name
san_mrn_raw$id <- gsub("-", "_", san_mrn_raw$id)

# san for all exp?
setdiff(san_mrn_raw$id, names(exp_counts_pre)) # way more samples in here bc of micro RNA!
setdiff(names(exp_counts_pre), san_mrn_raw$id) # info for all exp is present -> Reduce san
setdiff(names(exp_tpn_pre), san_mrn_raw$id) # info for all exp is present -> Reduce san

# reduce to samples we have
san_mrn_temp <- san_mrn_raw[san_mrn_raw$id %in% c(names(exp_tpn_pre), "fid"),]

# check
setdiff(san_mrn_temp$id, names(exp_tpn_pre)) #ok!
setdiff(names(exp_tpn_pre), san_mrn_temp$id) #ok!


# add donor details
donor_details <- read_csv(paste0(dir_moa, "/data/gnp/pro/original/", "donor_details - Donors.csv"))
donor_details$donor <- toupper(donor_details$donor)
san_mrn_temp <- left_join(san_mrn_temp, donor_details, by = c("donor"))

Filter insignificantly expressed features

exclude low counts and <3 values/group

counts

exp_counts_unfiltered <- exp_counts_pre

# copy
exp_counts_4filt <- exp_counts_pre # copy raw counts

# set all values below threshold (10) NA
exp_counts_4filt[exp_counts_4filt< 10] = NA

# check NAs per feature and group

exp_counts_4filt_long <- exp_counts_4filt %>% pivot_longer(!fid, names_to = 'id', values_to = "value")
exp_counts_4filt_long$pop_gen <- san_mrn_temp$population_general[match(exp_counts_4filt_long$id, san_mrn_temp$id)]

# not NA/group
notNAperGeneANDgroup_cou <- exp_counts_4filt_long %>% 
  dplyr::group_by(fid, pop_gen) %>% 
  dplyr::summarise(sum_notNA = sum(!is.na(value)))

# create filter
summary_notNA_counts <- notNAperGeneANDgroup_cou %>% group_by(fid) %>%
  filter(sum(sum_notNA>1)>1, sum_notNA > 1) %>%
  dplyr::summarise(tot_valid = n(), valid_groups = str_c(pop_gen, collapse = ';'))
table(summary_notNA_counts$tot_valid)


# min2 in all 7 groups = good for DEX
fid_min2INallGroups_counts <- summary_notNA_counts %>%
  filter(tot_valid == 7) %>% pull(fid)
length(fid_min2INallGroups_counts) # 9663 valis

TPM

Publication on tpm filtering: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4937821/

## Copy the NAs from counts to TPM for consistency
# copy data
exp_tpm_4filt <- exp_tpn_pre

# check consistency
identical(exp_tpm_4filt$fid, exp_counts_pre$fid)
identical(names(exp_tpm_4filt),names(exp_counts_pre))



###### TPM below threshold filtering (abandoned! proceed below)

# median library size = 37 Million
# 10/37 = 0.28 = minimum to filter for in TPN data (check if reads would get the same)
exp_tpm_4filt[exp_tpm_4filt <= 0.28] = NA 
nrow(exp_tpm_4filt)

# check NAs per feature and group
exp_tpm_4filt_long <- exp_tpm_4filt %>% pivot_longer(!fid, names_to = 'id', values_to = "value")
exp_tpm_4filt_long$pop_gen <- san_mrn_temp$population_general[match(exp_tpm_4filt_long$id, san_mrn_temp$id)]

# not NA/group
notNAperGeneANDgroup_tpm <- exp_tpm_4filt_long %>%
  group_by(fid, pop_gen) %>%
  summarise(sum_notNA = sum(!is.na(value)))
length(unique(notNAperGeneANDgroup_tpm$fid)) # 61552

# create filter
summary_notNA_tpm <- notNAperGeneANDgroup_tpm %>% group_by(fid) %>%
  filter(sum(sum_notNA>1)>1, sum_notNA > 1) %>%
  summarise(tot_valid = n(), valid_groups = str_c(pop_gen, collapse = ';'))

table(summary_notNA_tpm$tot_valid) # 10009 in 7
table(summary_notNA_counts$tot_valid) # 9663 in 7


# min2 in all 7 groups = good for DEX
fid_min2INallGroups_tpm <- summary_notNA_tpm %>%
  filter(tot_valid == 7) %>% pull(fid)
length(fid_min2INallGroups_tpm)

remove low features

# compare invlaid in TPM and counts
# fid_min2INallGroups_counts
# fid_min2INallGroups_tpm

length(intersect(fid_min2INallGroups_counts, fid_min2INallGroups_tpm))
# 8897 intersecting: keep only these

fid_min2inALL_CountsANDtpm <- intersect(fid_min2INallGroups_counts, fid_min2INallGroups_tpm)
length(fid_min2inALL_CountsANDtpm) # not final yet becuase I will summarize groups below

#### copy usable data at this point

# COUNTS
exp_counts_RAW <- exp_counts_pre # all features with count threshold = 10 (61.552)
exp_counts_filtered <- exp_counts_pre %>% filter(fid %in% fid_min2inALL_CountsANDtpm)

nrow(exp_counts_RAW) # 61552
nrow(exp_counts_filtered) # 8897

# set threshold to NA in filtered
exp_counts_filtered[exp_counts_filtered < 10] = NA
NAinCounts <- is.na(exp_counts_filtered)


# TPM
exp_tpm_RAW <- exp_tpn_pre
exp_tpm_filtered <- exp_tpn_pre %>% filter(fid %in% fid_min2inALL_CountsANDtpm)

nrow(exp_tpm_RAW) # 61552
nrow(exp_tpm_filtered) # 8897

exp_tpm_filtered[exp_tpm_filtered <= 0.28] = NA 
NAinTPM <- is.na(exp_tpm_filtered)

# copy NAs between the two DFs
exp_tpm_filtered[NAinCounts] <- NA
exp_counts_filtered[NAinTPM] <- NA


# creare non data
exp_tpm_non <- exp_tpm_filtered
exp_cou_non <- exp_counts_filtered

Histograms

# colors by sample
colors_ids <- san_mrn_temp %>% select(id, population_general)
colors_ids$color <- colors_stage[match(colors_ids$population_general, names(colors_stage))]
colors_ids$population_general <- NULL

col_ids <- as.vector(colors_ids$color)
names(col_ids) <- colors_ids$id



## density histogram plotter
# with log x
plot_sample_density_log <- function(exp, title) {
  exp_long <- exp %>% pivot_longer(!fid, values_to = 'value', names_to = 'id')
  exp_long$stage <- san_mrn_temp$population_general[match(exp_long$id, san_mrn_temp$id)]
  exp_long$stage <- factor(exp_long$stage, levels = levels_mrn)
  exp_long$donor <- san_mrn_temp$donor[match(exp_long$id, san_mrn_temp$id)]
  exp_long %>%
    #filter(donor == 'B') %>%
    ggplot(aes(x = value, color = id)) +
      geom_density(na.rm = T, bw = 'SJ') +
      scale_x_continuous(trans='log2') +
      scale_color_manual(values = col_ids) +
      theme(legend.position = "none") +
      ggtitle(title) +
      scale_x_log10(breaks = trans_breaks("log10", function(x) 10^x),
              labels = trans_format("log10", math_format(10^.x))) 
}
# without log x
plot_sample_density_noLog <- function(exp, title) {
  exp_long <- exp %>% pivot_longer(!fid, values_to = 'value', names_to = 'id')
  exp_long$stage <- san_mrn_temp$population_general[match(exp_long$id, san_mrn_temp$id)]
  exp_long$stage <- factor(exp_long$stage, levels = levels_mrn)
  exp_long$donor <- san_mrn_temp$donor[match(exp_long$id, san_mrn_temp$id)]
  exp_long %>%
    #filter(donor == 'B') %>%
    ggplot(aes(x = value, color = id)) +
      geom_density(na.rm = T, bw = 'SJ') +
      #scale_x_continuous(trans='log2') +
      scale_color_manual(values = col_ids) +
      theme(legend.position = "none") +
      ggtitle(title)
}

# apply
p.hist.tpm_unfiltered <- plot_sample_density_log(exp_tpm_RAW, "TPM unfiltered")
p.hist.tpm_filtered <- plot_sample_density_log(exp_tpm_filtered, "TPM filtered")
p.hist.cou_unfiltered <- plot_sample_density_log(exp_counts_RAW, "Counts unfiltered")
p.hist.cou_filtered <- plot_sample_density_log(exp_counts_filtered, "Counts filtered")

fig_hist_mrn_fitereffect_pre <- ggarrange(
    p.hist.cou_unfiltered,
    p.hist.cou_filtered,
    p.hist.tpm_unfiltered,
    p.hist.tpm_filtered,
    ncol = 2,
    nrow = 2)
fig_hist_mrn_fitereffect_pre

Sequencing QCs

names(san_mrn_temp)

dat.qc.mrn <- san_mrn_temp %>% dplyr::select(id, stage = population_general, reads, mbases_yield, mean_quality, bases_over_30)

dat.qc.mrn$stage <- factor(dat.qc.mrn$stage, levels = levels_mrn)
dat.qc.mrn$stageNUM <- as.numeric(dat.qc.mrn$stage)
# dat.qc.mrn$id <- reorder(dat.qc.mrn$id, dat.qc.mrn$stage)

# reads
dat.qc.mrn %>%
  ggplot(aes(x = reorder(id, stageNUM), y = reads, fill = stage)) +
  geom_bar(stat = 'identity') +
  scale_fill_manual(values = colors_stage) +
  theme(axis.text.x = element_text(angle=30, hjust=1)) +
  ggtitle("Total reads per sample in mRNA")

# MB yield
dat.qc.mrn %>%
  ggplot(aes(x = reorder(id, stageNUM), y = mbases_yield, fill = stage)) +
  geom_bar(stat = 'identity') +
  scale_fill_manual(values = colors_stage) +
  theme(axis.text.x = element_text(angle=30, hjust=1)) +
  ggtitle("MByields per sample in mRNA")

# Mean quality
dat.qc.mrn %>%
  ggplot(aes(x = reorder(id, stageNUM), y = mean_quality, fill = stage)) +
  geom_bar(stat = 'identity') +
  scale_fill_manual(values = colors_stage) +
  theme(axis.text.x = element_text(angle=30, hjust=1)) +
  ggtitle("Mean quality per sample in mRNA")

# Bases over 30
dat.qc.mrn %>%
  ggplot(aes(x = reorder(id, stageNUM), y = bases_over_30, fill = stage)) +
  geom_bar(stat = 'identity') +
  scale_fill_manual(values = colors_stage) +
  theme(axis.text.x = element_text(angle=30, hjust=1)) +
  ggtitle("Mean quality per sample in mRNA")

Normalisation

log2

exp_cou_4log2 <- FIDtoRow(exp_cou_non)
mx_cou_4log2 <- df2m(exp_cou_4log2)
min(mx_cou_4log2)
min(mx_cou_4log2, na.rm = T) # OK
# exp_cou_log2 <- exp_cou_log2 +1 # all +1 to remove negative values!
exp_cou_log2 <- log2(mx_cou_4log2)
exp_cou_log2 <- as.data.frame(exp_cou_log2)
exp_cou_log2$fid <- rownames(exp_cou_log2) # has -Inf! 



exp_tpm_4log2 <- FIDtoRow(exp_tpm_non)
mx_tpm_4log2 <- df2m(exp_tpm_4log2)
min(mx_tpm_4log2) # NA
min(mx_tpm_4log2, na.rm = T) # 0.28 -> all 0.x will give negative values
#mx_tpm_4log2_plus1 <- mx_tpm_4log2 +1 # all +1 to remove negative values!
#min(mx_tpm_4log2_plus1, na.rm = T) # 1.28. perfect
# ACTUALLY< THE +1 is not worth it because VSN keeps putting out negative values.
# BETTER TO KEEP IT AS IT IS!
exp_tpm_log2 <- log2(mx_tpm_4log2) # has some negative values
exp_tpm_log2 <- as.data.frame(exp_tpm_log2)
exp_tpm_log2$fid <- rownames(exp_tpm_log2) # 

VSN

Comparison of normalisation methods in proteomics: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2680204/

Vignette on VSN: https://www.bioconductor.org/packages/devel/bioc/vignettes/vsn/inst/doc/A-vsn.html

VSN application in expression data: https://academic.oup.com/bioinformatics/article/18/suppl_1/S96/231881

First author on VSN paper: https://www.embl.org/groups/huber/

Why VSN need non log data: https://support.bioconductor.org/p/13221/

eSet

# VSN needs counts, not log2!

# copy non log2 exp mx from above
mx_4eSet_cou <- mx_cou_4log2 # min = 10
mx_4eSet_tpm <- mx_tpm_4log2 # avoid negative values with +1 
min(mx_4eSet_tpm, na.rm = T) # 0.28... alright

# match san order for pheno creation
san_4eSet <- san_mrn_temp[match(colnames(mx_4eSet_cou), san_mrn_temp$id),]
san_4eSet <-  as.data.frame(san_4eSet)
rownames(san_4eSet) <- san_4eSet$id
pheno_eSet <- new("AnnotatedDataFrame", data = san_4eSet)

# compile
eSet_tpm_non <- ExpressionSet (
      assayData = mx_4eSet_tpm, #exprs
      phenoData = pheno_eSet) #pData

eSet_cou_non <- ExpressionSet (
      assayData = mx_4eSet_cou, #exprs
      phenoData = pheno_eSet) #pData

exec

# normalize (uses eSet with raw data)
eSet_tpm_vsn = justvsn(eSet_tpm_non)
eSet_cou_vsn = justvsn(eSet_cou_non)

## QC of VSN: needs to be discussed

# # vector of generalized log ratios
# M_cou_non = exprs(eSet_cou_non)[,1] - exprs(eSet_cou_non)[,2]
# M_cou_vsn = exprs(eSet_cou_vsn)[,1] - exprs(eSet_cou_vsn)[,2]
# 
# M_tpm_non = exprs(eSet_tpm_non)[,1] - exprs(eSet_tpm_non)[,2]
# M_tpm_vsn = exprs(eSet_tpm_vsn)[,1] - exprs(eSet_tpm_vsn)[,2]
# 
# # histogram of the generalized log-ratios
# hist(M_cou_non, breaks = 100, col = "#d95f0e") # data not well normalized
# hist(M_cou_vsn, breaks = 100, col = "#d95f0e") # very nice
# 
# hist(M_tpm_non, breaks = 100, col = "#d95f0e") 
# hist(M_tpm_vsn, breaks = 100, col = "#d95f0e") 
# 
# # graphs
# meanSdPlot(eSet_cou_non)
# meanSdPlot(eSet_cou_vsn) 
# 
# meanSdPlot(eSet_tpm_non)
# meanSdPlot(eSet_tpm_vsn) 



# extract
exp_tpm_vsn <- exprs(eSet_tpm_vsn)
min(exp_tpm_vsn, na.rm = T) # has negative values. has NAs
exp_tpm_vsn <- as.data.frame(exp_tpm_vsn)
exp_tpm_vsn$fid <- rownames(exp_tpm_vsn) 

exp_cou_vsn <- exprs(eSet_cou_vsn)
min(exp_cou_vsn, na.rm = T) # no - values. has NAs
exp_cou_vsn <- as.data.frame(exp_cou_vsn)
exp_cou_vsn$fid <- rownames(exp_cou_vsn)

QN (quantile normalisation)

DISCARDED! NO NEED FOR THIS!

https://davetang.org/muse/2014/07/07/quantile-normalisation-in-r/

using log2 values bc of the examples given

# 
# # counts
# exp_cou_qn  <-  normalize.quantiles(exp_4eSet_cou)
# exp_cou_qn <- as.data.frame(exp_cou_qn)
# rownames(exp_cou_qn) <- rownames(exp_4eSet_cou)
# colnames(exp_cou_qn) <- colnames(exp_4eSet_cou)
# exp_cou_qn$fid <- rownames(exp_cou_qn) 
# 
# # tpm
# exp_tpm_qn  <-  normalize.quantiles(exp_4eSet_tpm)
# exp_tpm_qn <- as.data.frame(exp_tpm_qn)
# rownames(exp_tpm_qn) <- rownames(exp_4eSet_tpm)
# colnames(exp_tpm_qn) <- colnames(exp_4eSet_tpm)
# exp_tpm_qn$fid <- rownames(exp_tpm_qn)

MN (median normalisation)

DISCARDED! NO NEED FOR THIS!

Recommended by Gergely, uses minimal assumptions

# 
# exp_cou_mn <- normalizeMedianValues(exp_4eSet_cou) # limma
# exp_cou_mn <- as.data.frame(exp_cou_mn)
# exp_cou_mn$fid <- rownames(exp_cou_mn)
# colnames(exp_cou_mn)
# 
# exp_tpm_mn <- normalizeMedianValues(exp_4eSet_tpm) # limma
# exp_tpm_mn <- as.data.frame(exp_4eSet_tpm)
# exp_tpm_mn$fid <- rownames(exp_4eSet_tpm)
# colnames(exp_tpm_mn)

Analyze norm. effect

Histogram

#p.hist.cou_non <- plot_sample_density_noLog(exp_cou_non, "Counts non")
p.hist.cou_log2 <- plot_sample_density_noLog(exp_cou_log2, "Counts log2")
p.hist.cou_vsn <- plot_sample_density_noLog(exp_cou_vsn, "Counts vsn")
#p.hist.cou_qn <- plot_sample_density(exp_cou_qn, "Counts qn")
#p.hist.cou_mn <- plot_sample_density_log(exp_cou_mn, "Counts mn")

p.hist.tpm_log2 <- plot_sample_density_noLog(exp_tpm_log2, "TPM log2")
p.hist.tpm_vsn <- plot_sample_density_noLog(exp_tpm_vsn, "TPM vsn")
#p.hist.tpm_qn <- plot_sample_density(exp_tpm_qn, "TPM qn")
#p.hist.tpm_mn <- plot_sample_density_log(exp_tpm_mn, "TPM mn")

fig_hist_norm_pre <- ggarrange(
    p.hist.cou_log2,
    p.hist.cou_vsn,
    #p.hist.cou_mn,
    p.hist.tpm_log2,
    p.hist.tpm_vsn,
    #p.hist.tpm_mn,
    ncol = 2,
    nrow = 2)
fig_hist_norm_pre

fig_hist_norm <- annotate_figure(fig_hist_norm_pre,
               top = text_grob("Normalisation effects on transcriptome counts and TPM", color = "black", face = "bold", size = 18))
fig_hist_norm

Boxplots

### fct in base script
# pepExp_2long_fidCol <- function(exp, san, LEVELS){
#   exp_long <- exp %>% pivot_longer(!fid, names_to = 'id', values_to = "value")
#   exp_long$stage <- san$population_general[match(exp_long$id, san$id)]
#   exp_long$stage <- factor(exp_long$stage , levels = LEVELS)
#   return(exp_long)
# }

exp_tpm_log2_long <- pepExp_2long_fidCol(exp_tpm_log2, san_mrn_temp, levels_mrn)
exp_tpm_vsn_long <- pepExp_2long_fidCol(exp_tpm_vsn, san_mrn_temp, levels_mrn)
# exp_tpm_mn_long <- pepExp_2long_fidCol(exp_tpm_mn, san_mrn_temp, levels_mrn)

exp_cou_log2_long <- pepExp_2long_fidCol(exp_cou_log2, san_mrn_temp, levels_mrn)
exp_cou_vsn_long <- pepExp_2long_fidCol(exp_cou_vsn, san_mrn_temp, levels_mrn)
# exp_cou_mn_long <- pepExp_2long_fidCol(exp_cou_mn, san_mrn_temp, levels_mrn)

plot_box_4normCheck <- function(exp_long, title){
  exp_long %>%
    arrange(stage) %>%
    ggplot(aes(x = interaction(id, stage), y = value, fill = stage)) +
    geom_boxplot() +
  #theme(axis.text.x=element_text(angle=45, hjust=1)) +
  #coord_flip() +
    ggtitle(title) +
    xlab('Sample')+
    ylab("Expression")+
    theme(axis.text.x=element_text(angle=45, hjust=1),
          axis.text=element_text(size=7)) +
    scale_fill_manual(values = colors_stage)
}

plot_box_4normCheck_logY <- function(exp_long, title){
  exp_long %>%
    arrange(stage) %>%
    ggplot(aes(x = interaction(id, stage), y = value, fill = stage)) +
    geom_boxplot() +
  #theme(axis.text.x=element_text(angle=45, hjust=1)) +
  #coord_flip() +
    ggtitle(title) +
    xlab('Sample')+
    ylab("Expression")+
    theme(axis.text.x=element_text(angle=45, hjust=1),
          axis.text=element_text(size=7)) +
    scale_fill_manual(values = colors_stage) +
    scale_y_continuous(trans='log2')
}

p.box.cou_log2 <- plot_box_4normCheck(exp_cou_log2_long, "cou log2")
p.box.cou_vsn <- plot_box_4normCheck(exp_cou_vsn_long, "cou vsn")
p.box.tpm_log2 <- plot_box_4normCheck(exp_tpm_log2_long, "TPM log2")
p.box.tpm_vsn <- plot_box_4normCheck(exp_tpm_vsn_long, "TPM vsn")


# fig
fig_boxes_pre <- ggarrange(
  p.box.cou_log2, 
  p.box.cou_vsn,
  p.box.tpm_log2,
  p.box.tpm_vsn,
  legend = 'none')
fig_boxes_pre

fig_boxes <- annotate_figure(fig_boxes_pre,
               top = text_grob("Total expression per sample", color = "black", face = "bold", size = 20))
fig_boxes

Replicates

How do the technical repeat samples behave? Samples were distributed in aliquots at the date of collection.

# isolate techReps
san_mrn_techResp <- san_mrn_temp %>% filter(has_tech_rep == T)

# all tech Reps are the same subpopulation!

# fct to get IDs by Yidi
get_Rep_ids_mrn <- function(DONOR, STAGE){
  san_mrn_techResp %>%
    dplyr::filter(donor == DONOR) %>%
    dplyr::filter(population_general == STAGE) %>%
    pull(id)
}

donors <- unique(san_mrn_techResp$donor)
stages <- unique(san_mrn_techResp$population_general)
ids <- list()
for(i in donors){
  for(j in stages){
    id <- paste(i,j,sep = '_')
    ids[[id]] <- get_Rep_ids_mrn(i,j)
  }
}

rep_ids_mrn <- purrr::compact(ids)

# plots
plot_corr_TechReps <- function(samples, data){
  data %>% dplyr::select(all_of(samples)) %>%
    rename(Rep1 = 1, Rep2 = 2) %>%
    ggplot(aes(x = Rep1, y = Rep2)) +
    geom_point() +
    stat_cor(method = "spearman", size = 3, na.rm = T)
}

pl.corr_mrn <- purrr::map(rep_ids_mrn, plot_corr_TechReps, exp_cou_log2)

# COUNTS
fig_RepCor_mrn_pre <- ggarrange(
  pl.corr_mrn$A_MB + ggtitle("Donor A, stage MB"), 
  pl.corr_mrn$A_MC + ggtitle("Donor A, stage MC"), 
  pl.corr_mrn$A_MM + ggtitle("Donor A, stage MM"), 
  pl.corr_mrn$A_B + ggtitle("Donor A, stage B"), 
  pl.corr_mrn$C_MC + ggtitle("Donor C, stage MC"), 
  pl.corr_mrn$C_B + ggtitle("Donor C, stage B"), 
  pl.corr_mrn$C_S + ggtitle("Donor C, stage S"), 
  pl.corr_mrn$D_MM + ggtitle("Donor D, stage MM"),
  pl.corr_mrn$D_B + ggtitle("Donor D, stage B"), 
  pl.corr_mrn$D_PMN + ggtitle("Donor D, stage PMN"), 
  pl.corr_mrn$E_MC + ggtitle("Donor E, stage MC"), 
  legend = 'none')
fig_RepCor_mrn_pre

fig_RepCor_mrn <- annotate_figure(fig_RepCor_mrn_pre,
               top = text_grob("Correlation of technical repeat samples mRNA", color = "black", face = "bold", size = 20))
fig_RepCor_mrn

# one sample with 3 reps
# "G23" "G24" "G25"
triplet_A_MM <- list(
  A_MM_G23.G24 = c('G23', 'G24'),
  A_MM_G23.G25 = c('G23', 'G25'),
  A_MM_G24.G25 = c('G24', 'G25'))


pl.corr_mrn_triplet <- purrr::map(triplet_A_MM, plot_corr_TechReps, exp_cou_log2)

fig_RepCor_mrn_trip_pre <- ggarrange(
  pl.corr_mrn_triplet$A_MM_G23.G24 + ggtitle("Donor A, MM, G23 G24"), 
  pl.corr_mrn_triplet$A_MM_G23.G25 + ggtitle("Donor A, MM, G23 G25"), 
  pl.corr_mrn_triplet$A_MM_G24.G25 + ggtitle("Donor A, MM, G24 G25"), 
  legend = 'none')
fig_RepCor_mrn_trip_pre

# define sample to remove
# toRemove_lowQ <- c('G2b') # included in PCA based outliers!

Outliers

PCA

## more general function save later

# extract data
extract_PC1to4 <- function(res.pca){
  dat.pc12 <- res.pca$x %>% as.data.frame() %>%
    rownames_to_column('id') %>% 
    dplyr::select(id, PC1, PC2, PC3, PC4)
}

# 
prep4_pca <- function(exp, san) {
  exp <- exp[complete.cases(exp),]
  fid <- exp$fid
  exp$fid <- NULL
  sid <- colnames(exp)
  expT <- t(exp)
  expT <- matrix(as.numeric(expT),    # Convert to numeric matrix
                  ncol = ncol(expT))
  expT <- scale(expT, center = T, scale = T)
  colnames(expT) <- fid
  rownames(expT) <- sid
  res.pca <- prcomp(expT)
  dat.pca <- extract_PC1to4(res.pca)
  dat.pca$stage <- san$population_general[match(dat.pca$id, san$id)]
  return(dat.pca)
}

dat.pca.cou_log2 <- prep4_pca(exp_cou_log2, san_mrn_temp)
dat.pca.tpm_log2 <- prep4_pca(exp_tpm_log2, san_mrn_temp)
dat.pca.cou_vsn <- prep4_pca(exp_cou_vsn, san_mrn_temp)
dat.pca.tpm_vsn <- prep4_pca(exp_tpm_vsn, san_mrn_temp)


plot_myPCA <- function(dat.pca){
  dat.pca %>%
    ggplot(aes(x= PC1, y = PC2, color = stage))+
      geom_point(size = 3)+
    scale_color_manual(values = colors_stage)
}
plot_myPCA_labeled <- function(dat.pca){
  dat.pca %>%
    ggplot(aes(x= PC1, y = PC2, color = stage, label = id))+
      geom_point(size = 3)+
      geom_text_repel() +
    scale_color_manual(values = colors_stage)
}

p.pca.cou_lg2 <- plot_myPCA(dat.pca.cou_log2)
p.pca.tpm_lg2 <- plot_myPCA(dat.pca.tpm_log2)
p.pca.cou_vsn <- plot_myPCA(dat.pca.cou_vsn)
p.pca.tpm_vsn <- plot_myPCA(dat.pca.tpm_vsn)


fig_PCA_mrn_pre <- ggarrange(
  p.pca.cou_lg2 + ggtitle("Counts log2"), 
  p.pca.cou_vsn + ggtitle("Counts vsn"),
  p.pca.tpm_lg2 + ggtitle("TPM log2"),
  p.pca.tpm_vsn + ggtitle("TPM vsn"),
  legend = 'none')
fig_PCA_mrn_pre

fig_PCA_mrn <- annotate_figure(fig_PCA_mrn_pre,
               top = text_grob("PCA of mRNA nromalisations", color = "black", face = "bold", size = 20))
fig_PCA_mrn

plot_myPCA_labeled(dat.pca.cou_vsn) +
  ggtitle("PCA of mRNA counts, vsn")

toRemove_mrn_outliers <- c('G2b', 'G7', 'G6', 'G13', 'R1_MM_C')

Final prep

remove outliers

toRemMrn <- c(toRemove_mrn_outliers)
length(toRemMrn) # 5

# san
san_mrn_clean <- san_mrn_temp %>% filter(! id %in% toRemMrn)
nrow(san_mrn_clean) # 44 samples

## exp
# the other normalisations are not needed!

# counts
exp_cou_non_clean <- exp_cou_non[! colnames(exp_cou_non) %in% toRemMrn]
exp_cou_log2_clean <- exp_cou_log2[! colnames(exp_cou_log2) %in% toRemMrn]
exp_cou_vsn_clean <- exp_cou_vsn[! colnames(exp_cou_vsn) %in% toRemMrn]

# tpm
exp_tpm_non_clean <- exp_tpm_non[! colnames(exp_tpm_non) %in% toRemMrn]
exp_tpm_log2_clean <- exp_tpm_log2[! colnames(exp_tpm_log2) %in% toRemMrn]
exp_tpm_vsn_clean <- exp_tpm_vsn[! colnames(exp_tpm_vsn) %in% toRemMrn]

combine & rename samples

# combine tech reps and subpopulations (identified by hand)

# fct to get IDs by Yidi
# 
# get_Rep_ids_mrn_reps <- function(DONOR, STAGE){
#   san_to_combine %>%
#     dplyr::filter(donor == DONOR) %>%
#     dplyr::filter(population_general == STAGE) %>%
#     pull(id)
# }
# 
# donors2comb <- unique(san_to_combine$donor)
# stages2comb <- unique(san_to_combine$population_general)
# ids <- list()
# for(i in donors2comb){
#   for(j in stages2comb){
#     id <- paste(i,j,sep = '_')
#     ids[[id]] <- get_Rep_ids_mrn_reps(i,j)
#   }
# }
# 
# rep_ids2comb <- purrr::compact(ids)
# rep_ids2comb # not perfect bc contains one singleton. But fair enough to work with

# create labels (copy into google sheets. prep there. download and integrate below)
label_generator_mrn_pre <- san_mrn_clean %>% select(id, donor, population_general, population_specific)
# write_clip(label_generator_mrn_pre)


# form medians via function
combine_samples_mrn <- function(exp){
  exp_c <- exp
  ## A
  # B reps in A
  exp_c <- exp_c %>%  
    rowwise() %>%
    dplyr::mutate(B_a1 = median(c(G16, G17), na.rm = T)) %>%
    select(- c(G16, G17))

  # MM triplet
  exp_c <- exp_c %>% 
    rowwise() %>%
    dplyr::mutate(MM_a1 = median(c(G23, G24, G25), na.rm = T)) %>%
    select(- c(G23, G24, G25))
  ## B: none

  ## C
  # B
  exp_c <- exp_c %>% 
    rowwise() %>%
    dplyr::mutate(eB_c = median(c(R1_eB_C, R2_eB_C), na.rm = T)) %>%
    select(- c(R1_eB_C, R2_eB_C))
  # MC
  exp_c <- exp_c %>% 
    rowwise() %>%
    dplyr::mutate(eMC_c = median(c(R2_eMC_C, R1_eMC_C), na.rm = T)) %>%
    select(- c(R2_eMC_C, R1_eMC_C))
  # S
  exp_c <- exp_c %>% 
    rowwise() %>%
    dplyr::mutate(S_c = median(c(R1_S_C, R2_S_C), na.rm = T)) %>%
    select(- c(R1_S_C, R2_S_C))
  ## D
  # B
  exp_c <- exp_c %>% 
    rowwise() %>%
    dplyr::mutate(lB_d = median(c(R1_lB_D, R2_lB_D), na.rm = T)) %>%
    select(- c(R1_lB_D, R2_lB_D))
  # MM
  exp_c <- exp_c %>% 
    rowwise() %>%
    dplyr::mutate(MM_d = median(c(R2_MM_D, R1_MM_D), na.rm = T)) %>%
    select(- c(R2_MM_D, R1_MM_D))
  # PMN
  exp_c <- exp_c %>% 
    rowwise() %>%
    dplyr::mutate(PMN_d = median(c(R2_PMN_D, R1_PMN_D), na.rm = T)) %>%
    select(- c(R2_PMN_D, R1_PMN_D))
  ## E
  # MC
  exp_c <- exp_c %>% 
    rowwise() %>%
    dplyr::mutate(eMC_e = median(c(R1_eMC_E, R2_eMC_E), na.rm = T)) %>%
    select(- c(R1_eMC_E, R2_eMC_E))
  return(exp_c)
}




rename_cols_mrn <- function(exp){
  exp_c <- exp
  exp_c %<>% rename(eMC_b = R1_eMC_B)
  exp_c %<>% rename(lMC_b = R1_lMC_B)
  exp_c %<>% rename(eMC_a = R1_eMC_A)
  exp_c %<>% rename(lMC_a = R1_lMC_A)
  exp_c %<>% rename(B_a2 = R1_B_A)
  exp_c %<>% rename(MB_a = R1_MB_A)
  exp_c %<>% rename(MC_a = G2a)
  exp_c %<>% rename(MM_a2 = R1_MM_A)
  exp_c %<>% rename(PM_a = R1_PM_A)
  exp_c %<>% rename(PMN_a = R1_PMN_A)
  exp_c %<>% rename(S_a = R1_S_A)
  exp_c %<>% rename(B_b = R1_B_B)
  exp_c %<>% rename(MB_b = R1_MB_B)
  exp_c %<>% rename(MM_b = R1_MM_B)
  exp_c %<>% rename(PM_b = R1_PM_B)
  exp_c %<>% rename(PMN_b = R1_PMN_B)
  exp_c %<>% rename(MB_c = R1_MB_C)
  exp_c %<>% rename(PM_c = R1_PM_C)
  exp_c %<>% rename(eMC_d = R1_eMC_D)
  exp_c %<>% rename(B_e = R1_eB_E)
  exp_c %<>% rename(MB_e = R1_MB_E)
  exp_c %<>% rename(MM_e = R1_MM_E)
  exp_c %<>% rename(PM_e = R1_PM_E)
  exp_c %<>% rename(PMN_e = R1_PMN_E)
  exp_c %<>% rename(S_e = R1_S_E)
  return(exp_c)
}

### apply to data

### COUNTS

# non
exp_cou_non_clean_comb <- combine_samples_mrn(exp_cou_non_clean)
exp_cou_non_clean_comb <- rename_cols_mrn(exp_cou_non_clean_comb)
sort(names(exp_cou_non_clean_comb))
sort(names(exp_counts_RAW))

# log2
exp_cou_log2_clean_comb <- combine_samples_mrn(exp_cou_log2_clean)
exp_cou_log2_clean_comb <- rename_cols_mrn(exp_cou_log2_clean_comb)

# vsn
exp_cou_vsn_clean_comb <- combine_samples_mrn(exp_cou_vsn_clean)
exp_cou_vsn_clean_comb <- rename_cols_mrn(exp_cou_vsn_clean_comb)

### TPM

# non
exp_tpm_non_clean_comb <- combine_samples_mrn(exp_tpm_non_clean)
exp_tpm_non_clean_comb <- rename_cols_mrn(exp_tpm_non_clean_comb)
sort(names(exp_tpm_non_clean_comb))

# log2
exp_tpm_log2_clean_comb <- combine_samples_mrn(exp_tpm_log2_clean)
exp_tpm_log2_clean_comb <- rename_cols_mrn(exp_tpm_log2_clean_comb)

# vsn
exp_tpm_vsn_clean_comb <- combine_samples_mrn(exp_tpm_vsn_clean)
exp_tpm_vsn_clean_comb <- rename_cols_mrn(exp_tpm_vsn_clean_comb)

san

# copy to gSheets
#write_clip(san_mrn_clean)
#colnames(exp_tpm_vsn_clean_comb)[-1]
#write_clip(colnames(exp_tpm_vsn_clean_comb)[-1])

san_mrn_pre <- read_csv("/home/isar/Dropbox/scn_moa/data/gnp/mrn/san_mrn_final - Sheet1.csv")

san_mrn_pre %<>% filter(! is.na(label_final))

setdiff(colnames(exp_tpm_vsn_clean_comb), san_mrn_pre$label_final) # OK
setdiff(san_mrn_pre$label_final, colnames(exp_tpm_vsn_clean_comb)) # OK


# Finalize
san_mrn_fin <- san_mrn_pre %>% select(
   id = label_final,
   stage = population_general,
   stage_nr = population_hirachical_number,
   substage = population_specific,
   donor = donor,
   sex = sex,
   age = age
)

# eMC_b missed in labeling. Add
san_mrn_fin[san_mrn_fin$id  == 'eMC_b', 'stage'] <- 'MC'

san_mrn_fin$donor <- tolower(san_mrn_fin$donor)
san_mrn_fin$idI <- make_unique(san_mrn_fin$stage, sep = "_", wrap_in_brackets = FALSE)

filter valid features: min2/group

# align cols in all exp
identical(colnames(exp_cou_non_clean_comb), colnames(exp_cou_vsn_clean_comb)) # need to align!

identical(rownames(exp_cou_non_clean_comb), rownames(exp_cou_vsn_clean_comb)) # OK!
identical(rownames(exp_tpm_non_clean_comb), rownames(exp_cou_vsn_clean_comb)) # OK!

order_colnames <- sort(colnames(exp_cou_non_clean_comb))
order_colnames <- moveMe(order_colnames, "fid first")

exp_cou_non_clean_comb <- exp_cou_non_clean_comb[order_colnames]
exp_cou_log2_clean_comb <- exp_cou_log2_clean_comb[order_colnames]
exp_cou_vsn_clean_comb <- exp_cou_vsn_clean_comb[order_colnames]

exp_tpm_non_clean_comb <- exp_tpm_non_clean_comb[order_colnames]
exp_tpm_log2_clean_comb <- exp_tpm_log2_clean_comb[order_colnames]
exp_tpm_vsn_clean_comb <- exp_tpm_vsn_clean_comb[order_colnames]

identical(colnames(exp_cou_non_clean_comb), colnames(exp_cou_vsn_clean_comb))


# order san by cols
san_mrn_fin <- san_mrn_fin[order(match(san_mrn_fin$id, colnames(exp_tpm_vsn_clean_comb)[-1])),]
identical(san_mrn_fin$id, colnames(exp_tpm_vsn_clean_comb)[-1])



# check NAs
NAinCOU_non <- is.na(exp_cou_non_clean_comb)
NAinCOU_log2 <- is.na(exp_cou_log2_clean_comb)
NAinCOU_vsn <- is.na(exp_cou_vsn_clean_comb)

NAinVSN_non <- is.na(exp_tpm_non_clean_comb)
NAinVSN_log2 <- is.na(exp_tpm_log2_clean_comb)
NAinVSN_vsn <- is.na(exp_tpm_vsn_clean_comb)

identical(NAinCOU_non, NAinCOU_vsn) # OK
identical(NAinCOU_vsn, NAinCOU_log2) # OK

# ALL DATA HAVE THE SAME NAs!

# COUNTS
exp_4filt2_long <- exp_tpm_vsn_clean_comb %>% pivot_longer(!fid, names_to = 'id', values_to = "value")
exp_4filt2_long$stage <- san_mrn_fin$stage[match(exp_4filt2_long$id, san_mrn_fin$id)]

# not NA/group
notNAperGeneANDgroup_final <- exp_4filt2_long %>% 
  group_by(fid, stage) %>% 
  summarise(sum_notNA = sum(!is.na(value)))

# create filter
summary_notNA_final <- notNAperGeneANDgroup_final %>% group_by(fid) %>%
  filter(sum(sum_notNA>1)>1, sum_notNA > 1) %>%
  summarise(tot_valid = n(), valid_groups = str_c(stage, collapse = ';'))
table(summary_notNA_final$tot_valid)


# min2 in all 7 groups = good for DEX
fid_min2INallGroups_final <- summary_notNA_final %>%
  filter(tot_valid == 7) %>% pull(fid)
length(fid_min2INallGroups_final) # 8431


## remove from exp

# COU
exp_cou_non_preF <- exp_cou_non_clean_comb %>% filter(fid %in% fid_min2INallGroups_final)
exp_cou_log2_preF <- exp_cou_log2_clean_comb %>% filter(fid %in% fid_min2INallGroups_final)
exp_cou_vsn_preF <- exp_cou_vsn_clean_comb %>% filter(fid %in% fid_min2INallGroups_final)
# VSN
exp_tpm_non_preF <- exp_tpm_non_clean_comb %>% filter(fid %in% fid_min2INallGroups_final)
exp_tpm_log2_preF <- exp_tpm_log2_clean_comb %>% filter(fid %in% fid_min2INallGroups_final)
exp_tpm_vsn_preF <- exp_tpm_vsn_clean_comb %>% filter(fid %in% fid_min2INallGroups_final)

match fids mrn and pro

check

# load data pro
load(paste0(dir_moa, "/data/final/old/", "pro_prefinal.RData"))

# manually curated IDs to harmonize mRNA and proteins
fan_complete_pre <- read_csv(paste0(dir_moa, "/data/anno/", "fan_complete.csv") )

# setdiff(exp_tpm_vsn_clean_comb$fid, fan_complete_pre$ENSG) # 231 missing
# get annotation and add to master fan

fan

# get fan for fid not in master fan
fan_mrn_missing <- data.frame(ENSG = setdiff(exp_tpm_vsn_clean_comb$fid, fan_complete_pre$ENSG)) # 231

# check for doubles
n_occur_ENSG <- data.frame(table(fan_mrn_missing$ENSG))
n_occur_ENSG[n_occur_ENSG$Freq > 1,] # no doubles
sum(is.na(fan_mrn_missing$ENSG)) # no NA

## main IDs from org.Hs.eg.db
fids_mrn <- AnnotationDbi::select(org.Hs.eg.db, keys= fan_mrn_missing$ENSG, columns= c("SYMBOL", "UNIPROT", "ENTREZID", "GENENAME"), keytype="ENSEMBL")
nrow(fids_mrn)

fids_mrn_temp <- fids_mrn %>%
  group_by(ENSEMBL) %>%
  dplyr::summarize(UNIPROT = paste(UNIPROT, collapse = ", "), .groups = 'drop')
nrow(fids_mrn_temp)


# remove double entries, keep only first
fids_mrn <- fids_mrn %>% distinct(ENSEMBL, .keep_all = T)
fids_mrn %<>% rename(ENSG = ENSEMBL, NAME_PROT = GENENAME)

fids_mrn$UNIPROT_mult <- fids_mrn_temp$UNIPROT[match(fids_mrn$ENSG, fids_mrn_temp$ENSEMBL)]
# nit actually important and only confusing. Keep it simple

nrow(fids_mrn) # 231. ok

# intersect(fids_mrn$UNIPROT, fan_pro_temp$UNIPROT) # no overlaps! good (all new entries = list is clean)

# add fids to fan
names(fan_complete_pre)
names(fids_mrn)

fids_mrn_toAdd <- fids_mrn %>% select(ENSG, UNIPROT, SYMBOL, ENTREZ = ENTREZID)

fan_complete_pro_mrn_PRE <- rbind(fan_complete_pre, fids_mrn_toAdd)

# check for completion
setdiff(exp_cou_non_preF$fid, fan_complete_pro_mrn_PRE$ENSG) # all contained!
setdiff(exp_pro_lg2_preF3$fid, fan_complete_pro_mrn_PRE$UNIPROT) # all contained!
# also quickcheck SCN
exp_scn_vsn_cor <- read_csv(paste0(dir_moa, "/data/scn/pro/final/", "exp_scn_vsn_cor_STRINGENT.csv"))
setdiff(exp_scn_vsn_cor$UNIPROT, fan_complete_pro_mrn_PRE$UNIPROT) # all contained! PERFECT!

consistency check

gg_miss_var(fan_complete_pro_mrn_PRE)

# fan_complete_pro_mrn_PRE %>%
#   group_by(ENSG) %>%
#   miss_var_summary()

# check ENSG
n_occur_ENS <- data.frame(table(fan_complete_pro_mrn_PRE$ENSG))
nrow(n_occur_ENS[n_occur_ENS$Freq > 1,]) # NO doubles
sum(is.na(fan_complete_pro_mrn_PRE$ENSG)) # NO NA

# check SYMBOL
n_occur_SYM <- data.frame(table(fan_complete_pro_mrn_PRE$SYMBOL))
nrow(n_occur_SYM[n_occur_SYM$Freq > 1,]) # NO doubles
sum(is.na(fan_complete_pro_mrn_PRE$SYMBOL)) # 47 NA. Simply copy ENSG
# check
fan_complete_pro_mrn_PRE %>% filter(is.na(SYMBOL)) # RNA genes! Could be interesting...
# copy to google
# fan_complete_pro_mrn_PRE %>% filter(is.na(SYMBOL)) %>% write_clip()

# 
fan_complete_pro_mrn_PRE %>% filter(str_detect(SYMBOL, 'ENSG')) # 682 !!!
# it doesnt make sense to import the 40 I have without checking all of these.
# do on a case by case basis of they pop up as difEx
# dont import for now, just copy ENSG to SYMBOLS

fan_complete_pro_mrn_PRE$SYMBOL <- ifelse(is.na(fan_complete_pro_mrn_PRE$SYMBOL), fan_complete_pro_mrn_PRE$ENSG, fan_complete_pro_mrn_PRE$SYMBOL)

sum(is.na(fan_complete_pro_mrn_PRE$SYMBOL)) # ok
n_occur_SYM <- data.frame(table(fan_complete_pro_mrn_PRE$SYMBOL))
nrow(n_occur_SYM[n_occur_SYM$Freq > 1,]) # no doubles

# UNIPROT
sum(is.na(fan_complete_pro_mrn_PRE$UNIPROT)) # 1263
n_occur_UNI <- data.frame(table(fan_complete_pro_mrn_PRE$UNIPROT))
nrow(n_occur_UNI[n_occur_UNI$Freq > 1,]) # no doubles

# ok to proceed!
fan_fin <- fan_complete_pro_mrn_PRE

# change one UNIPROT ids that is outdated
fan_fin$UNIPROT <- recode(fan_fin$UNIPROT,
              P30042 = 'P0DPI2')

format exp

exp_mrn_cou_non_fin <- transpose_FIDcols(exp_cou_non_preF) # contains NA
exp_mrn_cou_log2_fin <- transpose_FIDcols(exp_cou_log2_preF)
exp_mrn_cou_vsn_fin <- transpose_FIDcols(exp_cou_vsn_preF)

exp_mrn_tpm_non_fin <- transpose_FIDcols(exp_tpm_non_preF) # contains NA
exp_mrn_tpm_log2_fin <- transpose_FIDcols(exp_tpm_log2_preF)
exp_mrn_tpm_vsn_fin <- transpose_FIDcols(exp_tpm_vsn_preF)


# savety check
setdiff(san_mrn_fin$id, rownames(exp_mrn_tpm_vsn_fin)) # all IDs match
setdiff(rownames(exp_mrn_tpm_vsn_fin), san_mrn_fin$id)  # all IDs match

identical(rownames(exp_mrn_tpm_vsn_fin), san_mrn_fin$id) # OK!

setdiff(colnames(exp_mrn_tpm_vsn_fin), fan_fin$ENSG) # all fid in master fan

imputation

Using NIPALS

library(mixOmics)
nrow(exp_mrn_tpm_non_fin) # 34

# impute the TPM vsn values only
# exp_mrn_tpm_vsn_fin_NIP <- impute.nipals(X = exp_mrn_tpm_vsn_fin, ncomp = 34) # ncomp = n samples

# takes very long to compute. use saved data
load(file = paste0(dir_moa, "/data/final/", "gnp.mrn.RData"))
exp_mrn_tpm_vsn_fin_NIP <- gnp.mrn$exp_mrn_tpm_vsn_NIPALS

#exp_mrn_tpm_vsn_fin_NIP

Export

gnp.mrn <- list(
  ## final data
  # counts
  exp_mrn_cou_non = exp_mrn_cou_non_fin,
  exp_mrn_cou_lg2 = exp_mrn_cou_log2_fin,
  exp_mrn_cou_vsn = exp_mrn_cou_vsn_fin,
  # TPM
  exp_mrn_tpm_non = exp_mrn_tpm_non_fin,
  exp_mrn_tpm_lg2 = exp_mrn_tpm_log2_fin,
  exp_mrn_tpm_vsn = exp_mrn_tpm_vsn_fin,
  exp_mrn_tpm_vsn_NIPALS = exp_mrn_tpm_vsn_fin_NIP,
  # meta data
  san_mrn = san_mrn_fin,
  levels_mrn = levels_mrn,
  fan = fan_fin,
  ## raw data
  exp_mrn_cou_raw = exp_counts_pre,
  exp_mrn_tpn_raw = exp_tpn_pre,
  san_mrn_raw = san_mrn_temp)

save(
  gnp.mrn,
  file = paste0(dir_moa, "/data/final/", "gnp.mrn.RData"))