Last update: 23-10-13

Last markdown compiled: 23-10-13

ToDo: - format nicely

knitr::opts_chunk$set(warning = FALSE, message = FALSE, results = FALSE) 


# dirs
## Sebastian @Linux
dir_moa <- "/home/isar/Dropbox/scn_moa/"

# data loaded below 

## packages
source(paste0(dir_moa, "/scripts/", "scn_moa_fct.R"))




# data: not yet applicable here, will be created at end

# load required package collections
sapply(packages_general, require, character.only = TRUE)
sapply(packages_annotation, require, character.only = TRUE)
sapply(packages_pca, require, character.only = TRUE)
sapply(packages_normalisation, require, character.only = TRUE)

Import

# import: raw counts and tpn
exp_mic_counts_raw <- read_delim(paste0(dir_moa, "/data/gnp/microRNA/original/", "featureCounts.miRNA"), delim = "\t", escape_double = FALSE, trim_ws = TRUE, skip = 1)

info_mic_counts <- read_delim(paste0(dir_moa, "/data/gnp/microRNA/original/",  "featureCounts.miRNA.summary"), delim = "\t", escape_double = FALSE, trim_ws = TRUE)

san_mic_raw <- read_csv(paste0(dir_moa, "/data/gnp/microRNA/original/", "san_miRNA_raw.csv") )
san_mic_raw %<>% filter(type == "miRNA")
san_mic_raw$id <- gsub("-", "_", san_mic_raw$name)

# divide data by content
pan_mic_raw <- exp_mic_counts_raw %>% select('fid' = Geneid, Chr, Start, End, Strand, Length)
exp_mic_temp <- exp_mic_counts_raw %>% select('fid' = Geneid, 7:35)

# format ids
names(exp_mic_temp) <- gsub("-", "_", names(exp_mic_temp))
names(exp_mic_temp) <- gsub(".bam", "", names(exp_mic_temp))

# make numeric
dim(exp_mic_temp)
exp_mic_temp[,2:30] <- sapply(exp_mic_temp[,2:30], as.numeric)
names(exp_mic_temp)


# # transpose (not sure if needed)
# #transpose_mic <- function(df){
#   fid <- df$HSA
#   df$HSA <- NULL
#   df_t <- as.data.frame(t(df))
#   colnames(df_t) <- fid
#   return(df_t)
# }
# #exp_mic_temp_t <- transpose_mic(exp_mic_temp)

san_mic_temp <- as.data.frame(san_mic_raw)
san_mic_temp %<>% filter(id %in% colnames(exp_mic_temp)[-1])

san_mic_temp <-  san_mic_temp %>% dplyr::arrange(factor(san_mic_temp$id, levels = colnames(exp_mic_temp)[-1]))

colors & levels

# colors of stages
colors_stage <- rev(c(
  "#c8a600",
  "#ffac40",
  "#ff7763",
  "#ca0068",
  "#a80092",
  "#a043ec",
  "#0000e3"))
names(colors_stage) <- c( "MB", "PM", "MC", "MM", "B", "S", "PMN")

# levels
levels_mic = c('MB', 'MC', 'MM', 'B', 'S', 'PMN')

Filter insignificantly expressed features

exclude low counts and <3 values/group

counts <10

# copy
exp_mic_10 <- exp_mic_temp
# set all values below threshold (10) NA
exp_mic_10[exp_mic_10<10] = NA

# check NAs per feature and group
exp_mic_10_long <- exp_mic_10 %>% 
  pivot_longer(!fid, names_to = 'id', values_to = "value")
exp_mic_10_long$pop_gen <- san_mic_temp$population_general[match(exp_mic_10_long$id, san_mic_temp$id)]

# not NA/group
notNAperGeneANDgroup_mic <- exp_mic_10_long %>% 
  group_by(fid, pop_gen) %>% 
  summarise(sum_notNA = sum(!is.na(value)))

# create filter
summary_notNA_mic <- notNAperGeneANDgroup_mic %>% group_by(fid) %>%
  filter(sum(sum_notNA>1)>1, sum_notNA > 1) %>%
  summarise(tot_valid = n(), valid_groups = str_c(pop_gen, collapse = ';'))
table(summary_notNA_mic$tot_valid)

# min2 in all 7 groups = good for DEX
fid_min2INallGroups_mic <- summary_notNA_mic %>%
  filter(tot_valid == 6) %>% pull(fid)
length(fid_min2INallGroups_mic) # 285 


# 
exp_mic_all <- exp_mic_temp # all features
nrow(exp_mic_all)

exp_mic_filtered <- exp_mic_10 %>% filter(fid %in% fid_min2INallGroups_mic)

# nrow(exp_mic_filtered) # 285

Histograms

Try histrograms of all data

colors_ids_mic <- san_mic_temp %>% select(id, population_general)
colors_ids_mic$color <- colors_stage[match(colors_ids_mic$population_general, names(colors_stage))]
colors_ids_mic$population_general <- NULL

col_ids_mic <- as.vector(colors_ids_mic$color)
names(col_ids_mic) <- colors_ids_mic$id


# prep long
exp_mic_long <- exp_mic_filtered %>% pivot_longer(!fid, values_to = 'value', names_to = 'id')

exp_mic_long$stage <- san_mic_temp$population_general[match(exp_mic_long$id, san_mic_temp$id)]


exp_mic_long %>%
  ggplot(aes(x = value, color = id)) +
    geom_density(na.rm = T, bw = 'SJ') +
    scale_x_continuous(trans='log2') +
    scale_color_manual(values = col_ids_mic) +
    theme(legend.position = "none") +
    ggtitle("Density microRNA")

Sequencing QCs

names(san_mic_temp)

dat.qc.mic <- san_mic_temp %>% dplyr::select(id, stage = population_general, reads, mbases_yield, mean_quality, bases_over_30)

dat.qc.mic$stage <- factor(dat.qc.mic$stage, levels = levels_mic)
dat.qc.mic$stageNUM <- as.numeric(dat.qc.mic$stage)

# reads
dat.qc.mic %>%
  ggplot(aes(x = reorder(id, stageNUM), y = reads, fill = stage)) +
  geom_bar(stat = 'identity') +
  scale_fill_manual(values = colors_stage) +
  theme(axis.text.x = element_text(angle=30, hjust=1)) +
  ggtitle("Total reads per sample in microRNA")

# MB yield
dat.qc.mic %>%
  ggplot(aes(x = reorder(id, stageNUM), y = mbases_yield, fill = stage)) +
  geom_bar(stat = 'identity') +
  scale_fill_manual(values = colors_stage) +
  theme(axis.text.x = element_text(angle=30, hjust=1)) +
  ggtitle("MByields per sample in microRNA")

# Mean quality
dat.qc.mic %>%
  ggplot(aes(x = reorder(id, stageNUM), y = mean_quality, fill = stage)) +
  geom_bar(stat = 'identity') +
  scale_fill_manual(values = colors_stage) +
  theme(axis.text.x = element_text(angle=30, hjust=1)) +
  ggtitle("Mean quality per sample in microRNA")

# Bases over 30
dat.qc.mic %>%
  ggplot(aes(x = reorder(id, stageNUM), y = bases_over_30, fill = stage)) +
  geom_bar(stat = 'identity') +
  scale_fill_manual(values = colors_stage) +
  theme(axis.text.x = element_text(angle=30, hjust=1)) +
  ggtitle("Bases over 30 per sample in microRNA")

Normalisation

eSet

Used for VSN

# create non
exp_mic_non <- exp_mic_filtered

# copy exp

FIDtoRow <- function(df){
  df <- as.data.frame(df)
  rownames(df) <- df$fid
  df$fid <- NULL
  return(df)
}
exp_4eSet_mic <- FIDtoRow(exp_mic_non)

# VSN needs counts, not log2! Not sure about the format of the data. They look a bit different.

# match san order
san_mic_4eSet <- san_mic_temp[match(names(exp_4eSet_mic), san_mic_temp$id),]
san_mic_4eSet <-  as.data.frame(san_mic_4eSet)
rownames(san_mic_4eSet) <- san_mic_4eSet$id

# turn into matrix
df2m <- function(X) {
  if (!methods::is(X, "matrix")) {
    m <- as.matrix(X[, which(vapply(X, is.numeric, logical(1)))])
  }
  else {
    m <- X
  }
  m
}

mx_4eSet_mic_non <- df2m(exp_4eSet_mic)
mx_4eSet_mic_log2 <- log2(mx_4eSet_mic_non)

# create pheno
pheno_mic <- new("AnnotatedDataFrame", data = san_mic_4eSet)

# compile
eSet_mic_non <- ExpressionSet (
      assayData = mx_4eSet_mic_non, #exprs
      phenoData = pheno_mic) #pData

VSN norm

https://www.bioconductor.org/packages/devel/bioc/vignettes/vsn/inst/doc/A-vsn.html#sec:miss

https://academic.oup.com/bioinformatics/article/18/suppl_1/S96/231881

https://www.embl.org/groups/huber/ (first author on VSN paper)

https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2680204/

# normalize (uses eSet with raw data)
eSet_mic_vsn = justvsn(eSet_mic_non)


# extract
exp_mic_vsn <- exprs(eSet_mic_vsn)
exp_mic_vsn <- as.data.frame(exp_mic_vsn)
exp_mic_vsn$fid <- rownames(exp_mic_vsn)

QN (quantile normalisation)

https://davetang.org/muse/2014/07/07/quantile-normalisation-in-r/

DISCARDED!

# # counts
# exp_mic_qn  <-  normalize.quantiles(exp_4eSet_mic)
# exp_mic_qn <- as.data.frame(exp_mic_qn)
# rownames(exp_mic_qn) <- rownames(exp_4eSet_mic)
# colnames(exp_mic_qn) <- colnames(exp_4eSet_mic)
# exp_mic_qn$fid <- rownames(exp_mic_qn) 

MN (median normalisation)

DISCARDED

# 
# exp_mic_mn <- normalizeMedianValues(exp_4eSet_mic) # limma
# exp_mic_mn <- as.data.frame(exp_mic_mn)
# exp_mic_mn$fid <- rownames(exp_mic_mn)
# colnames(exp_mic_mn)

log2

exp_mic_log2 <- mx_4eSet_mic_log2
exp_mic_log2 <- as.data.frame(exp_mic_log2)
exp_mic_log2$fid <- rownames(exp_mic_log2)

Analyze norm. effect

Histogram

san_mic_temp

plot_sample_density_log <- function(exp, san, LEVELS, col_ids, title) {
  exp_long <- exp %>% pivot_longer(!fid, values_to = 'value', names_to = 'id')
  exp_long$stage <- san$population_general[match(exp_long$id, san$id)]
  exp_long$stage <- factor(exp_long$stage, levels = LEVELS)
  exp_long$donor <- san$donor[match(exp_long$id, san$id)]
  exp_long %>%
    #filter(donor == 'B') %>%
    ggplot(aes(x = value, color = id)) +
      geom_density(na.rm = T, bw = 'SJ') +
      scale_x_continuous(trans='log2') +
      scale_color_manual(values = col_ids) +
      theme(legend.position = "none") +
      ggtitle(title) +
      scale_x_log10(breaks = trans_breaks("log10", function(x) 10^x),
              labels = trans_format("log10", math_format(10^.x))) 
}

plot_sample_density_NOlog <- function(exp, san, LEVELS, col_ids, title) {
  exp_long <- exp %>% pivot_longer(!fid, values_to = 'value', names_to = 'id')
  exp_long$stage <- san$population_general[match(exp_long$id, san$id)]
  exp_long$stage <- factor(exp_long$stage, levels = LEVELS)
  exp_long$donor <- san$donor[match(exp_long$id, san$id)]
  exp_long %>%
    #filter(donor == 'B') %>%
    ggplot(aes(x = value, color = id)) +
      geom_density(na.rm = T, bw = 'SJ') +
      #scale_x_continuous(trans='log2') +
      scale_color_manual(values = col_ids) +
      theme(legend.position = "none") +
      ggtitle(title)
}

p.density.mic_log2 <- plot_sample_density_NOlog(exp_mic_log2, san_mic_temp, levels_mic, col_ids_mic, "Density mic log2")

p.density.mic_vsn <- plot_sample_density_NOlog(exp_mic_vsn, san_mic_temp, levels_mic, col_ids_mic, "Density mic vsn")



fig_hist_mic_norm_pre <- ggarrange(
    p.density.mic_log2,
    p.density.mic_vsn,
    ncol = 1,
    nrow = 2)
fig_hist_mic_norm_pre

fig_hist_mic_norm <- annotate_figure(fig_hist_mic_norm_pre,
               top = text_grob("Normalisation effects on microRNA", color = "black", face = "bold", size = 18))
fig_hist_mic_norm

Boxplots

pepExp_4box <- function(exp, san){
  exp_long <- exp %>% pivot_longer(! fid, names_to = 'id', values_to = "value")
  exp_long$stage <- san$population_general[match(exp_long$id, san$id)]
  exp_long$stage <- factor(exp_long$stage , levels = c('MB', 'PM', 'MC', 'MM', 'B', 'S', "PMN"))
  return(exp_long)
}

exp_mic_log2_long <- pepExp_4box(exp_mic_log2, san_mic_temp)
exp_mic_vsn_long <- pepExp_4box(exp_mic_vsn, san_mic_temp)
#exp_mic_mn_long <- pepExp_4box(exp_mic_mn, san_mic_temp)
#exp_mic_qn_long <- pepExp_4box(exp_mic_qn, san_mic_temp)

plot_box_4normCheck <- function(exp_long, title){
  exp_long %>%
    arrange(stage) %>%
    ggplot(aes(x = interaction(id, stage), y = value, fill = stage)) +
    geom_boxplot() +
  #theme(axis.text.x=element_text(angle=45, hjust=1)) +
  #coord_flip() +
    ggtitle(title) +
    xlab('Sample')+
    ylab("Expression")+
    theme(axis.text.x=element_text(angle=45, hjust=1),
          axis.text=element_text(size=7)) +
    scale_fill_manual(values = colors_stage)
}

plot_box_4normCheck_logY <- function(exp_long, title){
  exp_long %>%
    arrange(stage) %>%
    ggplot(aes(x = interaction(id, stage), y = value, fill = stage)) +
    geom_boxplot() +
  #theme(axis.text.x=element_text(angle=45, hjust=1)) +
  #coord_flip() +
    ggtitle(title) +
    xlab('Sample')+
    ylab("Expression")+
    theme(axis.text.x=element_text(angle=45, hjust=1),
          axis.text=element_text(size=7)) +
    scale_fill_manual(values = colors_stage) +
    scale_y_continuous(trans='log2')
}

p.box.mic_log2 <- plot_box_4normCheck(exp_mic_log2_long, "mic log2")
p.box.mic_vsn <- plot_box_4normCheck(exp_mic_vsn_long, "mic vsn")
#p.box.mic_mn <-  plot_box_4normCheck_logY(exp_mic_mn_long, "mic mn")
#p.box.mic_qn <-  plot_box_4normCheck_logY(exp_mic_qn_long, "mic qn")


fig_boxes_mic_pre <- ggarrange(
  p.box.mic_log2, 
  p.box.mic_vsn,
  legend = 'none')
fig_boxes_mic_pre

fig_boxes_mic <- annotate_figure(fig_boxes_mic_pre,
               top = text_grob("Expression per sample in mic", color = "black", face = "bold", size = 20))
fig_boxes_mic

Replicates

How do the technical repeat samples behave? Samples were distributed in aliquots at the date of collection.

# isolate techReps
san_mic_techRep <- san_mic_temp %>% filter(has_tech_rep == T)

# all tech Reps are the same subpopulation!

# fct to get IDs by Yidi
get_Rep_ids_mic <- function(DONOR, STAGE){
  san_mic_techRep %>%
    dplyr::filter(donor == DONOR) %>%
    dplyr::filter(population_general == STAGE) %>%
    pull(id)
}

donors_mic <- unique(san_mic_techRep$donor)
stages_mic <- unique(san_mic_techRep$population_general)
ids <- list()
for(i in donors_mic){
  for(j in stages_mic){
    id <- paste(i,j,sep = '_')
    ids[[id]] <- get_Rep_ids_mic(i,j)
  }
}

rep_ids_mic <- purrr::compact(ids)
rep_ids_mic # 3 triplets, one single. Do manually


# plots
plot_corr_TechReps <- function(samples, data){
  data %>% dplyr::select(all_of(samples)) %>%
    rename(Rep1 = 1, Rep2 = 2) %>%
    ggplot(aes(x = Rep1, y = Rep2)) +
    geom_point() +
    stat_cor(method = "spearman", size = 3, na.rm = T)
}


# one sample with 3 reps
# "G23" "G24" "G25"
triplets_mic <- list(
  C_lB_1 = c('Rm1_LB_C', 'Rm2_LB_C'),
  C_lB_2 = c('Rm1_LB_C', 'Rm3_LB_C'),
  D_PMN_1 = c('Rm1_PMN_D', 'Rm2_PMN_D'),
  D_PMN_2 = c('Rm1_PMN_D', 'Rm3_PMN_D'))


pl.corr_mic_triplet <- purrr::map(triplets_mic, plot_corr_TechReps, exp_mic_log2)

fig_RepCor_mic_trip_pre <- ggarrange(
  pl.corr_mic_triplet$C_lB_1 + ggtitle("Donor C, B, 1vs2"), 
  pl.corr_mic_triplet$C_lB_2 + ggtitle("Donor C, B, 1vs3"), 
  pl.corr_mic_triplet$D_PMN_1 + ggtitle("Donor D, PMN, 1vs2"),
  pl.corr_mic_triplet$D_PMN_2 + ggtitle("Donor D, PMN, 1vs3"),
  legend = 'none')
fig_RepCor_mic_trip_pre

# define sample to remove
# none!

# combine all replicates

Outliers

PCA

# extract data
extract_PC1.2 <- function(res.pca){
  dat.pc12 <- res.pca$x %>% as.data.frame() %>%
    rownames_to_column('id') %>% 
    dplyr::select(id, PC1, PC2)
}

prep4_pca <- function(exp, san) {
  exp <- exp[complete.cases(exp),]
  fid <- exp$fid
  exp$fid <- NULL
  sid <- colnames(exp)
  expT <- t(exp)
  expT <- matrix(as.numeric(expT),    # Convert to numeric matrix
                  ncol = ncol(expT))
  expT <- scale(expT, center = T, scale = T)
  colnames(expT) <- fid
  rownames(expT) <- sid
  res.pca <- prcomp(expT)
  dat.pca <- extract_PC1.2(res.pca)
  dat.pca$stage <- san$population_general[match(dat.pca$id, san$id)]
  return(dat.pca)
}

san_mic_temp$population_general <- factor(san_mic_temp$population_general, levels = levels_mic)

# PCA unlabeled
dat.pca.mic_log2 <- prep4_pca(exp_mic_log2, san_mic_temp)
dat.pca.mic_vsn <- prep4_pca(exp_mic_vsn, san_mic_temp)



plot_myPCA <- function(dat.pca, title){
  dat.pca %>%
    ggplot(aes(x= PC1, y = PC2, color = stage))+
      geom_point(size = 3)+
    scale_color_manual(values = colors_stage)+
    ggtitle(title)
}

plot_myPCA_labeled <- function(dat.pca){
  dat.pca %>%
    ggplot(aes(x= PC1, y = PC2, color = stage, label = id))+
      geom_point(size = 3)+
      geom_text_repel() +
    scale_color_manual(values = colors_stage)
}

p.pca.mic_log2 <- plot_myPCA(dat.pca.mic_log2, 'log2')
p.pca.mic_vsn <- plot_myPCA(dat.pca.mic_vsn, 'vsn')


fig_pca_mic_pre <- ggarrange(
  p.pca.mic_log2, 
  p.pca.mic_vsn,
  legend = 'none')
fig_pca_mic_pre

fig_pca_mic <- annotate_figure(fig_pca_mic_pre,
               top = text_grob("PCAs of normalisations in mic", color = "black", face = "bold", size = 20))
fig_pca_mic

plot_myPCA_labeled(dat.pca.mic_vsn) + ggtitle("PCA of microRNA, log2")

# No outliers to remove
toRemove_mic_outliers <- c('Rm_eB_C')

Final prep

remove unwanted samples

# san
san_mic_clean <- san_mic_temp %>% filter(! id %in% toRemove_mic_outliers)
nrow(san_mic_clean) # 28 samples
nrow(san_mic_temp) # 29. OK!

## exp

exp_mic_log2_clean <- exp_mic_log2[! colnames(exp_mic_log2) %in% toRemove_mic_outliers]
exp_mic_vsn_clean <- exp_mic_vsn[! colnames(exp_mic_vsn) %in% toRemove_mic_outliers]

combine & rename samples

# form medians via function
combine_samples_mic <- function(exp){
  exp_c <- exp
  ## A: none
  ## C
  #B
  exp_c <- exp_c %>% 
    rowwise() %>%
    dplyr::mutate(lB_c = median(c(Rm1_LB_C, Rm2_LB_C, Rm3_LB_C), na.rm = T)) %>%
    select(- c(Rm1_LB_C, Rm2_LB_C, Rm3_LB_C))
  ## D
  # PMN
  exp_c <- exp_c %>% 
    rowwise() %>%
    dplyr::mutate(PMN_d = median(c(Rm1_PMN_D, Rm2_PMN_D, Rm3_PMN_D), na.rm = T)) %>%
    select(- c(Rm1_PMN_D, Rm2_PMN_D, Rm3_PMN_D))
  # export
  return(exp_c)
}

exp_mic_log2_clean_comb <- combine_samples_mic(exp_mic_log2_clean)
exp_mic_vsn_clean_comb <- combine_samples_mic(exp_mic_vsn_clean)

# sort(colnames(exp_mic_log2_clean_comb))



rename_cols_mic <- function(exp){
  exp_c <- exp
  exp_c %<>% rename(eMC_a = mR_eMC_A)
  exp_c %<>% rename(lMC_a = mR_lMC_A)
  exp_c %<>% rename(B_a = mR_B_A)
  exp_c %<>% rename(lMC_b = mR_LMC_B)
  exp_c %<>% rename(MB_a = mR_MB_A)
  exp_c %<>% rename(MB_e = mR_MB_E)
  exp_c %<>% rename(MM_a = mR_MM_A)
  exp_c %<>% rename(MM_b = mR_MM_B)
  exp_c %<>% rename(PMN_b = mR_PMN_B)
  exp_c %<>% rename(PMN_c = mR_PMN_C)
  exp_c %<>% rename(S_a = mR_S_A)
  exp_c %<>% rename(PMN_a = mR1_PMN_A)
  exp_c %<>% rename(eMC_c = Rm_eMC_C)
  exp_c %<>% rename(eMC_d = Rm_eMC_D)
  exp_c %<>% rename(lB_d = Rm_lB_D)
  exp_c %<>% rename(MM_c = Rm_MM_C)
  exp_c %<>% rename(MM_d = Rm_MM_D)
  exp_c %<>% rename(MM_e = Rm_MM_E)
  exp_c %<>% rename(PMN_e = Rm_PMN_E)
  exp_c %<>% rename(S_c = Rm_S_C)
  exp_c %<>% rename(S_e = Rm_S_E)
  exp_c %<>% rename(eB_e = Rm_eB_E)
  return(exp_c)
}
  
# exec
exp_mic_log2_preF <- rename_cols_mic(exp_mic_log2_clean_comb)
exp_mic_vsn_preF <- rename_cols_mic(exp_mic_vsn_clean_comb)

sort(colnames(exp_mic_log2_preF))
sort(colnames(exp_mic_vsn_preF))

# order cols
order_colnames_mic <- sort(colnames(exp_mic_log2_preF))
order_colnames_mic <- moveMe(order_colnames_mic, "fid first")

exp_mic_log2_preF <- exp_mic_log2_preF[order_colnames_mic]
exp_mic_vsn_preF <- exp_mic_vsn_preF[order_colnames_mic]

san

# copy to gSheets
# write_clip(san_mic_clean)
#colnames(exp_tpm_vsn_clean_comb)[-1]
#write_clip(colnames(exp_tpm_vsn_clean_comb)[-1])

san_mic_preF <- read_csv("/home/isar/Dropbox/scn_moa/data/gnp/microRNA/san_mic_final - Sheet1.csv")
san_mic_preF %<>% filter(! is.na(label_final))

setdiff(colnames(exp_mic_log2_preF), san_mic_preF$label_final) # OK
setdiff(san_mic_preF$label_final, colnames(exp_mic_log2_preF)) # OK

## add sex and age
# load mRNA data
load(file = paste0(dir_moa, "/data/final/", "gnp.mrn.RData"))

san_mrn <- gnp.mrn$san_mrn

san_mic_preF$donor <- tolower(san_mic_preF$donor)
san_mic_preF$sex <- san_mrn$sex[match(san_mic_preF$donor, san_mrn$donor)]
san_mic_preF$age <- san_mrn$age[match(san_mic_preF$donor, san_mrn$donor)]

# Finalize
san_mic_fin <- san_mic_preF %>% select(
   id = label_final,
   stage = population_general,
   stage_nr = population_hirachical_number,
   donor = donor,
   sex = sex,
   age = age
)

san_mic_fin$idI <- make_unique(san_mic_fin$stage, sep = "_", wrap_in_brackets = FALSE)

san_mic_fin <- san_mic_fin[order(match(san_mic_fin$id, colnames(exp_mic_log2_preF)[-1])),]
identical(san_mic_fin$id, colnames(exp_mic_log2_preF)[-1]) # oK!

# sequencing details
mic_sequencing_details <- san_mic_preF # is also san raw? no thats the temp bc of deleted samples

NA check

# check NAs
NAinlg2_mic <- is.na(exp_mic_log2_preF)
NAinVSN_mic <- is.na(exp_mic_vsn_preF)

identical(NAinlg2_mic, NAinVSN_mic) # OK

# ALL DATA HAVE THE SAME NAs!

# COUNTS
exp_4filt2_long_mic <- exp_mic_log2_preF %>% pivot_longer(!fid, names_to = 'id', values_to = "value")
exp_4filt2_long_mic$stage <- san_mic_fin$stage[match(exp_4filt2_long_mic$id, san_mic_fin$id)]

# not NA/group
notNAperGeneANDgroup_final_mic <- exp_4filt2_long_mic %>% 
  group_by(fid, stage) %>% 
  summarise(sum_notNA = sum(!is.na(value)))

# create filter
summary_notNA_final_mic <- notNAperGeneANDgroup_final_mic %>% group_by(fid) %>%
  filter(sum(sum_notNA>1)>1, sum_notNA > 1) %>%
  summarise(tot_valid = n(), valid_groups = str_c(stage, collapse = ';'))
table(summary_notNA_final_mic$tot_valid) # 2 need to be excluded


# min2 in all 7 groups = good for DEX
fid_min2INallGroups_final_mic <- summary_notNA_final_mic %>%
  filter(tot_valid == 6) %>% pull(fid)
length(fid_min2INallGroups_final_mic) # 283


## remove from exp
exp_mic_lg2_finPRE <- exp_mic_log2_preF %>% filter(fid %in% fid_min2INallGroups_final_mic)
exp_mic_vsn_finPRE <- exp_mic_vsn_preF %>% filter(fid %in% fid_min2INallGroups_final_mic)

format exp

exp_mic_lg2_finT <- transpose_FIDcols(exp_mic_lg2_finPRE)
exp_mic_vsn_finT <- transpose_FIDcols(exp_mic_vsn_finPRE)

Imputation

Using NIPALS

library(mixOmics)
nrow(exp_mic_vsn_finT) # 24

exp_mic_vsn_finT_NIPALS <- impute.nipals(X = exp_mic_vsn_finT, ncomp = 24)

Export

# last check
identical(san_mic_fin$id, rownames(exp_mic_lg2_finT)) # oK!

gnp.mic <- list(
  ## final data
  exp_mic_lg2 = exp_mic_lg2_finT,
  exp_mic_vsn = exp_mic_vsn_finT,
  exp_mic_vsn_NIPALS = exp_mic_vsn_finT_NIPALS,
  # meta data
  san_mic = san_mic_fin,
  levels_mic = levels_mic,
  mic_sequencing_details = mic_sequencing_details,
  ## raw data
  exp_mic_raw = exp_mic_temp,
  san_mic_raw = san_mic_temp)

save(
  gnp.mic,
  file = paste0(dir_moa, "/data/final/", "gnp.mic.RData"))