Last update: 23-10-10

Last markdown compiled: 23-10-10

ToDo:

knitr::opts_chunk$set(warning = FALSE, message = FALSE, results = FALSE) 


# dirs
## Sebastian @Linux
dir_moa <- "/home/isar/Dropbox/scn_moa/"

# data
load(file = paste0(dir_moa, "/data/final/", "gnp.pro.RData"))
load(file = paste0(dir_moa, "/data/final/", "gnp.mrn.RData"))
load(file = paste0(dir_moa, "/data/final/", "gnp.mic.RData"))
# MISSING: SCN!!!

## packages
source(paste0(dir_moa, "/scripts/", "scn_moa_fct.R"))

# load required package collections
sapply(packages_general, require, character.only = TRUE)
sapply(packages_comparisons, require, character.only = TRUE)
#sapply(packages_annotation, require, character.only = TRUE)
#sapply(packages_pca, require, character.only = TRUE)
#sapply(packages_normalisation, require, character.only = TRUE)

Make sets to integrate all 3 and combos of 2 sets. Have to be matched by san$idI

prep

# get sans
san_mrn <- gnp.mrn$san_mrn
san_mic <- gnp.mic$san_mic
san_pro <- gnp.pro$san_pro

# get exp, use imputed vsn data for completion
exp_mrn <- gnp.mrn$exp_mrn_tpm_vsn_NIPALS
exp_mic <- gnp.mic$exp_mic_vsn_NIPALS
exp_pro <- gnp.pro$exp_pro_vsn

# rename fid
fan <- gnp.mrn$fan
exp_mrn <- makeFIDcols_SYMBOL(exp_mrn, fan, 'ENSG')
exp_pro <- makeFIDcols_SYMBOL(exp_pro, fan, 'UNIPROT')

set Is1: all 3 combined

# sample matching 1: mrn, pro, mic
ids_s1 <- data.frame(id_s1 = intersect(intersect(san_mrn$idI, san_mic$idI), san_pro$idI))
length(ids_s1$id_s1) # 17 samples
ids_s1$id_mrn <- san_mrn$id[match(ids_s1$id_s1, san_mrn$idI)]
ids_s1$id_pro <- san_pro$id[match(ids_s1$id_s1, san_pro$idI)]
ids_s1$id_mic <- san_mic$id[match(ids_s1$id_s1, san_mic$idI)]

names(san_mrn)
san_Is1 <- san_mrn %>% filter(id %in% ids_s1$id_mrn) %>% dplyr::select(idI, stage, stage_nr, donor)
san_Is1$stage <- factor(san_Is1$stage, levels = c('MB', 'MC', 'MM', 'B', 'PMN'))

exp_Is1_mrn <- exp_mrn[rownames(exp_mrn) %in% ids_s1$id_mrn, ]
rownames(exp_Is1_mrn) <- ids_s1$id_s1[match(rownames(exp_Is1_mrn), ids_s1$id_mrn)]
exp_Is1_mrn <- exp_Is1_mrn[order(match(rownames(exp_Is1_mrn), san_Is1$idI)), , drop = FALSE]

exp_Is1_mic <- exp_mic[rownames(exp_mic) %in% ids_s1$id_mic, ]
rownames(exp_Is1_mic) <- ids_s1$id_s1[match(rownames(exp_Is1_mic), ids_s1$id_mic)]
exp_Is1_mic <- exp_Is1_mic[order(match(rownames(exp_Is1_mic), san_Is1$idI)), , drop = FALSE]

exp_Is1_pro <- exp_pro[rownames(exp_pro) %in% ids_s1$id_pro, ]
rownames(exp_Is1_pro) <- ids_s1$id_s1[match(rownames(exp_Is1_pro), ids_s1$id_pro)]
exp_Is1_pro <- exp_Is1_pro[order(match(rownames(exp_Is1_pro), san_Is1$idI)), , drop = FALSE]

# check
sapply(list(san_Is1$idI, rownames(exp_Is1_mrn), rownames(exp_Is1_mic)), FUN = identical, rownames(exp_Is1_pro))

gnp_is1 <- list(
  san_is1 = san_Is1,
  exp_is1_mrn = exp_Is1_mrn,
  exp_is1_mic = exp_Is1_mic,
  exp_is1_pro = exp_Is1_pro)

set Is2: mrn & pro

ids_s2 <- data.frame(id_s2 = intersect(san_mrn$idI, san_pro$idI))
length(ids_s2$id_s2) # 24 samples
ids_s2$id_mrn <- san_mrn$id[match(ids_s2$id_s2, san_mrn$idI)]
ids_s2$id_pro <- san_pro$id[match(ids_s2$id_s2, san_pro$idI)]

san_Is2 <- san_mrn %>% filter(id %in% ids_s2$id_mrn) %>% dplyr::select(idI, stage, stage_nr, donor)
san_Is2$stage <- factor(san_Is2$stage, levels = c('MB', 'PM', 'MC', 'MM', 'B', 'PMN'))


exp_Is2_mrn <- exp_mrn[rownames(exp_mrn) %in% ids_s2$id_mrn, ]
rownames(exp_Is2_mrn) <- ids_s2$id_s2[match(rownames(exp_Is2_mrn), ids_s2$id_mrn)]
exp_Is2_mrn <- exp_Is2_mrn[order(match(rownames(exp_Is2_mrn), san_Is2$idI)), , drop = FALSE]

exp_Is2_pro <- exp_pro[rownames(exp_pro) %in% ids_s2$id_pro, ]
rownames(exp_Is2_pro) <- ids_s2$id_s2[match(rownames(exp_Is2_pro), ids_s2$id_pro)]
exp_Is2_pro <- exp_Is2_pro[order(match(rownames(exp_Is2_pro), san_Is2$idI)), , drop = FALSE]

sapply(list(san_Is2$idI, rownames(exp_Is2_mrn)), FUN = identical, rownames(exp_Is2_pro))

gnp_is2 <- list(
  san_is2 = san_Is2,
  exp_is2_mrn = exp_Is2_mrn,
  exp_is2_pro = exp_Is2_pro)

set Is3: mrn & mic

ids_s3 <- data.frame(id_s3 = intersect(san_mrn$idI, san_mic$idI))
length(ids_s3$id_s3) # 22 samples
ids_s3$id_mrn <- san_mrn$id[match(ids_s3$id_s3, san_mrn$idI)]
ids_s3$id_mic <- san_mic$id[match(ids_s3$id_s3, san_mic$idI)]

san_Is3 <- san_mrn %>% filter(id %in% ids_s3$id_mrn) %>% dplyr::select(idI, stage, stage_nr, donor)
san_Is3$stage <- factor(san_Is3$stage, levels = c('MB', 'MC', 'MM', 'B', 'S', 'PMN'))

exp_Is3_mrn <- exp_mrn[rownames(exp_mrn) %in% ids_s3$id_mrn, ]
rownames(exp_Is3_mrn) <- ids_s3$id_s3[match(rownames(exp_Is3_mrn), ids_s3$id_mrn)]
exp_Is3_mrn <- exp_Is3_mrn[order(match(rownames(exp_Is3_mrn), san_Is3$idI)), , drop = FALSE]

exp_Is3_mic <- exp_mic[rownames(exp_mic) %in% ids_s3$id_mic, ]
rownames(exp_Is3_mic) <- ids_s3$id_s3[match(rownames(exp_Is3_mic), ids_s3$id_mic)]
exp_Is3_mic <- exp_Is3_mic[order(match(rownames(exp_Is3_mic), san_Is3$idI)), , drop = FALSE]

sapply(list(rownames(exp_Is3_mrn), rownames(exp_Is3_mic)), FUN = identical, san_Is3$idI)

gnp_is3 <- list(
  san_is3 = san_Is3,
  exp_is3_mrn = exp_Is3_mrn,
  exp_is3_mic = exp_Is3_mic)

set Is4: pro & mic

# sample matching 4: pro, mic DISCONTINUED
ids_s4 <- data.frame(id_s4 = intersect(san_pro$idI, san_mic$idI))
length(ids_s4$id_s4) # 17 samples
ids_s4$id_mic <- san_mic$id[match(ids_s4$id_s4, san_mic$idI)]
ids_s4$id_pro <- san_pro$id[match(ids_s4$id_s4, san_pro$idI)]

san_Is4 <- san_mic %>% filter(id %in% ids_s4$id_mic) %>% dplyr::select(idI, stage, stage_nr, donor)
san_Is4$stage <- factor(san_Is4$stage, levels = c('MB', 'MC', 'MM', 'B', 'PMN'))

exp_Is4_mic <- exp_mic[rownames(exp_mic) %in% ids_s4$id_mic, ]
rownames(exp_Is4_mic) <- ids_s4$id_s4[match(rownames(exp_Is4_mic), ids_s4$id_mic)]
exp_Is4_mic <- exp_Is4_mic[order(match(rownames(exp_Is4_mic), san_Is4$idI)), , drop = FALSE]

exp_Is4_pro <- exp_pro[rownames(exp_pro) %in% ids_s4$id_pro, ]
rownames(exp_Is4_pro) <- ids_s4$id_s4[match(rownames(exp_Is4_pro), ids_s4$id_pro)]
exp_Is4_pro <- exp_Is4_pro[order(match(rownames(exp_Is4_pro), san_Is4$idI)), , drop = FALSE]

sapply(list(san_Is4$idI, rownames(exp_Is4_mic)), FUN = identical, rownames(exp_Is4_pro))

gnp_is4 <- list(
  san_Is4 = san_Is2,
  exp_Is4_mic = exp_Is4_mic,
  exp_Is4_pro = exp_Is4_pro)

fid match mrn & pro

# feature matching mrn and pro
fid_intersect_pro_mrn <- intersect(colnames(exp_mrn), colnames(exp_pro))
length(fid_intersect_pro_mrn) # 2566 matches

# check matches between coding genes: have SYMBOL and not ENSG (copied for missing)
coding_fid_mrn  <- colnames(exp_mrn)[! str_detect(colnames(exp_mrn), "ENSG")]
length(coding_fid_mrn)
length(colnames(exp_mrn))
length(coding_fid_mrn) / length(colnames(exp_mrn))  * 100 # 96%

coding_fid_pro  <- colnames(exp_pro)[! str_detect(colnames(exp_pro), "ENSG")]
length(coding_fid_pro)
length(colnames(exp_pro))
length(coding_fid_pro) / length(colnames(exp_pro))  * 100 # 100% perfect


# compare coding fids

# create matrix
coding_fid_mrn
coding_fid_pro
all_fid_mrn <- colnames(exp_mrn)

FIDsLIST<- list(coding_fid_mrn = coding_fid_mrn, 
                coding_fid_pro = coding_fid_pro,
                all_fid_mrn = all_fid_mrn)

library(UpSetR)
 
UpSetR::upset(fromList(FIDsLIST), order.by = "freq")

set If1: mrn & pro gnp

# san are the original san
san_mrn
san_pro

# exp are reduced to intersecting features
fid_intersect_pro_mrn

exp_If1_mrn <- exp_mrn[, colnames(exp_mrn) %in% fid_intersect_pro_mrn]
rownames(exp_If1_mrn) # ok

exp_If1_pro <- exp_pro[, colnames(exp_pro) %in% fid_intersect_pro_mrn]
rownames(exp_If1_pro) # ok

gnp_if1 <- list(
  san_mrn = san_mrn,
  san_pro = san_pro,
  exp_If1_mrn = exp_If1_mrn,
  exp_If1_pro = exp_If1_pro)

SCN integration

PENDING…

Export

gnp.Isets <- list(
  gnp_is1 = gnp_is1,
  gnp_is2 = gnp_is2,
  gnp_is3 = gnp_is3,
  gnp_is4 = gnp_is4,
  gnp_if1 = gnp_if1
  )

save(
  gnp.Isets,
  file = paste0(dir_moa, "/data/final/", "gnp.Isets.RData"))