Last update: 23-10-13

Last markdown compiled: 23-10-13

ToDo:

knitr::opts_chunk$set(warning = FALSE, message = FALSE, results = FALSE) 


# dirs
## Sebastian @Linux
dir_moa <- "/home/isar/Dropbox/scn_moa/"

# data loaded below 

## packages
source(paste0(dir_moa, "/scripts/", "scn_moa_fct.R"))

# data: not yet applicable here, will be created at end

# load required package collections
sapply(packages_general, require, character.only = TRUE)
#sapply(packages_annotation, require, character.only = TRUE)
#sapply(packages_pca, require, character.only = TRUE)
sapply(packages_normalisation, require, character.only = TRUE)

Import

exp raw

# MQ imputed values 
exp_pro_raw <- read_delim(paste0(dir_moa,"/data/gnp/pro/original/proteinGroups_filtered.txt"), delim = "\t", escape_double = FALSE, trim_ws = TRUE)

# unimputed: not important. Less IDs, no value!
# exp_pro_raw <- read_delim(paste0(dir_moa,"/data/gnp/pro/original/proteinGroups_nonImputed.txt"), delim = "\t", escape_double = FALSE, trim_ws = TRUE)


# format 
exp_pro_raw_temp <- exp_pro_raw[-(1:2),c(1:45,61)]
exp_pro_raw_temp$UNIPROT <- gsub(";.*","",exp_pro_raw_temp$`Majority protein IDs`)
exp_pro_raw_temp$`Majority protein IDs` <- NULL
names(exp_pro_raw_temp) <- gsub(" ", "_", names(exp_pro_raw_temp))
exp_pro_raw_temp[,1:45] <- sapply(exp_pro_raw_temp[,1:45], as.numeric) # make numeric

names(exp_pro_raw_temp) <- gsub("Intensity ", "", names(exp_pro_raw_temp)) 
names(exp_pro_raw_temp) <- gsub(" ", "_", names(exp_pro_raw_temp))

dim(exp_pro_raw_temp)

san raw

# import
san_pro_raw <- read_csv(paste0(dir_moa, "/data/gnp/pro/original/", "analyzed_proteome_samples_granulopoiesis - large_group.csv"))


# copy
san_pro_temp <- san_pro_raw

# create IDs
san_pro_temp$id <- san_pro_temp$name
san_pro_temp$id <- gsub(" ", "_", san_pro_temp$id)

# san for all exp?
setdiff(san_pro_temp$id, names(exp_pro_raw_temp)) #OK!
setdiff(names(exp_pro_raw_temp), san_pro_temp$id) # OK!


# donor details
donor_details <- read_csv(paste0(dir_moa, "data/gnp/pro/original/", "donor_details - Donors.csv"))

# names(san_pro_temp) #to check
san_pro_temp <- left_join(san_pro_temp, donor_details, by = c("donor"))

fan raw

# feature annotation proteome
fan_pro_temp <- exp_pro_raw[-(1:2),46:63]
fan_pro_temp$UNIPROT <- gsub(";.*","",fan_pro_temp$`Majority protein IDs`)

fan_pro_temp$has_mult_UNIPROT <- ifelse(str_detect(fan_pro_temp$`Majority protein IDs`, ";"), TRUE, FALSE)

# clean protein names
fan_pro_temp$name_prot <- gsub(";.*","",fan_pro_temp$`Protein names`)
fan_pro_temp$SYMBOL <- gsub(";.*","",fan_pro_temp$`Gene names`) # has SYMBOL for every UNIPROT


# check for doubles
n_occur_UNIPROT_imp <- data.frame(table(fan_pro_temp$UNIPROT))
n_occur_UNIPROT_imp[n_occur_UNIPROT_imp$Freq > 1,] #no doubles!

n_occur_SYMBOL_imp <- data.frame(table(fan_pro_temp$SYMBOL))
n_occur_SYMBOL_imp[n_occur_SYMBOL_imp$Freq > 1,] 
removed_UNIPROT_doubleSYMBOLS <- n_occur_SYMBOL_imp[n_occur_SYMBOL_imp$Freq > 1,] # doubles found in HLA genes

# remove doubles
fan_pro_temp %<>% distinct(SYMBOL, .keep_all = TRUE)

# check
sum(!(isUnique(fan_pro_temp$UNIPROT)))# all unique
sum(is.na(fan_pro_temp$UNIPROT)) # no NA

sum(!(isUnique(fan_pro_temp$SYMBOL)))# all unique
sum(is.na(fan_pro_temp$SYMBOL)) # one NA

#fan_imp %>% filter(is.na(fan_imp$SYMBOL)) %>% view() # manual curation: IGKV1-17
fan_pro_temp[is.na(fan_pro_temp$SYMBOL),]$SYMBOL <- 'IGKV1-17'


# check NA
anyNA(fan_pro_temp$SYMBOL) # no NA
anyNA(fan_pro_temp$UNIPROT) # no NA


fan_pro_temp

Feature check

NA values

# check NAs per feature and group
anyNA(exp_pro_raw_temp) # there are no NAs in any of the pro data!
# data were imputed using MaxQuant during raw data analysis

peptides per prot

names(fan_pro_temp)

table(fan_pro_temp$Peptides)

fan_pro_temp$Peptides <- as.numeric(fan_pro_temp$Peptides)

p.peptides <- fan_pro_temp %>%
  dplyr::arrange(desc(Peptides)) %>%
  ggplot(aes(x= Peptides)) +
  geom_histogram(stat="count") +
  scale_x_continuous(breaks=seq(1,50,1), limits = c(0,50)) +
  ggtitle("Quantified peptides per protein") +
  theme(axis.text.x = element_text(angle = 35, vjust = 0.5, hjust=0.3))

p.peptides

# which are the single pep prots?
fidU_single_pep <- fan_pro_temp %>% filter(Peptides == 1) %>% pull(UNIPROT)
fidS_single_pep <- fan_pro_temp %>% filter(Peptides == 1) %>% pull(SYMBOL)

length(fidU_single_pep) # 329 proteins lost

# remove vom exp
exp_pro_raw_temp_valid <- exp_pro_raw_temp %>% filter(! UNIPROT %in% fidU_single_pep)

Histograms

Try histrograms of all data

# create colors by sample
colors_ids_pro <- san_pro_temp %>% select(id, population_general)
colors_ids_pro$color <- colors_stage[match(colors_ids_pro$population_general, names(colors_stage))]
colors_ids_pro$population_general <- NULL

col_ids_pro <- as.vector(colors_ids_pro$color)
names(col_ids_pro) <- colors_ids_pro$id



# Density
exp_pro_long <- exp_pro_raw_temp_valid %>% pivot_longer(!UNIPROT, values_to = 'value', names_to = 'id')

exp_pro_long$stage <- san_pro_temp$population_general[match(exp_pro_long$id, san_pro_temp$id)]


exp_pro_long %>%
  ggplot(aes(x = value, color = id)) +
    geom_density(na.rm = T, bw = 'SJ') +
    scale_x_continuous(trans='log2') +
    scale_color_manual(values = col_ids_pro) +
    theme(legend.position = "none") +
    ggtitle("Density proteome")

# one sample looks strange!

Boxplot

exp_pro_long %>%
  arrange(stage) %>%
  ggplot(aes(x = interaction(id, stage), y = value, fill = stage)) +
  geom_boxplot() +
#theme(axis.text.x=element_text(angle=45, hjust=1)) +
#coord_flip() +
  ggtitle("Boxplot expression pro raw") +
  xlab('Sample')+
  ylab("Expression")+
  theme(axis.text.x=element_text(angle=45, hjust=1),
        axis.text=element_text(size=7)) +
  scale_fill_manual(values = colors_stage)

toRemove_pro_lowQ <- c("P2_eMC_C")

Contaminants

In these data there are no contaminants! Only in the non-imputed ones but we wont use them.

define

maxquant_contaminants <- read_csv(paste0(dir_moa, "data/anno/", "maxquant_contaminants.txt"), col_names = FALSE)
UNI_mq_cont <- maxquant_contaminants$X1 # UNIPROT list

# check keratins 
fan_pro_temp %>% filter(str_detect(fan_pro_temp$name_prot, 'Kerat',)) # none
fan_pro_temp %>% filter(str_detect(fan_pro_temp$SYMBOL, 'KRT',)) # none

# contaminants 
fan_pro_temp %>% filter(Reverse == "+") # none 

fan_pro_temp %>% filter(str_detect(fan_pro_temp$UNIPROT, 'REV_',)) # none
fan_pro_temp %>% filter(str_detect(fan_pro_temp$UNIPROT, 'CON_',)) # none


fan_pro_temp %>% filter(is.na(UNIPROT)) # bit stange that here is nothing in the contaminants. Unclear why it appeared later in the dataset but might have been a mixup. 

# fan_pro_temp %>% filter(is.na(UNIPROT)) %>% select(UNIPROT,SYMBOL,"Only identified by site", "Reverse", "Potential contaminant" ) %>%  view # all clean and no contaminants.

# check MQ list
intersect(UNI_mq_cont, exp_pro_raw_temp_valid$UNIPROT) # no interactions!

# this has probably been done during the imputation step!

exclude outlier

# create non (= log2 values) without outlier
exp_pro_log2 <- exp_pro_raw_temp_valid[, colnames(exp_pro_raw_temp_valid) != toRemove_pro_lowQ]
colnames(exp_pro_log2)

# also remove from san
san_pro_temp_noOut <- san_pro_temp %>% filter(id != toRemove_pro_lowQ)

Normalisation

eSet

Used for VSN

exp_pro_log2$fid <- exp_pro_log2$UNIPROT
exp_pro_log2$UNIPROT <- NULL

# copy exp
exp_pro_lg2_4eSet <- FIDtoRow(exp_pro_log2)
exp_pro_non_4eSet <- 2^exp_pro_lg2_4eSet

# VSN needs counts, not log2! Not sure about the format of the data. They look a bit different.

# match san order
setdiff(names(exp_pro_non_4eSet), san_pro_temp_noOut$id)
setdiff(san_pro_temp_noOut$id, names(exp_pro_non_4eSet))

san_pro_4eSet <- san_pro_temp_noOut[match(names(exp_pro_non_4eSet), san_pro_temp_noOut$id),]
san_pro_4eSet <-  as.data.frame(san_pro_4eSet)
rownames(san_pro_4eSet) <- san_pro_4eSet$id

# turn into matrix
mx_pro_non_4eSet <- df2m(exp_pro_non_4eSet)


# create pheno
pheno_eSet_pro <- new("AnnotatedDataFrame", data = san_pro_4eSet)

# compile
eSet_pro_non <- ExpressionSet (
      assayData = mx_pro_non_4eSet, #exprs
      phenoData = pheno_eSet_pro) #pData

VSN norm

https://www.bioconductor.org/packages/devel/bioc/vignettes/vsn/inst/doc/A-vsn.html#sec:miss

https://academic.oup.com/bioinformatics/article/18/suppl_1/S96/231881

https://www.embl.org/groups/huber/ (first author on VSN paper)

https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2680204/

# normalize (uses eSet with raw data)
eSet_pro_vsn = justvsn(eSet_pro_non)

# # vector of generalized log ratios
# M_cou_non = exprs(eSet_cou_non)[,1] - exprs(eSet_cou_non)[,2]
# M_cou_vsn = exprs(eSet_cou_vsn)[,1] - exprs(eSet_cou_vsn)[,2]
# 
# M_tpm_non = exprs(eSet_tpm_non)[,1] - exprs(eSet_tpm_non)[,2]
# M_tpm_vsn = exprs(eSet_tpm_vsn)[,1] - exprs(eSet_tpm_vsn)[,2]

# # histogram of the generalized log-ratios
# hist(M_cou_non, breaks = 100, col = "#d95f0e") # data not well normalized
# hist(M_cou_vsn, breaks = 100, col = "#d95f0e") # very nice
# 
# hist(M_tpm_non, breaks = 100, col = "#d95f0e") 
# hist(M_tpm_vsn, breaks = 100, col = "#d95f0e") 

# graphs
# meanSdPlot(eSet_cou_non)
# meanSdPlot(eSet_cou_vsn) 
# 
# meanSdPlot(eSet_tpm_non)
# meanSdPlot(eSet_tpm_vsn) 



# extract
exp_pro_vsn <- exprs(eSet_pro_vsn)
exp_pro_vsn <- as.data.frame(exp_pro_vsn)
exp_pro_vsn$fid <- rownames(exp_pro_vsn)

QN (quantile normalisation)

https://davetang.org/muse/2014/07/07/quantile-normalisation-in-r/

using log2 values bc of the examples given

# 
# mx.exp_pro_lg2 <- as.matrix(exp_pro_lg2_4eSet)
# 
# exp_pro_qn  <-  normalize.quantiles(mx.exp_pro_lg2)
# exp_pro_qn <- as.data.frame(exp_pro_qn)
# rownames(exp_pro_qn) <- rownames(mx.exp_pro_lg2)
# colnames(exp_pro_qn) <- colnames(mx.exp_pro_lg2)
# exp_pro_qn$UNIPROT <- rownames(exp_pro_qn) 

MN (median normalisation)

Recommended by Gergely, uses minimal assumptions

# exp_pro_mn <- normalizeMedianValues(exp_pro_lg2_4eSet) # limma
# exp_pro_mn <- as.data.frame(exp_pro_mn)
# exp_pro_mn$UNIPROT <- rownames(exp_pro_mn)
# colnames(exp_pro_mn)

Analyze norm. effect

Histogram

plot_sample_density_log.pro <- function(exp, title) {
  exp_long <- exp %>% pivot_longer(!fid, values_to = 'value', names_to = 'id')
  exp_long$stage <- san_pro_temp$population_general[match(exp_long$id, san_pro_temp$id)]
  exp_long$stage <- factor(exp_long$stage, levels = levels_mrn)
  exp_long$donor <- san_pro_temp$donor[match(exp_long$id, san_pro_temp$id)]
  exp_long %>%
    #filter(donor == 'B') %>%
    ggplot(aes(x = value, color = id)) +
      geom_density(na.rm = T, bw = 'SJ') +
      scale_x_continuous(trans='log2') +
      scale_color_manual(values = col_ids_pro) +
      theme(legend.position = "none") +
      ggtitle(title) +
      scale_x_log10(breaks = trans_breaks("log10", function(x) 10^x),
              labels = trans_format("log10", math_format(10^.x))) 
}

plot_sample_density_noLog.pro <- function(exp, title) {
  exp_long <- exp %>% pivot_longer(!fid, values_to = 'value', names_to = 'id')
  exp_long$stage <- san_pro_temp$population_general[match(exp_long$id, san_pro_temp$id)]
  exp_long$stage <- factor(exp_long$stage, levels = levels_mrn)
  exp_long$donor <- san_pro_temp$donor[match(exp_long$id, san_pro_temp$id)]
  exp_long %>%
    #filter(donor == 'B') %>%
    ggplot(aes(x = value, color = id)) +
      geom_density(na.rm = T, bw = 'SJ') +
      #scale_x_continuous(trans='log2') +
      scale_color_manual(values = col_ids_pro) +
      theme(legend.position = "none") +
      ggtitle(title)
}


#p.hist.cou_non <- plot_sample_density_noLog(exp_cou_non, "Counts non")
p.hist.pro_log2 <- plot_sample_density_noLog.pro(exp_pro_log2, "Pro log2")
p.hist.pro_vsn <- plot_sample_density_noLog.pro(exp_pro_vsn, "Pro vsn")
#p.hist.pro_mn <- plot_sample_density_log.pro(exp_pro_mn, "Pro mn")
#p.hist.pro_qn <- plot_sample_density_noLog.pro(exp_pro_qn, "Pro qn")

fig_hist_pro_norm_pre <- ggarrange(
    p.hist.pro_log2,
    p.hist.pro_vsn,
    ncol = 1,
    nrow = 2)
fig_hist_pro_norm_pre

fig_hist_pro_norm <- annotate_figure(fig_hist_pro_norm_pre,
               top = text_grob("Normalisation effects on proteome", color = "black", face = "bold", size = 18))
fig_hist_pro_norm

Boxplots

pepExp_4box_pro <- function(exp, san){
  exp_long <- exp %>% pivot_longer(!fid, names_to = 'id', values_to = "value")
  exp_long$stage <- san$population_general[match(exp_long$id, san$id)]
  exp_long$stage <- factor(exp_long$stage , levels = c('MB', 'PM', 'MC', 'MM', 'B', 'S', "PMN"))
  return(exp_long)
}

exp_pro_log2_long <- pepExp_4box_pro(exp_pro_log2, san_pro_temp_noOut)
exp_pro_vsn_long <- pepExp_4box_pro(exp_pro_vsn, san_pro_temp_noOut)
#exp_pro_mn_long <- pepExp_4box_pro(exp_pro_mn, san_pro_temp)
#exp_pro_qn_long <- pepExp_4box_pro(exp_pro_qn, san_pro_temp)

plot_box_4normCheck <- function(exp_long, title){
  exp_long %>%
    arrange(stage) %>%
    ggplot(aes(x = interaction(id, stage), y = value, fill = stage)) +
    geom_boxplot() +
  #theme(axis.text.x=element_text(angle=45, hjust=1)) +
  #coord_flip() +
    ggtitle(title) +
    xlab('Sample')+
    ylab("Expression")+
    theme(axis.text.x=element_text(angle=45, hjust=1),
          axis.text=element_text(size=7)) +
    scale_fill_manual(values = colors_stage)
}

plot_box_4normCheck_logY <- function(exp_long, title){
  exp_long %>%
    arrange(stage) %>%
    ggplot(aes(x = interaction(id, stage), y = value, fill = stage)) +
    geom_boxplot() +
  #theme(axis.text.x=element_text(angle=45, hjust=1)) +
  #coord_flip() +
    ggtitle(title) +
    xlab('Sample')+
    ylab("Expression")+
    theme(axis.text.x=element_text(angle=45, hjust=1),
          axis.text=element_text(size=7)) +
    scale_fill_manual(values = colors_stage) +
    scale_y_continuous(trans='log2')
}

p.box.pro_log2 <- plot_box_4normCheck(exp_pro_log2_long, "pro log2")
p.box.pro_vsn <- plot_box_4normCheck(exp_pro_vsn_long, "pro vsn")
#p.box.pro_mn <- plot_box_4normCheck(exp_pro_mn_long, "pro mn")
#p.box.pro_qn <- plot_box_4normCheck(exp_pro_qn_long, "pro qn")


# COUNTS
fig_boxes_norm_pro_pre <- ggarrange(
  p.box.pro_log2, 
  p.box.pro_vsn,
  legend = 'none')
fig_boxes_norm_pro_pre

fig_boxes_norm_pro <- annotate_figure(fig_boxes_norm_pro_pre,
               top = text_grob("Boxplots of normalized pro values", color = "black", face = "bold", size = 20))
fig_boxes_norm_pro

replicate correlation

# identify techReps
san_pro_techResp <- san_pro_temp_noOut %>% filter(has_tech_rep == T)
san_pro_techResp <- droplevels(san_pro_techResp)

### groups of interest"
get_Rep_ids_pro <- function(DONORoi, STAGEoi){
  san_pro_techResp %>%
    dplyr::filter(DONOR == DONORoi) %>%
    dplyr::filter(population_general == STAGEoi) %>%
    pull(id)
}

donors <- unique(san_pro_techResp$DONOR)
stages <- unique(san_pro_techResp$population_general)
ids <- list()
for(i in donors){
  for(j in stages){
    id <- paste(i,j,sep = '_')
    ids[[id]] <- get_Rep_ids_pro(i,j)
  }
}

rep_ids_pro <- purrr::compact(ids)


# plots
plot_corr_TechReps <- function(samples, data){
  data %>% dplyr::select(all_of(samples)) %>%
    rename(Rep1 = 1, Rep2 = 2) %>%
    ggplot(aes(x = Rep1, y = Rep2)) +
    geom_point() +
    stat_cor(method = "spearman", size = 3, na.rm = T)
}

# somehow the first list onlu has 1 entry, Remove
rep_ids_pro <- rep_ids_pro[-1] # because rep was removed as technical outlier

# the correlation is normalisation independend!
pl.corr_pro <- purrr::map(rep_ids_pro, plot_corr_TechReps, exp_pro_vsn)

# Plots
fig_RepCor_pro_pre <- ggarrange(
  pl.corr_pro$C_S + ggtitle("Donor C, stage S"),
  pl.corr_pro$D_MC + ggtitle("Donor D, stage MC"), 
  pl.corr_pro$D_MM + ggtitle("Donor D, stage MM"), 
  pl.corr_pro$D_PMN + ggtitle("Donor D, stage PMN"),
  pl.corr_pro$E_MC + ggtitle("Donor E, stage MC"), 
  pl.corr_pro$E_MM + ggtitle("Donor E, stage MM"), 
  pl.corr_pro$E_PMN + ggtitle("Donor E, stage PMN"),
  pl.corr_pro$A_MB + ggtitle("Donor A, stage MB"), 
  pl.corr_pro$A_MM + ggtitle("Donor A, stage MM"), 
  ncol = 5,
  nrow = 2,
  legend = 'none')

fig_RepCor_pro <- annotate_figure(fig_RepCor_pro_pre,
               top = text_grob("Correlation of technical repeat samples pro vsn", color = "black", face = "bold", size = 20))
fig_RepCor_pro

# samples with multi reps
rep_ids_pro

triplets_pro <- list(
  D_eMC_P1.P1rep = c('P1_eMC_D', 'P1_eMC_D_repeat'),
  D_eMC_P1.P2 = c('P1_eMC_D', 'P2_eMC_D'),
  D_eMC_P2.P1rep = c('P2_eMC_D', 'P1_eMC_D_repeat'),
  A_MM_G22.G19 = c('G22_A', 'G19_A'),
  A_MM_G22.G20 = c('G22_A', 'G20_A'),
  A_MM_G22.G21 = c('G22_A', 'G21_A'))


pl.corr_pro_triplet <- purrr::map(triplets_pro, plot_corr_TechReps, exp_pro_vsn)

fig_RepCor_pro_trip_pre <- ggarrange(
  pl.corr_pro_triplet$D_eMC_P1.P1rep + ggtitle("Donor D, eMC, P1 P1rep"), 
  pl.corr_pro_triplet$D_eMC_P1.P2 + ggtitle("Donor D, eMC, P1 P2"), 
  pl.corr_pro_triplet$D_eMC_P2.P1rep + ggtitle("Donor D, eMC, P1 P2"), 
  pl.corr_pro_triplet$A_MM_G22.G19 + ggtitle("Donor A, MM, G22 G19"), 
  pl.corr_pro_triplet$A_MM_G22.G20 + ggtitle("Donor A, MM, G22 G20"), 
  pl.corr_pro_triplet$A_MM_G22.G21 + ggtitle("Donor A, MM, G22 G21"), 
  legend = 'none')
fig_RepCor_pro_trip_pre

# combine all corr
fig_CORR_pro_pre <- ggarrange(
  pl.corr_pro$C_S + ggtitle("Donor C, stage S"),
  pl.corr_pro$D_MC + ggtitle("Donor D, stage MC"), 
  pl.corr_pro$D_MM + ggtitle("Donor D, stage MM"), 
  pl.corr_pro$D_PMN + ggtitle("Donor D, stage PMN"),
  pl.corr_pro$E_MC + ggtitle("Donor E, stage MC"), 
  pl.corr_pro$E_MM + ggtitle("Donor E, stage MM"), 
  pl.corr_pro$E_PMN + ggtitle("Donor E, stage PMN"),
  pl.corr_pro$A_MB + ggtitle("Donor A, stage MB"), 
  pl.corr_pro$A_MM + ggtitle("Donor A, stage MM"), 
  pl.corr_pro_triplet$D_eMC_P1.P1rep + ggtitle("Donor D, eMC, P1 P1rep"), 
  pl.corr_pro_triplet$D_eMC_P1.P2 + ggtitle("Donor D, eMC, P1 P2"), 
  pl.corr_pro_triplet$D_eMC_P2.P1rep + ggtitle("Donor D, eMC, P1 P2"), 
  pl.corr_pro_triplet$A_MM_G22.G19 + ggtitle("Donor A, MM, G22 G19"), 
  pl.corr_pro_triplet$A_MM_G22.G20 + ggtitle("Donor A, MM, G22 G20"), 
  pl.corr_pro_triplet$A_MM_G22.G21 + ggtitle("Donor A, MM, G22 G21"), 
  ncol = 5,
  nrow = 3,
  legend = 'none')

fig_CORR_pro <- annotate_figure(fig_CORR_pro_pre,
               top = text_grob("Correlation of technical repeat samples pro vsn", color = "black", face = "bold", size = 20))
fig_CORR_pro

Outliers

PCA

# extract data
extract_PCs <- function(res.pca){
  dat.pc12 <- res.pca$x %>% as.data.frame() %>%
    rownames_to_column('id') %>% 
    dplyr::select(id, PC1, PC2, PC3, PC4)
}

exp = exp_pro_vsn 
san = san_pro_temp_noOut

prep4_pca_pro <- function(exp, san) {
  exp <- exp[complete.cases(exp),]
  fid <- exp$fid
  exp$fid <- NULL
  sid <- colnames(exp)
  expT <- t(exp)
  expT <- matrix(as.numeric(expT),    # Convert to numeric matrix
                  ncol = ncol(expT))
  expT <- scale(expT, center = T, scale = T)
  colnames(expT) <- fid
  rownames(expT) <- sid
  res.pca <- prcomp(expT)
  dat.pca <- extract_PCs(res.pca)
  dat.pca$stage <- san$population_general[match(dat.pca$id, san$id)]
  return(dat.pca)
}

dat.pca.pro_log2 <- prep4_pca_pro(exp_pro_log2, san_pro_temp_noOut)
dat.pca.pro_vsn <- prep4_pca_pro(exp_pro_vsn, san_pro_temp_noOut)
#dat.pca.pro_mn <- prep4_pca_pro(exp_pro_qn, san_pro_temp_noOut)
#dat.pca.pro_qn <- prep4_pca_pro(exp_pro_mn, san_pro_temp_noOut)


# plot_myPCA <- function(dat.pca){
#   dat.pca %>%
#     ggplot(aes(x= PC1, y = PC2, color = stage))+
#       geom_point(size = 3)
# }

plot_PC1.2_labeled <- function(dat.pca){
  dat.pca %>%
    ggplot(aes(x= PC1, y = PC2, color = stage, label = id))+
      geom_point(size = 3)+
      geom_text_repel(size =2) +
    scale_color_manual(values = colors_stage)
}

plot_PC3.4_labeled <- function(dat.pca){
  dat.pca %>%
    ggplot(aes(x= PC3, y = PC4, color = stage, label = id))+
      geom_point(size = 3)+
      geom_text_repel(size =2) +
    scale_color_manual(values = colors_stage)
}


plot_PC1.2_labeled(dat.pca.pro_vsn) +
  ggtitle("PCA 1/2 of pro, vsn")

plot_PC3.4_labeled(dat.pca.pro_vsn) +
  ggtitle("PCA 3/4 of pro, vsn")

# identified PMN.2 (= P2_PMN_D) as another outliers via PLS-DA
# gnp_meta$san_pro # 

toRemove_id_pro_out_PCA_clear <- c('P1_PMN_E', 'P2_PMN_E', 'P2_PMN_D', 'P1_PMN_D',  'P1_S_A', 'P1_S_C', 'P2_S_C', 'P1_MB_D')

PCA with >=2 peptides only

Dont bother. Even though it looks very similar, validity of 1 peptide prots is questionable

# exp_pro_vsn_2pep <- exp_pro_vsn[! rownames(exp_pro_vsn) %in% fidU_single_pep,]
# #exp_pro_vsn_1pep <- exp_pro_vsn[rownames(exp_pro_vsn) %in% fidU_single_pep,]
# 
# dat.pca.pro.vsn_2pep <- prep4_pca_pro(exp_pro_vsn_2pep, san_pro_temp_noOut)
# dat.pca.pro.vsn_1pep <- prep4_pca_pro(exp_pro_vsn_1pep, san_pro_temp_noOut)
# 
# 
# plot_PC1.2_labeled(dat.pca.pro.vsn_2pep) +
#   ggtitle("PCA 1/2 of pro, vsn, min2pep prots")
# 
# plot_PC1.2_labeled(dat.pca.pro.vsn_1pep) +
#   ggtitle("PCA 1/2 of pro, vsn, 1pep prots")

# the 1 peptide proteins deliver the same PCA results as the <-2 peptide proteins. Ok to keep!

stage correlation

WHOLE SECTION DISCARDED

dissimilarity matrix

# # check labels
# all_unique(san_pro_temp_noOut$label_general)
# san_pro_temp_noOut$label_general <- make.unique(san_pro_temp_noOut$label_general)
# 
# # fct to prep
# prep_4clust_pro <- function(exp, san){
#   exp_4clust <- exp
#   exp_4clust <- exp_4clust[complete.cases(exp_4clust),]
#   # UNIPROT to col
#   exp_4clust <- transpose_UNIPROTcols(exp_4clust)
#   # lab_gen as rownames
#   rownames(exp_4clust) <-
#     san$label_general[match(rownames(exp_4clust),c(san$id))]
#   # scale
#   exp_4clust <- as.data.frame(scale(exp_4clust))
#   # return
#   return(exp_4clust)
# }
# 
# colnames(exp_pro_vsn)
# # 
# exp_pro_vsn_4clust <- prep_4clust_pro(exp_pro_vsn, san_pro_temp_noOut)
# 
# dist_pro_vsn <- dist(exp_pro_vsn_4clust, method = "euclidean")
# plot_DistMat_vsn <- fviz_dist(dist_pro_vsn, 
#            lab_size = 8,
#            gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))+ ggtitle("Dissimilarity matrix pro vsn")
# plot_DistMat_vsn
# 
# 
# # excluded samples
# # P1_eMC_C = MC_c_1.1
# 
# san_pro_temp_noOut
# toRemove_ids_pro_Outlier <- c('P1_eMC_C')

correlation

# 
# # plots
# plot_corr_pro <- function(samples, data){
#   data %>% dplyr::select(all_of(samples)) %>%
#     rename(Rep1 = 1, Rep2 = 2) %>%
#     ggplot(aes(x = Rep1, y = Rep2)) +
#     geom_point() +
#     stat_cor(method = "spearman", size = 3, na.rm = T)
# }
# 
# # 
# corr4_samples_pro <- list(
#   D_eMC_P1.P1rep = c('P1_eMC_D', 'P1_eMC_D_repeat'),
#   D_eMC_P1.P2 = c('P1_eMC_D', 'P2_eMC_D'),
#   D_eMC_P2.P1rep = c('P2_eMC_D', 'P1_eMC_D_repeat'),
#   A_MM_G22.G19 = c('G22_A', 'G19_A'),
#   A_MM_G22.G20 = c('G22_A', 'G20_A'),
#   A_MM_G22.G21 = c('G22_A', 'G21_A'))
# 
# 
# # the correlation is normalisation independend!
# pl.corr_pro <- purrr::map(rep_ids_pro, plot_corr_TechReps, exp_pro_vsn)

Final prep

remove outliers

toRemPro <- c(toRemove_id_pro_out_PCA_clear, toRemove_pro_lowQ)
length(toRemPro) # 9 in total

# san
san_pro_clean <- san_pro_temp_noOut %>% filter(! id %in% toRemPro)
nrow(san_pro_clean) # 36 samples

## exp
exp_pro_log2_clean <- exp_pro_log2[! colnames(exp_pro_log2) %in% toRemPro]
ncol(exp_pro_log2_clean) # 36+fid

exp_pro_vsn_clean <- exp_pro_vsn[! colnames(exp_pro_vsn) %in% toRemPro]
ncol(exp_pro_vsn_clean) # 36+fid

combine & rename samples

# form medians via function
combine_samples_pro <- function(exp){
  exp_c <- exp
  ## A
  # MB
  exp_c <- exp_c %>%  
    rowwise() %>% 
    dplyr::mutate(MB_a1 = median(c(G4_A, G5_A), na.rm = T)) %>%
    select(- c(G4_A, G5_A))

  # MM 
  exp_c <- exp_c %>%  
    rowwise() %>%
    dplyr::mutate(MM_a1 = median(c(G22_A, G19_A, G20_A, G21_A), na.rm = T)) %>%
    select(- c(G22_A, G19_A, G20_A, G21_A))


   ## D
  # MC
   exp_c <- exp_c %>%  
    rowwise() %>%
    dplyr::mutate(eMC_d = median(c(P1_eMC_D, P1_eMC_D_repeat, P2_eMC_D), na.rm = T)) %>%
    select(- c(P1_eMC_D, P1_eMC_D_repeat, P2_eMC_D))
   # MM
   exp_c <- exp_c %>%  
    rowwise() %>%
    dplyr::mutate(MM_d = median(c(P1_MM_D, P2_MM_D), na.rm = T)) %>%
    select(- c(P1_MM_D, P2_MM_D))
   ## E
   #MC
   exp_c <- exp_c %>%  
    rowwise() %>%
    dplyr::mutate(eMC_e = median(c(P1_eMC_E, P2_eMC_E), na.rm = T)) %>%
    select(- c(P1_eMC_E, P2_eMC_E))
   #MM
   exp_c <- exp_c %>%  
    rowwise() %>%
    dplyr::mutate(MM_e = median(c(P1_MM_E, P2_MM_E), na.rm = T)) %>%
    select(- c(P1_MM_E, P2_MM_E))
  return(exp_c)
}


rename_cols_pro <- function(exp){
  exp_c <- exp
  # A
  exp_c %<>% rename(MB_a2 = P1_MB_A)
  exp_c %<>% rename(eMC_a = P1_eMC_A)
  exp_c %<>% rename(lMC_a = P1_lMC_A)
  exp_c %<>% rename(MC_a = G1_A) 
  exp_c %<>% rename(MM_a2 = P1_MM_A)
  exp_c %<>% rename(PM_a = G12_A)
  exp_c %<>% rename(PMN_a = P1_PMN_A)
  exp_c %<>% rename(B_a2 = P1_B_A)
  exp_c %<>% rename(B_a1 = G15_A)
  # C
  exp_c %<>% rename(MB_c = P1_MB_C) 
  exp_c %<>% rename(eMC_c = P1_eMC_C)
  exp_c %<>% rename(lMC_c = P1_lMC_C)
  exp_c %<>% rename(MM_c = P1_MM_C)
  exp_c %<>% rename(PM_c = P1_PM_C)
  exp_c %<>% rename(eB_c = P1_eB_C)
  exp_c %<>% rename(lB_c = P1_lB_C)
  exp_c %<>% rename(PMN_c = P1_PMN_C)
  # D
  exp_c %<>% rename(PM_d = P1_PM_D)
  exp_c %<>% rename(lB_d = P1_lB_D)
  # E
  exp_c %<>% rename(MB_e = P1_MB_E)
  exp_c %<>% rename(PM_e = P1_PM_E)
  return(exp_c)
}


### apply to data
# log2
exp_pro_log2_clean_com <- combine_samples_pro(exp_pro_log2_clean)
exp_pro_log2_clean_com <- rename_cols_pro(exp_pro_log2_clean_com)
sort(colnames(exp_pro_log2_clean_com)) # OK!

exp_pro_vsn_clean_com <- combine_samples_pro(exp_pro_vsn_clean)
exp_pro_vsn_clean_com <- rename_cols_pro(exp_pro_vsn_clean_com)
sort(colnames(exp_pro_vsn_clean_com)) # OK!

# make same order
order_colnames_pro <- sort(colnames(exp_pro_vsn_clean_com))
order_colnames_pro <- moveMe(order_colnames_pro, "fid first")

exp_pro_vsn_clean_com <- exp_pro_vsn_clean_com[order_colnames_pro]
exp_pro_log2_clean_com <- exp_pro_log2_clean_com[order_colnames_pro]

identical(colnames(exp_pro_vsn_clean_com), colnames(exp_pro_log2_clean_com)) # OK!
identical(rownames(exp_pro_vsn_clean_com), rownames(exp_pro_log2_clean_com)) # OK!

san

# copy to gSheets
# write_clip(san_pro_clean)

san_pro_pre <- read_csv("/home/isar/Dropbox/scn_moa/data/gnp/pro/san_pro_final - Sheet1.csv")

san_pro_pre %<>% filter(! is.na(label_final))

setdiff(colnames(exp_pro_vsn_clean_com), san_pro_pre$label_final) # OK
setdiff(san_pro_pre$label_final, colnames(exp_pro_vsn_clean_com)) # OK


# Finalize
san_pro_fin <- san_pro_pre %>% select(
   id = label_final,
   stage = population_general,
   stage_nr = population_hirachical_number,
   substage = population_specific,
   donor = donor,
   sex = sex,
   age = age
)

san_pro_fin <- san_pro_fin[order(match(san_pro_fin$id, colnames(exp_pro_vsn_clean_com)[-1])),]
identical(san_pro_fin$id, colnames(exp_pro_vsn_clean_com)[-1])

san_pro_fin$idI <- make_unique(san_pro_fin$stage, sep = "_", wrap_in_brackets = FALSE)

filter valid features: min2/group

identical(colnames(exp_pro_vsn_clean_com), colnames(exp_pro_log2_clean_com))

# No NA in the data!!!!!!! (imputed via MaxQuant in raw data production)

match mrn and pro IDs

fan_master

This is based on the fan master created previously (matching mrn, pro and scn)

# manually curated IDs to harmonize mRNA and proteins
fan_complete_pre <- read_csv(paste0(dir_moa, "/data/anno/", "fan_complete.csv") )

# check UNIPROT not included in fan
setdiff(exp_pro_vsn_clean_com$fid, fan_complete_pre$UNIPROT) # 23 missing in fan

# overwrite IDs that do not match between pro and scn
rename_Features_pro <- function(exp){
  exp_c <- exp
  exp_c$fid %<>% recode('P62158' = 'P0DP23') # old ID of CALM1
  exp_c$fid %<>% recode('Q5SNT6' = 'Q641Q2') # to match SCN
  exp_c$fid %<>% recode('O15320' = 'Q96PC5') # old ID of MIA2
  exp_c$fid %<>% recode('P0CW22' = 'P08708') # old ID of RPS17
  exp_c$fid %<>% recode('P0DMV9' = 'P0DMV8') # old ID of HSPA1A
  #exp_c$fid %<>% recode('P10316' = 'P04439') # old ID of HLA-A. This creates twice. Delete!
  exp_c$fid %<>% recode('P0CG05' = 'P0DOY2') # old ID of HLA-
  return(exp_c)
}

exp_pro_lg2_preF1 <- rename_Features_pro(exp_pro_log2_clean_com)
exp_pro_vsn_preF1 <- rename_Features_pro(exp_pro_vsn_clean_com)

# delete double features 
delete_features_pro_d2SCN <- c('O00241', 'P42167', 'P10316') # only P42167 (=TMPO) in exp pro

exp_pro_lg2_preF2 <- exp_pro_lg2_preF1 %>% filter(! fid %in% delete_features_pro_d2SCN)
exp_pro_vsn_preF2 <- exp_pro_vsn_preF1 %>% filter(! fid %in% delete_features_pro_d2SCN)

UNIpro_NotInFan <- data.frame(UNI = setdiff(exp_pro_lg2_preF2$fid, fan_complete_pre$UNIPROT)) 
UNIpro_NotInFan$SYMBOL <- fan_pro_temp$`Gene names`[match(UNIpro_NotInFan$UNI, fan_pro_temp$UNIPROT)]

nrow(UNIpro_NotInFan) # 16 left. None have SYMBOLS. Check manually

# write_clip(UNIpro_NotInFan$UNI)
# manual inspection shows all uncovered UNIPROTS to be double entries! Remove.

exp_pro_lg2_preF3 <- exp_pro_lg2_preF2 %>% filter(! fid %in% UNIpro_NotInFan$UNI)
exp_pro_vsn_preF3 <- exp_pro_vsn_preF2 %>% filter(! fid %in% UNIpro_NotInFan$UNI)

setdiff(exp_pro_lg2_preF3$fid, fan_complete_pre$UNIPROT) # all in fan!
setdiff(exp_pro_vsn_preF3$fid, fan_complete_pre$UNIPROT) # all in fan!

exp_pro_lg2_preF3$fid[duplicated(exp_pro_lg2_preF3$fid)] # no duplicates

Molecular numbers

Using proteomic ruler for copy number estimation (http://www.mcponline.org/content/early/2014/09/17/mcp.M113.037309.full.pdf+html)

Assuming 250g/L typic cell protein concentration (http://www.coxdocs.org/doku.php?id=perseus:user:plugins:proteomicruler)

Neutrophil volume 320um3 (bionumbers)

Total protein content in 1e6 cells ~~ 80ug which translates into 8e-11g (80pg) per cell

## exp: using the 
exp_pro_lg2_4MN <- exp_pro_lg2_preF3
exp_pro_vsn_4MN <- exp_pro_vsn_preF3

# san
san_pro <- san_pro_fin


## fan
fan_complete_pre <- read_csv(paste0(dir_moa, "/data/anno/", "fan_complete.csv") )

setdiff(exp_pro_lg2_4MN$fid, fan_complete_pre$UNIPROT) # OK!

fan_4MN <- fan_complete_pre

## get masses of proteins
load(file = paste0(dir_moa, "/data/anno/", "UNI_masses.RData"))

#UNI_masses <- read_excel("Downloads/uniprotkb_reviewed_true_AND_model_organ_2023_09_19.xlsx", col_types = c("text", "text", "text", "text", "numeric", "numeric"))

#save(UNI_masses, file = paste0(dir_moa, "/data/anno/", "UNI_masses.RData"))



# check
length(intersect(exp_pro_lg2_4MN$fid, UNI_masses$Entry)) # 3155. 1 missing
length(intersect(exp_pro_vsn_4MN$fid, UNI_masses$Entry)) # 3155. 1 missing
# same for both sets!

UNI_missing_lg2 <- setdiff(exp_pro_lg2_4MN$fid, UNI_masses$Entry)
UNI_missing_vsn <- setdiff(exp_pro_vsn_4MN$fid, UNI_masses$Entry)

# its a old UNIPROT entry in my data! Exchnage for current entry

exp_pro_lg2_4MN$fid <- recode(exp_pro_lg2_4MN$fid,
              P30042 = 'P0DPI2') # checked. ok

exp_pro_vsn_4MN$fid <- recode(exp_pro_vsn_4MN$fid,
              P30042 = 'P0DPI2') # checked. ok

# recheck
setdiff(exp_pro_lg2_4MN$fid, UNI_masses$Entry) # ok!
setdiff(exp_pro_vsn_4MN$fid, UNI_masses$Entry) # ok!

exp_pro = exp_pro_vsn_4MN

# turn pro to medians
make_long_pro <- function(exp_pro){
  # make long
  expL_pro <- exp_pro
  expL_pro %<>% pivot_longer(!fid, names_to = "id", values_to = "value")
  expL_pro$stage <- san_pro$stage[match(expL_pro$id, san_pro$id)]
  expL_pro$stage <- factor(expL_pro$stage, levels = levels_pro)
  return(expL_pro)
}

exp_pro_lg2_4MN_long <- make_long_pro(exp_pro_lg2_4MN)
exp_pro_vsn_4MN_long <- make_long_pro(exp_pro_vsn_4MN)

medians_pro_lg2 <- exp_pro_lg2_4MN_long %>%
  dplyr::select(fid, stage, value) %>%
  group_by(fid, stage) %>%
  dplyr::summarise(median_pro = median(value, na.rm = T))
# glimpse(medians_pro_lg2) # ok

medians_pro_vsn <- exp_pro_vsn_4MN_long %>%
  dplyr::select(fid, stage, value) %>%
  group_by(fid, stage) %>%
  dplyr::summarise(median_pro = median(value, na.rm = T))
# glimpse(medians_pro_vsn) # ok

### proteome ruler
san_pro_temp$cell_number_as_million # this is qute diverese. Not sure how to solve this...

# add mass, rev log2 and add SYMBOL
medians_pro_lg2$mass <- UNI_masses$Mass[match(medians_pro_lg2$fid, UNI_masses$Entry,)]
medians_pro_lg2$median_quant <- 2**medians_pro_lg2$median_pro
medians_pro_lg2$SYMBOL <- fan_4MN$SYMBOL[match(medians_pro_lg2$fid, fan_4MN$UNIPROT)]

medians_pro_vsn$mass <- UNI_masses$Mass[match(medians_pro_vsn$fid, UNI_masses$Entry,)]
medians_pro_vsn$median_quant <- 2**medians_pro_vsn$median_pro
medians_pro_vsn$SYMBOL <- fan_4MN$SYMBOL[match(medians_pro_vsn$fid, fan_4MN$UNIPROT)]


# define constants
avogadro = 6.022e23
protein_mass_per_cell = 8e-11
total_ms_signal_lg2 = sum(medians_pro_lg2$median_quant) # 6e12
total_ms_signal_vsn = sum(medians_pro_vsn$median_quant) # 7e12
# not sure of this difference is relevant

# calculate & round to integer
medians_pro_lg2 %<>% mutate(
  molNr = median_quant / total_ms_signal_lg2*avogadro/ mass *protein_mass_per_cell)
medians_pro_lg2$molNr <- round(medians_pro_lg2$molNr, 0)

medians_pro_vsn %<>% mutate(
  molNr = median_quant / total_ms_signal_lg2*avogadro/ mass *protein_mass_per_cell)
medians_pro_vsn$molNr <- round(medians_pro_vsn$molNr, 0)

# compare for some proteins
molNR_4comps_lg2 <- medians_pro_lg2 %>% filter(SYMBOL %in% c('S100A8', 'S100A12', 'ELANE', 'AZU1'))
molNR_4comps_lg2$type <- "lg2"
molNR_4comps_vsn <- medians_pro_vsn %>% filter(SYMBOL%in% c('S100A8', 'S100A12', 'ELANE', 'AZU1'))
molNR_4comps_vsn$type <- "vsn"

molNR_4comps <- bind_rows(molNR_4comps_lg2, molNR_4comps_vsn)

molNR_4comps %>%
  ggplot(aes(x = stage, y = molNr, fill = type)) +
  geom_bar(stat = "identity", position = "dodge") +
  #scale_y_continuous(trans='log2') +
  facet_grid(SYMBOL ~., scales= "free_y")

fan_pro_temp %>% filter()


# exp_pro_lg2_preF3 %>% filter(fid == 'P80511') %>% view() # S100A1: C is a problem!
# exp_pro_lg2_preF3 %>% filter(fid == 'P20160') %>% view() # AZU1


# check old data, Jiadong mentioned that he does not see this fall in s100A12?
#load(file = paste0(dir_moa, "/data/mixOmics/", "moxL_data.RData"))
#exp_moxL_pro %>% select(S100A12) #its also there. all good

# would it make a difference to use median cell numbers?


## save
medians_pro_vsn
medians_pro_lg2

molNr_pro <- medians_pro_vsn %>% select(UNIPROT = fid, SYMBOL, stage, molNr_vsn = molNr)

identical(medians_pro_lg2$fid, medians_pro_vsn$fid)
identical(medians_pro_lg2$stage, medians_pro_vsn$stage)

molNr_pro$molNr_lg2 <- medians_pro_lg2$molNr

Final format

# one uni has to be exchanged from exp and fan!
# P30042 = 'P0DPI2'

exp_pro_lg2_preF3$fid <- recode(exp_pro_lg2_preF3$fid,
              P30042 = 'P0DPI2') 
exp_pro_vsn_preF3$fid <- recode(exp_pro_vsn_preF3$fid,
              P30042 = 'P0DPI2') 

molNr_pro %>% filter(UNIPROT == 'P0DPI2') # already changed in molNr!

fan_pro_temp$UNIPROT <- recode(fan_pro_temp$UNIPROT,
              P30042 = 'P0DPI2')

fan_pro_sequencing_details <- fan_pro_temp

exp_pro_lg2_fin <- transpose_FIDcols(exp_pro_lg2_preF3) 
exp_pro_vsn_fin <- transpose_FIDcols(exp_pro_vsn_preF3)


# savety check
setdiff(san_pro_fin$id, rownames(exp_pro_lg2_fin)) # all IDs match
setdiff(rownames(exp_pro_lg2_fin), san_pro_fin$id)  # all IDs match

identical(rownames(exp_pro_lg2_fin), san_pro_fin$id) # OK!

colnames(exp_pro_lg2_fin)[duplicated(colnames(exp_pro_lg2_fin))] # no duplic


load(file = paste0(dir_moa, "/data/final/", "gnp.mrn.RData"))
fan_fin <- gnp.mrn$fan

# fan_fin$UNIPROT <- recode(fan_fin$UNIPROT, P30042 = 'P0DPI2') # already done in fan of mRNA. OK!

setdiff(colnames(exp_pro_lg2_fin), fan_fin$UNIPROT) # ok!

Export

gnp.pro <- list(
  ## final data
  exp_pro_lg2 = exp_pro_lg2_fin,
  exp_pro_vsn = exp_pro_vsn_fin,
  exp_pro_molNr = molNr_pro,
  # meta data
  san_pro = san_pro_fin,
  levels_pro = levels_pro,
  fan = fan_fin, # SAME FOR mRNA, pro and scn!
  seq_inf_pro = fan_pro_sequencing_details,
  ## raw data
  exp_pro_raw = exp_pro_raw_temp,
  san_pro_raw = san_pro_temp)

save(
  gnp.pro,
  file = paste0(dir_moa, "/data/final/", "gnp.pro.RData"))