Last update: 23-10-13
Last markdown compiled: 23-10-13
ToDo:
knitr::opts_chunk$set(warning = FALSE, message = FALSE, results = FALSE)
# dirs
## Sebastian @Linux
dir_moa <- "/home/isar/Dropbox/scn_moa/"
# data loaded below
## packages
source(paste0(dir_moa, "/scripts/", "scn_moa_fct.R"))
# data: not yet applicable here, will be created at end
# load required package collections
sapply(packages_general, require, character.only = TRUE)
#sapply(packages_annotation, require, character.only = TRUE)
#sapply(packages_pca, require, character.only = TRUE)
sapply(packages_normalisation, require, character.only = TRUE)
# MQ imputed values
exp_pro_raw <- read_delim(paste0(dir_moa,"/data/gnp/pro/original/proteinGroups_filtered.txt"), delim = "\t", escape_double = FALSE, trim_ws = TRUE)
# unimputed: not important. Less IDs, no value!
# exp_pro_raw <- read_delim(paste0(dir_moa,"/data/gnp/pro/original/proteinGroups_nonImputed.txt"), delim = "\t", escape_double = FALSE, trim_ws = TRUE)
# format
exp_pro_raw_temp <- exp_pro_raw[-(1:2),c(1:45,61)]
exp_pro_raw_temp$UNIPROT <- gsub(";.*","",exp_pro_raw_temp$`Majority protein IDs`)
exp_pro_raw_temp$`Majority protein IDs` <- NULL
names(exp_pro_raw_temp) <- gsub(" ", "_", names(exp_pro_raw_temp))
exp_pro_raw_temp[,1:45] <- sapply(exp_pro_raw_temp[,1:45], as.numeric) # make numeric
names(exp_pro_raw_temp) <- gsub("Intensity ", "", names(exp_pro_raw_temp))
names(exp_pro_raw_temp) <- gsub(" ", "_", names(exp_pro_raw_temp))
dim(exp_pro_raw_temp)
# import
san_pro_raw <- read_csv(paste0(dir_moa, "/data/gnp/pro/original/", "analyzed_proteome_samples_granulopoiesis - large_group.csv"))
# copy
san_pro_temp <- san_pro_raw
# create IDs
san_pro_temp$id <- san_pro_temp$name
san_pro_temp$id <- gsub(" ", "_", san_pro_temp$id)
# san for all exp?
setdiff(san_pro_temp$id, names(exp_pro_raw_temp)) #OK!
setdiff(names(exp_pro_raw_temp), san_pro_temp$id) # OK!
# donor details
donor_details <- read_csv(paste0(dir_moa, "data/gnp/pro/original/", "donor_details - Donors.csv"))
# names(san_pro_temp) #to check
san_pro_temp <- left_join(san_pro_temp, donor_details, by = c("donor"))
# feature annotation proteome
fan_pro_temp <- exp_pro_raw[-(1:2),46:63]
fan_pro_temp$UNIPROT <- gsub(";.*","",fan_pro_temp$`Majority protein IDs`)
fan_pro_temp$has_mult_UNIPROT <- ifelse(str_detect(fan_pro_temp$`Majority protein IDs`, ";"), TRUE, FALSE)
# clean protein names
fan_pro_temp$name_prot <- gsub(";.*","",fan_pro_temp$`Protein names`)
fan_pro_temp$SYMBOL <- gsub(";.*","",fan_pro_temp$`Gene names`) # has SYMBOL for every UNIPROT
# check for doubles
n_occur_UNIPROT_imp <- data.frame(table(fan_pro_temp$UNIPROT))
n_occur_UNIPROT_imp[n_occur_UNIPROT_imp$Freq > 1,] #no doubles!
n_occur_SYMBOL_imp <- data.frame(table(fan_pro_temp$SYMBOL))
n_occur_SYMBOL_imp[n_occur_SYMBOL_imp$Freq > 1,]
removed_UNIPROT_doubleSYMBOLS <- n_occur_SYMBOL_imp[n_occur_SYMBOL_imp$Freq > 1,] # doubles found in HLA genes
# remove doubles
fan_pro_temp %<>% distinct(SYMBOL, .keep_all = TRUE)
# check
sum(!(isUnique(fan_pro_temp$UNIPROT)))# all unique
sum(is.na(fan_pro_temp$UNIPROT)) # no NA
sum(!(isUnique(fan_pro_temp$SYMBOL)))# all unique
sum(is.na(fan_pro_temp$SYMBOL)) # one NA
#fan_imp %>% filter(is.na(fan_imp$SYMBOL)) %>% view() # manual curation: IGKV1-17
fan_pro_temp[is.na(fan_pro_temp$SYMBOL),]$SYMBOL <- 'IGKV1-17'
# check NA
anyNA(fan_pro_temp$SYMBOL) # no NA
anyNA(fan_pro_temp$UNIPROT) # no NA
fan_pro_temp
# check NAs per feature and group
anyNA(exp_pro_raw_temp) # there are no NAs in any of the pro data!
# data were imputed using MaxQuant during raw data analysis
names(fan_pro_temp)
table(fan_pro_temp$Peptides)
fan_pro_temp$Peptides <- as.numeric(fan_pro_temp$Peptides)
p.peptides <- fan_pro_temp %>%
dplyr::arrange(desc(Peptides)) %>%
ggplot(aes(x= Peptides)) +
geom_histogram(stat="count") +
scale_x_continuous(breaks=seq(1,50,1), limits = c(0,50)) +
ggtitle("Quantified peptides per protein") +
theme(axis.text.x = element_text(angle = 35, vjust = 0.5, hjust=0.3))
p.peptides
# which are the single pep prots?
fidU_single_pep <- fan_pro_temp %>% filter(Peptides == 1) %>% pull(UNIPROT)
fidS_single_pep <- fan_pro_temp %>% filter(Peptides == 1) %>% pull(SYMBOL)
length(fidU_single_pep) # 329 proteins lost
# remove vom exp
exp_pro_raw_temp_valid <- exp_pro_raw_temp %>% filter(! UNIPROT %in% fidU_single_pep)
Try histrograms of all data
# create colors by sample
colors_ids_pro <- san_pro_temp %>% select(id, population_general)
colors_ids_pro$color <- colors_stage[match(colors_ids_pro$population_general, names(colors_stage))]
colors_ids_pro$population_general <- NULL
col_ids_pro <- as.vector(colors_ids_pro$color)
names(col_ids_pro) <- colors_ids_pro$id
# Density
exp_pro_long <- exp_pro_raw_temp_valid %>% pivot_longer(!UNIPROT, values_to = 'value', names_to = 'id')
exp_pro_long$stage <- san_pro_temp$population_general[match(exp_pro_long$id, san_pro_temp$id)]
exp_pro_long %>%
ggplot(aes(x = value, color = id)) +
geom_density(na.rm = T, bw = 'SJ') +
scale_x_continuous(trans='log2') +
scale_color_manual(values = col_ids_pro) +
theme(legend.position = "none") +
ggtitle("Density proteome")
# one sample looks strange!
exp_pro_long %>%
arrange(stage) %>%
ggplot(aes(x = interaction(id, stage), y = value, fill = stage)) +
geom_boxplot() +
#theme(axis.text.x=element_text(angle=45, hjust=1)) +
#coord_flip() +
ggtitle("Boxplot expression pro raw") +
xlab('Sample')+
ylab("Expression")+
theme(axis.text.x=element_text(angle=45, hjust=1),
axis.text=element_text(size=7)) +
scale_fill_manual(values = colors_stage)
toRemove_pro_lowQ <- c("P2_eMC_C")
In these data there are no contaminants! Only in the non-imputed ones but we wont use them.
maxquant_contaminants <- read_csv(paste0(dir_moa, "data/anno/", "maxquant_contaminants.txt"), col_names = FALSE)
UNI_mq_cont <- maxquant_contaminants$X1 # UNIPROT list
# check keratins
fan_pro_temp %>% filter(str_detect(fan_pro_temp$name_prot, 'Kerat',)) # none
fan_pro_temp %>% filter(str_detect(fan_pro_temp$SYMBOL, 'KRT',)) # none
# contaminants
fan_pro_temp %>% filter(Reverse == "+") # none
fan_pro_temp %>% filter(str_detect(fan_pro_temp$UNIPROT, 'REV_',)) # none
fan_pro_temp %>% filter(str_detect(fan_pro_temp$UNIPROT, 'CON_',)) # none
fan_pro_temp %>% filter(is.na(UNIPROT)) # bit stange that here is nothing in the contaminants. Unclear why it appeared later in the dataset but might have been a mixup.
# fan_pro_temp %>% filter(is.na(UNIPROT)) %>% select(UNIPROT,SYMBOL,"Only identified by site", "Reverse", "Potential contaminant" ) %>% view # all clean and no contaminants.
# check MQ list
intersect(UNI_mq_cont, exp_pro_raw_temp_valid$UNIPROT) # no interactions!
# this has probably been done during the imputation step!
# create non (= log2 values) without outlier
exp_pro_log2 <- exp_pro_raw_temp_valid[, colnames(exp_pro_raw_temp_valid) != toRemove_pro_lowQ]
colnames(exp_pro_log2)
# also remove from san
san_pro_temp_noOut <- san_pro_temp %>% filter(id != toRemove_pro_lowQ)
Used for VSN
exp_pro_log2$fid <- exp_pro_log2$UNIPROT
exp_pro_log2$UNIPROT <- NULL
# copy exp
exp_pro_lg2_4eSet <- FIDtoRow(exp_pro_log2)
exp_pro_non_4eSet <- 2^exp_pro_lg2_4eSet
# VSN needs counts, not log2! Not sure about the format of the data. They look a bit different.
# match san order
setdiff(names(exp_pro_non_4eSet), san_pro_temp_noOut$id)
setdiff(san_pro_temp_noOut$id, names(exp_pro_non_4eSet))
san_pro_4eSet <- san_pro_temp_noOut[match(names(exp_pro_non_4eSet), san_pro_temp_noOut$id),]
san_pro_4eSet <- as.data.frame(san_pro_4eSet)
rownames(san_pro_4eSet) <- san_pro_4eSet$id
# turn into matrix
mx_pro_non_4eSet <- df2m(exp_pro_non_4eSet)
# create pheno
pheno_eSet_pro <- new("AnnotatedDataFrame", data = san_pro_4eSet)
# compile
eSet_pro_non <- ExpressionSet (
assayData = mx_pro_non_4eSet, #exprs
phenoData = pheno_eSet_pro) #pData
https://www.bioconductor.org/packages/devel/bioc/vignettes/vsn/inst/doc/A-vsn.html#sec:miss
https://academic.oup.com/bioinformatics/article/18/suppl_1/S96/231881
https://www.embl.org/groups/huber/ (first author on VSN paper)
https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2680204/
# normalize (uses eSet with raw data)
eSet_pro_vsn = justvsn(eSet_pro_non)
# # vector of generalized log ratios
# M_cou_non = exprs(eSet_cou_non)[,1] - exprs(eSet_cou_non)[,2]
# M_cou_vsn = exprs(eSet_cou_vsn)[,1] - exprs(eSet_cou_vsn)[,2]
#
# M_tpm_non = exprs(eSet_tpm_non)[,1] - exprs(eSet_tpm_non)[,2]
# M_tpm_vsn = exprs(eSet_tpm_vsn)[,1] - exprs(eSet_tpm_vsn)[,2]
# # histogram of the generalized log-ratios
# hist(M_cou_non, breaks = 100, col = "#d95f0e") # data not well normalized
# hist(M_cou_vsn, breaks = 100, col = "#d95f0e") # very nice
#
# hist(M_tpm_non, breaks = 100, col = "#d95f0e")
# hist(M_tpm_vsn, breaks = 100, col = "#d95f0e")
# graphs
# meanSdPlot(eSet_cou_non)
# meanSdPlot(eSet_cou_vsn)
#
# meanSdPlot(eSet_tpm_non)
# meanSdPlot(eSet_tpm_vsn)
# extract
exp_pro_vsn <- exprs(eSet_pro_vsn)
exp_pro_vsn <- as.data.frame(exp_pro_vsn)
exp_pro_vsn$fid <- rownames(exp_pro_vsn)
https://davetang.org/muse/2014/07/07/quantile-normalisation-in-r/
using log2 values bc of the examples given
#
# mx.exp_pro_lg2 <- as.matrix(exp_pro_lg2_4eSet)
#
# exp_pro_qn <- normalize.quantiles(mx.exp_pro_lg2)
# exp_pro_qn <- as.data.frame(exp_pro_qn)
# rownames(exp_pro_qn) <- rownames(mx.exp_pro_lg2)
# colnames(exp_pro_qn) <- colnames(mx.exp_pro_lg2)
# exp_pro_qn$UNIPROT <- rownames(exp_pro_qn)
Recommended by Gergely, uses minimal assumptions
# exp_pro_mn <- normalizeMedianValues(exp_pro_lg2_4eSet) # limma
# exp_pro_mn <- as.data.frame(exp_pro_mn)
# exp_pro_mn$UNIPROT <- rownames(exp_pro_mn)
# colnames(exp_pro_mn)
plot_sample_density_log.pro <- function(exp, title) {
exp_long <- exp %>% pivot_longer(!fid, values_to = 'value', names_to = 'id')
exp_long$stage <- san_pro_temp$population_general[match(exp_long$id, san_pro_temp$id)]
exp_long$stage <- factor(exp_long$stage, levels = levels_mrn)
exp_long$donor <- san_pro_temp$donor[match(exp_long$id, san_pro_temp$id)]
exp_long %>%
#filter(donor == 'B') %>%
ggplot(aes(x = value, color = id)) +
geom_density(na.rm = T, bw = 'SJ') +
scale_x_continuous(trans='log2') +
scale_color_manual(values = col_ids_pro) +
theme(legend.position = "none") +
ggtitle(title) +
scale_x_log10(breaks = trans_breaks("log10", function(x) 10^x),
labels = trans_format("log10", math_format(10^.x)))
}
plot_sample_density_noLog.pro <- function(exp, title) {
exp_long <- exp %>% pivot_longer(!fid, values_to = 'value', names_to = 'id')
exp_long$stage <- san_pro_temp$population_general[match(exp_long$id, san_pro_temp$id)]
exp_long$stage <- factor(exp_long$stage, levels = levels_mrn)
exp_long$donor <- san_pro_temp$donor[match(exp_long$id, san_pro_temp$id)]
exp_long %>%
#filter(donor == 'B') %>%
ggplot(aes(x = value, color = id)) +
geom_density(na.rm = T, bw = 'SJ') +
#scale_x_continuous(trans='log2') +
scale_color_manual(values = col_ids_pro) +
theme(legend.position = "none") +
ggtitle(title)
}
#p.hist.cou_non <- plot_sample_density_noLog(exp_cou_non, "Counts non")
p.hist.pro_log2 <- plot_sample_density_noLog.pro(exp_pro_log2, "Pro log2")
p.hist.pro_vsn <- plot_sample_density_noLog.pro(exp_pro_vsn, "Pro vsn")
#p.hist.pro_mn <- plot_sample_density_log.pro(exp_pro_mn, "Pro mn")
#p.hist.pro_qn <- plot_sample_density_noLog.pro(exp_pro_qn, "Pro qn")
fig_hist_pro_norm_pre <- ggarrange(
p.hist.pro_log2,
p.hist.pro_vsn,
ncol = 1,
nrow = 2)
fig_hist_pro_norm_pre
fig_hist_pro_norm <- annotate_figure(fig_hist_pro_norm_pre,
top = text_grob("Normalisation effects on proteome", color = "black", face = "bold", size = 18))
fig_hist_pro_norm
pepExp_4box_pro <- function(exp, san){
exp_long <- exp %>% pivot_longer(!fid, names_to = 'id', values_to = "value")
exp_long$stage <- san$population_general[match(exp_long$id, san$id)]
exp_long$stage <- factor(exp_long$stage , levels = c('MB', 'PM', 'MC', 'MM', 'B', 'S', "PMN"))
return(exp_long)
}
exp_pro_log2_long <- pepExp_4box_pro(exp_pro_log2, san_pro_temp_noOut)
exp_pro_vsn_long <- pepExp_4box_pro(exp_pro_vsn, san_pro_temp_noOut)
#exp_pro_mn_long <- pepExp_4box_pro(exp_pro_mn, san_pro_temp)
#exp_pro_qn_long <- pepExp_4box_pro(exp_pro_qn, san_pro_temp)
plot_box_4normCheck <- function(exp_long, title){
exp_long %>%
arrange(stage) %>%
ggplot(aes(x = interaction(id, stage), y = value, fill = stage)) +
geom_boxplot() +
#theme(axis.text.x=element_text(angle=45, hjust=1)) +
#coord_flip() +
ggtitle(title) +
xlab('Sample')+
ylab("Expression")+
theme(axis.text.x=element_text(angle=45, hjust=1),
axis.text=element_text(size=7)) +
scale_fill_manual(values = colors_stage)
}
plot_box_4normCheck_logY <- function(exp_long, title){
exp_long %>%
arrange(stage) %>%
ggplot(aes(x = interaction(id, stage), y = value, fill = stage)) +
geom_boxplot() +
#theme(axis.text.x=element_text(angle=45, hjust=1)) +
#coord_flip() +
ggtitle(title) +
xlab('Sample')+
ylab("Expression")+
theme(axis.text.x=element_text(angle=45, hjust=1),
axis.text=element_text(size=7)) +
scale_fill_manual(values = colors_stage) +
scale_y_continuous(trans='log2')
}
p.box.pro_log2 <- plot_box_4normCheck(exp_pro_log2_long, "pro log2")
p.box.pro_vsn <- plot_box_4normCheck(exp_pro_vsn_long, "pro vsn")
#p.box.pro_mn <- plot_box_4normCheck(exp_pro_mn_long, "pro mn")
#p.box.pro_qn <- plot_box_4normCheck(exp_pro_qn_long, "pro qn")
# COUNTS
fig_boxes_norm_pro_pre <- ggarrange(
p.box.pro_log2,
p.box.pro_vsn,
legend = 'none')
fig_boxes_norm_pro_pre
fig_boxes_norm_pro <- annotate_figure(fig_boxes_norm_pro_pre,
top = text_grob("Boxplots of normalized pro values", color = "black", face = "bold", size = 20))
fig_boxes_norm_pro
# identify techReps
san_pro_techResp <- san_pro_temp_noOut %>% filter(has_tech_rep == T)
san_pro_techResp <- droplevels(san_pro_techResp)
### groups of interest"
get_Rep_ids_pro <- function(DONORoi, STAGEoi){
san_pro_techResp %>%
dplyr::filter(DONOR == DONORoi) %>%
dplyr::filter(population_general == STAGEoi) %>%
pull(id)
}
donors <- unique(san_pro_techResp$DONOR)
stages <- unique(san_pro_techResp$population_general)
ids <- list()
for(i in donors){
for(j in stages){
id <- paste(i,j,sep = '_')
ids[[id]] <- get_Rep_ids_pro(i,j)
}
}
rep_ids_pro <- purrr::compact(ids)
# plots
plot_corr_TechReps <- function(samples, data){
data %>% dplyr::select(all_of(samples)) %>%
rename(Rep1 = 1, Rep2 = 2) %>%
ggplot(aes(x = Rep1, y = Rep2)) +
geom_point() +
stat_cor(method = "spearman", size = 3, na.rm = T)
}
# somehow the first list onlu has 1 entry, Remove
rep_ids_pro <- rep_ids_pro[-1] # because rep was removed as technical outlier
# the correlation is normalisation independend!
pl.corr_pro <- purrr::map(rep_ids_pro, plot_corr_TechReps, exp_pro_vsn)
# Plots
fig_RepCor_pro_pre <- ggarrange(
pl.corr_pro$C_S + ggtitle("Donor C, stage S"),
pl.corr_pro$D_MC + ggtitle("Donor D, stage MC"),
pl.corr_pro$D_MM + ggtitle("Donor D, stage MM"),
pl.corr_pro$D_PMN + ggtitle("Donor D, stage PMN"),
pl.corr_pro$E_MC + ggtitle("Donor E, stage MC"),
pl.corr_pro$E_MM + ggtitle("Donor E, stage MM"),
pl.corr_pro$E_PMN + ggtitle("Donor E, stage PMN"),
pl.corr_pro$A_MB + ggtitle("Donor A, stage MB"),
pl.corr_pro$A_MM + ggtitle("Donor A, stage MM"),
ncol = 5,
nrow = 2,
legend = 'none')
fig_RepCor_pro <- annotate_figure(fig_RepCor_pro_pre,
top = text_grob("Correlation of technical repeat samples pro vsn", color = "black", face = "bold", size = 20))
fig_RepCor_pro
# samples with multi reps
rep_ids_pro
triplets_pro <- list(
D_eMC_P1.P1rep = c('P1_eMC_D', 'P1_eMC_D_repeat'),
D_eMC_P1.P2 = c('P1_eMC_D', 'P2_eMC_D'),
D_eMC_P2.P1rep = c('P2_eMC_D', 'P1_eMC_D_repeat'),
A_MM_G22.G19 = c('G22_A', 'G19_A'),
A_MM_G22.G20 = c('G22_A', 'G20_A'),
A_MM_G22.G21 = c('G22_A', 'G21_A'))
pl.corr_pro_triplet <- purrr::map(triplets_pro, plot_corr_TechReps, exp_pro_vsn)
fig_RepCor_pro_trip_pre <- ggarrange(
pl.corr_pro_triplet$D_eMC_P1.P1rep + ggtitle("Donor D, eMC, P1 P1rep"),
pl.corr_pro_triplet$D_eMC_P1.P2 + ggtitle("Donor D, eMC, P1 P2"),
pl.corr_pro_triplet$D_eMC_P2.P1rep + ggtitle("Donor D, eMC, P1 P2"),
pl.corr_pro_triplet$A_MM_G22.G19 + ggtitle("Donor A, MM, G22 G19"),
pl.corr_pro_triplet$A_MM_G22.G20 + ggtitle("Donor A, MM, G22 G20"),
pl.corr_pro_triplet$A_MM_G22.G21 + ggtitle("Donor A, MM, G22 G21"),
legend = 'none')
fig_RepCor_pro_trip_pre
# combine all corr
fig_CORR_pro_pre <- ggarrange(
pl.corr_pro$C_S + ggtitle("Donor C, stage S"),
pl.corr_pro$D_MC + ggtitle("Donor D, stage MC"),
pl.corr_pro$D_MM + ggtitle("Donor D, stage MM"),
pl.corr_pro$D_PMN + ggtitle("Donor D, stage PMN"),
pl.corr_pro$E_MC + ggtitle("Donor E, stage MC"),
pl.corr_pro$E_MM + ggtitle("Donor E, stage MM"),
pl.corr_pro$E_PMN + ggtitle("Donor E, stage PMN"),
pl.corr_pro$A_MB + ggtitle("Donor A, stage MB"),
pl.corr_pro$A_MM + ggtitle("Donor A, stage MM"),
pl.corr_pro_triplet$D_eMC_P1.P1rep + ggtitle("Donor D, eMC, P1 P1rep"),
pl.corr_pro_triplet$D_eMC_P1.P2 + ggtitle("Donor D, eMC, P1 P2"),
pl.corr_pro_triplet$D_eMC_P2.P1rep + ggtitle("Donor D, eMC, P1 P2"),
pl.corr_pro_triplet$A_MM_G22.G19 + ggtitle("Donor A, MM, G22 G19"),
pl.corr_pro_triplet$A_MM_G22.G20 + ggtitle("Donor A, MM, G22 G20"),
pl.corr_pro_triplet$A_MM_G22.G21 + ggtitle("Donor A, MM, G22 G21"),
ncol = 5,
nrow = 3,
legend = 'none')
fig_CORR_pro <- annotate_figure(fig_CORR_pro_pre,
top = text_grob("Correlation of technical repeat samples pro vsn", color = "black", face = "bold", size = 20))
fig_CORR_pro
# extract data
extract_PCs <- function(res.pca){
dat.pc12 <- res.pca$x %>% as.data.frame() %>%
rownames_to_column('id') %>%
dplyr::select(id, PC1, PC2, PC3, PC4)
}
exp = exp_pro_vsn
san = san_pro_temp_noOut
prep4_pca_pro <- function(exp, san) {
exp <- exp[complete.cases(exp),]
fid <- exp$fid
exp$fid <- NULL
sid <- colnames(exp)
expT <- t(exp)
expT <- matrix(as.numeric(expT), # Convert to numeric matrix
ncol = ncol(expT))
expT <- scale(expT, center = T, scale = T)
colnames(expT) <- fid
rownames(expT) <- sid
res.pca <- prcomp(expT)
dat.pca <- extract_PCs(res.pca)
dat.pca$stage <- san$population_general[match(dat.pca$id, san$id)]
return(dat.pca)
}
dat.pca.pro_log2 <- prep4_pca_pro(exp_pro_log2, san_pro_temp_noOut)
dat.pca.pro_vsn <- prep4_pca_pro(exp_pro_vsn, san_pro_temp_noOut)
#dat.pca.pro_mn <- prep4_pca_pro(exp_pro_qn, san_pro_temp_noOut)
#dat.pca.pro_qn <- prep4_pca_pro(exp_pro_mn, san_pro_temp_noOut)
# plot_myPCA <- function(dat.pca){
# dat.pca %>%
# ggplot(aes(x= PC1, y = PC2, color = stage))+
# geom_point(size = 3)
# }
plot_PC1.2_labeled <- function(dat.pca){
dat.pca %>%
ggplot(aes(x= PC1, y = PC2, color = stage, label = id))+
geom_point(size = 3)+
geom_text_repel(size =2) +
scale_color_manual(values = colors_stage)
}
plot_PC3.4_labeled <- function(dat.pca){
dat.pca %>%
ggplot(aes(x= PC3, y = PC4, color = stage, label = id))+
geom_point(size = 3)+
geom_text_repel(size =2) +
scale_color_manual(values = colors_stage)
}
plot_PC1.2_labeled(dat.pca.pro_vsn) +
ggtitle("PCA 1/2 of pro, vsn")
plot_PC3.4_labeled(dat.pca.pro_vsn) +
ggtitle("PCA 3/4 of pro, vsn")
# identified PMN.2 (= P2_PMN_D) as another outliers via PLS-DA
# gnp_meta$san_pro #
toRemove_id_pro_out_PCA_clear <- c('P1_PMN_E', 'P2_PMN_E', 'P2_PMN_D', 'P1_PMN_D', 'P1_S_A', 'P1_S_C', 'P2_S_C', 'P1_MB_D')
Dont bother. Even though it looks very similar, validity of 1 peptide prots is questionable
# exp_pro_vsn_2pep <- exp_pro_vsn[! rownames(exp_pro_vsn) %in% fidU_single_pep,]
# #exp_pro_vsn_1pep <- exp_pro_vsn[rownames(exp_pro_vsn) %in% fidU_single_pep,]
#
# dat.pca.pro.vsn_2pep <- prep4_pca_pro(exp_pro_vsn_2pep, san_pro_temp_noOut)
# dat.pca.pro.vsn_1pep <- prep4_pca_pro(exp_pro_vsn_1pep, san_pro_temp_noOut)
#
#
# plot_PC1.2_labeled(dat.pca.pro.vsn_2pep) +
# ggtitle("PCA 1/2 of pro, vsn, min2pep prots")
#
# plot_PC1.2_labeled(dat.pca.pro.vsn_1pep) +
# ggtitle("PCA 1/2 of pro, vsn, 1pep prots")
# the 1 peptide proteins deliver the same PCA results as the <-2 peptide proteins. Ok to keep!
WHOLE SECTION DISCARDED
# # check labels
# all_unique(san_pro_temp_noOut$label_general)
# san_pro_temp_noOut$label_general <- make.unique(san_pro_temp_noOut$label_general)
#
# # fct to prep
# prep_4clust_pro <- function(exp, san){
# exp_4clust <- exp
# exp_4clust <- exp_4clust[complete.cases(exp_4clust),]
# # UNIPROT to col
# exp_4clust <- transpose_UNIPROTcols(exp_4clust)
# # lab_gen as rownames
# rownames(exp_4clust) <-
# san$label_general[match(rownames(exp_4clust),c(san$id))]
# # scale
# exp_4clust <- as.data.frame(scale(exp_4clust))
# # return
# return(exp_4clust)
# }
#
# colnames(exp_pro_vsn)
# #
# exp_pro_vsn_4clust <- prep_4clust_pro(exp_pro_vsn, san_pro_temp_noOut)
#
# dist_pro_vsn <- dist(exp_pro_vsn_4clust, method = "euclidean")
# plot_DistMat_vsn <- fviz_dist(dist_pro_vsn,
# lab_size = 8,
# gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))+ ggtitle("Dissimilarity matrix pro vsn")
# plot_DistMat_vsn
#
#
# # excluded samples
# # P1_eMC_C = MC_c_1.1
#
# san_pro_temp_noOut
# toRemove_ids_pro_Outlier <- c('P1_eMC_C')
#
# # plots
# plot_corr_pro <- function(samples, data){
# data %>% dplyr::select(all_of(samples)) %>%
# rename(Rep1 = 1, Rep2 = 2) %>%
# ggplot(aes(x = Rep1, y = Rep2)) +
# geom_point() +
# stat_cor(method = "spearman", size = 3, na.rm = T)
# }
#
# #
# corr4_samples_pro <- list(
# D_eMC_P1.P1rep = c('P1_eMC_D', 'P1_eMC_D_repeat'),
# D_eMC_P1.P2 = c('P1_eMC_D', 'P2_eMC_D'),
# D_eMC_P2.P1rep = c('P2_eMC_D', 'P1_eMC_D_repeat'),
# A_MM_G22.G19 = c('G22_A', 'G19_A'),
# A_MM_G22.G20 = c('G22_A', 'G20_A'),
# A_MM_G22.G21 = c('G22_A', 'G21_A'))
#
#
# # the correlation is normalisation independend!
# pl.corr_pro <- purrr::map(rep_ids_pro, plot_corr_TechReps, exp_pro_vsn)
toRemPro <- c(toRemove_id_pro_out_PCA_clear, toRemove_pro_lowQ)
length(toRemPro) # 9 in total
# san
san_pro_clean <- san_pro_temp_noOut %>% filter(! id %in% toRemPro)
nrow(san_pro_clean) # 36 samples
## exp
exp_pro_log2_clean <- exp_pro_log2[! colnames(exp_pro_log2) %in% toRemPro]
ncol(exp_pro_log2_clean) # 36+fid
exp_pro_vsn_clean <- exp_pro_vsn[! colnames(exp_pro_vsn) %in% toRemPro]
ncol(exp_pro_vsn_clean) # 36+fid
# form medians via function
combine_samples_pro <- function(exp){
exp_c <- exp
## A
# MB
exp_c <- exp_c %>%
rowwise() %>%
dplyr::mutate(MB_a1 = median(c(G4_A, G5_A), na.rm = T)) %>%
select(- c(G4_A, G5_A))
# MM
exp_c <- exp_c %>%
rowwise() %>%
dplyr::mutate(MM_a1 = median(c(G22_A, G19_A, G20_A, G21_A), na.rm = T)) %>%
select(- c(G22_A, G19_A, G20_A, G21_A))
## D
# MC
exp_c <- exp_c %>%
rowwise() %>%
dplyr::mutate(eMC_d = median(c(P1_eMC_D, P1_eMC_D_repeat, P2_eMC_D), na.rm = T)) %>%
select(- c(P1_eMC_D, P1_eMC_D_repeat, P2_eMC_D))
# MM
exp_c <- exp_c %>%
rowwise() %>%
dplyr::mutate(MM_d = median(c(P1_MM_D, P2_MM_D), na.rm = T)) %>%
select(- c(P1_MM_D, P2_MM_D))
## E
#MC
exp_c <- exp_c %>%
rowwise() %>%
dplyr::mutate(eMC_e = median(c(P1_eMC_E, P2_eMC_E), na.rm = T)) %>%
select(- c(P1_eMC_E, P2_eMC_E))
#MM
exp_c <- exp_c %>%
rowwise() %>%
dplyr::mutate(MM_e = median(c(P1_MM_E, P2_MM_E), na.rm = T)) %>%
select(- c(P1_MM_E, P2_MM_E))
return(exp_c)
}
rename_cols_pro <- function(exp){
exp_c <- exp
# A
exp_c %<>% rename(MB_a2 = P1_MB_A)
exp_c %<>% rename(eMC_a = P1_eMC_A)
exp_c %<>% rename(lMC_a = P1_lMC_A)
exp_c %<>% rename(MC_a = G1_A)
exp_c %<>% rename(MM_a2 = P1_MM_A)
exp_c %<>% rename(PM_a = G12_A)
exp_c %<>% rename(PMN_a = P1_PMN_A)
exp_c %<>% rename(B_a2 = P1_B_A)
exp_c %<>% rename(B_a1 = G15_A)
# C
exp_c %<>% rename(MB_c = P1_MB_C)
exp_c %<>% rename(eMC_c = P1_eMC_C)
exp_c %<>% rename(lMC_c = P1_lMC_C)
exp_c %<>% rename(MM_c = P1_MM_C)
exp_c %<>% rename(PM_c = P1_PM_C)
exp_c %<>% rename(eB_c = P1_eB_C)
exp_c %<>% rename(lB_c = P1_lB_C)
exp_c %<>% rename(PMN_c = P1_PMN_C)
# D
exp_c %<>% rename(PM_d = P1_PM_D)
exp_c %<>% rename(lB_d = P1_lB_D)
# E
exp_c %<>% rename(MB_e = P1_MB_E)
exp_c %<>% rename(PM_e = P1_PM_E)
return(exp_c)
}
### apply to data
# log2
exp_pro_log2_clean_com <- combine_samples_pro(exp_pro_log2_clean)
exp_pro_log2_clean_com <- rename_cols_pro(exp_pro_log2_clean_com)
sort(colnames(exp_pro_log2_clean_com)) # OK!
exp_pro_vsn_clean_com <- combine_samples_pro(exp_pro_vsn_clean)
exp_pro_vsn_clean_com <- rename_cols_pro(exp_pro_vsn_clean_com)
sort(colnames(exp_pro_vsn_clean_com)) # OK!
# make same order
order_colnames_pro <- sort(colnames(exp_pro_vsn_clean_com))
order_colnames_pro <- moveMe(order_colnames_pro, "fid first")
exp_pro_vsn_clean_com <- exp_pro_vsn_clean_com[order_colnames_pro]
exp_pro_log2_clean_com <- exp_pro_log2_clean_com[order_colnames_pro]
identical(colnames(exp_pro_vsn_clean_com), colnames(exp_pro_log2_clean_com)) # OK!
identical(rownames(exp_pro_vsn_clean_com), rownames(exp_pro_log2_clean_com)) # OK!
# copy to gSheets
# write_clip(san_pro_clean)
san_pro_pre <- read_csv("/home/isar/Dropbox/scn_moa/data/gnp/pro/san_pro_final - Sheet1.csv")
san_pro_pre %<>% filter(! is.na(label_final))
setdiff(colnames(exp_pro_vsn_clean_com), san_pro_pre$label_final) # OK
setdiff(san_pro_pre$label_final, colnames(exp_pro_vsn_clean_com)) # OK
# Finalize
san_pro_fin <- san_pro_pre %>% select(
id = label_final,
stage = population_general,
stage_nr = population_hirachical_number,
substage = population_specific,
donor = donor,
sex = sex,
age = age
)
san_pro_fin <- san_pro_fin[order(match(san_pro_fin$id, colnames(exp_pro_vsn_clean_com)[-1])),]
identical(san_pro_fin$id, colnames(exp_pro_vsn_clean_com)[-1])
san_pro_fin$idI <- make_unique(san_pro_fin$stage, sep = "_", wrap_in_brackets = FALSE)
identical(colnames(exp_pro_vsn_clean_com), colnames(exp_pro_log2_clean_com))
# No NA in the data!!!!!!! (imputed via MaxQuant in raw data production)
This is based on the fan master created previously (matching mrn, pro and scn)
# manually curated IDs to harmonize mRNA and proteins
fan_complete_pre <- read_csv(paste0(dir_moa, "/data/anno/", "fan_complete.csv") )
# check UNIPROT not included in fan
setdiff(exp_pro_vsn_clean_com$fid, fan_complete_pre$UNIPROT) # 23 missing in fan
# overwrite IDs that do not match between pro and scn
rename_Features_pro <- function(exp){
exp_c <- exp
exp_c$fid %<>% recode('P62158' = 'P0DP23') # old ID of CALM1
exp_c$fid %<>% recode('Q5SNT6' = 'Q641Q2') # to match SCN
exp_c$fid %<>% recode('O15320' = 'Q96PC5') # old ID of MIA2
exp_c$fid %<>% recode('P0CW22' = 'P08708') # old ID of RPS17
exp_c$fid %<>% recode('P0DMV9' = 'P0DMV8') # old ID of HSPA1A
#exp_c$fid %<>% recode('P10316' = 'P04439') # old ID of HLA-A. This creates twice. Delete!
exp_c$fid %<>% recode('P0CG05' = 'P0DOY2') # old ID of HLA-
return(exp_c)
}
exp_pro_lg2_preF1 <- rename_Features_pro(exp_pro_log2_clean_com)
exp_pro_vsn_preF1 <- rename_Features_pro(exp_pro_vsn_clean_com)
# delete double features
delete_features_pro_d2SCN <- c('O00241', 'P42167', 'P10316') # only P42167 (=TMPO) in exp pro
exp_pro_lg2_preF2 <- exp_pro_lg2_preF1 %>% filter(! fid %in% delete_features_pro_d2SCN)
exp_pro_vsn_preF2 <- exp_pro_vsn_preF1 %>% filter(! fid %in% delete_features_pro_d2SCN)
UNIpro_NotInFan <- data.frame(UNI = setdiff(exp_pro_lg2_preF2$fid, fan_complete_pre$UNIPROT))
UNIpro_NotInFan$SYMBOL <- fan_pro_temp$`Gene names`[match(UNIpro_NotInFan$UNI, fan_pro_temp$UNIPROT)]
nrow(UNIpro_NotInFan) # 16 left. None have SYMBOLS. Check manually
# write_clip(UNIpro_NotInFan$UNI)
# manual inspection shows all uncovered UNIPROTS to be double entries! Remove.
exp_pro_lg2_preF3 <- exp_pro_lg2_preF2 %>% filter(! fid %in% UNIpro_NotInFan$UNI)
exp_pro_vsn_preF3 <- exp_pro_vsn_preF2 %>% filter(! fid %in% UNIpro_NotInFan$UNI)
setdiff(exp_pro_lg2_preF3$fid, fan_complete_pre$UNIPROT) # all in fan!
setdiff(exp_pro_vsn_preF3$fid, fan_complete_pre$UNIPROT) # all in fan!
exp_pro_lg2_preF3$fid[duplicated(exp_pro_lg2_preF3$fid)] # no duplicates
Using proteomic ruler for copy number estimation (http://www.mcponline.org/content/early/2014/09/17/mcp.M113.037309.full.pdf+html)
Assuming 250g/L typic cell protein concentration (http://www.coxdocs.org/doku.php?id=perseus:user:plugins:proteomicruler)
Neutrophil volume 320um3 (bionumbers)
Total protein content in 1e6 cells ~~ 80ug which translates into 8e-11g (80pg) per cell
## exp: using the
exp_pro_lg2_4MN <- exp_pro_lg2_preF3
exp_pro_vsn_4MN <- exp_pro_vsn_preF3
# san
san_pro <- san_pro_fin
## fan
fan_complete_pre <- read_csv(paste0(dir_moa, "/data/anno/", "fan_complete.csv") )
setdiff(exp_pro_lg2_4MN$fid, fan_complete_pre$UNIPROT) # OK!
fan_4MN <- fan_complete_pre
## get masses of proteins
load(file = paste0(dir_moa, "/data/anno/", "UNI_masses.RData"))
#UNI_masses <- read_excel("Downloads/uniprotkb_reviewed_true_AND_model_organ_2023_09_19.xlsx", col_types = c("text", "text", "text", "text", "numeric", "numeric"))
#save(UNI_masses, file = paste0(dir_moa, "/data/anno/", "UNI_masses.RData"))
# check
length(intersect(exp_pro_lg2_4MN$fid, UNI_masses$Entry)) # 3155. 1 missing
length(intersect(exp_pro_vsn_4MN$fid, UNI_masses$Entry)) # 3155. 1 missing
# same for both sets!
UNI_missing_lg2 <- setdiff(exp_pro_lg2_4MN$fid, UNI_masses$Entry)
UNI_missing_vsn <- setdiff(exp_pro_vsn_4MN$fid, UNI_masses$Entry)
# its a old UNIPROT entry in my data! Exchnage for current entry
exp_pro_lg2_4MN$fid <- recode(exp_pro_lg2_4MN$fid,
P30042 = 'P0DPI2') # checked. ok
exp_pro_vsn_4MN$fid <- recode(exp_pro_vsn_4MN$fid,
P30042 = 'P0DPI2') # checked. ok
# recheck
setdiff(exp_pro_lg2_4MN$fid, UNI_masses$Entry) # ok!
setdiff(exp_pro_vsn_4MN$fid, UNI_masses$Entry) # ok!
exp_pro = exp_pro_vsn_4MN
# turn pro to medians
make_long_pro <- function(exp_pro){
# make long
expL_pro <- exp_pro
expL_pro %<>% pivot_longer(!fid, names_to = "id", values_to = "value")
expL_pro$stage <- san_pro$stage[match(expL_pro$id, san_pro$id)]
expL_pro$stage <- factor(expL_pro$stage, levels = levels_pro)
return(expL_pro)
}
exp_pro_lg2_4MN_long <- make_long_pro(exp_pro_lg2_4MN)
exp_pro_vsn_4MN_long <- make_long_pro(exp_pro_vsn_4MN)
medians_pro_lg2 <- exp_pro_lg2_4MN_long %>%
dplyr::select(fid, stage, value) %>%
group_by(fid, stage) %>%
dplyr::summarise(median_pro = median(value, na.rm = T))
# glimpse(medians_pro_lg2) # ok
medians_pro_vsn <- exp_pro_vsn_4MN_long %>%
dplyr::select(fid, stage, value) %>%
group_by(fid, stage) %>%
dplyr::summarise(median_pro = median(value, na.rm = T))
# glimpse(medians_pro_vsn) # ok
### proteome ruler
san_pro_temp$cell_number_as_million # this is qute diverese. Not sure how to solve this...
# add mass, rev log2 and add SYMBOL
medians_pro_lg2$mass <- UNI_masses$Mass[match(medians_pro_lg2$fid, UNI_masses$Entry,)]
medians_pro_lg2$median_quant <- 2**medians_pro_lg2$median_pro
medians_pro_lg2$SYMBOL <- fan_4MN$SYMBOL[match(medians_pro_lg2$fid, fan_4MN$UNIPROT)]
medians_pro_vsn$mass <- UNI_masses$Mass[match(medians_pro_vsn$fid, UNI_masses$Entry,)]
medians_pro_vsn$median_quant <- 2**medians_pro_vsn$median_pro
medians_pro_vsn$SYMBOL <- fan_4MN$SYMBOL[match(medians_pro_vsn$fid, fan_4MN$UNIPROT)]
# define constants
avogadro = 6.022e23
protein_mass_per_cell = 8e-11
total_ms_signal_lg2 = sum(medians_pro_lg2$median_quant) # 6e12
total_ms_signal_vsn = sum(medians_pro_vsn$median_quant) # 7e12
# not sure of this difference is relevant
# calculate & round to integer
medians_pro_lg2 %<>% mutate(
molNr = median_quant / total_ms_signal_lg2*avogadro/ mass *protein_mass_per_cell)
medians_pro_lg2$molNr <- round(medians_pro_lg2$molNr, 0)
medians_pro_vsn %<>% mutate(
molNr = median_quant / total_ms_signal_lg2*avogadro/ mass *protein_mass_per_cell)
medians_pro_vsn$molNr <- round(medians_pro_vsn$molNr, 0)
# compare for some proteins
molNR_4comps_lg2 <- medians_pro_lg2 %>% filter(SYMBOL %in% c('S100A8', 'S100A12', 'ELANE', 'AZU1'))
molNR_4comps_lg2$type <- "lg2"
molNR_4comps_vsn <- medians_pro_vsn %>% filter(SYMBOL%in% c('S100A8', 'S100A12', 'ELANE', 'AZU1'))
molNR_4comps_vsn$type <- "vsn"
molNR_4comps <- bind_rows(molNR_4comps_lg2, molNR_4comps_vsn)
molNR_4comps %>%
ggplot(aes(x = stage, y = molNr, fill = type)) +
geom_bar(stat = "identity", position = "dodge") +
#scale_y_continuous(trans='log2') +
facet_grid(SYMBOL ~., scales= "free_y")
fan_pro_temp %>% filter()
# exp_pro_lg2_preF3 %>% filter(fid == 'P80511') %>% view() # S100A1: C is a problem!
# exp_pro_lg2_preF3 %>% filter(fid == 'P20160') %>% view() # AZU1
# check old data, Jiadong mentioned that he does not see this fall in s100A12?
#load(file = paste0(dir_moa, "/data/mixOmics/", "moxL_data.RData"))
#exp_moxL_pro %>% select(S100A12) #its also there. all good
# would it make a difference to use median cell numbers?
## save
medians_pro_vsn
medians_pro_lg2
molNr_pro <- medians_pro_vsn %>% select(UNIPROT = fid, SYMBOL, stage, molNr_vsn = molNr)
identical(medians_pro_lg2$fid, medians_pro_vsn$fid)
identical(medians_pro_lg2$stage, medians_pro_vsn$stage)
molNr_pro$molNr_lg2 <- medians_pro_lg2$molNr
# one uni has to be exchanged from exp and fan!
# P30042 = 'P0DPI2'
exp_pro_lg2_preF3$fid <- recode(exp_pro_lg2_preF3$fid,
P30042 = 'P0DPI2')
exp_pro_vsn_preF3$fid <- recode(exp_pro_vsn_preF3$fid,
P30042 = 'P0DPI2')
molNr_pro %>% filter(UNIPROT == 'P0DPI2') # already changed in molNr!
fan_pro_temp$UNIPROT <- recode(fan_pro_temp$UNIPROT,
P30042 = 'P0DPI2')
fan_pro_sequencing_details <- fan_pro_temp
exp_pro_lg2_fin <- transpose_FIDcols(exp_pro_lg2_preF3)
exp_pro_vsn_fin <- transpose_FIDcols(exp_pro_vsn_preF3)
# savety check
setdiff(san_pro_fin$id, rownames(exp_pro_lg2_fin)) # all IDs match
setdiff(rownames(exp_pro_lg2_fin), san_pro_fin$id) # all IDs match
identical(rownames(exp_pro_lg2_fin), san_pro_fin$id) # OK!
colnames(exp_pro_lg2_fin)[duplicated(colnames(exp_pro_lg2_fin))] # no duplic
load(file = paste0(dir_moa, "/data/final/", "gnp.mrn.RData"))
fan_fin <- gnp.mrn$fan
# fan_fin$UNIPROT <- recode(fan_fin$UNIPROT, P30042 = 'P0DPI2') # already done in fan of mRNA. OK!
setdiff(colnames(exp_pro_lg2_fin), fan_fin$UNIPROT) # ok!
gnp.pro <- list(
## final data
exp_pro_lg2 = exp_pro_lg2_fin,
exp_pro_vsn = exp_pro_vsn_fin,
exp_pro_molNr = molNr_pro,
# meta data
san_pro = san_pro_fin,
levels_pro = levels_pro,
fan = fan_fin, # SAME FOR mRNA, pro and scn!
seq_inf_pro = fan_pro_sequencing_details,
## raw data
exp_pro_raw = exp_pro_raw_temp,
san_pro_raw = san_pro_temp)
save(
gnp.pro,
file = paste0(dir_moa, "/data/final/", "gnp.pro.RData"))