Last update: 03-04-24

Goal of this analysis is to find a suitable absolute quantification method for our granulopoiesis proteome data in the form of molecules per cell. As the cellular proteome has to undergo dramatic shifts during granulopoiesis (eg loss of mitochondria and ribosomes, gain of granule proteins) we want to visulaize this on a cell compartment level. Before applying the histone/proteome ruler method, we test if the assumption of stably expressed histones is true in our cell. The histone ruler method assumes a fixed correlation between DNA and histones that allows absolute quantification of proteins from proteome intensities.

# version: 22.03.24
knitr::opts_chunk$set(warning = FALSE, message = FALSE, results = FALSE) 

# dirs
## Sebastian @Linux
dir_moa <- "/home/isar/Dropbox/scn_moa/"
## Sebastian @MAC
#dir_moa <- "/Users/home_sh/Library/CloudStorage/Dropbox/scn_moa"


# data, revised 2024.01.09
# exp
load(file = paste0(dir_moa, "/data/final/", "gnp.pro.RData")) # check fid name
load(file = paste0(dir_moa, "/data/final/", "gnp.mrn.RData")) # rename fid = ENSG!
load(file = paste0(dir_moa, "/data/final/", "gnp.mic.RData")) # rename fid = UNIPROT!

# results
load(file = paste0(dir_moa, "/data/final/", "dex.all.RData")) # remove SCN at some point
load(file = paste0(dir_moa, "/data/final/", "timeomics.singles.RData"))
load(file = paste0(dir_moa, "/data/final/", "res.wgcna.RData"))
load(file = paste0(dir_moa, "/data/final/", "anno.RData"))


## packages
source(paste0(dir_moa, "/scripts/", "scn_moa_fct.R"))
# load packagea
sapply(packages_general, require, character.only = TRUE)
sapply(packages_enrichment, require, character.only = TRUE)

library("viridis")


## graphics

# size
global_size = 21

thema_nix = theme(axis.title.x=element_blank(),
                  axis.title.y=element_blank())
thema_noY = theme(axis.title.y=element_blank())
thema_noX = theme(axis.title.x=element_blank())

# shapes (mrn & pro)
shps <- c(1, 16)

base data

medians

use the non normalized values

# fan
fan <- gnp.mrn$fan

## Proteome

# single values in long format
exp_proL <- as.data.frame(t(gnp.pro$exp_pro_lg2)) # previously: gnp.pro$exp_pro_vsn(but normalized values dont make sense here)
exp_proL$UNIPROT <- rownames(exp_proL)
exp_proL <- exp_proL %>% pivot_longer(!UNIPROT, names_to = "id", values_to = "value")
exp_proL$stage <- gnp.pro$san_pro$stage[match(exp_proL$id, gnp.pro$san_pro$id)]
exp_proL$SYMBOL <- fan$SYMBOL[match(exp_proL$UNIPROT, fan$UNIPROT)]
exp_proL$ENSG <- fan$ENSG[match(exp_proL$UNIPROT, fan$UNIPROT)]
exp_proL$stage <- factor(exp_proL$stage, levels = c('MB', 'PM', 'MC', 'MM', 'B', 'PMN'))

plotL.SYMBOL(exp_proL, 'ELANE') # why is ELANE so high in PM?

# is this really in the data or did I mess up someting?

# check the raw data
exp_pro.raw <- gnp.pro$exp_pro_raw
expL_pro.raw <- exp_pro.raw %>% pivot_longer(!UNIPROT)
expL_pro.raw <- add.fan2UNIPROT(expL_pro.raw)
expL_pro.raw$stage <- gnp.pro$san_pro_raw$population_general[match(expL_pro.raw$name, gnp.pro$san_pro_raw$id)]
expL_pro.raw$stage <- factor(expL_pro.raw$stage, levels = c('MB', 'PM', 'MC', 'MM', 'B', "S", 'PMN'))

# means
exp_proLc <- exp_proL %>%
  select(ENSG, stage, value) %>%
  group_by(ENSG, stage) %>%
  dplyr::summarise(mean= mean(value), median= median(value))
exp_proLc <- add.fan2ENSG(exp_proLc)
exp_proLc$stage <- factor(exp_proLc$stage, levels = c('MB', 'PM', 'MC', 'MM', 'B', 'PMN'))

# wide format for direction calculations
exp_proWc <- exp_proLc %>% select(SYMBOL, stage, median) %>%
  pivot_wider(names_from = stage, values_from = median)


## Transcriptome

# single values
exp_mrnL <- as.data.frame(t(gnp.mrn$exp_mrn_tpm_non))
exp_mrnL$ENSG <- rownames(exp_mrnL)
exp_mrnL <- exp_mrnL %>% pivot_longer(!ENSG, names_to = "id", values_to = "value")
exp_mrnL$stage <- gnp.mrn$san_mrn$stage[match(exp_mrnL$id, gnp.mrn$san_mrn$id)]
exp_mrnL$stage <- factor(exp_mrnL$stage, levels = c('MB', 'PM', 'MC', 'MM', 'B', 'S', 'PMN'))
exp_mrnL$SYMBOL <- fan$SYMBOL[match(exp_mrnL$ENSG, fan$ENSG)]

# plotL.SYMBOL(exp_mrnL, 'ELANE') # seems that ELANE is actually quite PM specific

# means
exp_mrnLc <- exp_mrnL %>%
  select(ENSG, stage, value) %>%
  group_by(ENSG, stage) %>%
  dplyr::summarise(median= median(value, na.rm = T))
exp_mrnLc <- add.fan2ENSG(exp_mrnLc)
exp_mrnLc$stage <- factor(exp_mrnLc$stage, levels = c('MB', 'PM', 'MC', 'MM', 'B', 'S', 'PMN'))

# wide format for direction calculations
exp_mrnWc <- exp_mrnLc %>% select(SYMBOL, stage, median) %>%
  pivot_wider(names_from = stage, values_from = median)


# example for directions

#exp_micWc$direction.MBtoMC <- ifelse(exp_micWc$MB > exp_micWc$MC, 'down', 'up')
#exp_micWc$direction <- ifelse(exp_micWc$MB > exp_micWc$PMN, 'down', 'up')
# transfer
#exp_micLc$direction.MBtoMC <- exp_micWc$direction.MBtoMC[match(exp_micLc$SYMBOL, exp_micWc$SYMBOL)]

Molecular numbers proteome

Using proteomic ruler for copy number estimation (http://www.mcponline.org/content/early/2014/09/17/mcp.M113.037309.full.pdf+html)

Assuming 250g/L typic cell protein concentration (http://www.coxdocs.org/doku.php?id=perseus:user:plugins:proteomicruler)

Neutrophil volume 320um3 (bionumbers)

Total protein content in 1e6 cells ~~ 80ug which translates into 8e-11g (80pg) per cell

base data

Shall we use the original values without normalisation or the vsn values? Will work with vsn values

## base data 

# san = sample annotation
san_pro <- gnp.pro$san_pro

# exp = "raw" expression values
exp_pro <- gnp.pro$exp_pro_lg2

# fan = feature annotation
fan_pro <- gnp.pro$fan

# masses of proteins from uniprot db
load(file = paste0(dir_moa, "/data/anno/", "UNI_masses.RData"))

# check
length(intersect(colnames(exp_pro), UNI_masses$Entry)) # 3156
length(colnames(exp_pro)) # 3156
setdiff(colnames(exp_pro), UNI_masses$Entry) # masses for all
# ok!


### calculate 

# use median values
exp_proLc <- add.fan2ENSG(exp_proLc)

# add mass from uniprot db
exp_proLc$mass <- UNI_masses$Mass[match(exp_proLc$UNIPROT, UNI_masses$Entry,)]
# reverse log2
exp_proLc$quant <- 2**exp_proLc$median # 

# export for JD to use proteome ruler in python
exp.pro_medianQuant <- exp_proLc %>% ungroup() %>% select(SYMBOL, quant, stage) %>%
  pivot_wider(names_from = stage, values_from = quant)

exp.pro_medianQuant <- add.fan2SYMBOL(exp.pro_medianQuant)
exp.pro_medianQuant$Mass <- UNI_masses$Mass[match(exp.pro_medianQuant$UNIPROT, UNI_masses$Entry,)]

exp.pro_medianQuant %<>% rename("Protein IDs" = UNIPROT)
exp.pro_medianQuant$SYMBOL <- NULL
exp.pro_medianQuant$ENSG <- NULL

# write_csv(exp.pro_medianQuant, paste0(dir_moa, "/JD_R_code/proteome_ruler/", "exp.pro_medianQuant", ".csv"))

### cell numbers: not sure if needed but probably relevant?
cell.numbers_pro <- gnp.pro$san_pro_raw
cell.numbers_pro$cell_number_as_million # ok

cell.numbers_pro.mean <- cell.numbers_pro %>% group_by(population_general) %>%
  dplyr::summarise(mean.cellNum = mean(cell_number_as_million, na.rm = T))

# add to median numers
exp_proLc$cellNum <- cell.numbers_pro.mean$mean.cellNum[match(exp_proLc$stage, cell.numbers_pro.mean$population_general)]

Histones

Histone protein intensity during maturation

### human histones
# source: El Kennani, S., Adrait, A., Shaytan, A. K., Khochbin, S., Bruley, C., Panchenko, A. R., Landsman, D., Pflieger, D. & Govin, J. MS_HistoneDB, a manually curated resource for proteomic analysis of human and mouse histones. Epigenetics Chromatin 10, 2 (2017).

# histoneDB <- read_excel(path = "Dropbox/scn_moa/data/anno/13072_2016_109_MOESM5_ESM.xlsx") # knitr has problem with file

histoneDB <- read_excel(path = paste0(dir_moa, "/data/anno/", "13072_2016_109_MOESM5_ESM.xlsx")) 

# copy.number = avogadro * (DNAweight2n * MSsignal(i) / MSsignal.hist * molWeight)
histoneSignal.pro <- exp_proLc %>% 
  #filter(stage == 'MB') %>% 
  group_by(stage) %>%
  filter(UNIPROT %in% histoneDB$UniprotKB) %>%
  dplyr::summarise(histInt = sum(quant, na.rm = T))

histoneSignal.pro %>%
  ggplot(aes(x= stage, y = histInt, fill = stage)) +
  geom_bar(stat = 'identity') +
  ggtitle("Histone intensity on protein level") +
  scale_fill_manual(values = colors_stage)

Total protein intensity of histones rise with maturation -> is it normal that histone intensity increases with maturation? I was expecting this to be constant?

Histone transcript intensity during maturation

# are histones also induced on transcriptome level?
histoneSignal.mrn <- exp_mrnLc %>% 
  #filter(stage == 'MB') %>% 
  group_by(stage) %>%
  filter(UNIPROT %in% histoneDB$UniprotKB) %>%
  dplyr::summarise(histInt = sum(median, na.rm = T))

histoneSignal.mrn %>%
  ggplot(aes(x= stage, y = histInt, fill = stage)) +
  geom_bar(stat = 'identity') +
  ggtitle("Histone intensity on mRNA level") +
  scale_fill_manual(values = colors_stage)

On transcript level, histone intensity appears rather stable

Differential expression of histones

Proteome level

during progression

NB: pysiological progression of granulopoiesis: MB->PM->MC->MM->B->S->PMN

# total histones in dataset
fid_pro.HISTONES <- data.frame(UNIPROT = unique(dex.all$dex.pro$dex_stage_pro_lg2$fid))
fid_pro.HISTONES %<>% filter(UNIPROT %in% histoneDB$UniprotKB) # 27

dex.histones.pro <- dex.all$dex.pro$dex_stage_pro_lg2 %>% filter(fid %in% histoneDB$UniprotKB) %>%
  #filter(comparison %in% c(comp_progression_pro_to)) %>%
  group_by(comparison) %>%
  dplyr::summarize(
    n_down = -sum(adj.P.Val < 0.05 & logFC <= -1),
    n_up = sum(adj.P.Val < 0.05 & logFC >= 1))  %>% 
  pivot_longer(-comparison, names_to = "direction", values_to = "n")

# check in progression data
dex.histones.pro.progression <- dex.histones.pro %>%
  filter(comparison %in% comp_progression_pro_to)
dex.histones.pro.progression$comparison <- factor(dex.histones.pro.progression$comparison, levels = c(comp_progression_pro_to))

dex.histones.pro.progression %>%
  ggplot(aes(x = comparison, y = n, fill = direction)) +
  geom_bar(stat="identity") +
  geom_text(aes(y = n + 1 * sign(n), label=n)) +
  ggtitle("DEX histone proteins: in maturation", subtitle = "Total histones in proteome = 27") + # 
  xlab("Development stage") +
  ylab("Number of dex proteins") +
  #ylim(-1600, 1600) +
  theme(axis.text.x=element_text(angle=30, hjust=1),
        legend.position = "none",
        axis.title.x = element_blank())+
  scale_fill_manual(values = colors_UpDown_n)

Histones are differentially upregulated (adj.p < 0.05, abs(logFc) > 1) in PM->MC

MB vs others

Check how the initial stage (MB) compares to the following maturation stages

# check in MB vs others
dex.histones.pro.MB <- dex.histones.pro %>%
  filter(comparison %in% c('MBtoPM','MBtoMC', 'MBtoMM', 'MBtoB', 'MBtoPMN'))
dex.histones.pro.MB$comparison <- factor(dex.histones.pro.MB$comparison, levels = c('MBtoPM', 'MBtoMC', 'MBtoMM', 'MBtoB', 'MBtoPMN'))

dex.histones.pro.MB %>%
  ggplot(aes(x = comparison, y = n, fill = direction)) +
  geom_bar(stat="identity") +
  geom_text(aes(y = n + 1 * sign(n), label=n)) +
  ggtitle("DEX histone proteins: MB vs mature", subtitle = "Total histones in proteome = 27") + # 
  xlab("Development stage") +
  ylab("Number of dex proteins") +
  #ylim(-1600, 1600) +
  theme(axis.text.x=element_text(angle=30, hjust=1),
        legend.position = "none",
        axis.title.x = element_blank())+
  scale_fill_manual(values = colors_UpDown_n)

In comparison to the initial stage (MB), all following maturation stages show upregulated histones on protein level

Transcriptome level

progression

## transcriptome

# total histones in dataset
fid_mrn.HISTONES <- data.frame(ENSG = unique(dex.all$dex.mrn$dex_stage_mrn_vsn$fid))
fid_mrn.HISTONES <- add.fan2ENSG(fid_mrn.HISTONES)
fid_mrn.HISTONES %<>% filter(UNIPROT %in% histoneDB$UniprotKB) # 17 # 10 less than on protein level ????

dex.histones.mrn <- dex.all$dex.mrn$dex_stage_mrn_vsn %>% filter(fid %in% fid_mrn.HISTONES$ENSG) %>%
  #filter(comparison %in% c(comp_progression_pro_to)) %>%
  group_by(comparison) %>%
  dplyr::summarize(
    n_down = -sum(adj.P.Val < 0.05 & logFC <= -1),
    n_up = sum(adj.P.Val < 0.05 & logFC >= 1))  %>% 
  pivot_longer(-comparison, names_to = "direction", values_to = "n")

# check in progression data
dex.histones.mrn.progression <- dex.histones.mrn %>%
  filter(comparison %in% comp_progression_mrn_to)
dex.histones.mrn.progression$comparison <- factor(dex.histones.mrn.progression$comparison, levels = c(comp_progression_mrn_to))

dex.histones.mrn.progression %>%
  ggplot(aes(x = comparison, y = n, fill = direction)) +
  geom_bar(stat="identity") +
  geom_text(aes(y = n + 1 * sign(n), label=n)) +
  ggtitle("DEX histone transcripts: in maturation", subtitle = "Total histones in transcriptome = 17") + # 
  xlab("Development stage") +
  ylab("Number of dex transcripts") +
  #ylim(-1600, 1600) +
  theme(axis.text.x=element_text(angle=30, hjust=1),
        legend.position = "none",
        axis.title.x = element_blank())+
  scale_fill_manual(values = colors_UpDown_n)

On transciptome level, histones are downregulated intermediatly (MC->MM) but upregulated in MM->B and B->S

MB vs others

# check in MB vs others
dex.histones.mrn.MB <- dex.histones.mrn %>%
  filter(comparison %in% c('MBtoPM','MBtoMC', 'MBtoMM', 'MBtoB', 'MBtoS', 'MBtoPMN'))
dex.histones.mrn.MB$comparison <- factor(dex.histones.mrn.MB$comparison, levels = c('MBtoPM', 'MBtoMC', 'MBtoMM', 'MBtoB', 'MBtoS', 'MBtoPMN'))

dex.histones.mrn.MB %>%
  ggplot(aes(x = comparison, y = n, fill = direction)) +
  geom_bar(stat="identity") +
  geom_text(aes(y = n + 2 * sign(n), label=n)) +
  ggtitle("DEX histone transcripts: MB vs mature", subtitle = "Total histones in proteome = 17") + # 
  xlab("Development stage") +
  ylab("Number of dex transcripts") +
  #ylim(-1600, 1600) +
  theme(axis.text.x=element_text(angle=30, hjust=1),
        legend.position = "none",
        axis.title.x = element_blank())+
  scale_fill_manual(values = colors_UpDown_n)

Compared with the initial stage (MB), histones are first downregulated in MM and later upregulated in S and PMN

Is it valid to use the histone ruler with these findings????

Rulers

constants

avogadro = 6.022e23 * 1e-12 # from gram converted to pg

DNAweight2n = 6.5 # pg. Should this number become cell number adjusted?

Histone ruler

Does the histone ruler make sense in a cellular development study where histone proteins are upregulated?

Uncover code to check calcuation. I think there might be a error becuase my numbers are off by several magnitudes! Currently using this formula: molNr = quant * (avogadro / mass) * (DNAweight2n / histInt)

# get histone intensities per stage
histInt.MB <- histoneSignal.pro %>% filter(stage == 'MB') %>% pull(histInt)
histInt.PM <- histoneSignal.pro %>% filter(stage == 'PM') %>% pull(histInt)
histInt.MC <- histoneSignal.pro %>% filter(stage == 'MC') %>% pull(histInt)
histInt.MM <- histoneSignal.pro %>% filter(stage == 'MM') %>% pull(histInt)
histInt.B <- histoneSignal.pro %>% filter(stage == 'B') %>% pull(histInt)
histInt.PMN <- histoneSignal.pro %>% filter(stage == 'PMN') %>% pull(histInt)

# protein intensities per stage 
expLc_pro.MB <- exp_proLc %>% filter(stage == 'MB')
expLc_pro.PM <- exp_proLc %>% filter(stage == 'PM')
expLc_pro.MC <- exp_proLc %>% filter(stage == 'MC')
expLc_pro.MM <- exp_proLc %>% filter(stage == 'MM')
expLc_pro.B <- exp_proLc %>% filter(stage == 'B')
expLc_pro.PMN <- exp_proLc %>% filter(stage == 'PMN')

## molecular numbers 

# fct
# use.histRuler <- function(df, histInt){
#   molNr <- df %>% mutate(molNr = avogadro * (DNAweight2n * quant / histInt * mass))
#   molNr <- ungroup(molNr)
#   molNr %<>% select(SYMBOL, molNr)
#   return(molNr)
# }

use.histRuler2 <- function(df, histInt){
  molNr <- df %>% mutate(molNr =  quant * (avogadro / mass) * (DNAweight2n / histInt))
  molNr <- ungroup(molNr)
  molNr %<>% select(SYMBOL, molNr)
  return(molNr)
}



# apply
molNrHIST.MB <- use.histRuler2(expLc_pro.MB, histInt.MB)
molNrHIST.PM <- use.histRuler2(expLc_pro.PM, histInt.PM) 
molNrHIST.MC <- use.histRuler2(expLc_pro.MC, histInt.MC)
molNrHIST.MM <- use.histRuler2(expLc_pro.MM, histInt.MM)
molNrHIST.B <- use.histRuler2(expLc_pro.B, histInt.B)
molNrHIST.PMN <- use.histRuler2(expLc_pro.PMN, histInt.PMN)

# rename (shabby shabby coding...)
names(molNrHIST.MB)[2] <- 'MB'
names(molNrHIST.PM)[2] <- 'PM'
names(molNrHIST.MC)[2] <- 'MC'
names(molNrHIST.MM)[2] <- 'MM'
names(molNrHIST.B)[2] <- 'B'
names(molNrHIST.PMN)[2] <- 'PMN'

molNrHIST <- molNrHIST.MB %>%
              left_join(molNrHIST.PM, by='SYMBOL') %>%
              left_join(molNrHIST.MC, by='SYMBOL') %>%
              left_join(molNrHIST.MM, by='SYMBOL') %>%
              left_join(molNrHIST.B, by='SYMBOL') %>%
              left_join(molNrHIST.PMN, by='SYMBOL')

molNrHIST.l <- molNrHIST %>% pivot_longer(!SYMBOL)
molNrHIST.l$name <- factor(molNrHIST.l$name, levels = c('MB', 'PM', 'MC', 'MM', 'B', 'PMN' ))
  
# check mol number of ELANE: published: 3x10^6 per cell. CAVE: value of ELANE does NOT make sense
molNrHIST.l %>% filter(SYMBOL == 'ELANE') %>%
  # filter(name != 'PM') %>%
  ggplot(aes(x = name, y = value)) +
  geom_bar(stat = 'identity')+
  labs(title = "MolNr of ELANE by histone ruler",
              subtitle = "Published = 3x10^6 per cell",
              caption = "PM spike followed up below. Seems ELANE (primary granule protein) is translated in PM and partially lost with further maturation. This fits well with the expression profile")

molNrHIST.l %>% filter(SYMBOL == 'AZU1') %>%
  ggplot(aes(x = name, y = value)) +
  geom_bar(stat = 'identity')+
  labs(title = "MolNr of AZU1 by histone ruler",
              subtitle = "Published = 40x10^6 per cell",
              caption = "Numbers smaller than in other publications. Again a strange spike in PM")

molNrHIST.l %>% filter(SYMBOL == 'S100A8') %>%
  ggplot(aes(x = name, y = value)) +
  geom_bar(stat = 'identity')+
  labs(title = "MolNr of S100A8 by histone ruler",
              subtitle = "Published = 300x10^6 per cell",
              caption = "Numbers are comparable with other publications")

The profiles look reasonable but I am not sure what to do about the changing histone levels. Maybe we need to find a group of proteins that remain stable during the whole expression course? But we would not have a weight correlation like DNA.

find stable protein group

After discussing this approach we decided not to follow it up because even if we find a stable group of proteins it would not be correlated by weight with the DNA, which is the point of the histone ruler…

## check pValues
dex.pro.lg2 <- dex.all$dex.pro$dex_stage_pro_lg2 
dex.pro.lg2 %<>% mutate(sig = ifelse(P.Value < 0.05, "sigP", "noSigP"))
sigP.pro.lg2 <- table(dex.pro.lg2$sig)

dex.pro.lg2 %>%
  ggplot(aes(x = P.Value, fill = sig)) +
  geom_histogram(bins = 80) +
  geom_vline(xintercept= 0.05, colour="black")

## find stable protein group
dex.pro.lg2 %>% 
  filter(adj.P.Val < 0.05) %>%
  filter(abs(logFC) < 0.5) # 20 proteins but in various groups

dex.pro.lg2 %>% 
  filter(comparison %in% comp_progression_pro_to) %>%
  #filter(adj.P.Val < 0.05) %>%
  filter(abs(logFC) < 0.5)   # many many 


dex = dex.pro.lg2

get.stable.fid <- function(dex, COMP){
  fid <- dex %>% filter(comparison == COMP) %>% 
    #filter(P.Value < 0.05) %>%
    filter(abs(logFC) < 0.5) %>% pull(fid)
  return(fid)
}

l.stable.fid <- list(
  MBtoPM = get.stable.fid(dex.pro.lg2, 'MBtoPM'),
  PMtoMC = get.stable.fid(dex.pro.lg2, 'PMtoMC'),
  MCtoMM = get.stable.fid(dex.pro.lg2, 'MCtoMM'),
  MMtoB = get.stable.fid(dex.pro.lg2, 'MMtoB'),
  BtoPMN = get.stable.fid(dex.pro.lg2, 'BtoPMN')
)

# draw upset
library(UpSetR)

upset(fromList(l.stable.fid), order.by = "degree")

# 130 proteins that do not change in any of the progression steps... but they would not be relatable to the weight of DNA like histones...
# damn it

unchanging.proteins <- data.frame(UNIPROT = Reduce(intersect, l.stable.fid))
unchanging.proteins <- add.fan2UNIPROT(unchanging.proteins)

# enrich (load enrichment data & fct)
# rs.GO.unchanging.prots <- enrichGO(gene = unchanging.proteins$ENSG,
#                 universe      = universe_pro.ENSG,
#                 OrgDb         = org.Hs.eg.db,
#                 ont           = "ALL",
#                 pAdjustMethod = "BH",
#                 keyType = "ENSEMBL",
#                 pvalueCutoff  = 0.05,
#                 qvalueCutoff  = 0.05,
#                 readable      = TRUE)
# 
# save(rs.GO.unchanging.prots, file = paste0(dir_moa, "/data/final/preps/enrichment/", "rs.GO.unchanging.prots.RData"))

load(file = paste0(dir_moa,  "/data/final/preps/enrichment/", "rs.GO.unchanging.prots.RData"))


barplot(rs.GO.unchanging.prots, showCategory=20)

# dotplot(rs.GO.unchanging.prots, showCategory=30)

Intensity ruler

Another way to calculate molecular numbers is using the total protein intensity. I am not 100% sure if we should use the total intensity over all stages or do this in a stage-specific way. Currently I am using the following formulas:

option 1: stage specific MS intensity molNr = (median_quant / stageIntensity) * (avogadro / mass) * protein_mass_per_cell

option 2: total intensity over all stages molNr = median_quant / total_ms_signal_lg2 * avogadro / mass * protein_mass_per_cell

option3: without intensity molNr = avogadro * protein_mass_per_PMN * median_quant

The formula relies on the total mass per cell which is only available for PMN (last stage) and, suposedly, it is changing over the development course?

total intensity

protein_mass_per_PMN = 60  # pg 
# source: Sollberger, G., Brenes, A. J., Warner, J., Arthur, J. S. C. & Howden, A. J. M. Quantitative proteomics reveals tissue-specific, infection-induced and species-specific neutrophil protein signatures. Sci. Rep. 14, 5966 (2024).
# total prot content neutrophil = 60 pg (can we trust this number?)


## total MS signals
total.ms.signal_MB <- sum(expLc_pro.MB$median)
total.ms.signal_PM <- sum(expLc_pro.PM$median)
total.ms.signal_MC <- sum(expLc_pro.MC$median)
total.ms.signal_MM <- sum(expLc_pro.MM$median)
total.ms.signal_B <- sum(expLc_pro.B$median)
total.ms.signal_PMN <- sum(expLc_pro.PMN$median)

total.ms.signal_all <- sum(c(total.ms.signal_MB, total.ms.signal_PM, total.ms.signal_MC, total.ms.signal_MM, total.ms.signal_B, total.ms.signal_PMN))

# plot
ms.signals <- data.frame(
  stage = c('MB', 'PM', 'MC', 'MM', 'B', 'PMN'),
  signal = c(total.ms.signal_MB, total.ms.signal_PM, total.ms.signal_MC, total.ms.signal_MM, total.ms.signal_B, total.ms.signal_PMN)
)

ms.signals %>%
  ggplot(aes(x= stage, y = signal))+
  geom_bar(stat = 'identity')+
  labs(title = "Total MS signal per stage",
              subtitle = "non-normalized log2 values",
              caption = "Quite similar for all")

option 1

using the stage specific MS intensity molNr = (median / stageIntensity) * (avogadro / mass) * protein_mass_per_cell

# calculate
calc.molNr.Intesity.o1 <- function(stageMedians, stageIntensity){
  stageMedians %<>% mutate(
    molNr = (median / stageIntensity) * (avogadro / mass) * protein_mass_per_PMN
  )
  stageMedians$molNr <- round(stageMedians$molNr, 0)
  
  rs <- stageMedians %>% ungroup() %>% select(SYMBOL, molNr)
  return(rs)
}

## intensity by stage. Should we also correc for cell number???
molNrINT.o1_pro.MB <- calc.molNr.Intesity.o1(expLc_pro.MB, total.ms.signal_MB) 
molNrINT.o1_pro.PM <- calc.molNr.Intesity.o1(expLc_pro.PM, total.ms.signal_PM)
molNrINT.o1_pro.MC <- calc.molNr.Intesity.o1(expLc_pro.MC, total.ms.signal_MC)
molNrINT.o1_pro.MM <- calc.molNr.Intesity.o1(expLc_pro.MM, total.ms.signal_MM)
molNrINT.o1_pro.B <- calc.molNr.Intesity.o1(expLc_pro.B, total.ms.signal_B)
molNrINT.o1_pro.PMN <- calc.molNr.Intesity.o1(expLc_pro.PMN, total.ms.signal_PMN)

names(molNrINT.o1_pro.MB)[2] <- 'MB'
names(molNrINT.o1_pro.PM)[2] <- 'PM'
names(molNrINT.o1_pro.MC)[2] <- 'MC'
names(molNrINT.o1_pro.MM)[2] <- 'MM'
names(molNrINT.o1_pro.B)[2] <- 'B'
names(molNrINT.o1_pro.PMN)[2] <- 'PMN'

molNrINT.o1 <- molNrINT.o1_pro.MB %>%
              left_join(molNrINT.o1_pro.PM, by='SYMBOL') %>%
              left_join(molNrINT.o1_pro.MC, by='SYMBOL') %>%
              left_join(molNrINT.o1_pro.MM, by='SYMBOL') %>%
              left_join(molNrINT.o1_pro.B, by='SYMBOL') %>%
              left_join(molNrINT.o1_pro.PMN, by='SYMBOL')

molNrINT.o1.l <- molNrINT.o1 %>% pivot_longer(!SYMBOL)
molNrINT.o1.l$name <- factor(molNrINT.o1.l$name, levels = c('MB', 'PM', 'MC', 'MM', 'B', 'PMN' ))


# check mol number of ELANE: published: 3x10^6 per cell
molNrINT.o1.l %>% filter(SYMBOL == 'ELANE') %>%
  # filter(name != 'PM') %>%
  ggplot(aes(x = name, y = value)) +
  geom_bar(stat = 'identity') +
  labs(title = "ELANE, intenisty ruler option 1",
              subtitle = "Published = 3x10^6 per cell",
              caption = "PM spike seems strange but is found on mRNA level as well!")

molNrINT.o1.l %>% filter(SYMBOL == 'S100A8') %>%
  # filter(name != 'PM') %>%
  ggplot(aes(x = name, y = value)) +
  geom_bar(stat = 'identity') +
  labs(title = "S100A8, intenisty ruler option 1",
              subtitle = "",
              caption = "")

molNrINT.o1.l %>% filter(SYMBOL == 'AZU1') %>%
  # filter(name != 'PM') %>%
  ggplot(aes(x = name, y = value)) +
  geom_bar(stat = 'identity') +
  labs(title = "AZU1, intenisty ruler option 1",
              subtitle = "",
              caption = "")

molNrINT.o1.l %>% filter(SYMBOL == 'RPL18') %>%
  # filter(name != 'PM') %>%
  ggplot(aes(x = name, y = value)) +
  geom_bar(stat = 'identity') +
  labs(title = "RPL18, intenisty ruler option 1",
              subtitle = "",
              caption = "")

Values seem all a bit too similar with this approach? Probably because the intensities we use for normalization were all very similar.

option 2

using the total intensity of all stages molNr = median_quant / total_ms_signal_lg2 * avogadro / mass * protein_mass_per_cell

# calculate
calc.molNr.Intesity.o2 <- function(stageMedians){
  stageMedians %<>% mutate(
    molNr = (median / total.ms.signal_all) * (avogadro / mass) * protein_mass_per_PMN
  )
  stageMedians$molNr <- round(stageMedians$molNr, 0)
  
  rs <- stageMedians %>% ungroup() %>% select(SYMBOL, molNr)
  return(rs)
}



## intensity by stage. Should we also correc for cell number???
molNrINT.o2_pro.MB <- calc.molNr.Intesity.o2(expLc_pro.MB) 
molNrINT.o2_pro.PM <- calc.molNr.Intesity.o2(expLc_pro.PM)
molNrINT.o2_pro.MC <- calc.molNr.Intesity.o2(expLc_pro.MC)
molNrINT.o2_pro.MM <- calc.molNr.Intesity.o2(expLc_pro.MM)
molNrINT.o2_pro.B <- calc.molNr.Intesity.o2(expLc_pro.B)
molNrINT.o2_pro.PMN <- calc.molNr.Intesity.o2(expLc_pro.PMN)

names(molNrINT.o2_pro.MB)[2] <- 'MB'
names(molNrINT.o2_pro.PM)[2] <- 'PM'
names(molNrINT.o2_pro.MC)[2] <- 'MC'
names(molNrINT.o2_pro.MM)[2] <- 'MM'
names(molNrINT.o2_pro.B)[2] <- 'B'
names(molNrINT.o2_pro.PMN)[2] <- 'PMN'

molNrINT.o2 <- molNrINT.o2_pro.MB %>%
              left_join(molNrINT.o2_pro.PM, by='SYMBOL') %>%
              left_join(molNrINT.o2_pro.MC, by='SYMBOL') %>%
              left_join(molNrINT.o2_pro.MM, by='SYMBOL') %>%
              left_join(molNrINT.o2_pro.B, by='SYMBOL') %>%
              left_join(molNrINT.o2_pro.PMN, by='SYMBOL')

molNrINT.o2.l <- molNrINT.o2 %>% pivot_longer(!SYMBOL)
molNrINT.o2.l$name <- factor(molNrINT.o2.l$name, levels = c('MB', 'PM', 'MC', 'MM', 'B', 'PMN' ))


# check mol number of ELANE: published: 3x10^6 per cell
molNrINT.o2.l %>% filter(SYMBOL == 'ELANE') %>%
  # filter(name != 'PM') %>%
  ggplot(aes(x = name, y = value)) +
  geom_bar(stat = 'identity') +
  labs(title = "ELANE, intenisty ruler option 1",
              subtitle = "Published = 3x10^6 per cell",
              caption = "all very similar and too small")

molNrINT.o2.l %>% filter(SYMBOL == 'S100A8') %>%
  # filter(name != 'PM') %>%
  ggplot(aes(x = name, y = value)) +
  geom_bar(stat = 'identity') +
  labs(title = "S100A8, intenisty ruler option 2",
              subtitle = "",
              caption = "")

molNrINT.o2.l %>% filter(SYMBOL == 'AZU1') %>%
  # filter(name != 'PM') %>%
  ggplot(aes(x = name, y = value)) +
  geom_bar(stat = 'identity') +
  labs(title = "AZU1, intenisty ruler option 2",
              subtitle = "",
              caption = "")

molNrINT.o2.l %>% filter(SYMBOL == 'RPL18') %>%
  # filter(name != 'PM') %>%
  ggplot(aes(x = name, y = value)) +
  geom_bar(stat = 'identity') +
  labs(title = "RPL18, intenisty ruler option 2",
              subtitle = "",
              caption = "")

Values are even more similar, unsurprisingly so because normalized by same intensity…

option 3

without intensity molNr = avogadro * protein_mass_per_PMN * median_quant

# calculate
calc.molNr.Intesity.o3 <- function(stageMedians, stageIntensity){
  stageMedians %<>% mutate(
    molNr = avogadro * protein_mass_per_PMN * median 
  )
  stageMedians$molNr <- round(stageMedians$molNr, 0)
  
  rs <- stageMedians %>% ungroup() %>% select(SYMBOL, molNr)
  return(rs)
}

## intensity by stage. Should we also correc for cell number???
molNrINT.o3_pro.MB <- calc.molNr.Intesity.o3(expLc_pro.MB, total.ms.signal_MB) 
molNrINT.o3_pro.PM <- calc.molNr.Intesity.o3(expLc_pro.PM, total.ms.signal_PM)
molNrINT.o3_pro.MC <- calc.molNr.Intesity.o3(expLc_pro.MC, total.ms.signal_MC)
molNrINT.o3_pro.MM <- calc.molNr.Intesity.o3(expLc_pro.MM, total.ms.signal_MM)
molNrINT.o3_pro.B <- calc.molNr.Intesity.o3(expLc_pro.B, total.ms.signal_B)
molNrINT.o3_pro.PMN <- calc.molNr.Intesity.o3(expLc_pro.PMN, total.ms.signal_PMN)

names(molNrINT.o3_pro.MB)[2] <- 'MB'
names(molNrINT.o3_pro.PM)[2] <- 'PM'
names(molNrINT.o3_pro.MC)[2] <- 'MC'
names(molNrINT.o3_pro.MM)[2] <- 'MM'
names(molNrINT.o3_pro.B)[2] <- 'B'
names(molNrINT.o3_pro.PMN)[2] <- 'PMN'

molNrINT.o3 <- molNrINT.o3_pro.MB %>%
              left_join(molNrINT.o3_pro.PM, by='SYMBOL') %>%
              left_join(molNrINT.o3_pro.MC, by='SYMBOL') %>%
              left_join(molNrINT.o3_pro.MM, by='SYMBOL') %>%
              left_join(molNrINT.o3_pro.B, by='SYMBOL') %>%
              left_join(molNrINT.o3_pro.PMN, by='SYMBOL')

molNrINT.o3.l <- molNrINT.o3 %>% pivot_longer(!SYMBOL)
molNrINT.o3.l$name <- factor(molNrINT.o3.l$name, levels = c('MB', 'PM', 'MC', 'MM', 'B', 'PMN' ))


# check mol number of ELANE: published: 3x10^6 per cell
molNrINT.o3.l %>% filter(SYMBOL == 'ELANE') %>%
  # filter(name != 'PM') %>%
  ggplot(aes(x = name, y = value)) +
  geom_bar(stat = 'identity') +
  labs(title = "ELANE, intenisty ruler option 1",
              subtitle = "Published = 3x10^6 per cell",
              caption = "Numer is totally off")

molNrINT.o3.l %>% filter(SYMBOL == 'S100A8') %>%
  # filter(name != 'PM') %>%
  ggplot(aes(x = name, y = value)) +
  geom_bar(stat = 'identity') +
  labs(title = "S100A8, intenisty ruler option 1",
              subtitle = "",
              caption = "")

molNrINT.o3.l %>% filter(SYMBOL == 'AZU1') %>%
  # filter(name != 'PM') %>%
  ggplot(aes(x = name, y = value)) +
  geom_bar(stat = 'identity') +
  labs(title = "AZU1, intenisty ruler option 1",
              subtitle = "",
              caption = "")

molNrINT.o3.l %>% filter(SYMBOL == 'RPL18') %>%
  # filter(name != 'PM') %>%
  ggplot(aes(x = name, y = value)) +
  geom_bar(stat = 'identity') +
  labs(title = "RPL18, intenisty ruler option 1",
              subtitle = "",
              caption = "")

Values are completely off and dont differ much between the stages

Protein ranks

In our first paper, we found that 50% of the neutrophils total proteome mass is made up of <10 different proteins. Other leucocytes showed a more heterogenous proteome (eg 50 monocytes, 100 in Tcells).

We hypothesize that this proteome shift has to happen during granulopoiesis and that MB (first stage) likely show a higher degree of heterogeneity than PMN.

Will also try just the intensity instead of the molNr

calc_CumSum_per_stage <- function(dat, STAGE){
  # isolate
  dat_stage <- dat %>% filter(name == STAGE)
  # total_Mol_stage = sum(dat_stage$value) # this seems unneccesary
  dat_stage <- dat_stage %>% arrange(desc(value))
  # calc cumMol
  dat_stage$cum_Mol <- cumsum(dat_stage$value)/sum(dat_stage$value)
  # rank
  dat_stage %<>% mutate(rank = c(1:3156))
  return(dat_stage)
}

## apply stage wise
apply_CumSum <- function(moNr.l){
  dat.pro_MB <- calc_CumSum_per_stage(moNr.l, 'MB')
  dat.pro_PM <- calc_CumSum_per_stage(moNr.l, 'PM')
  dat.pro_MC <- calc_CumSum_per_stage(moNr.l, 'MC')
  dat.pro_MM <- calc_CumSum_per_stage(moNr.l, 'MM')
  dat.pro_B <- calc_CumSum_per_stage(moNr.l, 'B')
  dat.pro_PMN <- calc_CumSum_per_stage(moNr.l, 'PMN')
  # combine and format
  rs <- rbind(dat.pro_MB, dat.pro_PM, dat.pro_MC, dat.pro_MM, dat.pro_B, dat.pro_PMN)
  rs %<>% ungroup()
  return(rs)
}

# also prep the medians to be used with the formula
exp_proL.4rank <- exp_proLc %>% select(SYMBOL = ENSG, name = stage, value = quant)
exp_proL.4rank$value <- round(exp_proL.4rank$value, 0)
exp_proL.4rank<- ungroup(exp_proL.4rank)

ranks.HIST <- apply_CumSum(molNrHIST.l)
ranks.INT1 <- apply_CumSum(molNrINT.o1.l)
ranks.INT2 <- apply_CumSum(molNrINT.o2.l)
ranks.INT3 <- apply_CumSum(molNrINT.o3.l)
ranks.log2 <- apply_CumSum(exp_proL.4rank)


# print fct
plot.protein.ranks <- function(ranksDF, type4title, CAPTION){
  p <- ranksDF %>% ggplot(aes(x= rank, y= cum_Mol, color = name)) +
    geom_point() +
    geom_hline(yintercept = 0.5, linetype="dashed", alpha=0.7) +
    xlab("Protein rank by abundance") + 
    ylab("Cumulative protein abundance") +
    theme(legend.position = "none") +
    xlim(0, 50) +
    scale_color_manual(values = colors_stage) +
    labs(title = paste0("Cumulative abundance:", type4title),
         caption = CAPTION)
  return(p)
}

### unlabeled
plot.protein.ranks(ranks.HIST, ' histones', "Looks like the expected curve")

plot.protein.ranks(ranks.INT1, ' Intensity o1', "Not the expected curve")

plot.protein.ranks(ranks.INT2, ' Intensity o2', "Not the expected curve")

plot.protein.ranks(ranks.INT3, ' Intensity o3', "Not the expected curve")

plot.protein.ranks(ranks.log2, ' Just the intensity', "Maybe this is the solution?")

# # with labeled proteins
# dat.pro.mols %>% 
#   ggplot(aes(x= x, y= cum_Mol, color = stage)) +
#   geom_point() +
#   geom_hline(yintercept = 0.5, linetype="dashed", alpha=0.7) +
#   xlab("Protein rank by abundance") + 
#   ylab("Cumulative protein abundance") +
#   theme(legend.position = "none") +
#   xlim(0, 300) +
#   geom_text_repel(data = . %>% filter(SYMBOL == 'S100A8'), aes(label = SYMBOL), color = "black", size =3, nudge_x = 50 ) + # nudge_y = 0.1, 
#   #geom_point(data = . %>% filter(SYMBOL == 'S100A8'), color = "black") +
#   scale_color_manual(values = colors_stage)+
#   ggtitle("Cumulative abundance in proteome")

From the ranks perspective, the intensity based results do not make sense at all and should be discarded. The result form histones looks like expected but im not sure if its valid. Just using the intensity values actually gives a pretty good result as well! Lets follow up on that one!

knitr::knit_exit()

Molecular numbers in granulopoiesis

Sebastian Hesse, contact: sebastian.hesse@mail.de

2024-03-13

base data

medians

Molecular numbers proteome

base data

Histones

Histone protein intensity during maturation

Histone transcript intensity during maturation

Differential expression of histones

Proteome level

during progression

MB vs others

Transcriptome level

progression

MB vs others

Rulers

constants

Histone ruler

find stable protein group

Intensity ruler

total intensity

option 1

option 2

option 3

Protein ranks