Last date of work: 13.03.24

ToDo:

knitr::opts_chunk$set(warning = FALSE, message = FALSE, results = FALSE) 

# dirs
dir_moa <- "/home/isar/Dropbox/scn_moa/" #  Sebastian @Linux
# dir_moa <- "/Users/home_sh/Library/CloudStorage/Dropbox/scn_moa"  # Sebastian @Mac


# data, revised 2024.01.09
# exp
load(file = paste0(dir_moa, "/data/final/", "gnp.pro.RData"))
load(file = paste0(dir_moa, "/data/final/", "gnp.mrn.RData"))
load(file = paste0(dir_moa, "/data/final/", "gnp.mic.RData"))
#load(file = paste0(dir_moa, "/data/final/", "scn.pro.RData")) # discarded prom project
# results
load(file = paste0(dir_moa, "/data/final/", "dex.all.RData"))
load(file = paste0(dir_moa, "/data/final/", "timeomics.singles.RData"))
load(file = paste0(dir_moa, "/data/final/", "res.wgcna.RData"))


## packages
source(paste0(dir_moa, "/scripts/", "scn_moa_fct.R"))
# load packages
sapply(packages_general, require, character.only = TRUE)
sapply(packages_annotation, require, character.only = TRUE)
# sapply(packages_WGCNA, require, character.only = TRUE)

library("viridis")

# graphs
global_size = 21

fan <- gnp.mrn$fan

Localisation

Here we collect protein localisation information from 4 databases: + Human protein atlas (HPA) + Thul et al + Uniprot + LocDB

Additionally we collect information about neutrophil granule proteins by 3 databases: + GO + Lominadze et al + Rovig et al

HPA

#downloaded from https://www.proteinatlas.org/about/download at 14/03/24 as "Subcellular location data"
localisations_HPA <- read_delim("/home/isar/Dropbox/scn_moa/data/anno/subcellular_location.tsv", 
    delim = "\t", escape_double = FALSE, 
    trim_ws = TRUE)

# use ENSG. Some proteins simply have no loc in HPA
loc_HPA <- data.frame(
  ENSG = fan$ENSG,
  UNIPROT = fan$UNIPROT)

loc_HPA$loc_HPA_main <- localisations_HPA$`Main location`[match(loc_HPA$ENSG, localisations_HPA$Gene)]
loc_HPA$loc_HPA_other <- localisations_HPA$`Additional location`[match(loc_HPA$ENSG, localisations_HPA$Gene)]


sum(is.na(loc_HPA$loc_HPA_main)) # 3118


unlocalized_prots <- tibble(
  db = c("HPA", 'UNIPROT', 'LocDB', 'Thul', "HyLo"),
  unlocalized = c(3118, NA, NA, NA, NA)
)

UNIPROT

# downloaded 13/03/24 from uniprot website: all human proteins

anno_uni.raw <- read_delim("/home/isar/Dropbox/scn_moa/data/anno/uniprotkb_AND_model_organism_9606_2024_03_13.tsv", 
    delim = "\t", escape_double = FALSE, 
    trim_ws = TRUE)

anno_uni <- anno_uni.raw %>% filter(anno_uni.raw$Entry %in% fan$UNIPROT)

setdiff(fan$UNIPROT, anno_uni$Entry) # 389 without annotation

names(anno_uni)
anno_uni %<>% dplyr::rename(
  UNIPROT = 'Entry', 
  SWISS_ID = "Entry Name", 
  proteinName_exd ='Protein names',
  SYMBOL_mult = "Gene Names",
  interactors_UNI = "Interacts with",
  #GO_BP = "Gene Ontology (biological process)",
  #GO_CC = "Gene Ontology (cellular component)",
  #GO_MF = "Gene Ontology (molecular function)",
  loc_UNI = "Subcellular location [CC]", 
  subunit = "Subunit structure",
  domain_intramem_UNI = "Intramembrane", 
  domain_tramsmem_UNI = "Transmembrane",
  domain_topo_UNI = "Topological domain")

## process strings

#split the location-strings, using "Note=" as split character
l <- data.table::tstrsplit(anno_uni$loc_UNI, "Note=", fixed = FALSE )

#now, get the locations by splitting the location-strings
#first, strip the `SUBCELLULAR LOCATION:`
l <- lapply( l, function(x) gsub( "^SUBCELLULAR LOCATION: ", "", x ) )
#and get rid of all the stuff within { ... }
l <- lapply( l, function(x) gsub( "\\{.*?\\}", "", x ) )
#now split the locations on . and ;, and trim whitespace
loc_UNI <- lapply( strsplit( l[[1]], "[.;]", fixed = FALSE ), trimws ) #ok
#remove eventual empty locations
loc_UNI <- lapply(loc_UNI, function(x) subset(x, nchar(x) > 0) )
#paste locations together
loc_UNI <- lapply(loc_UNI, paste0, collapse = ";") #ok
#isolate the notes about localisations
loc_UNI_notes <- l[[2]]
#4 now we build the final data.table
dt <- data.table( UNIPROT  = anno_uni$UNIPROT )
#get the maximum number of locations
max_loc <- length( tstrsplit( loc_UNI,";" ) )
#input the locations
dt[, paste0("location_", 1:max_loc) := tstrsplit( loc_UNI, ";" ) ]
#add the note
dt[, note := loc_UNI_notes ]

#final result
loc_UNI_final <- dt

#change "fake" NA to real ones
loc_UNI_final[loc_UNI_final == "NA"] <- NA
# unite
dim(loc_UNI_final)

loc_UNI <- unite(loc_UNI_final, loc_UNI, c(location_1:location_22), remove = T, na.rm = T, sep = ";")

loc_UNI %<>% rename(loc_UNI_note = note)

# replace empty cells with NA
loc_UNI %<>% mutate_all(na_if,"")

sum(is.na(loc_UNI$loc_UNI)) # 1356 without location

unlocalized_prots[unlocalized_prots$db== "UNIPROT", "unlocalized"] <- 1356

Granule Proteins

For granule protein annotation we use GO annotation and two papers

GO

fan <- gnp.mrn$fan

GO_terms <- AnnotationDbi::select(org.Hs.eg.db, keys= fan$ENSG, columns= c("GOALL"), keytype="ENSEMBL")
names(GO_terms)[1] <- 'ENSG'
head(GO_terms)

#table(org.HS_GO$EVIDENCEALL)
#table(org.HS_GO$EVIDENCEALL[org.HS_GO$EVIDENCEALL %in% c("TAS", "EXP")])

GO_long <- GO_terms[, 1:2]
GO_wide <- group_by(GO_long, ENSG) %>% summarise_all(funs(trimws(paste(., collapse = ';'))))

# convert GO codes into granule annotation
GO_long$granule_GO <- case_when(
  str_detect(GO_long$GOALL, "GO:0042582") ~ "primary",
  str_detect(GO_long$GOALL, "GO:0042581") ~ "secondary",
  str_detect(GO_long$GOALL, "GO:0070820") ~ "tertiary",
  str_detect(GO_long$GOALL, "GO:0101002") ~ "ficolin-1"
  #str_detect(GO_long$GOALL, "GO:0030141") ~ "secretory" # this is not a neutrophils specific term!
 )

# are there proteins localized in multiple granules?
GO.granules <- GO_long %>% filter(!is.na(GO_long$granule_GO)) %>% select(ENSG, granule_GO)
GO.granules <- group_by(GO.granules, ENSG) %>% summarise_all(funs(trimws(paste(., collapse = ';'))))

GO.granules$nr_GO <- str_count(GO.granules$granule_GO, ';')
GO.granules$nr_GO <- GO.granules$nr_GO +1

GO.granules %>% filter(nr_GO > 1) # 187 in multiple!
table(GO.granules$granule_GO) # some have random repeat of the same granule
table(GO.granules$nr_GO) 

# seperate. But actually this inst that useful Its only 35 proteins at the end
GO.granules.multi <- GO.granules %>% filter(nr_GO > 1) %>% pull(ENSG)
GO.granules.mono <- GO.granules %>% filter(nr_GO == 1)
# alternative: just use the first mentioned granule
GO.granules$granule_GO.fin <- gsub(";.*", "", GO.granules$granule_GO)

GO.granules

length(GO.granules.multi) + nrow(GO.granules.mono)


granule_prots <- data_frame(
  db = c("GO", 'ROVIG', 'TOTAL'),
  annotated = c(428, NA, NA)
)

Rovig

Rørvig, S., Østergaard, O., Heegaard, N. H. H. & Borregaard, N. Proteome profiling of human neutrophil granule subsets, secretory vesicles, and cell membrane: correlation with transcriptome profiling of neutrophil precursors. J. Leukoc. Biol. 94, 711–721 (2013).

https://paperpile.com/app/p/4e77514f-d7f3-0ce0-a233-0a6d5d98e0b5

#granule prots from neutrophil proteom paper in JLB 
rovig_raw <- read_csv("/home/isar/Dropbox/scn_moa/data/anno/Neutrophil_protein_locations - Table 1.csv")
rovig_raw %<>% dplyr::rename(UNIPROT = `ID\nUniprot`, granule_rovig = `Peak in\nFraction`)
table(rovig_raw$granule_rovig)
#unique(jlb$loc)

rovig_mutanizer <- data.frame(
  granule_rovig =  c("AG", "SG", "GG",  "FG", "SV", "CM"),
  long = c("primary", "secondary", "tertiary", "ficolin-1", "secretory", "Plasma membrane"))

loc_rovig.pre <- plyr::join(rovig_raw, rovig_mutanizer, by = "granule_rovig", type="left", match="first")

loc_rovig.granu <- loc_rovig.pre %>% filter (granule_rovig != "CM")
nrow(loc_rovig.granu)
loc_rovig.membrane <- loc_rovig.pre %>% filter (granule_rovig == "CM")
nrow(loc_rovig.membrane)

loc_ROV <- data.frame(
  ENSG = fan$ENSG,
  UNIPROT = fan$UNIPROT)

loc_ROV$granule_ROV <- loc_rovig.granu$long[match(loc_ROV$UNIPROT, loc_rovig.granu$UNIPROT)]
loc_ROV$membrane_ROV <- loc_rovig.membrane$long[match(loc_ROV$UNIPROT, loc_rovig.membrane$UNIPROT)]

sum(!is.na(loc_ROV$granule_ROV)) # 720
sum(!is.na(loc_ROV$membrane_ROV)) # 294

granule_prots[granule_prots$db== "ROVIG", "annotated"] <- 720

table(loc_ROV$granule_ROV)

Unify locs

Granules

loc.gran_pre <- fan

loc.gran_pre$granule_ROV <- loc_ROV$granule_ROV[match(loc.gran_pre$ENSG, loc_ROV$ENSG)]
loc.gran_pre$granule_GO <- GO.granules$granule_GO.fin[match(loc.gran_pre$ENSG, GO.granules.mono$ENSG)]

table(loc.gran_pre$granule_ROV)
table(loc.gran_pre$granule_GO)

# IDs where all agree
granules_allAgree <- loc.gran_pre %>% 
  filter(if_all(granule_ROV, ~ granule_GO == .x))
nrow(granules_allAgree) # 181 agree. WHY IS THIS 46 now???? ... 


loc.gran <- loc.gran_pre[!with(loc.gran_pre, is.na(granule_ROV)& is.na(granule_GO)),]


granules.notInROV <- loc.gran %>% filter(is.na(granule_ROV)) 
nrow(granules.notInROV) # 87 additional in GO that are not in ROV
table(granules.notInROV$granule_GO)


######## finalize
loc.gran$granule <- loc.gran$granule_ROV # rovig is first
loc.gran$granule <- ifelse(is.na(loc.gran$granule), loc.gran$granule_GO, loc.gran$granule)

table(loc.gran$granule)

HPA

# HPA
# loc_HPA.fin <- loc_HPA %>% filter(ENSG %in% locs.left1$ENSG)
# nrow(loc_HPA.fin) # 9659 left

head(loc_HPA)
sort(table(loc_HPA$loc_HPA_main))

# LVL1
loc_HPA$loc.HPA_lvl1 <- NA

# cytoskeleton
loc_HPA$loc.HPA_lvl1 <- ifelse(str_detect(loc_HPA$loc_HPA_main, 'Centrosome'), 'Cytoskeleton', loc_HPA$loc.HPA_lvl1) 
loc_HPA$loc.HPA_lvl1 <- ifelse(str_detect(loc_HPA$loc_HPA_main, 'Cell Junctions'), 'Cytoskeleton', loc_HPA$loc.HPA_lvl1)  
loc_HPA$loc.HPA_lvl1 <- ifelse(str_detect(loc_HPA$loc_HPA_main, 'Centriolar satellite'), 'Cytoskeleton', loc_HPA$loc.HPA_lvl1)  
loc_HPA$loc.HPA_lvl1 <- ifelse(str_detect(loc_HPA$loc_HPA_main, 'Microtubules'), 'Cytoskeleton', loc_HPA$loc.HPA_lvl1)
loc_HPA$loc.HPA_lvl1 <- ifelse(str_detect(loc_HPA$loc_HPA_main, 'Focal adhesion sites'), 'Cytoskeleton', loc_HPA$loc.HPA_lvl1)
loc_HPA$loc.HPA_lvl1 <- ifelse(str_detect(loc_HPA$loc_HPA_main, 'Intermediate filaments'), 'Cytoskeleton', loc_HPA$loc.HPA_lvl1)
loc_HPA$loc.HPA_lvl1 <- ifelse(str_detect(loc_HPA$loc_HPA_main, 'Actin filaments'), 'Cytoskeleton', loc_HPA$loc.HPA_lvl1)            
loc_HPA$loc.HPA_lvl1 <- ifelse(str_detect(loc_HPA$loc_HPA_main, 'Cytokinetic bridge'), 'Cytoskeleton', loc_HPA$loc.HPA_lvl1)         
loc_HPA$loc.HPA_lvl1 <- ifelse(str_detect(loc_HPA$loc_HPA_main, 'Mitotic spindle'), 'Cytoskeleton', loc_HPA$loc.HPA_lvl1)                 
loc_HPA$loc.HPA_lvl1 <- ifelse(str_detect(loc_HPA$loc_HPA_main, 'Midbody'), 'Cytoskeleton', loc_HPA$loc.HPA_lvl1)
loc_HPA$loc.HPA_lvl1 <- ifelse(str_detect(loc_HPA$loc_HPA_main, 'Microtubule ends'), 'Cytoskeleton', loc_HPA$loc.HPA_lvl1)     
loc_HPA$loc.HPA_lvl1 <- ifelse(str_detect(loc_HPA$loc_HPA_main, 'Midbody ring'), 'Cytoskeleton', loc_HPA$loc.HPA_lvl1)
# vesicles
loc_HPA$loc.HPA_lvl1 <- ifelse(str_detect(loc_HPA$loc_HPA_main, 'Vesicles'), 'Vesicle', loc_HPA$loc.HPA_lvl1)
loc_HPA$loc.HPA_lvl1 <- ifelse(str_detect(loc_HPA$loc_HPA_main, 'Endosomes'), 'Vesicle', loc_HPA$loc.HPA_lvl1)
loc_HPA$loc.HPA_lvl1 <- ifelse(str_detect(loc_HPA$loc_HPA_main, 'Lysosomes'), 'Vesicle', loc_HPA$loc.HPA_lvl1)
loc_HPA$loc.HPA_lvl1 <- ifelse(str_detect(loc_HPA$loc_HPA_main, 'Peroxisomes'), 'Vesicle', loc_HPA$loc.HPA_lvl1)
loc_HPA$loc.HPA_lvl1 <- ifelse(str_detect(loc_HPA$loc_HPA_main, 'Lipid droplets'), 'Vesicle', loc_HPA$loc.HPA_lvl1)
# cytosol
loc_HPA$loc.HPA_lvl1 <- ifelse(str_detect(loc_HPA$loc_HPA_main, 'Rods & Rings'), 'Cytosol', loc_HPA$loc.HPA_lvl1)
loc_HPA$loc.HPA_lvl1 <- ifelse(str_detect(loc_HPA$loc_HPA_main, 'Cytosol'), 'Cytosol', loc_HPA$loc.HPA_lvl1)
loc_HPA$loc.HPA_lvl1 <- ifelse(str_detect(loc_HPA$loc_HPA_main, 'Cytoplasmic bodies'), 'Cytosol', loc_HPA$loc.HPA_lvl1)
# nucleus
loc_HPA$loc.HPA_lvl1 <- ifelse(str_detect(loc_HPA$loc_HPA_main, 'Nucleoplasm'), 'Nucleus', loc_HPA$loc.HPA_lvl1)
loc_HPA$loc.HPA_lvl1 <- ifelse(str_detect(loc_HPA$loc_HPA_main, 'Nuclear membrane'), 'Nucleus', loc_HPA$loc.HPA_lvl1)
loc_HPA$loc.HPA_lvl1 <- ifelse(str_detect(loc_HPA$loc_HPA_main, 'Nuclear speckles'), 'Nucleus', loc_HPA$loc.HPA_lvl1)
loc_HPA$loc.HPA_lvl1 <- ifelse(str_detect(loc_HPA$loc_HPA_main, 'Nucleoli'), 'Nucleus', loc_HPA$loc.HPA_lvl1)
loc_HPA$loc.HPA_lvl1 <- ifelse(str_detect(loc_HPA$loc_HPA_main, 'Nucleoli fibrillar center'), 'Nucleus', loc_HPA$loc.HPA_lvl1)
loc_HPA$loc.HPA_lvl1 <- ifelse(str_detect(loc_HPA$loc_HPA_main, 'Nuclear bodies'), 'Nucleus', loc_HPA$loc.HPA_lvl1)
loc_HPA$loc.HPA_lvl1 <- ifelse(str_detect(loc_HPA$loc_HPA_main, 'Nucleoli rim'), 'Nucleus', loc_HPA$loc.HPA_lvl1)
loc_HPA$loc.HPA_lvl1 <- ifelse(str_detect(loc_HPA$loc_HPA_main, 'Mitotic chromosome'), 'Nucleus', loc_HPA$loc.HPA_lvl1)
# organels
loc_HPA$loc.HPA_lvl1 <- ifelse(str_detect(loc_HPA$loc_HPA_main, 'Mitochondria'), 'Mitochondrion', loc_HPA$loc.HPA_lvl1)
loc_HPA$loc.HPA_lvl1 <- ifelse(str_detect(loc_HPA$loc_HPA_main, 'Golgi apparatus'), 'Golgi', loc_HPA$loc.HPA_lvl1)
loc_HPA$loc.HPA_lvl1 <- ifelse(str_detect(loc_HPA$loc_HPA_main, 'Endoplasmic reticulum'), 'ER', loc_HPA$loc.HPA_lvl1)
# membrane
loc_HPA$loc.HPA_lvl1 <- ifelse(str_detect(loc_HPA$loc_HPA_main, 'Plasma membrane'), 'Plasma membrane', loc_HPA$loc.HPA_lvl1)

# LVL2
loc_HPA$loc.HPA_lvl2 <- NA
table(loc_HPA$loc.HPA_lvl1)

# Plasma Membrane
loc_HPA$loc.HPA_lvl2 <- ifelse(str_detect(loc_HPA$loc.HPA_lvl1, 'Plasma membrane'), 'Plasma membrane', loc_HPA$loc.HPA_lvl2)
# Nucleus
loc_HPA$loc.HPA_lvl2 <- ifelse(str_detect(loc_HPA$loc.HPA_lvl1, 'Nucleus'), 'Nucleus', loc_HPA$loc.HPA_lvl2)
loc_HPA$loc.HPA_lvl2 <- ifelse(str_detect(loc_HPA$loc.HPA_lvl1, 'Mitochondrion'), 'Mitochondrion', loc_HPA$loc.HPA_lvl2)

# Cytosol
loc_HPA$loc.HPA_lvl2 <- ifelse(str_detect(loc_HPA$loc.HPA_lvl1, 'Cytoskeleton'), 'Cytosol', loc_HPA$loc.HPA_lvl2)
loc_HPA$loc.HPA_lvl2 <- ifelse(str_detect(loc_HPA$loc.HPA_lvl1, 'Cytosol'), 'Cytosol', loc_HPA$loc.HPA_lvl2)
loc_HPA$loc.HPA_lvl2 <- ifelse(str_detect(loc_HPA$loc.HPA_lvl1, 'Vesicle'), 'Cytosol', loc_HPA$loc.HPA_lvl2)


# ER & Golgi
loc_HPA$loc.HPA_lvl2 <- ifelse(str_detect(loc_HPA$loc.HPA_lvl1, 'ER'), 'ER u. Golgi', loc_HPA$loc.HPA_lvl2)
loc_HPA$loc.HPA_lvl2 <- ifelse(str_detect(loc_HPA$loc.HPA_lvl1, 'Golgi'), 'ER u. Golgi', loc_HPA$loc.HPA_lvl2)


# check
loc_HPA %>% filter(is.na(loc.HPA_lvl1)) %>% nrow() # 3118 without
loc_HPA %>% filter(is.na(loc.HPA_lvl2)) %>% nrow() # same -> ok
loc_HPA %>% filter(is.na(loc.HPA_lvl1)) %>% filter(!is.na(UNIPROT)) %>% nrow() # 1883 proteins without anno

#loc_HPA$loc.HPA_lvl1 <- ifelse(is.na(loc_HPA$loc.HPA_lvl1) , 'Unknown', loc_HPA$loc.HPA_lvl1)
#loc_HPA$loc.HPA_lvl2 <- ifelse(is.na(loc_HPA$loc.HPA_lvl2) , 'Unknown', loc_HPA$loc.HPA_lvl2)

table(loc_HPA$loc.HPA_lvl1)
table(loc_HPA$loc.HPA_lvl2)

# ##### select and check with UNIPROT
# UNIPROT_noLoc.HPA <- loc_HPA.fin %>% filter(is.na(loc.lvl1)) %>% pull(UNIPROT)
# 
# 
# table(loc_HPA.fin$loc.lvl2)

UNIPROT

loc_UNI

sort(table(loc_UNI$loc_UNI)) # chaotic mess
 
# simplify with str_detect
loc_UNI$loc_UNI.lvl2 <- NA


loc_UNI$loc_UNI.lvl2 <- ifelse(str_detect(loc_UNI$loc_UNI, 'Membrane'), 'Plasma membrane', loc_UNI$loc_UNI.lvl2)
loc_UNI$loc_UNI.lvl2 <- ifelse(str_detect(loc_UNI$loc_UNI, 'Secreted'), 'Cytosol', loc_UNI$loc_UNI.lvl2)

loc_UNI$loc_UNI.lvl2 <- ifelse(str_detect(loc_UNI$loc_UNI, 'endosome'), 'Cytosol', loc_UNI$loc_UNI.lvl2)
loc_UNI$loc_UNI.lvl2 <- ifelse(str_detect(loc_UNI$loc_UNI, 'Lysosome'), 'Cytosol', loc_UNI$loc_UNI.lvl2)
loc_UNI$loc_UNI.lvl2 <- ifelse(str_detect(loc_UNI$loc_UNI, 'Peroxisome'), 'Cytosol', loc_UNI$loc_UNI.lvl2)
loc_UNI$loc_UNI.lvl2 <- ifelse(str_detect(loc_UNI$loc_UNI, 'Cytolytic granule'), 'Cytosol', loc_UNI$loc_UNI.lvl2)

loc_UNI$loc_UNI.lvl2 <- ifelse(str_detect(loc_UNI$loc_UNI, 'Cell junction'), 'Cytosol', loc_UNI$loc_UNI.lvl2)
loc_UNI$loc_UNI.lvl2 <- ifelse(str_detect(loc_UNI$loc_UNI, 'centromere'), 'Cytosol', loc_UNI$loc_UNI.lvl2)

loc_UNI$loc_UNI.lvl2 <- ifelse(str_detect(loc_UNI$loc_UNI, 'Cytoplasm'), 'Cytosol', loc_UNI$loc_UNI.lvl2)

loc_UNI$loc_UNI.lvl2 <- ifelse(str_detect(loc_UNI$loc_UNI, 'cilium'), 'Plasma membrane', loc_UNI$loc_UNI.lvl2)
loc_UNI$loc_UNI.lvl2 <- ifelse(str_detect(loc_UNI$loc_UNI, 'Cell membrane'), 'Plasma membrane', loc_UNI$loc_UNI.lvl2)
loc_UNI$loc_UNI.lvl2 <- ifelse(str_detect(loc_UNI$loc_UNI, 'Apical cell membranee'), 'Plasma membrane', loc_UNI$loc_UNI.lvl2)
loc_UNI$loc_UNI.lvl2 <- ifelse(str_detect(loc_UNI$loc_UNI, 'Cell projection'), 'Plasma membrane', loc_UNI$loc_UNI.lvl2)
loc_UNI$loc_UNI.lvl2 <- ifelse(str_detect(loc_UNI$loc_UNI, 'Basolateral cell membrane'), 'Plasma membrane', loc_UNI$loc_UNI.lvl2)

loc_UNI$loc_UNI.lvl2 <- ifelse(str_detect(loc_UNI$loc_UNI, 'Mitochondrion'), 'Mitochondrion', loc_UNI$loc_UNI.lvl2)
loc_UNI$loc_UNI.lvl2 <- ifelse(str_detect(loc_UNI$loc_UNI, 'Endoplasmic reticulum'), 'ER u. Golgi', loc_UNI$loc_UNI.lvl2)
loc_UNI$loc_UNI.lvl2 <- ifelse(str_detect(loc_UNI$loc_UNI, 'endoplasmic reticulum'), 'ER u. Golgi', loc_UNI$loc_UNI.lvl2)
loc_UNI$loc_UNI.lvl2 <- ifelse(str_detect(loc_UNI$loc_UNI, 'Golgi'), 'ER u. Golgi', loc_UNI$loc_UNI.lvl2)
loc_UNI$loc_UNI.lvl2 <- ifelse(str_detect(loc_UNI$loc_UNI, 'Nucleus'), 'Nucleus', loc_UNI$loc_UNI.lvl2)

# check
loc_UNI %>% filter(is.na(loc_UNI.lvl2)) %>% nrow()   #  # 1405 left
loc_UNI %>% filter(is.na(loc_UNI.lvl2)) %>% select(UNIPROT, loc_UNI.lvl2, loc_UNI)

# put unknown
#loc_UNI$loc.lvl2 <- ifelse(is.na(loc_UNI$loc_UNI.lvl2) , 'Unknown', loc_UNI$loc_UNI.lvl2)

table(loc_UNI$loc_UNI.lvl2)
table(loc_UNI$loc_UNI)

finalize

loc.gran # granule subsets
loc_ROV # plasma membrane proteins
loc_HPA # HPA locs
loc_UNI # uniprot locs

fan.locs <- data.frame(
  ENSG = fan$ENSG,
  UNIPROT = fan$UNIPROT)

# Granule subsets
fan.locs$granule <- loc.gran$granule[match(fan.locs$ENSG, loc.gran$ENSG)]
fan.locs$granule <- str_to_sentence(fan.locs$granule)
# plasam membrane Rovig
fan.locs$plasma_membrane <- loc_ROV$membrane_ROV[match(fan.locs$ENSG, loc_ROV$ENSG)]
# HPA
fan.locs$loc_HPA.lv1 <- loc_HPA$loc.HPA_lvl1[match(fan.locs$ENSG, loc_HPA$ENSG)]
fan.locs$loc_HPA.lv2 <- loc_HPA$loc.HPA_lvl2[match(fan.locs$ENSG, loc_HPA$ENSG)]
# Uniprot
fan.locs$loc_UNI.lv1 <- loc_UNI$loc_UNI[match(fan.locs$ENSG, loc_UNI$ENSG)]
fan.locs$loc_UNI.lv2 <- loc_UNI$loc_UNI.lvl2[match(fan.locs$ENSG, loc_UNI$ENSG)]

# check
colSums(is.na(fan.locs))


fan.locs$loc.combined <- NA
fan.locs$loc.combined <- ifelse(is.na(fan.locs$loc.combined), fan.locs$granule, NA)
fan.locs$loc.combined <- ifelse(is.na(fan.locs$loc.combined), fan.locs$plasma_membrane, fan.locs$loc.combined)
fan.locs$loc.combined <- ifelse(is.na(fan.locs$loc.combined), fan.locs$loc_HPA.lv2, fan.locs$loc.combined)
fan.locs$loc.combined <- ifelse(is.na(fan.locs$loc.combined), fan.locs$loc_UNI.lv2, fan.locs$loc.combined)
table(fan.locs$loc.combined)
fan.locs$loc.combined <- ifelse(is.na(fan.locs$loc.combined), 'Unknown', fan.locs$loc.combined)

table(fan.locs$loc.combined)
count(fan.locs$loc.combined)

# library(treemapify)
# 
# 
# fan.locs %>%
#   group_by(loc.combined) %>%
#   summarise (n=n()) %>%
#   mutate(rel.freq = as.integer(paste0(round(100 * n/sum(n), 0)))) %>%
#   ggplot(aes(area = rel.freq, fill = loc.combined)) +
#   geom_treemap() +
#   geom_treemap_text(aes(label = paste(loc.combined, paste0(rel.freq, "%"), sep = "\n")), colour = "white", min.size = 8) +
#   theme(legend.position = "none") +
#   ggtitle("Subcellular distribution of all granulopoiesis features: \ntranscriptome and proteome")

Export

anno <- list(
  fan.locs = fan.locs,
  anno_uni = anno_uni,
  GO_long = GO_long,
  GO_wide = GO_wide,
  GO_terms = GO_terms,
  loc_ROV = loc_ROV)

save(
  anno,
  file = paste0(dir_moa, "/data/final/", "anno.RData"))

Gene code transcript annotations

Gene Code

import

gff_raw <- read.delim('/home/isar/Dropbox/scn_moa/data/final/ext/gencode.v46.annotation.gtf', header = FALSE, sep = '\t', skip = 3)

gc_raw <-rtracklayer::readGFF('/home/isar/Dropbox/scn_moa/data/final/ext/gencode.v46.annotation.gtf')
is.data.frame(gc_raw)

gc_tmp <- gc_raw
gc_tmp <- as.tibble(gc_tmp)
gc_tmp$ENSG <- gsub("\\..*","", gc_tmp$gene_id)   

gc_fan <- gc_tmp %>% filter(ENSG %in% fan$ENSG)
gc_fan %<>% distinct(ENSG, .keep_all = T)

# write_csv(gc_fan, paste0(dir_moa, "/home/isar/Dropbox/scn_moa/data/final/anno/", "gene_code", ".csv"))

overview

# table(gc_fan$gene_type)
# 
# library(gtsummary)
# 
# gc_fan %>% tbl_summary(include = c(gene_type))
# 
# gc_fan$mrn <- ifelse(gc_fan$ENSG %in% exp_mrnL$ENSG, T, F)
# gc_fan$pro <- ifelse(gc_fan$ENSG %in% exp_proL$ENSG, T, F)
# 
# # lnc
# foi_lncRNA <- gc_fan %>% filter(gene_type == 'lncRNA') %>% pull(ENSG) # make heatmap
# 
# gc_fan %>% filter(gene_type == 'miRNA')