Last date of work: 13.03.24
ToDo:
knitr::opts_chunk$set(warning = FALSE, message = FALSE, results = FALSE)
# dirs
dir_moa <- "/home/isar/Dropbox/scn_moa/" # Sebastian @Linux
# dir_moa <- "/Users/home_sh/Library/CloudStorage/Dropbox/scn_moa" # Sebastian @Mac
# data, revised 2024.01.09
# exp
load(file = paste0(dir_moa, "/data/final/", "gnp.pro.RData"))
load(file = paste0(dir_moa, "/data/final/", "gnp.mrn.RData"))
load(file = paste0(dir_moa, "/data/final/", "gnp.mic.RData"))
#load(file = paste0(dir_moa, "/data/final/", "scn.pro.RData")) # discarded prom project
# results
load(file = paste0(dir_moa, "/data/final/", "dex.all.RData"))
load(file = paste0(dir_moa, "/data/final/", "timeomics.singles.RData"))
load(file = paste0(dir_moa, "/data/final/", "res.wgcna.RData"))
## packages
source(paste0(dir_moa, "/scripts/", "scn_moa_fct.R"))
# load packages
sapply(packages_general, require, character.only = TRUE)
sapply(packages_annotation, require, character.only = TRUE)
# sapply(packages_WGCNA, require, character.only = TRUE)
library("viridis")
# graphs
global_size = 21
fan <- gnp.mrn$fan
Here we collect protein localisation information from 4 databases: + Human protein atlas (HPA) + Thul et al + Uniprot + LocDB
Additionally we collect information about neutrophil granule proteins by 3 databases: + GO + Lominadze et al + Rovig et al
#downloaded from https://www.proteinatlas.org/about/download at 14/03/24 as "Subcellular location data"
localisations_HPA <- read_delim("/home/isar/Dropbox/scn_moa/data/anno/subcellular_location.tsv",
delim = "\t", escape_double = FALSE,
trim_ws = TRUE)
# use ENSG. Some proteins simply have no loc in HPA
loc_HPA <- data.frame(
ENSG = fan$ENSG,
UNIPROT = fan$UNIPROT)
loc_HPA$loc_HPA_main <- localisations_HPA$`Main location`[match(loc_HPA$ENSG, localisations_HPA$Gene)]
loc_HPA$loc_HPA_other <- localisations_HPA$`Additional location`[match(loc_HPA$ENSG, localisations_HPA$Gene)]
sum(is.na(loc_HPA$loc_HPA_main)) # 3118
unlocalized_prots <- tibble(
db = c("HPA", 'UNIPROT', 'LocDB', 'Thul', "HyLo"),
unlocalized = c(3118, NA, NA, NA, NA)
)
# downloaded 13/03/24 from uniprot website: all human proteins
anno_uni.raw <- read_delim("/home/isar/Dropbox/scn_moa/data/anno/uniprotkb_AND_model_organism_9606_2024_03_13.tsv",
delim = "\t", escape_double = FALSE,
trim_ws = TRUE)
anno_uni <- anno_uni.raw %>% filter(anno_uni.raw$Entry %in% fan$UNIPROT)
setdiff(fan$UNIPROT, anno_uni$Entry) # 389 without annotation
names(anno_uni)
anno_uni %<>% dplyr::rename(
UNIPROT = 'Entry',
SWISS_ID = "Entry Name",
proteinName_exd ='Protein names',
SYMBOL_mult = "Gene Names",
interactors_UNI = "Interacts with",
#GO_BP = "Gene Ontology (biological process)",
#GO_CC = "Gene Ontology (cellular component)",
#GO_MF = "Gene Ontology (molecular function)",
loc_UNI = "Subcellular location [CC]",
subunit = "Subunit structure",
domain_intramem_UNI = "Intramembrane",
domain_tramsmem_UNI = "Transmembrane",
domain_topo_UNI = "Topological domain")
## process strings
#split the location-strings, using "Note=" as split character
l <- data.table::tstrsplit(anno_uni$loc_UNI, "Note=", fixed = FALSE )
#now, get the locations by splitting the location-strings
#first, strip the `SUBCELLULAR LOCATION:`
l <- lapply( l, function(x) gsub( "^SUBCELLULAR LOCATION: ", "", x ) )
#and get rid of all the stuff within { ... }
l <- lapply( l, function(x) gsub( "\\{.*?\\}", "", x ) )
#now split the locations on . and ;, and trim whitespace
loc_UNI <- lapply( strsplit( l[[1]], "[.;]", fixed = FALSE ), trimws ) #ok
#remove eventual empty locations
loc_UNI <- lapply(loc_UNI, function(x) subset(x, nchar(x) > 0) )
#paste locations together
loc_UNI <- lapply(loc_UNI, paste0, collapse = ";") #ok
#isolate the notes about localisations
loc_UNI_notes <- l[[2]]
#4 now we build the final data.table
dt <- data.table( UNIPROT = anno_uni$UNIPROT )
#get the maximum number of locations
max_loc <- length( tstrsplit( loc_UNI,";" ) )
#input the locations
dt[, paste0("location_", 1:max_loc) := tstrsplit( loc_UNI, ";" ) ]
#add the note
dt[, note := loc_UNI_notes ]
#final result
loc_UNI_final <- dt
#change "fake" NA to real ones
loc_UNI_final[loc_UNI_final == "NA"] <- NA
# unite
dim(loc_UNI_final)
loc_UNI <- unite(loc_UNI_final, loc_UNI, c(location_1:location_22), remove = T, na.rm = T, sep = ";")
loc_UNI %<>% rename(loc_UNI_note = note)
# replace empty cells with NA
loc_UNI %<>% mutate_all(na_if,"")
sum(is.na(loc_UNI$loc_UNI)) # 1356 without location
unlocalized_prots[unlocalized_prots$db== "UNIPROT", "unlocalized"] <- 1356
For granule protein annotation we use GO annotation and two papers
fan <- gnp.mrn$fan
GO_terms <- AnnotationDbi::select(org.Hs.eg.db, keys= fan$ENSG, columns= c("GOALL"), keytype="ENSEMBL")
names(GO_terms)[1] <- 'ENSG'
head(GO_terms)
#table(org.HS_GO$EVIDENCEALL)
#table(org.HS_GO$EVIDENCEALL[org.HS_GO$EVIDENCEALL %in% c("TAS", "EXP")])
GO_long <- GO_terms[, 1:2]
GO_wide <- group_by(GO_long, ENSG) %>% summarise_all(funs(trimws(paste(., collapse = ';'))))
# convert GO codes into granule annotation
GO_long$granule_GO <- case_when(
str_detect(GO_long$GOALL, "GO:0042582") ~ "primary",
str_detect(GO_long$GOALL, "GO:0042581") ~ "secondary",
str_detect(GO_long$GOALL, "GO:0070820") ~ "tertiary",
str_detect(GO_long$GOALL, "GO:0101002") ~ "ficolin-1"
#str_detect(GO_long$GOALL, "GO:0030141") ~ "secretory" # this is not a neutrophils specific term!
)
# are there proteins localized in multiple granules?
GO.granules <- GO_long %>% filter(!is.na(GO_long$granule_GO)) %>% select(ENSG, granule_GO)
GO.granules <- group_by(GO.granules, ENSG) %>% summarise_all(funs(trimws(paste(., collapse = ';'))))
GO.granules$nr_GO <- str_count(GO.granules$granule_GO, ';')
GO.granules$nr_GO <- GO.granules$nr_GO +1
GO.granules %>% filter(nr_GO > 1) # 187 in multiple!
table(GO.granules$granule_GO) # some have random repeat of the same granule
table(GO.granules$nr_GO)
# seperate. But actually this inst that useful Its only 35 proteins at the end
GO.granules.multi <- GO.granules %>% filter(nr_GO > 1) %>% pull(ENSG)
GO.granules.mono <- GO.granules %>% filter(nr_GO == 1)
# alternative: just use the first mentioned granule
GO.granules$granule_GO.fin <- gsub(";.*", "", GO.granules$granule_GO)
GO.granules
length(GO.granules.multi) + nrow(GO.granules.mono)
granule_prots <- data_frame(
db = c("GO", 'ROVIG', 'TOTAL'),
annotated = c(428, NA, NA)
)
Rørvig, S., Østergaard, O., Heegaard, N. H. H. & Borregaard, N. Proteome profiling of human neutrophil granule subsets, secretory vesicles, and cell membrane: correlation with transcriptome profiling of neutrophil precursors. J. Leukoc. Biol. 94, 711–721 (2013).
https://paperpile.com/app/p/4e77514f-d7f3-0ce0-a233-0a6d5d98e0b5
#granule prots from neutrophil proteom paper in JLB
rovig_raw <- read_csv("/home/isar/Dropbox/scn_moa/data/anno/Neutrophil_protein_locations - Table 1.csv")
rovig_raw %<>% dplyr::rename(UNIPROT = `ID\nUniprot`, granule_rovig = `Peak in\nFraction`)
table(rovig_raw$granule_rovig)
#unique(jlb$loc)
rovig_mutanizer <- data.frame(
granule_rovig = c("AG", "SG", "GG", "FG", "SV", "CM"),
long = c("primary", "secondary", "tertiary", "ficolin-1", "secretory", "Plasma membrane"))
loc_rovig.pre <- plyr::join(rovig_raw, rovig_mutanizer, by = "granule_rovig", type="left", match="first")
loc_rovig.granu <- loc_rovig.pre %>% filter (granule_rovig != "CM")
nrow(loc_rovig.granu)
loc_rovig.membrane <- loc_rovig.pre %>% filter (granule_rovig == "CM")
nrow(loc_rovig.membrane)
loc_ROV <- data.frame(
ENSG = fan$ENSG,
UNIPROT = fan$UNIPROT)
loc_ROV$granule_ROV <- loc_rovig.granu$long[match(loc_ROV$UNIPROT, loc_rovig.granu$UNIPROT)]
loc_ROV$membrane_ROV <- loc_rovig.membrane$long[match(loc_ROV$UNIPROT, loc_rovig.membrane$UNIPROT)]
sum(!is.na(loc_ROV$granule_ROV)) # 720
sum(!is.na(loc_ROV$membrane_ROV)) # 294
granule_prots[granule_prots$db== "ROVIG", "annotated"] <- 720
table(loc_ROV$granule_ROV)
loc.gran_pre <- fan
loc.gran_pre$granule_ROV <- loc_ROV$granule_ROV[match(loc.gran_pre$ENSG, loc_ROV$ENSG)]
loc.gran_pre$granule_GO <- GO.granules$granule_GO.fin[match(loc.gran_pre$ENSG, GO.granules.mono$ENSG)]
table(loc.gran_pre$granule_ROV)
table(loc.gran_pre$granule_GO)
# IDs where all agree
granules_allAgree <- loc.gran_pre %>%
filter(if_all(granule_ROV, ~ granule_GO == .x))
nrow(granules_allAgree) # 181 agree. WHY IS THIS 46 now???? ...
loc.gran <- loc.gran_pre[!with(loc.gran_pre, is.na(granule_ROV)& is.na(granule_GO)),]
granules.notInROV <- loc.gran %>% filter(is.na(granule_ROV))
nrow(granules.notInROV) # 87 additional in GO that are not in ROV
table(granules.notInROV$granule_GO)
######## finalize
loc.gran$granule <- loc.gran$granule_ROV # rovig is first
loc.gran$granule <- ifelse(is.na(loc.gran$granule), loc.gran$granule_GO, loc.gran$granule)
table(loc.gran$granule)
# HPA
# loc_HPA.fin <- loc_HPA %>% filter(ENSG %in% locs.left1$ENSG)
# nrow(loc_HPA.fin) # 9659 left
head(loc_HPA)
sort(table(loc_HPA$loc_HPA_main))
# LVL1
loc_HPA$loc.HPA_lvl1 <- NA
# cytoskeleton
loc_HPA$loc.HPA_lvl1 <- ifelse(str_detect(loc_HPA$loc_HPA_main, 'Centrosome'), 'Cytoskeleton', loc_HPA$loc.HPA_lvl1)
loc_HPA$loc.HPA_lvl1 <- ifelse(str_detect(loc_HPA$loc_HPA_main, 'Cell Junctions'), 'Cytoskeleton', loc_HPA$loc.HPA_lvl1)
loc_HPA$loc.HPA_lvl1 <- ifelse(str_detect(loc_HPA$loc_HPA_main, 'Centriolar satellite'), 'Cytoskeleton', loc_HPA$loc.HPA_lvl1)
loc_HPA$loc.HPA_lvl1 <- ifelse(str_detect(loc_HPA$loc_HPA_main, 'Microtubules'), 'Cytoskeleton', loc_HPA$loc.HPA_lvl1)
loc_HPA$loc.HPA_lvl1 <- ifelse(str_detect(loc_HPA$loc_HPA_main, 'Focal adhesion sites'), 'Cytoskeleton', loc_HPA$loc.HPA_lvl1)
loc_HPA$loc.HPA_lvl1 <- ifelse(str_detect(loc_HPA$loc_HPA_main, 'Intermediate filaments'), 'Cytoskeleton', loc_HPA$loc.HPA_lvl1)
loc_HPA$loc.HPA_lvl1 <- ifelse(str_detect(loc_HPA$loc_HPA_main, 'Actin filaments'), 'Cytoskeleton', loc_HPA$loc.HPA_lvl1)
loc_HPA$loc.HPA_lvl1 <- ifelse(str_detect(loc_HPA$loc_HPA_main, 'Cytokinetic bridge'), 'Cytoskeleton', loc_HPA$loc.HPA_lvl1)
loc_HPA$loc.HPA_lvl1 <- ifelse(str_detect(loc_HPA$loc_HPA_main, 'Mitotic spindle'), 'Cytoskeleton', loc_HPA$loc.HPA_lvl1)
loc_HPA$loc.HPA_lvl1 <- ifelse(str_detect(loc_HPA$loc_HPA_main, 'Midbody'), 'Cytoskeleton', loc_HPA$loc.HPA_lvl1)
loc_HPA$loc.HPA_lvl1 <- ifelse(str_detect(loc_HPA$loc_HPA_main, 'Microtubule ends'), 'Cytoskeleton', loc_HPA$loc.HPA_lvl1)
loc_HPA$loc.HPA_lvl1 <- ifelse(str_detect(loc_HPA$loc_HPA_main, 'Midbody ring'), 'Cytoskeleton', loc_HPA$loc.HPA_lvl1)
# vesicles
loc_HPA$loc.HPA_lvl1 <- ifelse(str_detect(loc_HPA$loc_HPA_main, 'Vesicles'), 'Vesicle', loc_HPA$loc.HPA_lvl1)
loc_HPA$loc.HPA_lvl1 <- ifelse(str_detect(loc_HPA$loc_HPA_main, 'Endosomes'), 'Vesicle', loc_HPA$loc.HPA_lvl1)
loc_HPA$loc.HPA_lvl1 <- ifelse(str_detect(loc_HPA$loc_HPA_main, 'Lysosomes'), 'Vesicle', loc_HPA$loc.HPA_lvl1)
loc_HPA$loc.HPA_lvl1 <- ifelse(str_detect(loc_HPA$loc_HPA_main, 'Peroxisomes'), 'Vesicle', loc_HPA$loc.HPA_lvl1)
loc_HPA$loc.HPA_lvl1 <- ifelse(str_detect(loc_HPA$loc_HPA_main, 'Lipid droplets'), 'Vesicle', loc_HPA$loc.HPA_lvl1)
# cytosol
loc_HPA$loc.HPA_lvl1 <- ifelse(str_detect(loc_HPA$loc_HPA_main, 'Rods & Rings'), 'Cytosol', loc_HPA$loc.HPA_lvl1)
loc_HPA$loc.HPA_lvl1 <- ifelse(str_detect(loc_HPA$loc_HPA_main, 'Cytosol'), 'Cytosol', loc_HPA$loc.HPA_lvl1)
loc_HPA$loc.HPA_lvl1 <- ifelse(str_detect(loc_HPA$loc_HPA_main, 'Cytoplasmic bodies'), 'Cytosol', loc_HPA$loc.HPA_lvl1)
# nucleus
loc_HPA$loc.HPA_lvl1 <- ifelse(str_detect(loc_HPA$loc_HPA_main, 'Nucleoplasm'), 'Nucleus', loc_HPA$loc.HPA_lvl1)
loc_HPA$loc.HPA_lvl1 <- ifelse(str_detect(loc_HPA$loc_HPA_main, 'Nuclear membrane'), 'Nucleus', loc_HPA$loc.HPA_lvl1)
loc_HPA$loc.HPA_lvl1 <- ifelse(str_detect(loc_HPA$loc_HPA_main, 'Nuclear speckles'), 'Nucleus', loc_HPA$loc.HPA_lvl1)
loc_HPA$loc.HPA_lvl1 <- ifelse(str_detect(loc_HPA$loc_HPA_main, 'Nucleoli'), 'Nucleus', loc_HPA$loc.HPA_lvl1)
loc_HPA$loc.HPA_lvl1 <- ifelse(str_detect(loc_HPA$loc_HPA_main, 'Nucleoli fibrillar center'), 'Nucleus', loc_HPA$loc.HPA_lvl1)
loc_HPA$loc.HPA_lvl1 <- ifelse(str_detect(loc_HPA$loc_HPA_main, 'Nuclear bodies'), 'Nucleus', loc_HPA$loc.HPA_lvl1)
loc_HPA$loc.HPA_lvl1 <- ifelse(str_detect(loc_HPA$loc_HPA_main, 'Nucleoli rim'), 'Nucleus', loc_HPA$loc.HPA_lvl1)
loc_HPA$loc.HPA_lvl1 <- ifelse(str_detect(loc_HPA$loc_HPA_main, 'Mitotic chromosome'), 'Nucleus', loc_HPA$loc.HPA_lvl1)
# organels
loc_HPA$loc.HPA_lvl1 <- ifelse(str_detect(loc_HPA$loc_HPA_main, 'Mitochondria'), 'Mitochondrion', loc_HPA$loc.HPA_lvl1)
loc_HPA$loc.HPA_lvl1 <- ifelse(str_detect(loc_HPA$loc_HPA_main, 'Golgi apparatus'), 'Golgi', loc_HPA$loc.HPA_lvl1)
loc_HPA$loc.HPA_lvl1 <- ifelse(str_detect(loc_HPA$loc_HPA_main, 'Endoplasmic reticulum'), 'ER', loc_HPA$loc.HPA_lvl1)
# membrane
loc_HPA$loc.HPA_lvl1 <- ifelse(str_detect(loc_HPA$loc_HPA_main, 'Plasma membrane'), 'Plasma membrane', loc_HPA$loc.HPA_lvl1)
# LVL2
loc_HPA$loc.HPA_lvl2 <- NA
table(loc_HPA$loc.HPA_lvl1)
# Plasma Membrane
loc_HPA$loc.HPA_lvl2 <- ifelse(str_detect(loc_HPA$loc.HPA_lvl1, 'Plasma membrane'), 'Plasma membrane', loc_HPA$loc.HPA_lvl2)
# Nucleus
loc_HPA$loc.HPA_lvl2 <- ifelse(str_detect(loc_HPA$loc.HPA_lvl1, 'Nucleus'), 'Nucleus', loc_HPA$loc.HPA_lvl2)
loc_HPA$loc.HPA_lvl2 <- ifelse(str_detect(loc_HPA$loc.HPA_lvl1, 'Mitochondrion'), 'Mitochondrion', loc_HPA$loc.HPA_lvl2)
# Cytosol
loc_HPA$loc.HPA_lvl2 <- ifelse(str_detect(loc_HPA$loc.HPA_lvl1, 'Cytoskeleton'), 'Cytosol', loc_HPA$loc.HPA_lvl2)
loc_HPA$loc.HPA_lvl2 <- ifelse(str_detect(loc_HPA$loc.HPA_lvl1, 'Cytosol'), 'Cytosol', loc_HPA$loc.HPA_lvl2)
loc_HPA$loc.HPA_lvl2 <- ifelse(str_detect(loc_HPA$loc.HPA_lvl1, 'Vesicle'), 'Cytosol', loc_HPA$loc.HPA_lvl2)
# ER & Golgi
loc_HPA$loc.HPA_lvl2 <- ifelse(str_detect(loc_HPA$loc.HPA_lvl1, 'ER'), 'ER u. Golgi', loc_HPA$loc.HPA_lvl2)
loc_HPA$loc.HPA_lvl2 <- ifelse(str_detect(loc_HPA$loc.HPA_lvl1, 'Golgi'), 'ER u. Golgi', loc_HPA$loc.HPA_lvl2)
# check
loc_HPA %>% filter(is.na(loc.HPA_lvl1)) %>% nrow() # 3118 without
loc_HPA %>% filter(is.na(loc.HPA_lvl2)) %>% nrow() # same -> ok
loc_HPA %>% filter(is.na(loc.HPA_lvl1)) %>% filter(!is.na(UNIPROT)) %>% nrow() # 1883 proteins without anno
#loc_HPA$loc.HPA_lvl1 <- ifelse(is.na(loc_HPA$loc.HPA_lvl1) , 'Unknown', loc_HPA$loc.HPA_lvl1)
#loc_HPA$loc.HPA_lvl2 <- ifelse(is.na(loc_HPA$loc.HPA_lvl2) , 'Unknown', loc_HPA$loc.HPA_lvl2)
table(loc_HPA$loc.HPA_lvl1)
table(loc_HPA$loc.HPA_lvl2)
# ##### select and check with UNIPROT
# UNIPROT_noLoc.HPA <- loc_HPA.fin %>% filter(is.na(loc.lvl1)) %>% pull(UNIPROT)
#
#
# table(loc_HPA.fin$loc.lvl2)
loc_UNI
sort(table(loc_UNI$loc_UNI)) # chaotic mess
# simplify with str_detect
loc_UNI$loc_UNI.lvl2 <- NA
loc_UNI$loc_UNI.lvl2 <- ifelse(str_detect(loc_UNI$loc_UNI, 'Membrane'), 'Plasma membrane', loc_UNI$loc_UNI.lvl2)
loc_UNI$loc_UNI.lvl2 <- ifelse(str_detect(loc_UNI$loc_UNI, 'Secreted'), 'Cytosol', loc_UNI$loc_UNI.lvl2)
loc_UNI$loc_UNI.lvl2 <- ifelse(str_detect(loc_UNI$loc_UNI, 'endosome'), 'Cytosol', loc_UNI$loc_UNI.lvl2)
loc_UNI$loc_UNI.lvl2 <- ifelse(str_detect(loc_UNI$loc_UNI, 'Lysosome'), 'Cytosol', loc_UNI$loc_UNI.lvl2)
loc_UNI$loc_UNI.lvl2 <- ifelse(str_detect(loc_UNI$loc_UNI, 'Peroxisome'), 'Cytosol', loc_UNI$loc_UNI.lvl2)
loc_UNI$loc_UNI.lvl2 <- ifelse(str_detect(loc_UNI$loc_UNI, 'Cytolytic granule'), 'Cytosol', loc_UNI$loc_UNI.lvl2)
loc_UNI$loc_UNI.lvl2 <- ifelse(str_detect(loc_UNI$loc_UNI, 'Cell junction'), 'Cytosol', loc_UNI$loc_UNI.lvl2)
loc_UNI$loc_UNI.lvl2 <- ifelse(str_detect(loc_UNI$loc_UNI, 'centromere'), 'Cytosol', loc_UNI$loc_UNI.lvl2)
loc_UNI$loc_UNI.lvl2 <- ifelse(str_detect(loc_UNI$loc_UNI, 'Cytoplasm'), 'Cytosol', loc_UNI$loc_UNI.lvl2)
loc_UNI$loc_UNI.lvl2 <- ifelse(str_detect(loc_UNI$loc_UNI, 'cilium'), 'Plasma membrane', loc_UNI$loc_UNI.lvl2)
loc_UNI$loc_UNI.lvl2 <- ifelse(str_detect(loc_UNI$loc_UNI, 'Cell membrane'), 'Plasma membrane', loc_UNI$loc_UNI.lvl2)
loc_UNI$loc_UNI.lvl2 <- ifelse(str_detect(loc_UNI$loc_UNI, 'Apical cell membranee'), 'Plasma membrane', loc_UNI$loc_UNI.lvl2)
loc_UNI$loc_UNI.lvl2 <- ifelse(str_detect(loc_UNI$loc_UNI, 'Cell projection'), 'Plasma membrane', loc_UNI$loc_UNI.lvl2)
loc_UNI$loc_UNI.lvl2 <- ifelse(str_detect(loc_UNI$loc_UNI, 'Basolateral cell membrane'), 'Plasma membrane', loc_UNI$loc_UNI.lvl2)
loc_UNI$loc_UNI.lvl2 <- ifelse(str_detect(loc_UNI$loc_UNI, 'Mitochondrion'), 'Mitochondrion', loc_UNI$loc_UNI.lvl2)
loc_UNI$loc_UNI.lvl2 <- ifelse(str_detect(loc_UNI$loc_UNI, 'Endoplasmic reticulum'), 'ER u. Golgi', loc_UNI$loc_UNI.lvl2)
loc_UNI$loc_UNI.lvl2 <- ifelse(str_detect(loc_UNI$loc_UNI, 'endoplasmic reticulum'), 'ER u. Golgi', loc_UNI$loc_UNI.lvl2)
loc_UNI$loc_UNI.lvl2 <- ifelse(str_detect(loc_UNI$loc_UNI, 'Golgi'), 'ER u. Golgi', loc_UNI$loc_UNI.lvl2)
loc_UNI$loc_UNI.lvl2 <- ifelse(str_detect(loc_UNI$loc_UNI, 'Nucleus'), 'Nucleus', loc_UNI$loc_UNI.lvl2)
# check
loc_UNI %>% filter(is.na(loc_UNI.lvl2)) %>% nrow() # # 1405 left
loc_UNI %>% filter(is.na(loc_UNI.lvl2)) %>% select(UNIPROT, loc_UNI.lvl2, loc_UNI)
# put unknown
#loc_UNI$loc.lvl2 <- ifelse(is.na(loc_UNI$loc_UNI.lvl2) , 'Unknown', loc_UNI$loc_UNI.lvl2)
table(loc_UNI$loc_UNI.lvl2)
table(loc_UNI$loc_UNI)
loc.gran # granule subsets
loc_ROV # plasma membrane proteins
loc_HPA # HPA locs
loc_UNI # uniprot locs
fan.locs <- data.frame(
ENSG = fan$ENSG,
UNIPROT = fan$UNIPROT)
# Granule subsets
fan.locs$granule <- loc.gran$granule[match(fan.locs$ENSG, loc.gran$ENSG)]
fan.locs$granule <- str_to_sentence(fan.locs$granule)
# plasam membrane Rovig
fan.locs$plasma_membrane <- loc_ROV$membrane_ROV[match(fan.locs$ENSG, loc_ROV$ENSG)]
# HPA
fan.locs$loc_HPA.lv1 <- loc_HPA$loc.HPA_lvl1[match(fan.locs$ENSG, loc_HPA$ENSG)]
fan.locs$loc_HPA.lv2 <- loc_HPA$loc.HPA_lvl2[match(fan.locs$ENSG, loc_HPA$ENSG)]
# Uniprot
fan.locs$loc_UNI.lv1 <- loc_UNI$loc_UNI[match(fan.locs$ENSG, loc_UNI$ENSG)]
fan.locs$loc_UNI.lv2 <- loc_UNI$loc_UNI.lvl2[match(fan.locs$ENSG, loc_UNI$ENSG)]
# check
colSums(is.na(fan.locs))
fan.locs$loc.combined <- NA
fan.locs$loc.combined <- ifelse(is.na(fan.locs$loc.combined), fan.locs$granule, NA)
fan.locs$loc.combined <- ifelse(is.na(fan.locs$loc.combined), fan.locs$plasma_membrane, fan.locs$loc.combined)
fan.locs$loc.combined <- ifelse(is.na(fan.locs$loc.combined), fan.locs$loc_HPA.lv2, fan.locs$loc.combined)
fan.locs$loc.combined <- ifelse(is.na(fan.locs$loc.combined), fan.locs$loc_UNI.lv2, fan.locs$loc.combined)
table(fan.locs$loc.combined)
fan.locs$loc.combined <- ifelse(is.na(fan.locs$loc.combined), 'Unknown', fan.locs$loc.combined)
table(fan.locs$loc.combined)
count(fan.locs$loc.combined)
# library(treemapify)
#
#
# fan.locs %>%
# group_by(loc.combined) %>%
# summarise (n=n()) %>%
# mutate(rel.freq = as.integer(paste0(round(100 * n/sum(n), 0)))) %>%
# ggplot(aes(area = rel.freq, fill = loc.combined)) +
# geom_treemap() +
# geom_treemap_text(aes(label = paste(loc.combined, paste0(rel.freq, "%"), sep = "\n")), colour = "white", min.size = 8) +
# theme(legend.position = "none") +
# ggtitle("Subcellular distribution of all granulopoiesis features: \ntranscriptome and proteome")
anno <- list(
fan.locs = fan.locs,
anno_uni = anno_uni,
GO_long = GO_long,
GO_wide = GO_wide,
GO_terms = GO_terms,
loc_ROV = loc_ROV)
save(
anno,
file = paste0(dir_moa, "/data/final/", "anno.RData"))
gff_raw <- read.delim('/home/isar/Dropbox/scn_moa/data/final/ext/gencode.v46.annotation.gtf', header = FALSE, sep = '\t', skip = 3)
gc_raw <-rtracklayer::readGFF('/home/isar/Dropbox/scn_moa/data/final/ext/gencode.v46.annotation.gtf')
is.data.frame(gc_raw)
gc_tmp <- gc_raw
gc_tmp <- as.tibble(gc_tmp)
gc_tmp$ENSG <- gsub("\\..*","", gc_tmp$gene_id)
gc_fan <- gc_tmp %>% filter(ENSG %in% fan$ENSG)
gc_fan %<>% distinct(ENSG, .keep_all = T)
# write_csv(gc_fan, paste0(dir_moa, "/home/isar/Dropbox/scn_moa/data/final/anno/", "gene_code", ".csv"))
# table(gc_fan$gene_type)
#
# library(gtsummary)
#
# gc_fan %>% tbl_summary(include = c(gene_type))
#
# gc_fan$mrn <- ifelse(gc_fan$ENSG %in% exp_mrnL$ENSG, T, F)
# gc_fan$pro <- ifelse(gc_fan$ENSG %in% exp_proL$ENSG, T, F)
#
# # lnc
# foi_lncRNA <- gc_fan %>% filter(gene_type == 'lncRNA') %>% pull(ENSG) # make heatmap
#
# gc_fan %>% filter(gene_type == 'miRNA')