Last update: 23-10-13
Last markdown compiled: 23-10-13
ToDo: - format nicely
knitr::opts_chunk$set(warning = FALSE, message = FALSE, results = FALSE)
# dirs
## Sebastian @Linux
dir_moa <- "/home/isar/Dropbox/scn_moa/"
# data loaded below
## packages
source(paste0(dir_moa, "/scripts/", "scn_moa_fct.R"))
# data: not yet applicable here, will be created at end
# load required package collections
sapply(packages_general, require, character.only = TRUE)
sapply(packages_annotation, require, character.only = TRUE)
sapply(packages_pca, require, character.only = TRUE)
sapply(packages_normalisation, require, character.only = TRUE)
# import: raw counts and tpn
exp_mic_counts_raw <- read_delim(paste0(dir_moa, "/data/gnp/microRNA/original/", "featureCounts.miRNA"), delim = "\t", escape_double = FALSE, trim_ws = TRUE, skip = 1)
info_mic_counts <- read_delim(paste0(dir_moa, "/data/gnp/microRNA/original/", "featureCounts.miRNA.summary"), delim = "\t", escape_double = FALSE, trim_ws = TRUE)
san_mic_raw <- read_csv(paste0(dir_moa, "/data/gnp/microRNA/original/", "san_miRNA_raw.csv") )
san_mic_raw %<>% filter(type == "miRNA")
san_mic_raw$id <- gsub("-", "_", san_mic_raw$name)
# divide data by content
pan_mic_raw <- exp_mic_counts_raw %>% select('fid' = Geneid, Chr, Start, End, Strand, Length)
exp_mic_temp <- exp_mic_counts_raw %>% select('fid' = Geneid, 7:35)
# format ids
names(exp_mic_temp) <- gsub("-", "_", names(exp_mic_temp))
names(exp_mic_temp) <- gsub(".bam", "", names(exp_mic_temp))
# make numeric
dim(exp_mic_temp)
exp_mic_temp[,2:30] <- sapply(exp_mic_temp[,2:30], as.numeric)
names(exp_mic_temp)
# # transpose (not sure if needed)
# #transpose_mic <- function(df){
# fid <- df$HSA
# df$HSA <- NULL
# df_t <- as.data.frame(t(df))
# colnames(df_t) <- fid
# return(df_t)
# }
# #exp_mic_temp_t <- transpose_mic(exp_mic_temp)
san_mic_temp <- as.data.frame(san_mic_raw)
san_mic_temp %<>% filter(id %in% colnames(exp_mic_temp)[-1])
san_mic_temp <- san_mic_temp %>% dplyr::arrange(factor(san_mic_temp$id, levels = colnames(exp_mic_temp)[-1]))
# colors of stages
colors_stage <- rev(c(
"#c8a600",
"#ffac40",
"#ff7763",
"#ca0068",
"#a80092",
"#a043ec",
"#0000e3"))
names(colors_stage) <- c( "MB", "PM", "MC", "MM", "B", "S", "PMN")
# levels
levels_mic = c('MB', 'MC', 'MM', 'B', 'S', 'PMN')
exclude low counts and <3 values/group
# copy
exp_mic_10 <- exp_mic_temp
# set all values below threshold (10) NA
exp_mic_10[exp_mic_10<10] = NA
# check NAs per feature and group
exp_mic_10_long <- exp_mic_10 %>%
pivot_longer(!fid, names_to = 'id', values_to = "value")
exp_mic_10_long$pop_gen <- san_mic_temp$population_general[match(exp_mic_10_long$id, san_mic_temp$id)]
# not NA/group
notNAperGeneANDgroup_mic <- exp_mic_10_long %>%
group_by(fid, pop_gen) %>%
summarise(sum_notNA = sum(!is.na(value)))
# create filter
summary_notNA_mic <- notNAperGeneANDgroup_mic %>% group_by(fid) %>%
filter(sum(sum_notNA>1)>1, sum_notNA > 1) %>%
summarise(tot_valid = n(), valid_groups = str_c(pop_gen, collapse = ';'))
table(summary_notNA_mic$tot_valid)
# min2 in all 7 groups = good for DEX
fid_min2INallGroups_mic <- summary_notNA_mic %>%
filter(tot_valid == 6) %>% pull(fid)
length(fid_min2INallGroups_mic) # 285
#
exp_mic_all <- exp_mic_temp # all features
nrow(exp_mic_all)
exp_mic_filtered <- exp_mic_10 %>% filter(fid %in% fid_min2INallGroups_mic)
# nrow(exp_mic_filtered) # 285
Try histrograms of all data
colors_ids_mic <- san_mic_temp %>% select(id, population_general)
colors_ids_mic$color <- colors_stage[match(colors_ids_mic$population_general, names(colors_stage))]
colors_ids_mic$population_general <- NULL
col_ids_mic <- as.vector(colors_ids_mic$color)
names(col_ids_mic) <- colors_ids_mic$id
# prep long
exp_mic_long <- exp_mic_filtered %>% pivot_longer(!fid, values_to = 'value', names_to = 'id')
exp_mic_long$stage <- san_mic_temp$population_general[match(exp_mic_long$id, san_mic_temp$id)]
exp_mic_long %>%
ggplot(aes(x = value, color = id)) +
geom_density(na.rm = T, bw = 'SJ') +
scale_x_continuous(trans='log2') +
scale_color_manual(values = col_ids_mic) +
theme(legend.position = "none") +
ggtitle("Density microRNA")
names(san_mic_temp)
dat.qc.mic <- san_mic_temp %>% dplyr::select(id, stage = population_general, reads, mbases_yield, mean_quality, bases_over_30)
dat.qc.mic$stage <- factor(dat.qc.mic$stage, levels = levels_mic)
dat.qc.mic$stageNUM <- as.numeric(dat.qc.mic$stage)
# reads
dat.qc.mic %>%
ggplot(aes(x = reorder(id, stageNUM), y = reads, fill = stage)) +
geom_bar(stat = 'identity') +
scale_fill_manual(values = colors_stage) +
theme(axis.text.x = element_text(angle=30, hjust=1)) +
ggtitle("Total reads per sample in microRNA")
# MB yield
dat.qc.mic %>%
ggplot(aes(x = reorder(id, stageNUM), y = mbases_yield, fill = stage)) +
geom_bar(stat = 'identity') +
scale_fill_manual(values = colors_stage) +
theme(axis.text.x = element_text(angle=30, hjust=1)) +
ggtitle("MByields per sample in microRNA")
# Mean quality
dat.qc.mic %>%
ggplot(aes(x = reorder(id, stageNUM), y = mean_quality, fill = stage)) +
geom_bar(stat = 'identity') +
scale_fill_manual(values = colors_stage) +
theme(axis.text.x = element_text(angle=30, hjust=1)) +
ggtitle("Mean quality per sample in microRNA")
# Bases over 30
dat.qc.mic %>%
ggplot(aes(x = reorder(id, stageNUM), y = bases_over_30, fill = stage)) +
geom_bar(stat = 'identity') +
scale_fill_manual(values = colors_stage) +
theme(axis.text.x = element_text(angle=30, hjust=1)) +
ggtitle("Bases over 30 per sample in microRNA")
Used for VSN
# create non
exp_mic_non <- exp_mic_filtered
# copy exp
FIDtoRow <- function(df){
df <- as.data.frame(df)
rownames(df) <- df$fid
df$fid <- NULL
return(df)
}
exp_4eSet_mic <- FIDtoRow(exp_mic_non)
# VSN needs counts, not log2! Not sure about the format of the data. They look a bit different.
# match san order
san_mic_4eSet <- san_mic_temp[match(names(exp_4eSet_mic), san_mic_temp$id),]
san_mic_4eSet <- as.data.frame(san_mic_4eSet)
rownames(san_mic_4eSet) <- san_mic_4eSet$id
# turn into matrix
df2m <- function(X) {
if (!methods::is(X, "matrix")) {
m <- as.matrix(X[, which(vapply(X, is.numeric, logical(1)))])
}
else {
m <- X
}
m
}
mx_4eSet_mic_non <- df2m(exp_4eSet_mic)
mx_4eSet_mic_log2 <- log2(mx_4eSet_mic_non)
# create pheno
pheno_mic <- new("AnnotatedDataFrame", data = san_mic_4eSet)
# compile
eSet_mic_non <- ExpressionSet (
assayData = mx_4eSet_mic_non, #exprs
phenoData = pheno_mic) #pData
https://www.bioconductor.org/packages/devel/bioc/vignettes/vsn/inst/doc/A-vsn.html#sec:miss
https://academic.oup.com/bioinformatics/article/18/suppl_1/S96/231881
https://www.embl.org/groups/huber/ (first author on VSN paper)
https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2680204/
# normalize (uses eSet with raw data)
eSet_mic_vsn = justvsn(eSet_mic_non)
# extract
exp_mic_vsn <- exprs(eSet_mic_vsn)
exp_mic_vsn <- as.data.frame(exp_mic_vsn)
exp_mic_vsn$fid <- rownames(exp_mic_vsn)
https://davetang.org/muse/2014/07/07/quantile-normalisation-in-r/
DISCARDED!
# # counts
# exp_mic_qn <- normalize.quantiles(exp_4eSet_mic)
# exp_mic_qn <- as.data.frame(exp_mic_qn)
# rownames(exp_mic_qn) <- rownames(exp_4eSet_mic)
# colnames(exp_mic_qn) <- colnames(exp_4eSet_mic)
# exp_mic_qn$fid <- rownames(exp_mic_qn)
DISCARDED
#
# exp_mic_mn <- normalizeMedianValues(exp_4eSet_mic) # limma
# exp_mic_mn <- as.data.frame(exp_mic_mn)
# exp_mic_mn$fid <- rownames(exp_mic_mn)
# colnames(exp_mic_mn)
exp_mic_log2 <- mx_4eSet_mic_log2
exp_mic_log2 <- as.data.frame(exp_mic_log2)
exp_mic_log2$fid <- rownames(exp_mic_log2)
san_mic_temp
plot_sample_density_log <- function(exp, san, LEVELS, col_ids, title) {
exp_long <- exp %>% pivot_longer(!fid, values_to = 'value', names_to = 'id')
exp_long$stage <- san$population_general[match(exp_long$id, san$id)]
exp_long$stage <- factor(exp_long$stage, levels = LEVELS)
exp_long$donor <- san$donor[match(exp_long$id, san$id)]
exp_long %>%
#filter(donor == 'B') %>%
ggplot(aes(x = value, color = id)) +
geom_density(na.rm = T, bw = 'SJ') +
scale_x_continuous(trans='log2') +
scale_color_manual(values = col_ids) +
theme(legend.position = "none") +
ggtitle(title) +
scale_x_log10(breaks = trans_breaks("log10", function(x) 10^x),
labels = trans_format("log10", math_format(10^.x)))
}
plot_sample_density_NOlog <- function(exp, san, LEVELS, col_ids, title) {
exp_long <- exp %>% pivot_longer(!fid, values_to = 'value', names_to = 'id')
exp_long$stage <- san$population_general[match(exp_long$id, san$id)]
exp_long$stage <- factor(exp_long$stage, levels = LEVELS)
exp_long$donor <- san$donor[match(exp_long$id, san$id)]
exp_long %>%
#filter(donor == 'B') %>%
ggplot(aes(x = value, color = id)) +
geom_density(na.rm = T, bw = 'SJ') +
#scale_x_continuous(trans='log2') +
scale_color_manual(values = col_ids) +
theme(legend.position = "none") +
ggtitle(title)
}
p.density.mic_log2 <- plot_sample_density_NOlog(exp_mic_log2, san_mic_temp, levels_mic, col_ids_mic, "Density mic log2")
p.density.mic_vsn <- plot_sample_density_NOlog(exp_mic_vsn, san_mic_temp, levels_mic, col_ids_mic, "Density mic vsn")
fig_hist_mic_norm_pre <- ggarrange(
p.density.mic_log2,
p.density.mic_vsn,
ncol = 1,
nrow = 2)
fig_hist_mic_norm_pre
fig_hist_mic_norm <- annotate_figure(fig_hist_mic_norm_pre,
top = text_grob("Normalisation effects on microRNA", color = "black", face = "bold", size = 18))
fig_hist_mic_norm
pepExp_4box <- function(exp, san){
exp_long <- exp %>% pivot_longer(! fid, names_to = 'id', values_to = "value")
exp_long$stage <- san$population_general[match(exp_long$id, san$id)]
exp_long$stage <- factor(exp_long$stage , levels = c('MB', 'PM', 'MC', 'MM', 'B', 'S', "PMN"))
return(exp_long)
}
exp_mic_log2_long <- pepExp_4box(exp_mic_log2, san_mic_temp)
exp_mic_vsn_long <- pepExp_4box(exp_mic_vsn, san_mic_temp)
#exp_mic_mn_long <- pepExp_4box(exp_mic_mn, san_mic_temp)
#exp_mic_qn_long <- pepExp_4box(exp_mic_qn, san_mic_temp)
plot_box_4normCheck <- function(exp_long, title){
exp_long %>%
arrange(stage) %>%
ggplot(aes(x = interaction(id, stage), y = value, fill = stage)) +
geom_boxplot() +
#theme(axis.text.x=element_text(angle=45, hjust=1)) +
#coord_flip() +
ggtitle(title) +
xlab('Sample')+
ylab("Expression")+
theme(axis.text.x=element_text(angle=45, hjust=1),
axis.text=element_text(size=7)) +
scale_fill_manual(values = colors_stage)
}
plot_box_4normCheck_logY <- function(exp_long, title){
exp_long %>%
arrange(stage) %>%
ggplot(aes(x = interaction(id, stage), y = value, fill = stage)) +
geom_boxplot() +
#theme(axis.text.x=element_text(angle=45, hjust=1)) +
#coord_flip() +
ggtitle(title) +
xlab('Sample')+
ylab("Expression")+
theme(axis.text.x=element_text(angle=45, hjust=1),
axis.text=element_text(size=7)) +
scale_fill_manual(values = colors_stage) +
scale_y_continuous(trans='log2')
}
p.box.mic_log2 <- plot_box_4normCheck(exp_mic_log2_long, "mic log2")
p.box.mic_vsn <- plot_box_4normCheck(exp_mic_vsn_long, "mic vsn")
#p.box.mic_mn <- plot_box_4normCheck_logY(exp_mic_mn_long, "mic mn")
#p.box.mic_qn <- plot_box_4normCheck_logY(exp_mic_qn_long, "mic qn")
fig_boxes_mic_pre <- ggarrange(
p.box.mic_log2,
p.box.mic_vsn,
legend = 'none')
fig_boxes_mic_pre
fig_boxes_mic <- annotate_figure(fig_boxes_mic_pre,
top = text_grob("Expression per sample in mic", color = "black", face = "bold", size = 20))
fig_boxes_mic
How do the technical repeat samples behave? Samples were distributed in aliquots at the date of collection.
# isolate techReps
san_mic_techRep <- san_mic_temp %>% filter(has_tech_rep == T)
# all tech Reps are the same subpopulation!
# fct to get IDs by Yidi
get_Rep_ids_mic <- function(DONOR, STAGE){
san_mic_techRep %>%
dplyr::filter(donor == DONOR) %>%
dplyr::filter(population_general == STAGE) %>%
pull(id)
}
donors_mic <- unique(san_mic_techRep$donor)
stages_mic <- unique(san_mic_techRep$population_general)
ids <- list()
for(i in donors_mic){
for(j in stages_mic){
id <- paste(i,j,sep = '_')
ids[[id]] <- get_Rep_ids_mic(i,j)
}
}
rep_ids_mic <- purrr::compact(ids)
rep_ids_mic # 3 triplets, one single. Do manually
# plots
plot_corr_TechReps <- function(samples, data){
data %>% dplyr::select(all_of(samples)) %>%
rename(Rep1 = 1, Rep2 = 2) %>%
ggplot(aes(x = Rep1, y = Rep2)) +
geom_point() +
stat_cor(method = "spearman", size = 3, na.rm = T)
}
# one sample with 3 reps
# "G23" "G24" "G25"
triplets_mic <- list(
C_lB_1 = c('Rm1_LB_C', 'Rm2_LB_C'),
C_lB_2 = c('Rm1_LB_C', 'Rm3_LB_C'),
D_PMN_1 = c('Rm1_PMN_D', 'Rm2_PMN_D'),
D_PMN_2 = c('Rm1_PMN_D', 'Rm3_PMN_D'))
pl.corr_mic_triplet <- purrr::map(triplets_mic, plot_corr_TechReps, exp_mic_log2)
fig_RepCor_mic_trip_pre <- ggarrange(
pl.corr_mic_triplet$C_lB_1 + ggtitle("Donor C, B, 1vs2"),
pl.corr_mic_triplet$C_lB_2 + ggtitle("Donor C, B, 1vs3"),
pl.corr_mic_triplet$D_PMN_1 + ggtitle("Donor D, PMN, 1vs2"),
pl.corr_mic_triplet$D_PMN_2 + ggtitle("Donor D, PMN, 1vs3"),
legend = 'none')
fig_RepCor_mic_trip_pre
# define sample to remove
# none!
# combine all replicates
# extract data
extract_PC1.2 <- function(res.pca){
dat.pc12 <- res.pca$x %>% as.data.frame() %>%
rownames_to_column('id') %>%
dplyr::select(id, PC1, PC2)
}
prep4_pca <- function(exp, san) {
exp <- exp[complete.cases(exp),]
fid <- exp$fid
exp$fid <- NULL
sid <- colnames(exp)
expT <- t(exp)
expT <- matrix(as.numeric(expT), # Convert to numeric matrix
ncol = ncol(expT))
expT <- scale(expT, center = T, scale = T)
colnames(expT) <- fid
rownames(expT) <- sid
res.pca <- prcomp(expT)
dat.pca <- extract_PC1.2(res.pca)
dat.pca$stage <- san$population_general[match(dat.pca$id, san$id)]
return(dat.pca)
}
san_mic_temp$population_general <- factor(san_mic_temp$population_general, levels = levels_mic)
# PCA unlabeled
dat.pca.mic_log2 <- prep4_pca(exp_mic_log2, san_mic_temp)
dat.pca.mic_vsn <- prep4_pca(exp_mic_vsn, san_mic_temp)
plot_myPCA <- function(dat.pca, title){
dat.pca %>%
ggplot(aes(x= PC1, y = PC2, color = stage))+
geom_point(size = 3)+
scale_color_manual(values = colors_stage)+
ggtitle(title)
}
plot_myPCA_labeled <- function(dat.pca){
dat.pca %>%
ggplot(aes(x= PC1, y = PC2, color = stage, label = id))+
geom_point(size = 3)+
geom_text_repel() +
scale_color_manual(values = colors_stage)
}
p.pca.mic_log2 <- plot_myPCA(dat.pca.mic_log2, 'log2')
p.pca.mic_vsn <- plot_myPCA(dat.pca.mic_vsn, 'vsn')
fig_pca_mic_pre <- ggarrange(
p.pca.mic_log2,
p.pca.mic_vsn,
legend = 'none')
fig_pca_mic_pre
fig_pca_mic <- annotate_figure(fig_pca_mic_pre,
top = text_grob("PCAs of normalisations in mic", color = "black", face = "bold", size = 20))
fig_pca_mic
plot_myPCA_labeled(dat.pca.mic_vsn) + ggtitle("PCA of microRNA, log2")
# No outliers to remove
toRemove_mic_outliers <- c('Rm_eB_C')
# san
san_mic_clean <- san_mic_temp %>% filter(! id %in% toRemove_mic_outliers)
nrow(san_mic_clean) # 28 samples
nrow(san_mic_temp) # 29. OK!
## exp
exp_mic_log2_clean <- exp_mic_log2[! colnames(exp_mic_log2) %in% toRemove_mic_outliers]
exp_mic_vsn_clean <- exp_mic_vsn[! colnames(exp_mic_vsn) %in% toRemove_mic_outliers]
# form medians via function
combine_samples_mic <- function(exp){
exp_c <- exp
## A: none
## C
#B
exp_c <- exp_c %>%
rowwise() %>%
dplyr::mutate(lB_c = median(c(Rm1_LB_C, Rm2_LB_C, Rm3_LB_C), na.rm = T)) %>%
select(- c(Rm1_LB_C, Rm2_LB_C, Rm3_LB_C))
## D
# PMN
exp_c <- exp_c %>%
rowwise() %>%
dplyr::mutate(PMN_d = median(c(Rm1_PMN_D, Rm2_PMN_D, Rm3_PMN_D), na.rm = T)) %>%
select(- c(Rm1_PMN_D, Rm2_PMN_D, Rm3_PMN_D))
# export
return(exp_c)
}
exp_mic_log2_clean_comb <- combine_samples_mic(exp_mic_log2_clean)
exp_mic_vsn_clean_comb <- combine_samples_mic(exp_mic_vsn_clean)
# sort(colnames(exp_mic_log2_clean_comb))
rename_cols_mic <- function(exp){
exp_c <- exp
exp_c %<>% rename(eMC_a = mR_eMC_A)
exp_c %<>% rename(lMC_a = mR_lMC_A)
exp_c %<>% rename(B_a = mR_B_A)
exp_c %<>% rename(lMC_b = mR_LMC_B)
exp_c %<>% rename(MB_a = mR_MB_A)
exp_c %<>% rename(MB_e = mR_MB_E)
exp_c %<>% rename(MM_a = mR_MM_A)
exp_c %<>% rename(MM_b = mR_MM_B)
exp_c %<>% rename(PMN_b = mR_PMN_B)
exp_c %<>% rename(PMN_c = mR_PMN_C)
exp_c %<>% rename(S_a = mR_S_A)
exp_c %<>% rename(PMN_a = mR1_PMN_A)
exp_c %<>% rename(eMC_c = Rm_eMC_C)
exp_c %<>% rename(eMC_d = Rm_eMC_D)
exp_c %<>% rename(lB_d = Rm_lB_D)
exp_c %<>% rename(MM_c = Rm_MM_C)
exp_c %<>% rename(MM_d = Rm_MM_D)
exp_c %<>% rename(MM_e = Rm_MM_E)
exp_c %<>% rename(PMN_e = Rm_PMN_E)
exp_c %<>% rename(S_c = Rm_S_C)
exp_c %<>% rename(S_e = Rm_S_E)
exp_c %<>% rename(eB_e = Rm_eB_E)
return(exp_c)
}
# exec
exp_mic_log2_preF <- rename_cols_mic(exp_mic_log2_clean_comb)
exp_mic_vsn_preF <- rename_cols_mic(exp_mic_vsn_clean_comb)
sort(colnames(exp_mic_log2_preF))
sort(colnames(exp_mic_vsn_preF))
# order cols
order_colnames_mic <- sort(colnames(exp_mic_log2_preF))
order_colnames_mic <- moveMe(order_colnames_mic, "fid first")
exp_mic_log2_preF <- exp_mic_log2_preF[order_colnames_mic]
exp_mic_vsn_preF <- exp_mic_vsn_preF[order_colnames_mic]
# copy to gSheets
# write_clip(san_mic_clean)
#colnames(exp_tpm_vsn_clean_comb)[-1]
#write_clip(colnames(exp_tpm_vsn_clean_comb)[-1])
san_mic_preF <- read_csv("/home/isar/Dropbox/scn_moa/data/gnp/microRNA/san_mic_final - Sheet1.csv")
san_mic_preF %<>% filter(! is.na(label_final))
setdiff(colnames(exp_mic_log2_preF), san_mic_preF$label_final) # OK
setdiff(san_mic_preF$label_final, colnames(exp_mic_log2_preF)) # OK
## add sex and age
# load mRNA data
load(file = paste0(dir_moa, "/data/final/", "gnp.mrn.RData"))
san_mrn <- gnp.mrn$san_mrn
san_mic_preF$donor <- tolower(san_mic_preF$donor)
san_mic_preF$sex <- san_mrn$sex[match(san_mic_preF$donor, san_mrn$donor)]
san_mic_preF$age <- san_mrn$age[match(san_mic_preF$donor, san_mrn$donor)]
# Finalize
san_mic_fin <- san_mic_preF %>% select(
id = label_final,
stage = population_general,
stage_nr = population_hirachical_number,
donor = donor,
sex = sex,
age = age
)
san_mic_fin$idI <- make_unique(san_mic_fin$stage, sep = "_", wrap_in_brackets = FALSE)
san_mic_fin <- san_mic_fin[order(match(san_mic_fin$id, colnames(exp_mic_log2_preF)[-1])),]
identical(san_mic_fin$id, colnames(exp_mic_log2_preF)[-1]) # oK!
# sequencing details
mic_sequencing_details <- san_mic_preF # is also san raw? no thats the temp bc of deleted samples
# check NAs
NAinlg2_mic <- is.na(exp_mic_log2_preF)
NAinVSN_mic <- is.na(exp_mic_vsn_preF)
identical(NAinlg2_mic, NAinVSN_mic) # OK
# ALL DATA HAVE THE SAME NAs!
# COUNTS
exp_4filt2_long_mic <- exp_mic_log2_preF %>% pivot_longer(!fid, names_to = 'id', values_to = "value")
exp_4filt2_long_mic$stage <- san_mic_fin$stage[match(exp_4filt2_long_mic$id, san_mic_fin$id)]
# not NA/group
notNAperGeneANDgroup_final_mic <- exp_4filt2_long_mic %>%
group_by(fid, stage) %>%
summarise(sum_notNA = sum(!is.na(value)))
# create filter
summary_notNA_final_mic <- notNAperGeneANDgroup_final_mic %>% group_by(fid) %>%
filter(sum(sum_notNA>1)>1, sum_notNA > 1) %>%
summarise(tot_valid = n(), valid_groups = str_c(stage, collapse = ';'))
table(summary_notNA_final_mic$tot_valid) # 2 need to be excluded
# min2 in all 7 groups = good for DEX
fid_min2INallGroups_final_mic <- summary_notNA_final_mic %>%
filter(tot_valid == 6) %>% pull(fid)
length(fid_min2INallGroups_final_mic) # 283
## remove from exp
exp_mic_lg2_finPRE <- exp_mic_log2_preF %>% filter(fid %in% fid_min2INallGroups_final_mic)
exp_mic_vsn_finPRE <- exp_mic_vsn_preF %>% filter(fid %in% fid_min2INallGroups_final_mic)
exp_mic_lg2_finT <- transpose_FIDcols(exp_mic_lg2_finPRE)
exp_mic_vsn_finT <- transpose_FIDcols(exp_mic_vsn_finPRE)
Using NIPALS
library(mixOmics)
nrow(exp_mic_vsn_finT) # 24
exp_mic_vsn_finT_NIPALS <- impute.nipals(X = exp_mic_vsn_finT, ncomp = 24)
# last check
identical(san_mic_fin$id, rownames(exp_mic_lg2_finT)) # oK!
gnp.mic <- list(
## final data
exp_mic_lg2 = exp_mic_lg2_finT,
exp_mic_vsn = exp_mic_vsn_finT,
exp_mic_vsn_NIPALS = exp_mic_vsn_finT_NIPALS,
# meta data
san_mic = san_mic_fin,
levels_mic = levels_mic,
mic_sequencing_details = mic_sequencing_details,
## raw data
exp_mic_raw = exp_mic_temp,
san_mic_raw = san_mic_temp)
save(
gnp.mic,
file = paste0(dir_moa, "/data/final/", "gnp.mic.RData"))