Look at Thomas et al 2024 and analyze their longitudinal dataset with our pipeline.
rm(list=ls())
library(Matrix, lib.loc = "/projects/lvanderlinden@xsede.org/Rstudio_libs/4.2.2/")
library(scLASER)
## Warning: replacing previous import 'data.table::last' by 'dplyr::last' when
## loading 'scLASER'
## Warning: replacing previous import 'data.table::first' by 'dplyr::first' when
## loading 'scLASER'
## Warning: replacing previous import 'MASS::select' by 'dplyr::select' when
## loading 'scLASER'
## Warning: replacing previous import 'data.table::between' by 'dplyr::between'
## when loading 'scLASER'
## Warning: replacing previous import 'caret::plsda' by 'mixOmics::plsda' when
## loading 'scLASER'
## Warning: replacing previous import 'caret::splsda' by 'mixOmics::splsda' when
## loading 'scLASER'
## Warning: replacing previous import 'caret::nearZeroVar' by
## 'mixOmics::nearZeroVar' when loading 'scLASER'
## Warning: replacing previous import 'dplyr::collapse' by 'nlme::collapse' when
## loading 'scLASER'
## Warning: replacing previous import 'lme4::lmList' by 'nlme::lmList' when
## loading 'scLASER'
## Warning: replacing previous import 'foreach::when' by 'purrr::when' when
## loading 'scLASER'
## Warning: replacing previous import 'mixOmics::map' by 'purrr::map' when loading
## 'scLASER'
## Warning: replacing previous import 'caret::lift' by 'purrr::lift' when loading
## 'scLASER'
## Warning: replacing previous import 'data.table::transpose' by
## 'purrr::transpose' when loading 'scLASER'
## Warning: replacing previous import 'foreach::accumulate' by 'purrr::accumulate'
## when loading 'scLASER'
## Warning: replacing previous import 'Matrix::cov2cor' by 'stats::cov2cor' when
## loading 'scLASER'
## Warning: replacing previous import 'dplyr::filter' by 'stats::filter' when
## loading 'scLASER'
## Warning: replacing previous import 'dplyr::lag' by 'stats::lag' when loading
## 'scLASER'
## Warning: replacing previous import 'Matrix::toeplitz' by 'stats::toeplitz' when
## loading 'scLASER'
## Warning: replacing previous import 'Matrix::update' by 'stats::update' when
## loading 'scLASER'
## Warning: replacing previous import 'Matrix::expand' by 'tidyr::expand' when
## loading 'scLASER'
## Warning: replacing previous import 'Matrix::pack' by 'tidyr::pack' when loading
## 'scLASER'
## Warning: replacing previous import 'Matrix::unpack' by 'tidyr::unpack' when
## loading 'scLASER'
## Warning: replacing previous import 'Matrix::tail' by 'utils::tail' when loading
## 'scLASER'
## Warning: replacing previous import 'Matrix::head' by 'utils::head' when loading
## 'scLASER'
library(kableExtra)
library(ggplot2)
library(dplyr)
library(DT)
library(table1)
source(file="/projects/lvanderlinden@xsede.org/pvalForTable1_updatedForNA.R")
This data was downloaded from: https://zenodo.org/records/14007626 and saved under the IBD_Thomas2024 folder.
data.loc <- "/pl/active/fanzhanglab/lvanderlinden/publicDatasets/IBD_Thomas2024"
features <- list.files(data.loc, recursive="FALSE")
IBD.filenames <- paste0(data.loc, "/", features)
paired_samples <- read.delim(file=paste0(data.loc, "/paired_sample_list.csv"))
dim(paired_samples)
##### Load in fibroblast/pericyte cells ##########
library(zellkonverter)
# read in the h5ad files (fibroblast/pericyte cells have the "fib" in name)
sce <- readH5AD(IBD.filenames[grep("fib", IBD.filenames)])
saveRDS(sce, file="/pl/active/fanzhanglab/lvanderlinden/publicDatasets/IBD_Thomas2024/data/fibroblast.asSCEobj.rds")
meta <- colData(sce)
meta.uniqueSamples <- meta[!duplicated(meta$sample_id),]
nVisits <- as.data.frame(table(meta.uniqueSamples$Patient))
patientCD5 <- meta.uniqueSamples[which(meta.uniqueSamples$Patient=="CD5"),]
exp <- assay(sce, "X")
assayNames(sce)
reducedDimNames(sce)
sapply(reducedDims(sce), dim) # rows = cells, cols = components
head(reducedDim(sce, reducedDimNames(sce)[1]))
harmony_pcs <- reducedDims(sce)$X_harmony
umap <- reducedDims(sce)$X_umap
colnames(umap) <- c("UMAP1", "UMAP2")
save(umap, harmony_pcs, meta, file="/pl/active/fanzhanglab/lvanderlinden/publicDatasets/IBD_Thomas2024/data/fibroblast_harmonyPCs_meta.Rdata")
What does the meta data look like? Let’s check out the first 6 rows.
datatable(head(as.data.frame(meta)))
#make the disease/healthy variable as binary
meta$disease_healthy0_other1 <- ifelse(meta$Disease=="Healthy", 0, 1)
# from the data we read in, make an scLASER object with their metadata, umap and the harmony PCs they calculated.
obj <- scLASER::scLASER(metadata=as.data.frame(meta), umap=as.data.frame(umap), harmony = as.matrix(harmony_pcs))
For this report, make a neighborhood abundence matrix (NAM) and reduce down by PCA and take top 30 PCs moving forward.
# make NAM and get the top 30 NAM PCs
obj = scLASER::association_nam_LV(obj = obj,
test_var = "disease_healthy0_other1",
#test_var="Remission_status",
samplem_key = "sample_id",
graph_use = "RNA_nn",
batches = "Batch",
covs = c("Age", "Gender"),
nsteps = NULL,
verbose=TRUE,
assay=NULL,
key="NAMPC_",
maxnsteps=15L,
max_frac_pcs=0.15, # added option to pass number of PCs, for potential speedups
n_pcs = 30,
ks=NULL,
Nnull=1000,
force_permute_all=FALSE,
local_test=TRUE,
seed=1234,
return_nam=TRUE
)
# This step takes some time depending on how large your dataset. Would reccommend saving after this point.
saveRDS(obj, file="/pl/active/fanzhanglab/lvanderlinden/publicDatasets/IBD_Thomas2024/data/fibroblast.scLASERobj.rds")
Let’s look at what the NAM PC matrix looks like. These will be our outcomes in the subsequent linear mixed effect models.
datatable(as.data.frame(head(obj@nam_pcs)), rownames = FALSE)
Before we perform the association testing, let’s look at some things like do all patients have samples from the same sites and so on.
Looking at each sample do we have missing information that we need? The main thing are our treatment (time status as pre/post) and the disease (remission vs non-remission)
# Check treatment (pre/post) information:
table(obj@metadata$Treatment, useNA = "ifany")
##
## Post Pre <NA>
## 48178 48701 6174
# Check remission status:
table(obj@metadata$Remission_status, useNA = "ifany")
##
## Non_Remission None Remission Not_avail
## 38422 6174 57391 1066
# Lets look at this together:
table(obj@metadata$Remission_status, obj@metadata$Treatment, useNA = "ifany")
##
## Post Pre <NA>
## Non_Remission 15388 23034 0
## None 0 0 6174
## Remission 32790 24601 0
## Not_avail 0 1066 0
## remove the missing Treatment and Remission Status that is not available.
keep <- !is.na(obj@metadata$Treatment) &
obj@metadata$Remission_status %in% c("Remission", "Non_Remission")
obj2 <- obj
obj2@metadata <- obj@metadata[keep, , drop = FALSE]
obj2@umap <- obj@umap[keep, , drop = FALSE]
obj2@harmony <- obj@harmony[keep, , drop = FALSE]
obj2@nam_pcs <- obj@nam_pcs[keep, , drop = FALSE]
# check remission status
table(obj2@metadata$Remission_status, useNA = "ifany")
##
## Non_Remission None Remission Not_avail
## 38422 0 57391 0
# check the treatment (i.e.timepoint)
table(obj2@metadata$Treatment, useNA = "ifany")
##
## Post Pre
## 48178 47635
#looks good. Now let's get get a dataframe together to look at the table 1s:
obj2@metadata$Disease <- droplevels(obj2@metadata$Disease)
obj2@metadata$sample_id <- droplevels(obj2@metadata$sample_id)
obj2@metadata$Patient <- droplevels(obj2@metadata$Patient)
forAnalysis <- data.frame(obj2@metadata, obj2@umap, obj2@nam_pcs)
sampleLevel <- forAnalysis[!duplicated(forAnalysis$sample_id),]
table1(~ Disease + Site + Treatment | Remission_status, data=sampleLevel, overall=FALSE, extra.col=list(`p-value` = pvalue))
| Non_Remission (N=99) |
Remission (N=101) |
p-value | |
|---|---|---|---|
| Disease | |||
| CD | 28 (28.3%) | 68 (67.3%) | <0.001 |
| UC | 71 (71.7%) | 33 (32.7%) | |
| Site | |||
| Ascending_Colon | 17 (17.2%) | 18 (17.8%) | 0.889 |
| Descending_Colon | 26 (26.3%) | 29 (28.7%) | |
| Rectum | 27 (27.3%) | 26 (25.7%) | |
| Sigmoid | 14 (14.1%) | 10 (9.9%) | |
| Terminal_Ileum | 15 (15.2%) | 18 (17.8%) | |
| Treatment | |||
| Post | 48 (48.5%) | 52 (51.5%) | 0.777 |
| Pre | 51 (51.5%) | 49 (48.5%) |
Looking at this table1, we will diffinately need to adjust for disease type as more CD go into remission than UC. Also, looking at the sites, they are not equal. We should adjust for this too.
Let’s take a look at the participants, and how many sites they have/time period.
nSites_perTime <- as.data.frame(table(sampleLevel$Patient, sampleLevel$Treatment))
colnames(nSites_perTime) <- c("Patient", "Treatment", "N Samples")
getSites <- function(patient, time){
tmp <- sampleLevel[which(sampleLevel$Patient==patient & sampleLevel$Treatment==time),]
if(nrow(tmp)==0){sites = "none"}else{sites = paste(tmp$Site, collapse=", ")}
return(sites)
}
nSites_perTime$Sites <- apply(nSites_perTime, 1, function(a) getSites(a[1], a[2]))
nSites_perTime <- nSites_perTime[order(nSites_perTime$Patient),]
datatable(nSites_perTime, rownames=FALSE)
Looking at this table, some patients have matching sites and some don’t. This is something our approach can handle. But we just want to note this.
Now let’s look at demographics on the patient level.
patientLevel <- sampleLevel[!duplicated(sampleLevel$Patient),]
nSites_pre <- nSites_perTime[which(nSites_perTime$Treatment == "Pre"), c(1,3)]
colnames(nSites_pre)[2] <- "nSamples_Pre"
nSites_post <- nSites_perTime[which(nSites_perTime$Treatment == "Post"), c(1,3)]
colnames(nSites_post)[2] <- "nSamples_Post"
patientLevel <- merge(patientLevel, nSites_pre, by="Patient")
patientLevel <- merge(patientLevel, nSites_post, by="Patient")
table1(~ Disease + Age + Gender + Ethnicity + nSamples_Pre + nSamples_Post | Remission_status, data=patientLevel, overall=FALSE, extra.col=list(`p-value` = pvalue))
## Warning in chisq.test(table(y, g)): Chi-squared approximation may be incorrect
| Non_Remission (N=20) |
Remission (N=16) |
p-value | |
|---|---|---|---|
| Disease | |||
| CD | 6 (30.0%) | 10 (62.5%) | 0.107 |
| UC | 14 (70.0%) | 6 (37.5%) | |
| Age | |||
| Mean (SD) | 33.8 (10.1) | 34.1 (11.0) | 0.928 |
| Median [Min, Max] | 31.5 [17.0, 55.0] | 32.5 [17.0, 61.0] | |
| Gender | |||
| F | 10 (50.0%) | 6 (37.5%) | 0.68 |
| M | 10 (50.0%) | 10 (62.5%) | |
| Ethnicity | |||
| Afro-Carribean | 0 (0%) | 1 (6.3%) | 0.266 |
| Caucasian | 20 (100%) | 14 (87.5%) | |
| Middle_Eastern | 0 (0%) | 1 (6.3%) | |
| nSamples_Pre | |||
| Mean (SD) | 2.55 (0.887) | 3.06 (0.929) | 0.103 |
| Median [Min, Max] | 2.50 [1.00, 4.00] | 3.00 [1.00, 4.00] | |
| nSamples_Post | |||
| Mean (SD) | 2.40 (1.23) | 3.25 (1.13) | 0.0381 |
| Median [Min, Max] | 3.00 [0, 4.00] | 3.50 [1.00, 5.00] |
Let’s check out a basic chi square of the cell types.
table1(~ final_analysis | Remission_status, data=forAnalysis, overall=FALSE, extra.col=list(`p-value` = pvalue))
| Non_Remission (N=38422) |
Remission (N=57391) |
p-value | |
|---|---|---|---|
| final_analysis | |||
| ABCA8pos WNT2Bpos FOShi fibroblast | 3665 (9.5%) | 7349 (12.8%) | <0.001 |
| ABCA8pos WNT2Bpos FOSlo fibroblast | 8043 (20.9%) | 15155 (26.4%) | |
| C3hi CCL19pos fibroblast | 668 (1.7%) | 901 (1.6%) | |
| C3hi RSPO3pos fibroblast | 2847 (7.4%) | 5550 (9.7%) | |
| CD74hi HLADRB1hi arterial pericyte | 5855 (15.2%) | 6089 (10.6%) | |
| CD74hi HLADRB1hi venous pericyte | 318 (0.8%) | 231 (0.4%) | |
| Myofibroblast | 1402 (3.6%) | 2026 (3.5%) | |
| NOTCH3hi MYH11pos pericyte | 1885 (4.9%) | 1974 (3.4%) | |
| NOTCH3hi TNChi LOXL2pos pericyte | 1232 (3.2%) | 811 (1.4%) | |
| NOTCH3hi TNCint CCL19pos pericyte | 1029 (2.7%) | 859 (1.5%) | |
| NOTCH3hi TNClo pericyte | 4325 (11.3%) | 3625 (6.3%) | |
| SOX6pos POSTNpos NRG1hi NPYpos fibroblast | 1532 (4.0%) | 2863 (5.0%) | |
| SOX6pos POSTNpos fibroblast | 4567 (11.9%) | 9444 (16.5%) | |
| THY1pos FAPpos PDPNpos fibroblast | 1054 (2.7%) | 514 (0.9%) |
Now let’s actually perform the association analysis to identify NAM scores (NAM PCs) associated with the time*remission.
# make the remission and time variables numeric (0/1)
obj2@metadata$remission0_nonRemission1 <- ifelse(obj2@metadata$Remission_status=="Remission", 0, 1)
obj2@metadata$time_pre0_post1 <- ifelse(obj2@metadata$Treatment=="Pre", 0, 1)
#also make sure the
head(obj2@nam_pcs)
rownames(obj2@nam_pcs) <- obj2@metadata$sample_id
obj2 <- longAnalyses_plusModelSelection(obj2,
disease_var = "remission0_nonRemission1",
time_var = "time_pre0_post1",
covariates = c("Age", "Gender", "Disease", "Site"),
subject_var = "Patient",
sample_var = "sample_id"
)
saveRDS(obj2, file="/pl/active/fanzhanglab/lvanderlinden/publicDatasets/IBD_Thomas2024/data/fibroblast.scLASERobj.wAnalysis.rds")
results <- obj2@pipeline_output
results.int <- results[grep(":", results$coefficient),]
Subset to only the interaction term for easy viewing and also can adjust for multiple testing as well.
results.int$FDR <- p.adjust(results.int$p.value, method="BH")
results.int$Bonferroni <- p.adjust(results.int$p.value, method = "bonferroni")
toPrint <- results.int[,c("PC", "p.value", "FDR")]
colnames(toPrint)[1:2] <- c("NAM Score", "p-value")
kable(toPrint, row.names=0, align='l') %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = F)
| NAM Score | p-value | FDR |
|---|---|---|
| NAM_PC1 | 0.0002670 | 0.0080109 |
| NAM_PC2 | 0.7267870 | 0.9151403 |
| NAM_PC3 | 0.1653085 | 0.4749787 |
| NAM_PC4 | 0.1918990 | 0.4797475 |
| NAM_PC5 | 0.3219551 | 0.6899037 |
| NAM_PC6 | 0.6048631 | 0.9151403 |
| NAM_PC7 | 0.8864153 | 0.9281942 |
| NAM_PC8 | 0.0955415 | 0.3774913 |
| NAM_PC9 | 0.0261702 | 0.1962767 |
| NAM_PC10 | 0.7286994 | 0.9151403 |
| NAM_PC11 | 0.0934054 | 0.3774913 |
| NAM_PC12 | 0.6862049 | 0.9151403 |
| NAM_PC13 | 0.6570907 | 0.9151403 |
| NAM_PC14 | 0.6743436 | 0.9151403 |
| NAM_PC15 | 0.0145588 | 0.1455880 |
| NAM_PC16 | 0.4812335 | 0.9023129 |
| NAM_PC17 | 0.7211096 | 0.9151403 |
| NAM_PC18 | 0.8377547 | 0.9281942 |
| NAM_PC19 | 0.2082144 | 0.4804948 |
| NAM_PC20 | 0.0774626 | 0.3774913 |
| NAM_PC21 | 0.7398858 | 0.9151403 |
| NAM_PC22 | 0.4437195 | 0.8874391 |
| NAM_PC23 | 0.0104816 | 0.1455880 |
| NAM_PC24 | 0.8972544 | 0.9281942 |
| NAM_PC25 | 0.1072389 | 0.3774913 |
| NAM_PC26 | 0.8679080 | 0.9281942 |
| NAM_PC27 | 0.1741588 | 0.4749787 |
| NAM_PC28 | 0.9436909 | 0.9436909 |
| NAM_PC29 | 0.1132474 | 0.3774913 |
| NAM_PC30 | 0.7626169 | 0.9151403 |
We have 1 resulting association score from the model: NAM PC1
pc1.res <- results[which(results$PC=="NAM_PC1"), ]
#round and format pretty to make viewable
pc1.res$p.value <- formatC(pc1.res$p.value, format="e", digits=2)
pc1.res$t.value <- round(pc1.res$t.value, digits=2)
pc1.res$Value <- formatC(pc1.res$Value, format="e", digits=2)
pc1.res$Std.Error <- formatC(pc1.res$Std.Error, format="e", digits=2)
datatable(pc1.res, options = list(
pageLength = 11))
library(dplyr); library(tidyr); library(ggplot2)
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:S4Vectors':
##
## expand
## The following object is masked from 'package:magrittr':
##
## extract
## The following objects are masked from 'package:Matrix':
##
## expand, pack, unpack
forAnalysis <- data.frame(obj2@metadata, obj2@nam_pcs, obj2@umap)
colnames(forAnalysis)[grep("PC", colnames(forAnalysis))] <- paste0("NAM_", colnames(forAnalysis)[grep("PC", colnames(forAnalysis))])
forAnalysis$Treatment <- factor(forAnalysis$Treatment, levels = c("Pre","Post"))
paired_dat <- forAnalysis %>%
dplyr::group_by(Patient, Treatment, Remission_status) %>%
dplyr::summarise(NAM_PC1 = median(NAM_PC1, na.rm = TRUE), .groups = "drop") %>%
tidyr::pivot_wider(names_from = Treatment, values_from = NAM_PC1) %>%
tidyr::drop_na(Pre, Post) %>%
tidyr::pivot_longer(c(Pre, Post), names_to = "Treatment", values_to = "NAM_PC1") %>%
mutate(Treatment = factor(Treatment, levels = c("Pre","Post")))
paired_dat$Treatment <- factor(paired_dat$Treatment,
levels = c("Pre","Post"),
labels = c("Pre-Tx","Post-Tx"))
paired_dat$Remission_status <- gsub("Non_Remission", "Non-remission", paired_dat$Remission_status)
p4 <- ggplot(paired_dat, aes(x = Treatment, y = NAM_PC1)) +
# colored boxplots behind
geom_boxplot(aes(group = Treatment, fill = Remission_status),
width = 0.55, outlier.shape = NA,
color = "grey50", alpha = 0.45) +
# grey connecting lines
geom_line(aes(group = Patient), color = "grey70", linewidth = 0.6, alpha = 0.7) +
# black points
geom_point(color = "black", size = 2.6) +
facet_wrap(~ Remission_status, nrow = 1) +
scale_fill_manual(values = c("Non-remission" = "#4DAF4A", "Remission" = "#984EA3")) +
labs(x = "Time", y = "Association score") +
theme_classic(base_size = 16) +
theme(
strip.text = element_text(size = 16, face = "bold"),
axis.title = element_text(size = 18),
axis.text = element_text(size = 14),
axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "none"
)
p4
#### UMAP with PC
fig_umap2 <- ggplot() +
geom_point(
data = forAnalysis[sample(nrow(forAnalysis)), ],
mapping = aes(x = UMAP1, y = UMAP2, fill = NAM_PC1),
size = 0.5, stroke = 0.01, shape = 21
) +
scale_fill_gradient2(
low = "navy", # Low values (dark blue)
mid = "white", # Middle values (white)
high = "darkred", # High values (dark red)
midpoint = 0 # Set the midpoint to 0
) +
labs(
x = "UMAP1",
y = "UMAP2",
#title = "IBD Fibroblasts: Association Score",
#subtitle = "N cells = 100, N samples = 30"
) +
theme_bw(base_size = 20) +
theme(
legend.position = "right",
axis.text = element_blank(),
axis.ticks = element_blank(),
panel.grid = element_blank(),
plot.title = element_text(color = "black", size = 20),
plot.subtitle = element_text(color = "black", size = 10)
)
fig_umap2
The cell types are defined from the paper. What does this look like on the UMAP?
#### PLOTS #####
### UMAP by cell type from paper
library(dplyr)
cluster_center <- data.frame(forAnalysis) %>%
group_by(final_analysis) %>%
summarise(
median_UMAP1 = median(UMAP1, na.rm = TRUE),
median_UMAP2 = median(UMAP2, na.rm = TRUE)
)
cluster_center <- as.data.frame(cluster_center)
library(RColorBrewer)
# your inputs
want <- c("NOTCH3hi TNClo pericyte",
"NOTCH3hi TNCint CCL19pos pericyte",
"NOTCH3hi TNChi LOXL2pos pericyte",
"C3hi CCL19pos fibroblast")
library(stringr)
library(RColorBrewer)
cols_11 <- brewer.pal(n=11, "RdYlBu")
cols_extra3 <- brewer.pal(n=9, "RdGy")[c(7:9)]
cols_order_want <- c(cols_11[c(11, 10, 9, 8, 7)], cols_extra3, cols_11[c(6, 5, 4, 3, 2, 1)])
library(forcats)
forAnalysis <- forAnalysis %>%
filter(!is.na(final_analysis), !is.na(NAM_PC1)) %>%
mutate(cell_type = fct_reorder(final_analysis, NAM_PC1, .fun = median, na.rm = TRUE))
# ensure the factor levels in your plotting data match the palette order
#forAnalysis2$final_analysis <- factor(forAnalysis2$final_analysis, levels = names(cols_in_level_order))
library(stringr)
lab_wrap <- str_wrap(levels(forAnalysis$cell_type), width = 28) # tweak width
cell_type <- levels(forAnalysis$cell_type)
g2 <- ggplot() +
geom_point(
data = forAnalysis[sample(nrow(forAnalysis)), ],
aes(x = UMAP1, y = UMAP2, fill = cell_type),
size = 0.4, stroke = 0.0001, shape = 21
) +
scale_fill_manual(
values = cols_order_want,
breaks = cell_type,
labels = lab_wrap, # wrapped labels
drop = FALSE,
name = "Cell Type"
) +
guides(fill = guide_legend(
override.aes = list(size = 3),
keyheight = unit(2.5, "mm"),
keywidth = unit(2.5, "mm")
)) +
labs(x = "UMAP1", y = "UMAP2", title = "IBD Stroma Cells") +
theme_bw(base_size = 9) +
theme(
legend.position = "bottom",
axis.text = element_blank(),
axis.ticks = element_blank(),
panel.grid = element_blank(),
plot.title = element_text(color = "black", size = 9),
plot.subtitle = element_text(color = "black", size = 7),
legend.box = "horizontal",
legend.title = element_text(size = 4),
legend.box.spacing = unit(2, "pt"), # space between plot and legend
legend.margin = margin(0, 0, 0, 0), # margin around the whole legend
legend.key.height = unit(4, "pt"), # key height (colorbar thickness area)
legend.key.width = unit(60, "pt"), # key length
plot.margin = margin(5, 5, 5, 5) # trim outer plot margins if needed
)
g2
#### Boxplot with Cell Type.
library(forcats)
library(ggplot2)
# order levels of final_analysis from lowest → highest median NAM_PC1
forAnalysis <- forAnalysis |>
mutate(final_analysis = fct_reorder(final_analysis, NAM_PC1,
.fun = ~median(., na.rm = TRUE),
.desc = FALSE))
#levels(forAnalysis2$final_analysis)
## get colors for cell types updated to RedBlue from RColorBrewer;
library(RColorBrewer)
cols_11 <- brewer.pal(n=11, "RdYlBu")
cols_extra3 <- brewer.pal(n=9, "RdGy")[c(7:9)]
cols_order_want <- c(cols_11[c(11, 10, 9, 8, 7)], cols_extra3, cols_11[c(6, 5, 4, 3, 2, 1)])
library(stringr)
p2 <- ggplot(forAnalysis,
aes(x = final_analysis, y = NAM_PC1, fill = final_analysis)) +
geom_boxplot(
position = position_dodge(width = 0.8),
outlier.alpha = 0.35,
color = "black", # keep black outlines/median
fatten = 1
) +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
labs(x = "Cell Type", y = "Association score") +
scale_fill_manual(values = cols_order_want, drop = FALSE) +
guides(fill = "none") + # remove legend; rely on x-axis names
scale_x_discrete(
labels = function(x) str_wrap(x, width = 20) # wrap long names
# or: guide = guide_axis(angle = 55, n.dodge = 2)
) +
theme_classic() +
theme(
axis.text = element_text(size = 15), # tick labels (both axes)
axis.text.x = element_text(angle = 55, hjust = 1, vjust = 1, size = 12),
axis.title.x = element_text(size = 15), # x-axis title
axis.title.y = element_text(size = 15), # y-axis title
axis.ticks.x = element_line()
)
p2
First let’s calculate the ROCs for each of the cell types.
#### Use Prediction for PCs ####
library(Matrix)
library(lme4)
library(pROC)
library(ggplot2)
library(dplyr)
##### Function to Get ROC Info #####
getROCobj <- function(cellType_want, data, seed){
#set up dataset
tmp <- data
tmp$cellType <- ifelse(tmp$cell_type==cellType_want, 1, 0)
#perform model
mod <- glm(cellType ~ NAM_PC1, data=tmp, family=binomial)
pred_probs <- stats::predict(mod, type="response")
#get ROC
roc_obj <- roc(tmp$cellType, pred_probs)
auc_value <- pROC::auc(roc_obj)
ci_auc <- pROC::ci.auc(roc_obj, conf.level = 0.95) # DeLong by default
set.seed(seed)
ci_band <- pROC::ci.se(
roc_obj,
specificities = seq(0, 1, by = 0.01),
boot.n = 100,
boot.stratified = TRUE,
progress = "none"
)
# Save output we need
want <- list(roc_obj, auc_value, ci_auc, ci_band)
names(want) <- c("roc_obj", "auc", "ci.auc", "ci.se")
return(want)
}
#### Get the ROC Info for Each Cell Type ####
forAnalysis$cell_type <- forAnalysis$final_analysis
cellTypes <- unique(forAnalysis$final_analysis)
ibd_roc_c3hi_ccl19plus <- getROCobj(cellType_want = "C3hi CCL19pos fibroblast", forAnalysis, 2020)
ibd_roc_thy1pos <- getROCobj(cellType_want = "THY1pos FAPpos PDPNpos fibroblast", forAnalysis, 2020)
ibd_roc_notch3hi_tnchi <- getROCobj(cellType_want = "NOTCH3hi TNChi LOXL2pos pericyte", forAnalysis, 2020)
ibd_roc_notch3hi_tnclo <- getROCobj(cellType_want = "NOTCH3hi TNClo pericyte", forAnalysis, 2020)
ibd_roc_CD74hi <- getROCobj(cellType_want = "CD74hi HLADRB1hi venous pericyte", forAnalysis, 2020)
ibd_roc_notch3other <- getROCobj(cellType_want = "NOTCH3hi TNCint CCL19pos pericyte", forAnalysis, 2020)
ibd_roc_ABCA8_FOShi <- getROCobj(cellType_want = "ABCA8pos WNT2Bpos FOShi fibroblast", forAnalysis, 2020)
ibd_roc_ABCA8_FOSlo <- getROCobj(cellType_want = "ABCA8pos WNT2Bpos FOSlo fibroblast", forAnalysis, 2020)
ibd_roc_c3hi_rspo3pos <- getROCobj(cellType_want = "C3hi RSPO3pos fibroblast", forAnalysis, 2020)
ibd_roc_CD74hi_arterial <- getROCobj(cellType_want = "CD74hi HLADRB1hi arterial pericyte", forAnalysis, 2020)
ibd_roc_Myofibroblast <- getROCobj(cellType_want = "Myofibroblast", forAnalysis, 2020)
ibd_roc_notch3hi_myh11 <- getROCobj(cellType_want = "NOTCH3hi MYH11pos pericyte", forAnalysis, 2020)
ibd_roc_sox6_npy <- getROCobj(cellType_want = "SOX6pos POSTNpos NRG1hi NPYpos fibroblast", forAnalysis, 2020)
ibd_roc_sox6_postn <- getROCobj(cellType_want = "SOX6pos POSTNpos fibroblast", forAnalysis, 2020)
#### Plot the ROCs Together###
library(RColorBrewer)
# Put your results into a named list (order as you like)
rocs <- list(
"C3hi CCL19+ fibroblast" = ibd_roc_c3hi_ccl19plus,
"Thy1+ FAP+ PDPN+ fibroblast" = ibd_roc_thy1pos,
"NOTCH3hi TNChi LOXL2pos pericyte" = ibd_roc_notch3hi_tnchi,
"NOTCH3hi TNClo pericyte" = ibd_roc_notch3hi_tnclo,
"CD74hi HLADRB1hi venous pericyte" = ibd_roc_CD74hi,
"NOTCH3hi TNCint CCL19pos pericyte" = ibd_roc_notch3other,
"ABCA8pos WNT2Bpos FOShi fibroblast" = ibd_roc_ABCA8_FOShi,
"ABCA8pos WNT2Bpos FOSlo fibroblast" = ibd_roc_ABCA8_FOSlo,
"C3hi RSPO3pos fibroblast" = ibd_roc_c3hi_rspo3pos,
"CD74hi HLADRB1hi arterial pericyte" = ibd_roc_CD74hi_arterial,
"Myofibroblast" = ibd_roc_Myofibroblast,
"NOTCH3hi MYH11pos pericyte" = ibd_roc_notch3hi_myh11,
"SOX6pos POSTNpos NRG1hi NPYpos fibroblast" = ibd_roc_sox6_npy,
"SOX6pos POSTNpos fibroblast" = ibd_roc_sox6_postn
)
saveRDS(rocs, file="/pl/active/fanzhanglab/lvanderlinden/publicDatasets/IBD_Thomas2024/data/longMod_v1/NAMPC1_cellType_ROCS.rds")
Plot just the to performing ones (NOTCH3) and one that is somewhat in the middle.
rocs_all <- readRDS(file="/pl/active/fanzhanglab/lvanderlinden/publicDatasets/IBD_Thomas2024/data/longMod_v1/NAMPC1_cellType_ROCS.rds")
names(rocs_all) <- gsub("+", "pos", names(rocs_all), fixed=TRUE)
want <- c("NOTCH3hi TNClo pericyte",
"NOTCH3hi TNCint CCL19pos pericyte",
"NOTCH3hi TNChi LOXL2pos pericyte",
"C3hi CCL19pos fibroblast")
rocs <- rocs_all[which(names(rocs_all) %in% want)]
# Colors (use the colors that you defined above). Use lighter alpha for CI bands.
cols <- cols_order_want[c(8, 1, 3, 2)]
fills <- vapply(cols, \(z) adjustcolor(z, alpha.f = 0.25), character(1))
pdf(
file = "/pl/active/fanzhanglab/lvanderlinden/publicDatasets/IBD_Thomas2024/data/longMod_v1/NAMPC1_ROC_fibroblasts_fig2_select4CT_newColors.pdf",
width = 9, height = 7.6
)
## -------- Layout: plot (top) + custom legend (bottom) --------
layout(matrix(c(1, 2), nrow = 2, byrow = TRUE), heights = c(6.0, 1.6))
## -------- Top panel (ROC) --------
par(mar = c(5.2, 5.2, 3.8, 2.2), cex = 1.15, cex.axis = 1.1, cex.lab = 1.3)
# remove axis padding to kill the white space
op <- par(xaxs = "i", yaxs = "i"); on.exit(par(op), add = TRUE)
first <- rocs[[1]]
plot(
first$roc_obj,
main = "Association Score Predicting Cell Type",
col = NA,
legacy.axes = TRUE, # keep classic "1 - Specificity" axis
xlim = c(0.5, 0), # <-- 0→1 left-to-right for legacy axes
ylim = c(0, 1), # clamp y as well
lwd = 5, # thicker baseline
cex.main = 2.55
)
# CI polygons (UNCHANGED from your working version)
i <- 0
for (nm in names(rocs)) {
i <- i + 1
ci_mat <- as.matrix(rocs[[nm]]$ci.se)
sp <- as.numeric(rownames(ci_mat))
polygon(
c(sp, rev(sp)),
c(ci_mat[, 1], rev(ci_mat[, 3])),
col = fills[i], border = NA
)
}
# ROC lines (thicker)
i <- 0
for (nm in names(rocs)) {
i <- i + 1
plot(rocs[[nm]]$roc_obj, add = TRUE, lwd = 5, col = cols[i])
}
## -------- Bottom panel: custom 2-column legend --------
par(mar = c(0.2, 0.4, 0.2, 0.4))
plot.new()
par(xpd = NA); par(usr = c(0, 1, 0, 1))
make_entry <- function(nm, obj) {
auc_val <- as.numeric(pROC::auc(obj$roc_obj))
ci_val <- try(pROC::ci.auc(obj$roc_obj), silent = TRUE)
line1 <- sprintf("%s:", nm)
line2 <- if (inherits(ci_val, "try-error")) {
sprintf("AUC = %.3f", auc_val)
} else {
sprintf("AUC = %.3f (95%% CI %.3f, %.3f)", auc_val, ci_val[1], ci_val[3])
}
list(line1 = line1, line2 = line2)
}
entries <- lapply(names(rocs), function(nm) make_entry(nm, rocs[[nm]]))
n <- length(entries); half <- ceiling(n/2)
left_idx <- seq_len(half)
right_idx <- if (n > half) (half + 1):n else integer(0)
cex_leg <- 1.05
lh <- strheight("X", cex = cex_leg)
gap <- 3.1 * lh
delta <- 1.15 * lh
y_top <- 0.93
# Column anchors
col1 <- list(x_seg0 = 0.03, x_seg1 = 0.11, x_text = 0.13)
col2 <- list(x_seg0 = 0.53, x_seg1 = 0.61, x_text = 0.63)
# --- Keep segments() inside this function so colspec/idx are in scope ---
draw_col <- function(idxs, colspec, cols_palette) {
for (k in seq_along(idxs)) {
idx <- idxs[k]; ent <- entries[[idx]]
y <- y_top - (k - 1) * gap
# thicker swatch to match ROC lines
segments(
colspec$x_seg0, y - 0.5 * lh,
colspec$x_seg1, y - 0.5 * lh,
lwd = 5, # thickness here
col = cols_palette[idx]
)
text(colspec$x_text, y, ent$line1, adj = c(0, 0.5), cex = cex_leg)
text(colspec$x_text, y - delta, ent$line2, adj = c(0, 0.5), cex = cex_leg)
}
}
draw_col(left_idx, col1, cols)
if (length(right_idx)) draw_col(right_idx, col2, cols)
layout(1); dev.off()