Overview

Look at Thomas et al 2024 and analyze their longitudinal dataset with our pipeline.

rm(list=ls())
library(Matrix, lib.loc = "/projects/lvanderlinden@xsede.org/Rstudio_libs/4.2.2/")
library(scLASER)
## Warning: replacing previous import 'data.table::last' by 'dplyr::last' when
## loading 'scLASER'
## Warning: replacing previous import 'data.table::first' by 'dplyr::first' when
## loading 'scLASER'
## Warning: replacing previous import 'MASS::select' by 'dplyr::select' when
## loading 'scLASER'
## Warning: replacing previous import 'data.table::between' by 'dplyr::between'
## when loading 'scLASER'
## Warning: replacing previous import 'caret::plsda' by 'mixOmics::plsda' when
## loading 'scLASER'
## Warning: replacing previous import 'caret::splsda' by 'mixOmics::splsda' when
## loading 'scLASER'
## Warning: replacing previous import 'caret::nearZeroVar' by
## 'mixOmics::nearZeroVar' when loading 'scLASER'
## Warning: replacing previous import 'dplyr::collapse' by 'nlme::collapse' when
## loading 'scLASER'
## Warning: replacing previous import 'lme4::lmList' by 'nlme::lmList' when
## loading 'scLASER'
## Warning: replacing previous import 'foreach::when' by 'purrr::when' when
## loading 'scLASER'
## Warning: replacing previous import 'mixOmics::map' by 'purrr::map' when loading
## 'scLASER'
## Warning: replacing previous import 'caret::lift' by 'purrr::lift' when loading
## 'scLASER'
## Warning: replacing previous import 'data.table::transpose' by
## 'purrr::transpose' when loading 'scLASER'
## Warning: replacing previous import 'foreach::accumulate' by 'purrr::accumulate'
## when loading 'scLASER'
## Warning: replacing previous import 'Matrix::cov2cor' by 'stats::cov2cor' when
## loading 'scLASER'
## Warning: replacing previous import 'dplyr::filter' by 'stats::filter' when
## loading 'scLASER'
## Warning: replacing previous import 'dplyr::lag' by 'stats::lag' when loading
## 'scLASER'
## Warning: replacing previous import 'Matrix::toeplitz' by 'stats::toeplitz' when
## loading 'scLASER'
## Warning: replacing previous import 'Matrix::update' by 'stats::update' when
## loading 'scLASER'
## Warning: replacing previous import 'Matrix::expand' by 'tidyr::expand' when
## loading 'scLASER'
## Warning: replacing previous import 'Matrix::pack' by 'tidyr::pack' when loading
## 'scLASER'
## Warning: replacing previous import 'Matrix::unpack' by 'tidyr::unpack' when
## loading 'scLASER'
## Warning: replacing previous import 'Matrix::tail' by 'utils::tail' when loading
## 'scLASER'
## Warning: replacing previous import 'Matrix::head' by 'utils::head' when loading
## 'scLASER'
library(kableExtra)
library(ggplot2)
library(dplyr)
library(DT)
library(table1)
source(file="/projects/lvanderlinden@xsede.org/pvalForTable1_updatedForNA.R")

Read in Raw Data

This data was downloaded from: https://zenodo.org/records/14007626 and saved under the IBD_Thomas2024 folder.

data.loc <- "/pl/active/fanzhanglab/lvanderlinden/publicDatasets/IBD_Thomas2024"

features <- list.files(data.loc, recursive="FALSE")
IBD.filenames <- paste0(data.loc, "/", features)

paired_samples <- read.delim(file=paste0(data.loc, "/paired_sample_list.csv"))
dim(paired_samples)

##### Load in fibroblast/pericyte cells ##########
library(zellkonverter)

# read in the h5ad files (fibroblast/pericyte cells have the "fib" in name)
sce <- readH5AD(IBD.filenames[grep("fib", IBD.filenames)])
saveRDS(sce, file="/pl/active/fanzhanglab/lvanderlinden/publicDatasets/IBD_Thomas2024/data/fibroblast.asSCEobj.rds")


meta <- colData(sce)
meta.uniqueSamples <- meta[!duplicated(meta$sample_id),]
nVisits <- as.data.frame(table(meta.uniqueSamples$Patient))
patientCD5 <- meta.uniqueSamples[which(meta.uniqueSamples$Patient=="CD5"),]

exp <- assay(sce, "X")
assayNames(sce)        
reducedDimNames(sce)
sapply(reducedDims(sce), dim)  # rows = cells, cols = components
head(reducedDim(sce, reducedDimNames(sce)[1]))

harmony_pcs <- reducedDims(sce)$X_harmony
umap <- reducedDims(sce)$X_umap
colnames(umap) <- c("UMAP1", "UMAP2")
save(umap, harmony_pcs, meta, file="/pl/active/fanzhanglab/lvanderlinden/publicDatasets/IBD_Thomas2024/data/fibroblast_harmonyPCs_meta.Rdata")

What does the meta data look like? Let’s check out the first 6 rows.

datatable(head(as.data.frame(meta)))

Create an scLASER object

#make the disease/healthy variable as binary
meta$disease_healthy0_other1 <- ifelse(meta$Disease=="Healthy", 0, 1)

# from the data we read in, make an scLASER object with their metadata, umap and the harmony PCs they calculated.  
obj <- scLASER::scLASER(metadata=as.data.frame(meta), umap=as.data.frame(umap), harmony = as.matrix(harmony_pcs))

Create NAM PCs

For this report, make a neighborhood abundence matrix (NAM) and reduce down by PCA and take top 30 PCs moving forward.

# make NAM and get the top 30 NAM PCs
obj = scLASER::association_nam_LV(obj = obj,
                          test_var = "disease_healthy0_other1",
                          #test_var="Remission_status",
                          samplem_key = "sample_id",
                          graph_use = "RNA_nn",
                          batches = "Batch",
                          covs = c("Age", "Gender"),
                          nsteps = NULL,
                          verbose=TRUE,
                          assay=NULL,
                          key="NAMPC_",
                          maxnsteps=15L,
                          max_frac_pcs=0.15, # added option to pass number of PCs, for potential speedups
                          n_pcs = 30, 
                          ks=NULL,
                          Nnull=1000,
                          force_permute_all=FALSE,
                          local_test=TRUE,
                          seed=1234,
                          return_nam=TRUE
)

# This step takes some time depending on how large your dataset.  Would reccommend saving after this point.  
saveRDS(obj, file="/pl/active/fanzhanglab/lvanderlinden/publicDatasets/IBD_Thomas2024/data/fibroblast.scLASERobj.rds")

Let’s look at what the NAM PC matrix looks like. These will be our outcomes in the subsequent linear mixed effect models.

datatable(as.data.frame(head(obj@nam_pcs)), rownames = FALSE)

Sample Distributions & Demographics

Before we perform the association testing, let’s look at some things like do all patients have samples from the same sites and so on.

Sample Level

Looking at each sample do we have missing information that we need? The main thing are our treatment (time status as pre/post) and the disease (remission vs non-remission)

# Check treatment (pre/post) information:
table(obj@metadata$Treatment, useNA = "ifany")
## 
##  Post   Pre  <NA> 
## 48178 48701  6174
# Check remission status:
table(obj@metadata$Remission_status, useNA = "ifany")
## 
## Non_Remission         None      Remission     Not_avail 
##         38422          6174         57391          1066
# Lets look at this together:
table(obj@metadata$Remission_status, obj@metadata$Treatment, useNA = "ifany")
##                
##                  Post   Pre  <NA>
##   Non_Remission 15388 23034     0
##   None              0     0  6174
##   Remission     32790 24601     0
##   Not_avail         0  1066     0
## remove the missing Treatment and Remission Status that is not available.  
keep <- !is.na(obj@metadata$Treatment) &
        obj@metadata$Remission_status %in% c("Remission", "Non_Remission")

obj2 <- obj
obj2@metadata     <- obj@metadata[keep, , drop = FALSE]
obj2@umap         <- obj@umap[keep, , drop = FALSE]
obj2@harmony  <- obj@harmony[keep, , drop = FALSE]
obj2@nam_pcs      <- obj@nam_pcs[keep, , drop = FALSE]

# check remission status
table(obj2@metadata$Remission_status, useNA = "ifany")
## 
## Non_Remission         None      Remission     Not_avail 
##         38422             0         57391             0
# check the treatment (i.e.timepoint)
table(obj2@metadata$Treatment, useNA = "ifany")
## 
##  Post   Pre 
## 48178 47635
#looks good.  Now let's get get a dataframe together to look at the table 1s:
obj2@metadata$Disease <- droplevels(obj2@metadata$Disease)
obj2@metadata$sample_id <- droplevels(obj2@metadata$sample_id)
obj2@metadata$Patient <- droplevels(obj2@metadata$Patient)

forAnalysis <- data.frame(obj2@metadata, obj2@umap, obj2@nam_pcs)
sampleLevel <- forAnalysis[!duplicated(forAnalysis$sample_id),]


table1(~ Disease + Site + Treatment  | Remission_status, data=sampleLevel, overall=FALSE, extra.col=list(`p-value` = pvalue))
Non_Remission
(N=99)
Remission
(N=101)
p-value
Disease
CD 28 (28.3%) 68 (67.3%) <0.001
UC 71 (71.7%) 33 (32.7%)
Site
Ascending_Colon 17 (17.2%) 18 (17.8%) 0.889
Descending_Colon 26 (26.3%) 29 (28.7%)
Rectum 27 (27.3%) 26 (25.7%)
Sigmoid 14 (14.1%) 10 (9.9%)
Terminal_Ileum 15 (15.2%) 18 (17.8%)
Treatment
Post 48 (48.5%) 52 (51.5%) 0.777
Pre 51 (51.5%) 49 (48.5%)

Looking at this table1, we will diffinately need to adjust for disease type as more CD go into remission than UC. Also, looking at the sites, they are not equal. We should adjust for this too.

Let’s take a look at the participants, and how many sites they have/time period.

nSites_perTime <- as.data.frame(table(sampleLevel$Patient, sampleLevel$Treatment))
colnames(nSites_perTime) <- c("Patient", "Treatment", "N Samples")

getSites <- function(patient, time){
  tmp <- sampleLevel[which(sampleLevel$Patient==patient & sampleLevel$Treatment==time),]
  if(nrow(tmp)==0){sites = "none"}else{sites = paste(tmp$Site, collapse=", ")}
  return(sites)
}

nSites_perTime$Sites <- apply(nSites_perTime, 1, function(a) getSites(a[1], a[2]))

nSites_perTime <- nSites_perTime[order(nSites_perTime$Patient),]
datatable(nSites_perTime, rownames=FALSE)

Looking at this table, some patients have matching sites and some don’t. This is something our approach can handle. But we just want to note this.

Patient Level

Now let’s look at demographics on the patient level.

patientLevel <- sampleLevel[!duplicated(sampleLevel$Patient),]

nSites_pre <- nSites_perTime[which(nSites_perTime$Treatment == "Pre"), c(1,3)]
colnames(nSites_pre)[2] <- "nSamples_Pre"
nSites_post <- nSites_perTime[which(nSites_perTime$Treatment == "Post"), c(1,3)]
colnames(nSites_post)[2] <- "nSamples_Post"

patientLevel <- merge(patientLevel, nSites_pre, by="Patient")
patientLevel <- merge(patientLevel, nSites_post, by="Patient")

table1(~ Disease + Age + Gender + Ethnicity + nSamples_Pre + nSamples_Post | Remission_status, data=patientLevel, overall=FALSE, extra.col=list(`p-value` = pvalue))
## Warning in chisq.test(table(y, g)): Chi-squared approximation may be incorrect
Non_Remission
(N=20)
Remission
(N=16)
p-value
Disease
CD 6 (30.0%) 10 (62.5%) 0.107
UC 14 (70.0%) 6 (37.5%)
Age
Mean (SD) 33.8 (10.1) 34.1 (11.0) 0.928
Median [Min, Max] 31.5 [17.0, 55.0] 32.5 [17.0, 61.0]
Gender
F 10 (50.0%) 6 (37.5%) 0.68
M 10 (50.0%) 10 (62.5%)
Ethnicity
Afro-Carribean 0 (0%) 1 (6.3%) 0.266
Caucasian 20 (100%) 14 (87.5%)
Middle_Eastern 0 (0%) 1 (6.3%)
nSamples_Pre
Mean (SD) 2.55 (0.887) 3.06 (0.929) 0.103
Median [Min, Max] 2.50 [1.00, 4.00] 3.00 [1.00, 4.00]
nSamples_Post
Mean (SD) 2.40 (1.23) 3.25 (1.13) 0.0381
Median [Min, Max] 3.00 [0, 4.00] 3.50 [1.00, 5.00]

Cell Level

Let’s check out a basic chi square of the cell types.

table1(~ final_analysis | Remission_status, data=forAnalysis, overall=FALSE, extra.col=list(`p-value` = pvalue))
Non_Remission
(N=38422)
Remission
(N=57391)
p-value
final_analysis
ABCA8pos WNT2Bpos FOShi fibroblast 3665 (9.5%) 7349 (12.8%) <0.001
ABCA8pos WNT2Bpos FOSlo fibroblast 8043 (20.9%) 15155 (26.4%)
C3hi CCL19pos fibroblast 668 (1.7%) 901 (1.6%)
C3hi RSPO3pos fibroblast 2847 (7.4%) 5550 (9.7%)
CD74hi HLADRB1hi arterial pericyte 5855 (15.2%) 6089 (10.6%)
CD74hi HLADRB1hi venous pericyte 318 (0.8%) 231 (0.4%)
Myofibroblast 1402 (3.6%) 2026 (3.5%)
NOTCH3hi MYH11pos pericyte 1885 (4.9%) 1974 (3.4%)
NOTCH3hi TNChi LOXL2pos pericyte 1232 (3.2%) 811 (1.4%)
NOTCH3hi TNCint CCL19pos pericyte 1029 (2.7%) 859 (1.5%)
NOTCH3hi TNClo pericyte 4325 (11.3%) 3625 (6.3%)
SOX6pos POSTNpos NRG1hi NPYpos fibroblast 1532 (4.0%) 2863 (5.0%)
SOX6pos POSTNpos fibroblast 4567 (11.9%) 9444 (16.5%)
THY1pos FAPpos PDPNpos fibroblast 1054 (2.7%) 514 (0.9%)

Association Analysis

Now let’s actually perform the association analysis to identify NAM scores (NAM PCs) associated with the time*remission.

# make the remission and time variables numeric (0/1)
obj2@metadata$remission0_nonRemission1 <- ifelse(obj2@metadata$Remission_status=="Remission", 0, 1)
obj2@metadata$time_pre0_post1 <- ifelse(obj2@metadata$Treatment=="Pre", 0, 1)
#also make sure the 
head(obj2@nam_pcs)
rownames(obj2@nam_pcs) <- obj2@metadata$sample_id


obj2 <- longAnalyses_plusModelSelection(obj2,
  disease_var = "remission0_nonRemission1",
  time_var    = "time_pre0_post1",
  covariates  = c("Age", "Gender", "Disease", "Site"),
  subject_var = "Patient",
  sample_var  = "sample_id"
)

saveRDS(obj2, file="/pl/active/fanzhanglab/lvanderlinden/publicDatasets/IBD_Thomas2024/data/fibroblast.scLASERobj.wAnalysis.rds")

results <- obj2@pipeline_output
results.int <- results[grep(":", results$coefficient),]

NAM Score p-values and FDRs

Subset to only the interaction term for easy viewing and also can adjust for multiple testing as well.

results.int$FDR <- p.adjust(results.int$p.value, method="BH")
results.int$Bonferroni <- p.adjust(results.int$p.value, method = "bonferroni")

toPrint <- results.int[,c("PC", "p.value", "FDR")]
colnames(toPrint)[1:2] <- c("NAM Score", "p-value")

kable(toPrint, row.names=0, align='l')  %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = F)
NAM Score p-value FDR
NAM_PC1 0.0002670 0.0080109
NAM_PC2 0.7267870 0.9151403
NAM_PC3 0.1653085 0.4749787
NAM_PC4 0.1918990 0.4797475
NAM_PC5 0.3219551 0.6899037
NAM_PC6 0.6048631 0.9151403
NAM_PC7 0.8864153 0.9281942
NAM_PC8 0.0955415 0.3774913
NAM_PC9 0.0261702 0.1962767
NAM_PC10 0.7286994 0.9151403
NAM_PC11 0.0934054 0.3774913
NAM_PC12 0.6862049 0.9151403
NAM_PC13 0.6570907 0.9151403
NAM_PC14 0.6743436 0.9151403
NAM_PC15 0.0145588 0.1455880
NAM_PC16 0.4812335 0.9023129
NAM_PC17 0.7211096 0.9151403
NAM_PC18 0.8377547 0.9281942
NAM_PC19 0.2082144 0.4804948
NAM_PC20 0.0774626 0.3774913
NAM_PC21 0.7398858 0.9151403
NAM_PC22 0.4437195 0.8874391
NAM_PC23 0.0104816 0.1455880
NAM_PC24 0.8972544 0.9281942
NAM_PC25 0.1072389 0.3774913
NAM_PC26 0.8679080 0.9281942
NAM_PC27 0.1741588 0.4749787
NAM_PC28 0.9436909 0.9436909
NAM_PC29 0.1132474 0.3774913
NAM_PC30 0.7626169 0.9151403

Association Score - NAM PC1

We have 1 resulting association score from the model: NAM PC1

Fixed Effects Table

pc1.res <- results[which(results$PC=="NAM_PC1"), ]
#round and format pretty to make viewable
pc1.res$p.value <- formatC(pc1.res$p.value, format="e", digits=2)
pc1.res$t.value <- round(pc1.res$t.value, digits=2)
pc1.res$Value <- formatC(pc1.res$Value, format="e", digits=2)
pc1.res$Std.Error <- formatC(pc1.res$Std.Error, format="e", digits=2)

datatable(pc1.res, options = list(
  pageLength = 11))

Boxplot Summarized By Sample

library(dplyr); library(tidyr); library(ggplot2)
## 
## Attaching package: 'tidyr'
## The following object is masked from 'package:S4Vectors':
## 
##     expand
## The following object is masked from 'package:magrittr':
## 
##     extract
## The following objects are masked from 'package:Matrix':
## 
##     expand, pack, unpack
forAnalysis <- data.frame(obj2@metadata, obj2@nam_pcs, obj2@umap)
colnames(forAnalysis)[grep("PC", colnames(forAnalysis))] <- paste0("NAM_", colnames(forAnalysis)[grep("PC", colnames(forAnalysis))])
forAnalysis$Treatment <- factor(forAnalysis$Treatment, levels = c("Pre","Post"))

paired_dat <- forAnalysis %>%
  dplyr::group_by(Patient, Treatment, Remission_status) %>%
  dplyr::summarise(NAM_PC1 = median(NAM_PC1, na.rm = TRUE), .groups = "drop") %>%
  tidyr::pivot_wider(names_from = Treatment, values_from = NAM_PC1) %>%
  tidyr::drop_na(Pre, Post) %>%
  tidyr::pivot_longer(c(Pre, Post), names_to = "Treatment", values_to = "NAM_PC1") %>%
  mutate(Treatment = factor(Treatment, levels = c("Pre","Post")))

paired_dat$Treatment <- factor(paired_dat$Treatment,
                               levels = c("Pre","Post"),
                               labels = c("Pre-Tx","Post-Tx"))

paired_dat$Remission_status <- gsub("Non_Remission", "Non-remission", paired_dat$Remission_status)


p4 <- ggplot(paired_dat, aes(x = Treatment, y = NAM_PC1)) +
  # colored boxplots behind
  geom_boxplot(aes(group = Treatment, fill = Remission_status),
               width = 0.55, outlier.shape = NA,
               color = "grey50", alpha = 0.45) +
  # grey connecting lines
  geom_line(aes(group = Patient), color = "grey70", linewidth = 0.6, alpha = 0.7) +
  # black points
  geom_point(color = "black", size = 2.6) +
  facet_wrap(~ Remission_status, nrow = 1) +
  scale_fill_manual(values = c("Non-remission" = "#4DAF4A", "Remission" = "#984EA3")) +
  labs(x = "Time", y = "Association score") +
  theme_classic(base_size = 16) +
  theme(
    strip.text  = element_text(size = 16, face = "bold"),
    axis.title  = element_text(size = 18),
    axis.text   = element_text(size = 14),
    axis.text.x = element_text(angle = 45, hjust = 1),
    legend.position = "none"
  )
p4

UMAP Colored By Association Score

#### UMAP with PC
fig_umap2 <- ggplot() +
 geom_point(
    data = forAnalysis[sample(nrow(forAnalysis)), ],
    mapping = aes(x = UMAP1, y = UMAP2, fill = NAM_PC1),
    size = 0.5, stroke = 0.01, shape = 21
  ) +
  scale_fill_gradient2(
    low = "navy",        # Low values (dark blue)
    mid = "white",       # Middle values (white)
    high = "darkred",    # High values (dark red)
    midpoint = 0         # Set the midpoint to 0
  ) +
  labs(
    x = "UMAP1",
    y = "UMAP2",
    #title = "IBD Fibroblasts: Association Score",
    #subtitle = "N cells = 100, N samples = 30"
  ) +
  theme_bw(base_size = 20) +
  theme(
    legend.position = "right",
    axis.text = element_blank(),
    axis.ticks = element_blank(),
    panel.grid = element_blank(),
    plot.title = element_text(color = "black", size = 20),
    plot.subtitle = element_text(color = "black", size = 10)
  )

fig_umap2

Cell Types

UMAP colored by cell type

The cell types are defined from the paper. What does this look like on the UMAP?

#### PLOTS #####
### UMAP by cell type from paper
library(dplyr)
cluster_center <- data.frame(forAnalysis) %>%
  group_by(final_analysis) %>%
  summarise(
    median_UMAP1 = median(UMAP1, na.rm = TRUE),
    median_UMAP2 = median(UMAP2, na.rm = TRUE)
  )
cluster_center <- as.data.frame(cluster_center)

library(RColorBrewer)

# your inputs
want <- c("NOTCH3hi TNClo pericyte",
          "NOTCH3hi TNCint CCL19pos pericyte",
          "NOTCH3hi TNChi LOXL2pos pericyte",
          "C3hi CCL19pos fibroblast")

library(stringr)
library(RColorBrewer)
cols_11 <- brewer.pal(n=11, "RdYlBu")
cols_extra3 <- brewer.pal(n=9, "RdGy")[c(7:9)]
cols_order_want <- c(cols_11[c(11, 10, 9, 8, 7)], cols_extra3, cols_11[c(6, 5, 4, 3, 2, 1)])


library(forcats)
forAnalysis <- forAnalysis %>%
  filter(!is.na(final_analysis), !is.na(NAM_PC1)) %>%
  mutate(cell_type = fct_reorder(final_analysis, NAM_PC1, .fun = median, na.rm = TRUE))

# ensure the factor levels in your plotting data match the palette order
#forAnalysis2$final_analysis <- factor(forAnalysis2$final_analysis, levels = names(cols_in_level_order))
library(stringr)
lab_wrap <- str_wrap(levels(forAnalysis$cell_type), width = 28)  # tweak width

cell_type <- levels(forAnalysis$cell_type)
g2 <- ggplot() +
  geom_point(
    data = forAnalysis[sample(nrow(forAnalysis)), ],
    aes(x = UMAP1, y = UMAP2, fill = cell_type),
    size = 0.4, stroke = 0.0001, shape = 21
  ) +
  scale_fill_manual(
    values = cols_order_want,
    breaks = cell_type,
    labels = lab_wrap,                    # wrapped labels
    drop = FALSE,
    name = "Cell Type"
  ) +
  guides(fill = guide_legend(
    override.aes = list(size = 3),
    keyheight = unit(2.5, "mm"),
    keywidth  = unit(2.5, "mm")
  )) +
  labs(x = "UMAP1", y = "UMAP2", title = "IBD Stroma Cells") +
  theme_bw(base_size = 9) +
  theme(
    legend.position = "bottom",
    axis.text = element_blank(),
    axis.ticks = element_blank(),
    panel.grid = element_blank(),
    plot.title = element_text(color = "black", size = 9),
    plot.subtitle = element_text(color = "black", size = 7),
    legend.box = "horizontal",
    legend.title = element_text(size = 4), 
    legend.box.spacing = unit(2, "pt"),   # space between plot and legend
    legend.margin = margin(0, 0, 0, 0),   # margin around the whole legend
    legend.key.height = unit(4, "pt"),    # key height (colorbar thickness area)
    legend.key.width  = unit(60, "pt"),   # key length
    plot.margin = margin(5, 5, 5, 5)      # trim outer plot margins if needed
  )
g2

Association score cell type boxplot

#### Boxplot with Cell Type.
library(forcats)
library(ggplot2)

# order levels of final_analysis from lowest → highest median NAM_PC1
forAnalysis <- forAnalysis |>
  mutate(final_analysis = fct_reorder(final_analysis, NAM_PC1,
                                      .fun = ~median(., na.rm = TRUE),
                                      .desc = FALSE))

#levels(forAnalysis2$final_analysis)
## get colors for cell types updated to RedBlue from RColorBrewer;
library(RColorBrewer)
cols_11 <- brewer.pal(n=11, "RdYlBu")
cols_extra3 <- brewer.pal(n=9, "RdGy")[c(7:9)]
cols_order_want <- c(cols_11[c(11, 10, 9, 8, 7)], cols_extra3, cols_11[c(6, 5, 4, 3, 2, 1)])

library(stringr)

p2 <- ggplot(forAnalysis,
             aes(x = final_analysis, y = NAM_PC1, fill = final_analysis)) +
  geom_boxplot(
    position = position_dodge(width = 0.8),
    outlier.alpha = 0.35,
    color = "black",      # keep black outlines/median
    fatten = 1
  ) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
  labs(x = "Cell Type", y = "Association score") +
  scale_fill_manual(values = cols_order_want, drop = FALSE) +
  guides(fill = "none") +                    # remove legend; rely on x-axis names
  scale_x_discrete(
    labels = function(x) str_wrap(x, width = 20)  # wrap long names
    # or: guide = guide_axis(angle = 55, n.dodge = 2)
  ) +
  theme_classic() +
  theme(
    axis.text    = element_text(size = 15),                        # tick labels (both axes)
    axis.text.x  = element_text(angle = 55, hjust = 1, vjust = 1, size = 12),
    axis.title.x = element_text(size = 15),                        # x-axis title
    axis.title.y = element_text(size = 15),                        # y-axis title
    axis.ticks.x = element_line()
  )
p2

Association score prediction of cell types

First let’s calculate the ROCs for each of the cell types.

#### Use Prediction for PCs #### 
library(Matrix)
library(lme4)
library(pROC)
library(ggplot2)
library(dplyr)


##### Function to Get ROC Info #####
getROCobj <- function(cellType_want, data, seed){
  #set up dataset
  tmp <- data
  tmp$cellType <- ifelse(tmp$cell_type==cellType_want, 1, 0)
  
  #perform model 
  mod <- glm(cellType ~ NAM_PC1, data=tmp, family=binomial)
  pred_probs <- stats::predict(mod, type="response")
  
  #get ROC
  roc_obj <- roc(tmp$cellType, pred_probs)
  auc_value <- pROC::auc(roc_obj)
  
  ci_auc <- pROC::ci.auc(roc_obj, conf.level = 0.95)  # DeLong by default
  set.seed(seed)
  ci_band <- pROC::ci.se(
    roc_obj,
    specificities = seq(0, 1, by = 0.01),
    boot.n = 100, 
    boot.stratified = TRUE,
    progress = "none"
  ) 
  
  # Save output we need
  want <- list(roc_obj, auc_value, ci_auc, ci_band)
  names(want) <- c("roc_obj", "auc", "ci.auc", "ci.se")
  return(want)
}


#### Get the ROC Info for Each Cell Type ####
forAnalysis$cell_type <- forAnalysis$final_analysis
cellTypes <- unique(forAnalysis$final_analysis)

ibd_roc_c3hi_ccl19plus <- getROCobj(cellType_want = "C3hi CCL19pos fibroblast", forAnalysis, 2020)
ibd_roc_thy1pos <- getROCobj(cellType_want = "THY1pos FAPpos PDPNpos fibroblast", forAnalysis, 2020)
ibd_roc_notch3hi_tnchi <- getROCobj(cellType_want = "NOTCH3hi TNChi LOXL2pos pericyte", forAnalysis, 2020)
ibd_roc_notch3hi_tnclo <- getROCobj(cellType_want = "NOTCH3hi TNClo pericyte", forAnalysis, 2020)
ibd_roc_CD74hi <- getROCobj(cellType_want = "CD74hi HLADRB1hi venous pericyte", forAnalysis, 2020)
ibd_roc_notch3other <- getROCobj(cellType_want = "NOTCH3hi TNCint CCL19pos pericyte", forAnalysis, 2020)

ibd_roc_ABCA8_FOShi <- getROCobj(cellType_want = "ABCA8pos WNT2Bpos FOShi fibroblast", forAnalysis, 2020)
ibd_roc_ABCA8_FOSlo <- getROCobj(cellType_want = "ABCA8pos WNT2Bpos FOSlo fibroblast", forAnalysis, 2020)
ibd_roc_c3hi_rspo3pos <- getROCobj(cellType_want = "C3hi RSPO3pos fibroblast", forAnalysis, 2020)
ibd_roc_CD74hi_arterial <- getROCobj(cellType_want = "CD74hi HLADRB1hi arterial pericyte", forAnalysis, 2020)
ibd_roc_Myofibroblast <- getROCobj(cellType_want = "Myofibroblast", forAnalysis, 2020)
ibd_roc_notch3hi_myh11 <- getROCobj(cellType_want = "NOTCH3hi MYH11pos pericyte", forAnalysis, 2020)
ibd_roc_sox6_npy <- getROCobj(cellType_want = "SOX6pos POSTNpos NRG1hi NPYpos fibroblast", forAnalysis, 2020)
ibd_roc_sox6_postn <- getROCobj(cellType_want = "SOX6pos POSTNpos fibroblast", forAnalysis, 2020)

#### Plot the ROCs Together###
library(RColorBrewer)
# Put your results into a named list (order as you like)
rocs <- list(
  "C3hi CCL19+ fibroblast" = ibd_roc_c3hi_ccl19plus,
  "Thy1+ FAP+ PDPN+ fibroblast" = ibd_roc_thy1pos,
  "NOTCH3hi TNChi LOXL2pos pericyte" = ibd_roc_notch3hi_tnchi,
  "NOTCH3hi TNClo pericyte" = ibd_roc_notch3hi_tnclo,
  "CD74hi HLADRB1hi venous pericyte" = ibd_roc_CD74hi,
  "NOTCH3hi TNCint CCL19pos pericyte" = ibd_roc_notch3other,
  
  "ABCA8pos WNT2Bpos FOShi fibroblast" = ibd_roc_ABCA8_FOShi,
  "ABCA8pos WNT2Bpos FOSlo fibroblast" = ibd_roc_ABCA8_FOSlo,
  "C3hi RSPO3pos fibroblast" = ibd_roc_c3hi_rspo3pos, 
  "CD74hi HLADRB1hi arterial pericyte" = ibd_roc_CD74hi_arterial, 
  "Myofibroblast" = ibd_roc_Myofibroblast,
  "NOTCH3hi MYH11pos pericyte" = ibd_roc_notch3hi_myh11, 
  "SOX6pos POSTNpos NRG1hi NPYpos fibroblast" = ibd_roc_sox6_npy,
  "SOX6pos POSTNpos fibroblast" = ibd_roc_sox6_postn
)

saveRDS(rocs, file="/pl/active/fanzhanglab/lvanderlinden/publicDatasets/IBD_Thomas2024/data/longMod_v1/NAMPC1_cellType_ROCS.rds")

Plot just the to performing ones (NOTCH3) and one that is somewhat in the middle.

rocs_all <- readRDS(file="/pl/active/fanzhanglab/lvanderlinden/publicDatasets/IBD_Thomas2024/data/longMod_v1/NAMPC1_cellType_ROCS.rds")
names(rocs_all) <- gsub("+", "pos", names(rocs_all), fixed=TRUE)
want <- c("NOTCH3hi TNClo pericyte",
          "NOTCH3hi TNCint CCL19pos pericyte",
          "NOTCH3hi TNChi LOXL2pos pericyte",
          "C3hi CCL19pos fibroblast")
rocs <- rocs_all[which(names(rocs_all) %in% want)]

# Colors (use the colors that you defined above). Use lighter alpha for CI bands.  
cols <- cols_order_want[c(8, 1, 3, 2)]
fills <- vapply(cols, \(z) adjustcolor(z, alpha.f = 0.25), character(1))

pdf(
  file = "/pl/active/fanzhanglab/lvanderlinden/publicDatasets/IBD_Thomas2024/data/longMod_v1/NAMPC1_ROC_fibroblasts_fig2_select4CT_newColors.pdf",
  width = 9, height = 7.6
)

## -------- Layout: plot (top) + custom legend (bottom) --------
layout(matrix(c(1, 2), nrow = 2, byrow = TRUE), heights = c(6.0, 1.6))
## -------- Top panel (ROC) --------
par(mar = c(5.2, 5.2, 3.8, 2.2), cex = 1.15, cex.axis = 1.1, cex.lab = 1.3)

# remove axis padding to kill the white space
op <- par(xaxs = "i", yaxs = "i"); on.exit(par(op), add = TRUE)

first <- rocs[[1]]
plot(
  first$roc_obj,
  main = "Association Score Predicting Cell Type",
  col = NA,
  legacy.axes = TRUE,      # keep classic "1 - Specificity" axis
  xlim = c(0.5, 0),          # <-- 0→1 left-to-right for legacy axes
  ylim = c(0, 1),          # clamp y as well
  lwd  = 5,                # thicker baseline
  cex.main = 2.55
)

# CI polygons (UNCHANGED from your working version)
i <- 0
for (nm in names(rocs)) {
  i <- i + 1
  ci_mat <- as.matrix(rocs[[nm]]$ci.se)
  sp      <- as.numeric(rownames(ci_mat))
  polygon(
    c(sp, rev(sp)),
    c(ci_mat[, 1], rev(ci_mat[, 3])),
    col = fills[i], border = NA
  )
}

# ROC lines (thicker)
i <- 0
for (nm in names(rocs)) {
  i <- i + 1
  plot(rocs[[nm]]$roc_obj, add = TRUE, lwd = 5, col = cols[i])
}

## -------- Bottom panel: custom 2-column legend --------
par(mar = c(0.2, 0.4, 0.2, 0.4))
plot.new()
par(xpd = NA); par(usr = c(0, 1, 0, 1))

make_entry <- function(nm, obj) {
  auc_val <- as.numeric(pROC::auc(obj$roc_obj))
  ci_val  <- try(pROC::ci.auc(obj$roc_obj), silent = TRUE)
  line1 <- sprintf("%s:", nm)
  line2 <- if (inherits(ci_val, "try-error")) {
    sprintf("AUC = %.3f", auc_val)
  } else {
    sprintf("AUC = %.3f (95%% CI %.3f, %.3f)", auc_val, ci_val[1], ci_val[3])
  }
  list(line1 = line1, line2 = line2)
}
entries <- lapply(names(rocs), function(nm) make_entry(nm, rocs[[nm]]))

n <- length(entries); half <- ceiling(n/2)
left_idx  <- seq_len(half)
right_idx <- if (n > half) (half + 1):n else integer(0)

cex_leg <- 1.05
lh    <- strheight("X", cex = cex_leg)
gap   <- 3.1 * lh
delta <- 1.15 * lh
y_top <- 0.93

# Column anchors
col1 <- list(x_seg0 = 0.03, x_seg1 = 0.11, x_text = 0.13)
col2 <- list(x_seg0 = 0.53, x_seg1 = 0.61, x_text = 0.63)

# --- Keep segments() inside this function so colspec/idx are in scope ---
draw_col <- function(idxs, colspec, cols_palette) {
  for (k in seq_along(idxs)) {
    idx <- idxs[k]; ent <- entries[[idx]]
    y <- y_top - (k - 1) * gap

    # thicker swatch to match ROC lines
    segments(
      colspec$x_seg0, y - 0.5 * lh,
      colspec$x_seg1, y - 0.5 * lh,
      lwd = 5,                          # thickness here
      col = cols_palette[idx]
    )

    text(colspec$x_text, y,         ent$line1, adj = c(0, 0.5), cex = cex_leg)
    text(colspec$x_text, y - delta, ent$line2, adj = c(0, 0.5), cex = cex_leg)
  }
}

draw_col(left_idx,  col1, cols)
if (length(right_idx)) draw_col(right_idx, col2, cols)

layout(1); dev.off()