1 Introduction

To investigate whether complement components are spatially organized within RA synovium, we developed a spatial neighborhood-based modeling framework for gene-level associations to define complement cellular niches for spatial transcriptomics.

This tutorial outlines key steps of applying the spatial neighborhood framework to measure spatial relationships between complement signatures and canonical synovial structural markers in RA synovial.

To run this framework, the following inputs are required:

A normalized gene expression matrix
A spatial coordinate matrix containing X and Y
Gene sets (e.g., complement signatures, canonical synovial lining markers, etc.)

We provided a set of custom R functions for this pipeline. Before running the analysis, please load the required functions from ./Spatial_modeling_R_functions.R.

source("../R/Spatial_modeling_R_functions.R")

2 Load RA synovial Xenium spatial transcriptomics datasets

library(qs)
library(Seurat)
library(Matrix)
library(ComplexHeatmap)
library(RANN)
library(dplyr)
library(ggplot2)
library(circlize)

# Load object that includes multiple RA samples
Xenium_samples <- qread("../../Data/xenium_merged.qs")

# In this tutorial, we take RA331 (RA1) as an example
RA1_obj <- subset(Xenium_samples, subset = sample == "RA331")

3 Spatial neighborhood modeling framework

3.1 Tissue segmentation for sample RA1

# RA1 meta data
meta_RA1 <- RA1_obj@meta.data
meta_RA1$cell <- rownames(meta_RA1)
coord_df_RA1 <- data.frame(cell = meta_RA1$cell, x = meta_RA1$x, y = meta_RA1$y,  check.names = FALSE)
coord_df_RA1[1:5,] %>% 
  DT::datatable(options = list(scrollx = T))

# Define the RA1 region of interest (ROI)
xlim <- c(4200, 8000)
ylim <- c(7.151, 6300)

# Select ROI cells
coord_df_RA1$selected <- with(coord_df_RA1, x >= xlim[1] & x <= xlim[2] & y >= ylim[1] & y <= ylim[2])
cells_seg <- coord_df_RA1$cell[coord_df_RA1$selected]  # subset of cells
cat("N selected = ", length(cells_seg), "\n")

## N selected =  153815

# Visualize ROI
p_rect <- ggplot(coord_df_RA1, aes(y, x)) +
  geom_point(aes(color = selected), size = 0.15, alpha = 0.8, stroke = 0) +
  annotate("rect", xmin = ylim[1], xmax = ylim[2],
           ymin = xlim[1], ymax = xlim[2],
           fill = NA, color = "black", linewidth = 0.4) +
  scale_color_manual(values = c("FALSE" = "grey80", "TRUE" = "darkslategray3")) +
  coord_flip() +
  coord_fixed() +
  labs(
    title = "RA1 ROI",
    x = "y", y = "x"
  ) +
  theme_bw(base_size = 11) +
  theme(legend.position = "none")
p_rect

RA1_seg <- subset(RA1_obj, cells = cells_seg)

# Subest 1/5 cells for a quick render of this tutorial 
set.seed(42)
n_cells <- round(ncol(RA1_seg) / 5)
sampled_cells <- sample(colnames(RA1_seg), size = n_cells)
RA1_seg_subset <- subset(RA1_seg, cells = sampled_cells)

# Meta data 
df_meta_seg_sub <- meta_RA1[
  rownames(meta_RA1) %in% colnames(RA1_seg_subset),
]
# Spatial coordinate x,y
coord_df_sub_RA1 <- data.frame(cell = rownames(df_meta_seg_sub), x = df_meta_seg_sub$x, y = df_meta_seg_sub$y, check.names = FALSE)

3.2 Construct normalized gene expression matrix on genes of interest

# Query genes: we use curated complement signatures, canonical synovial lining markers, canonical synovial sublining markers, and curated inflammatory and IL-6-associated signatures.
lining_genes    <- c("MMP3", "MMP1","HBEGF","CD68","SPP1","GPNMB")  
sublining_genes <- c("THY1", "CXCL12", "SFRP1","LYVE1","COL5A2")
complement_sublining <- c("C3", "COLEC12","CFB")
complement_lining <- c("C5AR1", "FCGR3A","FCGR1A")
inflammatory_signatures <- c("TNF", "CXCL2", "ATF3","CXCL3" ,"CLEC5A","LIF", "CXCL9","IL27","SOCS3")

gene_groups <- list(
  lining_genes = lining_genes,
  sublining_genes = sublining_genes,
  complement_lining = complement_lining,
  complement_sublining = complement_sublining,
  inflammatory_signatures = inflammatory_signatures
)

# Extract normalized expression matrix（log-normalized counts）
RA1_expr <- GetAssayData(RA1_seg_subset, layer = "data") 
RA1_seg_sub_expr <- t(as.matrix(RA1_expr)) # cell by gene
dim(RA1_seg_sub_expr)

## [1] 30763   480

RA1_seg_sub_expr[1:10,1:10] %>% 
  DT::datatable(options = list(scrollx = T))

# Construct expression matrix
query_genes <- preprocess_gene_expr(
  expr_mat = RA1_seg_sub_expr,
  gene_groups = gene_groups
)

# Intersect cells between expr and coords
cells_common <- intersect(rownames(RA1_seg_sub_expr), rownames(df_meta_seg_sub))

expr_mat_RA1 <- RA1_seg_sub_expr[cells_common, , drop = FALSE]
coords_RA1  <- df_meta_seg_sub[cells_common, c("x","y"), drop = FALSE]

# Subset expression to selected genes
X_all <- expr_mat_RA1[, query_genes, drop = FALSE]
X_all <- as.matrix(X_all)

dim(X_all) # cells x genes

## [1] 30763    25

3.3 Construct Gaussian kernel-weighted spatial neighborhoods

To capture distance-decayed local influence, we used Gaussian kernel to weight the k-nearest spatial neighboring cells as a function of spatial distance (e.g., k = 20 nearest neighbors). The resulting weight matrix was used to summarize local spatial neighborhood signals.

# Gaussian kernel-weighted object
weights_obj <- GaussianKernel_weights(
  coords_df = coords_RA1,
  k = 20
)

# Extract 1) cell density quantifying regional variation
cell_density <- weights_obj$cell_density_z
# Extract 2) spatial neighborhood weights 
W_mat <- weights_obj$W_sparse

# Define distance-weighted neighborhood exposure of cell 
E_mat <- distance_weighted_neighborhoods_exposure(
  X_all = X_all,
  W_sparse = W_mat
)

dim(E_mat) # cells x genes

## [1] 30763    25

E_mat[1:5,1:5] %>% 
  DT::datatable(options = list(scrollx = T))

3.4 Model spatial neighborhood gene assocations among curated genes

For each exposure gene, the direction-pair regression model tests whether its spatial neighborhood expression is associated with each outcome gene, adjusting for: 1) self expression of the exposure gene, 2) local cell density.

Sys.setenv(
  OMP_NUM_THREADS = "1",
  OPENBLAS_NUM_THREADS = "1",
  MKL_NUM_THREADS = "1",
  VECLIB_MAXIMUM_THREADS = "1",
  NUMEXPR_NUM_THREADS = "1"
)

set.seed(42)

spatial_res_RA1 <- spatial_neighborhood_model_all(
  X_matrix = X_all,
  E_weighted_matrix = E_mat,
  W_weights = W_mat,
  cell_density = cell_density,
  B = 1000, # generate empirical p-values
  n_cores = 4,
  seed = 123,
  verbose = TRUE
)

# To account for multiple testing across all directional gene pairs, empirical p-values were adjusted using the Benjamini-Hochberg procedure, producing a false discovery rate  (FDR) value for each gene pair. 
spatial_res_RA1 <- spatial_association_fdr(spatial_res_RA1)

4 Downstream results

4.1 Cluster complement components into spatially associated complement-lining and -sublining signatures

# Build gene-by-gene matrix: rows = outcome genes, cols = exposure genes
heat_mat_RA1 <- matrix(
  0,
  nrow = length(query_genes),
  ncol = length(query_genes),
  dimnames = list(query_genes, query_genes)
)

# Define signed colocalization significance score: C = sign(beta_hat)*[-log(FDR)]
for (i in seq_len(nrow(spatial_res_RA1))) {
  g <- spatial_res_RA1$outcome_gene[i]
  h <- spatial_res_RA1$exposure_gene[i]
  heat_mat_RA1[g, h] <- spatial_res_RA1$signed_log10_fdr[i]
}

# Gene annotations
gene_annot <- data.frame(
  gene = c(
    intersect(complement_lining, query_genes),
    intersect(lining_genes, query_genes),
    intersect(complement_sublining, query_genes),
    intersect(sublining_genes, query_genes),
    intersect(inflammatory_signatures, query_genes)
  ),
  group = c(
    rep("Complement-lining", length(intersect(complement_lining, query_genes))),
    rep("Lining", length(intersect(lining_genes, query_genes))),
    rep("Complement-sublining", length(intersect(complement_sublining, query_genes))),
    rep("Sublining", length(intersect(sublining_genes, query_genes))),
    rep("Inflammatory and IL-6-associated", length(intersect(inflammatory_signatures, query_genes)))
  ),
  stringsAsFactors = FALSE
)

gene_order <- gene_annot$gene
gene_group <- factor(
  gene_annot$group,
  levels = c(
    "Complement-lining",
    "Lining",
    "Complement-sublining",
    "Sublining",
    "Inflammatory and IL-6-associated"
  )
)

heat_mat_RA1 <- heat_mat_RA1[gene_order, gene_order, drop = FALSE]

# Heatmap of signed colocalization significance score
group_cols <- c(
  "Complement-lining" = "#f4a582", 
  "Lining" = "#d95f02", 
  "Complement-sublining" = "#7570b3", 
  "Sublining" = "#53308F",  
  "Inflammatory and IL-6-associated" = "#f2e7b5"
)

top_ha <- HeatmapAnnotation(
  Group = gene_group,
  col = list(Group = group_cols),
  show_annotation_name = FALSE,
  show_legend = FALSE
)

left_ha <- rowAnnotation(
  Group = gene_group,
  col = list(Group = group_cols),
  show_annotation_name = FALSE
)

lim <- max(abs(heat_mat_RA1), na.rm = TRUE)

col_fun <- colorRamp2(
  c(-lim, 0, lim),
  c("#92c5de", "#f5f5dc", "orange")
)

Heatmap(
  heat_mat_RA1,
  name = "signed\n-log10(FDR)",
  col = col_fun,
  
  cluster_rows = TRUE,
  row_split = 2,                 
  cluster_row_slices = TRUE,
  
  cluster_columns = TRUE,
  column_split = 2,
  cluster_column_slices = FALSE,
  
  top_annotation = top_ha,
  left_annotation = left_ha,
  row_names_side = "right",
  show_row_dend = FALSE,
  show_column_dend = FALSE,
  row_title = NULL,
  column_names_rot = -45,
  column_title = "RA1 spatial colocalization analysis",
  rect_gp = gpar(col = "lightgrey", lwd = 0.5),
  row_names_gp = gpar(fontsize = 14,fontface = "italic",fontfamily = "sans"),
  column_names_gp = gpar(fontsize = 14, fontfamily = "sans")

)

# Build discrete significance matrix
sig_mat_RA1 <- matrix(
  "NS",
  nrow = nrow(heat_mat_RA1),
  ncol = ncol(heat_mat_RA1),
  dimnames = dimnames(heat_mat_RA1)
)

sig_mat_RA1[heat_mat_RA1 > -log10(0.05)] <- "Pos"
sig_mat_RA1[heat_mat_RA1 < -log10(0.05)] <- "Neg"
sig_mat_RA1[heat_mat_RA1 == -log10(0.05) | is.na(heat_mat_RA1)] <- "NS"

diag(sig_mat_RA1) <- "diag"

sig_cols <- c(
  Neg  = "#2166AC",
  NS   = "grey",
  Pos  = "#B2182B",
  diag = "white"
)

mat_for_cluster <- heat_mat_RA1
mat_for_cluster[is.na(mat_for_cluster)] <- 0

row_hclust <- hclust(dist(mat_for_cluster))
col_hclust <- hclust(dist(t(mat_for_cluster)))

# cluster block
row_split <- factor(
  cutree(row_hclust, k = 2),
  levels = sort(unique(cutree(row_hclust, k = 2)))
)

col_split <- factor(
  cutree(col_hclust, k = 2),
  levels = sort(unique(cutree(col_hclust, k = 2)))
)

Heatmap(
  sig_mat_RA1,
  name = "sig",
  col = sig_cols,
  
  cluster_rows = as.dendrogram(row_hclust),
  row_split = 2,
  cluster_row_slices = FALSE,
  
  cluster_columns = as.dendrogram(col_hclust),
  column_split = 2,
  cluster_column_slices = FALSE,
  
  show_row_dend = FALSE,
  show_column_dend = FALSE,
  
  row_title = NULL,
  
  top_annotation = top_ha,
  left_annotation = left_ha,
  
  row_names_side = "right",
  column_names_rot = 45,
  
  rect_gp = gpar(col = "lightgrey", lwd = 0.5),
  
  row_names_gp = gpar(
    fontsize = 12,
    fontface = "italic",
    fontfamily = "sans"
  ),
  column_title = "RA1 spatial colocalization analysis (significance)",
  column_names_gp = gpar(
    fontsize = 12,
    fontfamily = "sans"
  )
)

4.2 Spatial regression-weighted niche scores

To define compartment-specific complement-associated niches at the cell level, we constructed regression-weighted niche scores directly from the direction-pair regression results. We identify two sharply segregated complement niches quantified the degree to which cells were embedded within a local complement-lining or -sublining neighborhood.

# Specifically, we focused on significantly positively colocalized gene pairs: fdr < 0.05 and beta > 0
cl_pairs <- subset(
  spatial_res_RA1,
  outcome_gene %in% lining_genes &
    exposure_gene %in% complement_lining &
    beta_exposure > 0 &
    fdr < 0.05
)

cs_pairs <- subset(
  spatial_res_RA1,
  outcome_gene %in% sublining_genes &
    exposure_gene %in% complement_sublining &
    beta_exposure > 0 &
    fdr < 0.05
)

plot_df <- data.frame(
  cell = rownames(coords_RA1),
  x = coords_RA1[, "x"],
  y = coords_RA1[, "y"],
  stringsAsFactors = FALSE
)

for (g in colnames(E_mat)) {
  plot_df[[paste0("exp_", g)]] <- E_mat[, g]
}
plot_df$y_plot <- -plot_df$y

# Compute weighted module score from significant positive pairs
# score_i = sum_{pairs} beta_exposure * E_i,h
plot_df$CL_module_weighted <- compute_weighted_module_score(cl_pairs, E_mat)
plot_df$CS_module_weighted <- compute_weighted_module_score(cs_pairs, E_mat)
plot_df$CL_minus_CS_weighted <- plot_df$CL_module_weighted - plot_df$CS_module_weighted

p_cl_weighted <- plot_spatial_feature(
  plot_df, "CL_module_weighted", "RA1 - Complement-lining niche score"
)
p_cs_weighted <- plot_spatial_feature(
  plot_df, "CS_module_weighted", "RA1 - Complement-sublining niche score"
)

p_cl_weighted + p_cs_weighted

## Concordance with unbiased spatial niche clusters adapted from Mantel et al.

cell_match <- match(plot_df$cell, meta_RA1$cell)
meta_RA1 <- meta_RA1[cell_match[!is.na(cell_match)], ]

df_unbiased_label <- meta_RA1 %>% select("cell","x","y","Niche")

# remove duplicated cells
sum(duplicated(df_unbiased_label$cell))

## [1] 0

sum(duplicated(plot_df$cell))

## [1] 0

df_concord <- df_unbiased_label %>%
  inner_join(plot_df, by = "cell")

# Focus on the unbiased synovial Lining (Niche3) and Sublining (Niche4) niche
df_concord_sub <- df_concord[df_concord$Niche %in% c("Niche3", "Niche4"), ]
df_concord_sub <- droplevels(
  df_concord[df_concord$Niche %in% c("Niche3", "Niche4"), ]
)

# Statistical test and compute the correlation between our regression-weighted niche scores and unbiased niche labels
wilcox.test(CL_module_weighted ~ Niche, data = df_concord_sub)

## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  CL_module_weighted by Niche
## W = 103763995, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0

df_concord_sub$Niche_binary <- ifelse(df_concord_sub$Niche == "Niche3", 1, 0)

res <- cor.test(
  df_concord_sub$Niche_binary,
  df_concord_sub$CL_module_weighted,
  method = "pearson"
)

5 Session info

sessionInfo()

## R version 4.4.1 (2024-06-14)
## Platform: aarch64-apple-darwin20
## Running under: macOS 15.3.1
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/Denver
## tzcode source: internal
## 
## attached base packages:
## [1] grid      stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] circlize_0.4.17       ggplot2_4.0.2         dplyr_1.2.0          
##  [4] RANN_2.6.2            ComplexHeatmap_2.20.0 Matrix_1.7-5         
##  [7] Seurat_5.4.0          SeuratObject_5.3.0    sp_2.2-1             
## [10] qs_0.27.3             BiocStyle_2.32.1     
## 
## loaded via a namespace (and not attached):
##   [1] RColorBrewer_1.1-3     shape_1.4.6.1          rstudioapi_0.18.0     
##   [4] jsonlite_2.0.0         magrittr_2.0.4         magick_2.9.1          
##   [7] spatstat.utils_3.2-2   farver_2.1.2           rmarkdown_2.30        
##  [10] GlobalOptions_0.1.3    vctrs_0.7.2            ROCR_1.0-12           
##  [13] Cairo_1.7-0            spatstat.explore_3.8-0 tinytex_0.58          
##  [16] htmltools_0.5.9        sass_0.4.10            sctransform_0.4.3     
##  [19] parallelly_1.46.1      KernSmooth_2.23-26     bslib_0.10.0          
##  [22] htmlwidgets_1.6.4      ica_1.0-3              plyr_1.8.9            
##  [25] plotly_4.12.0          zoo_1.8-15             cachem_1.1.0          
##  [28] igraph_2.2.2           mime_0.13              lifecycle_1.0.5       
##  [31] iterators_1.0.14       pkgconfig_2.0.3        R6_2.6.1              
##  [34] fastmap_1.2.0          clue_0.3-67            fitdistrplus_1.2-6    
##  [37] future_1.70.0          shiny_1.13.0           digest_0.6.39         
##  [40] colorspace_2.1-2       S4Vectors_0.42.1       patchwork_1.3.2       
##  [43] tensor_1.5.1           RSpectra_0.16-2        irlba_2.3.7           
##  [46] crosstalk_1.2.2        labeling_0.4.3         progressr_0.18.0      
##  [49] spatstat.sparse_3.1-0  httr_1.4.8             polyclip_1.10-7       
##  [52] abind_1.4-8            compiler_4.4.1         withr_3.0.2           
##  [55] doParallel_1.0.17      S7_0.2.1               fastDummies_1.7.5     
##  [58] MASS_7.3-65            rjson_0.2.23           tools_4.4.1           
##  [61] lmtest_0.9-40          otel_0.2.0             httpuv_1.6.17         
##  [64] future.apply_1.20.2    goftest_1.2-3          glue_1.8.0            
##  [67] nlme_3.1-168           promises_1.5.0         Rtsne_0.17            
##  [70] cluster_2.1.8.2        reshape2_1.4.5         generics_0.1.4        
##  [73] gtable_0.3.6           spatstat.data_3.1-9    tidyr_1.3.2           
##  [76] data.table_1.18.2.1    RApiSerialize_0.1.4    stringfish_0.18.0     
##  [79] BiocGenerics_0.50.0    spatstat.geom_3.7-2    RcppAnnoy_0.0.23      
##  [82] ggrepel_0.9.8          foreach_1.5.2          pillar_1.11.1         
##  [85] stringr_1.6.0          spam_2.11-3            RcppHNSW_0.6.0        
##  [88] later_1.4.8            splines_4.4.1          lattice_0.22-9        
##  [91] survival_3.8-6         deldir_2.0-4           tidyselect_1.2.1      
##  [94] miniUI_0.1.2           pbapply_1.7-4          knitr_1.51            
##  [97] gridExtra_2.3          bookdown_0.46          IRanges_2.38.1        
## [100] scattermore_1.2        stats4_4.4.1           xfun_0.57             
## [103] matrixStats_1.5.0      DT_0.34.0              stringi_1.8.7         
## [106] lazyeval_0.2.2         yaml_2.3.12            evaluate_1.0.5        
## [109] codetools_0.2-20       tibble_3.3.1           BiocManager_1.30.27   
## [112] cli_3.6.5              uwot_0.2.4             RcppParallel_5.1.11-2 
## [115] xtable_1.8-8           reticulate_1.45.0      jquerylib_0.1.4       
## [118] Rcpp_1.1.1             globals_0.19.1         spatstat.random_3.4-5 
## [121] png_0.1-9              spatstat.univar_3.1-7  parallel_4.4.1        
## [124] dotCall64_1.2          listenv_0.10.1         viridisLite_0.4.3     
## [127] scales_1.4.0           ggridges_0.5.7         purrr_1.2.1           
## [130] crayon_1.5.3           GetoptLong_1.1.0       rlang_1.1.7           
## [133] cowplot_1.2.0

Tutorial of spatial neighborhood-based modeling for gene-level associations to construct cellular niches for spatial transcriptomics

14 June, 2026