Differential gene expression analysis for bulk RNA-seq to identify complement treatment effect

library(tidyverse)
library(edgeR)
library(vsn)
library(matrixStats)
library(viridisLite)

gene.exp.count = readRDS('/Volumes/T7/genomics/complement_amp/count_bulk_rna_4-23-2024_v2.rds')
meta_all = readRDS("/Volumes/T7/genomics/complement_amp/meta_bulk_rna_4-23-2024_v2.rds")

1 Quality Control

# normalize to log(TPM+1)
gene.exp.norm.tpm <- t(t(gene.exp.count) / colSums(gene.exp.count) * 1e6)
colSums(gene.exp.norm.tpm)

##        100nM_C3a_1_S69_L007        100nM_C3a_1_S69_L008 
##                       1e+06                       1e+06 
##        100nM_C3a_2_S70_L007        100nM_C3a_2_S70_L008 
##                       1e+06                       1e+06 
##        100nM_C3a_3_S71_L007        100nM_C3a_3_S71_L008 
##                       1e+06                       1e+06 
##        100nM_C3a_4_S72_L007        100nM_C3a_4_S72_L008 
##                       1e+06                       1e+06 
##        100nM_C5a_1_S73_L007        100nM_C5a_1_S73_L008 
##                       1e+06                       1e+06 
##        100nM_C5a_2_S74_L007        100nM_C5a_2_S74_L008 
##                       1e+06                       1e+06 
##        100nM_C5a_3_S75_L007        100nM_C5a_3_S75_L008 
##                       1e+06                       1e+06 
##        100nM_C5a_4_S76_L007        100nM_C5a_4_S76_L008 
##                       1e+06                       1e+06 
##            No_tx_1_S81_L007            No_tx_1_S81_L008 
##                       1e+06                       1e+06 
##            No_tx_2_S82_L007            No_tx_2_S82_L008 
##                       1e+06                       1e+06 
##            No_tx_3_S83_L007            No_tx_3_S83_L008 
##                       1e+06                       1e+06 
##            No_tx_4_S84_L007            No_tx_4_S84_L008 
##                       1e+06                       1e+06 
##               SF_1_S49_L007               SF_1_S49_L008 
##                       1e+06                       1e+06 
##               SF_2_S50_L007               SF_2_S50_L008 
##                       1e+06                       1e+06 
##               SF_3_S51_L007               SF_3_S51_L008 
##                       1e+06                       1e+06 
##               SF_4_S52_L007               SF_4_S52_L008 
##                       1e+06                       1e+06 
## SF_T___100nM_C3a_1_S61_L007 SF_T___100nM_C3a_1_S61_L008 
##                       1e+06                       1e+06 
## SF_T___100nM_C3a_2_S62_L007 SF_T___100nM_C3a_2_S62_L008 
##                       1e+06                       1e+06 
## SF_T___100nM_C3a_3_S63_L007 SF_T___100nM_C3a_3_S63_L008 
##                       1e+06                       1e+06 
## SF_T___100nM_C3a_4_S64_L007 SF_T___100nM_C3a_4_S64_L008 
##                       1e+06                       1e+06 
## SF_T___100nM_C5a_1_S65_L007 SF_T___100nM_C5a_1_S65_L008 
##                       1e+06                       1e+06 
## SF_T___100nM_C5a_2_S66_L007 SF_T___100nM_C5a_2_S66_L008 
##                       1e+06                       1e+06 
## SF_T___100nM_C5a_3_S67_L007 SF_T___100nM_C5a_3_S67_L008 
##                       1e+06                       1e+06 
## SF_T___100nM_C5a_4_S68_L007 SF_T___100nM_C5a_4_S68_L008 
##                       1e+06                       1e+06 
##             SF_T_1_S53_L007             SF_T_1_S53_L008 
##                       1e+06                       1e+06 
##             SF_T_2_S54_L007             SF_T_2_S54_L008 
##                       1e+06                       1e+06 
##             SF_T_3_S55_L007             SF_T_3_S55_L008 
##                       1e+06                       1e+06 
##             SF_T_4_S56_L007             SF_T_4_S56_L008 
##                       1e+06                       1e+06

tpm_to_log2tpm <- function(x) log(x + 1, base = 2)
log2tpm <- apply(as.data.frame(gene.exp.norm.tpm), 2, tpm_to_log2tpm)
log2tpm[1:4,1:4]

##         100nM_C3a_1_S69_L007 100nM_C3a_1_S69_L008 100nM_C3a_2_S70_L007
## A2MP1                      0            0.0000000             0.000000
## A3GALT2                    0            0.0000000             0.000000
## A4GNT                      0            0.0000000             1.395835
## AACSP1                     0            0.1768183             0.000000
##         100nM_C3a_2_S70_L008
## A2MP1              0.9861742
## A3GALT2            0.0000000
## A4GNT              1.1770817
## AACSP1             0.1892010

# Which genes are detected in 95% of samples? (# of non-zeros in log2tpm/ total # of samples)
nonzero_markers <- apply(log2tpm, 1, function(x) length(x[x>0])/length(x) > 0.95)
nonzero_genes <- log2tpm[nonzero_markers,]
common_genes <- rownames(nonzero_genes)
length(common_genes)

## [1] 2664

# For a given sample, how many common genes are detected? (# of non-zeros in the common genes per sample/ total # of genes)
# If < 95%, that would be a BAD sample.
temp<-which(log2tpm[,1]>0)
common_index<-which(nonzero_markers, arr.ind=TRUE)

percentage <- array(0,dim=c(ncol(log2tpm)))
num_genes_det <- array(0,dim=c(ncol(log2tpm)))

for (i in 1:ncol(log2tpm)){
  temp<-which(log2tpm[,i]>0)
  num_genes_det[i] <- length(temp)
  inter<- intersect(common_index, temp)
  per<-length(inter)/length(common_genes)
  percentage[i] <- per
}

meta <- data.frame(
  
  sample = colnames(log2tpm)
  
)

meta$percentage <- as.numeric(percentage)
meta[1:4,]

##                 sample percentage
## 1 100nM_C3a_1_S69_L007  0.9924925
## 2 100nM_C3a_1_S69_L008  1.0000000
## 3 100nM_C3a_2_S70_L007  0.9962462
## 4 100nM_C3a_2_S70_L008  1.0000000

bad_index <- meta[which(meta$percentage < 0.95), ]$sample
bad_index

## character(0)

options(repr.plot.width = 8, repr.plot.height = 4)                    
ggplot() +
  geom_point(
    data = meta[order(meta$percentage, decreasing = T),],
    mapping = aes(sample, percentage),
    shape = 21, size = 3, stroke = 0.1
  ) +
  geom_hline(yintercept = 0.95, color = "blue", linetype = "dashed") +
  #     scale_fill_manual(values = c("#8DD3C7", "#FFFFB3", "#BEBADA", "#FB8072"), name = "") +
  labs(
    x = "samples", y = "Percent of common gene detected"
  ) +
  scale_y_continuous(labels=scales::percent) +
  theme_classic() +
  theme(axis.text.x=element_text(angle=45, hjust=1, vjust=1))

ggplot(meta, aes(x = 100 * percentage)) +
  geom_histogram(bins = 100) +
  geom_vline(xintercept = 95, color = "blue", linetype = "dashed") +
  scale_y_continuous(breaks = scales::pretty_breaks(4)) +
  # coord_cartesian(xlim=c(80,100)) +
  labs(
    x = "Percent", y = "# Samples", # title = "Percent of Common Genes",
    subtitle = sprintf(
      "%s common genes detected in\n %s (95%%) of samples",
      formattable::comma(length(common_genes)), ceiling(nrow(meta) * 0.95), nrow(meta)
    )
  ) +
  theme_classic() +
  theme(axis.text.x=element_text(angle=45, hjust=1, vjust=1))

# ggsave("qc2.png", width = 5, height = 3, dpi = 300)

Get location and gc content for the genes

library(BSgenome.Hsapiens.UCSC.hg38)
library(TxDb.Hsapiens.UCSC.hg38.knownGene)

txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene::TxDb.Hsapiens.UCSC.hg38.knownGene


gene_coords <- AnnotationDbi::select(txdb, 
                                     keys = keys(txdb, keytype = "TXNAME"), 
                                     columns = c("TXSTART", "TXEND", "GENEID", "TXCHROM"),
                                     keytype = "TXNAME")

gene_coords$TXNAME_no_version <- sub("\\..*", "", gene_coords$TXNAME)

mapping <- AnnotationDbi::select(org.Hs.eg.db, 
                                 keys = keys(org.Hs.eg.db, keytype = "ENSEMBL"), 
                                 columns = c("SYMBOL", "ENSEMBLTRANS","ENTREZID"), 
                                 keytype = "ENSEMBL")
mapping$ENSEMBLTRANS_no_version <- sub("\\..*", "", mapping$ENSEMBLTRANS)


merged_data <- merge(gene_coords, mapping, by.x = "TXNAME_no_version", by.y = "ENSEMBLTRANS_no_version", all.x = TRUE)


merged_data = merged_data %>%
  dplyr::select(-c(ENSEMBL,ENSEMBLTRANS))


filtered_data <- merged_data %>%
  filter(!is.na(SYMBOL) & SYMBOL != "") %>%
  distinct(SYMBOL, .keep_all = TRUE)



calcCGContent <- function(sequence) {
  cg_count <- sum(toupper(charToRaw(sequence)) %in% c(charToRaw('C'), charToRaw('G')))
  total_count <- nchar(sequence)
  return(cg_count / total_count)
}



library(Rcpp)

cppFunction("
  double calcCGContent(std::string sequence) {
    int cg_count = 0;
    int total_count = sequence.size(); // Total number of characters in the sequence
    
    for (int i = 0; i < total_count; ++i) {
      char nucleotide = toupper(sequence[i]);
      if (nucleotide == 'C' || nucleotide == 'G') {
        cg_count++;
      }
    }
    
    return total_count > 0 ? (double)cg_count / total_count : 0;
  }
")

calcCGContent <- function(sequence) {
  cg_count <- sum(toupper(charToRaw(sequence)) %in% c(charToRaw('C'), charToRaw('G')))
  total_count <- nchar(sequence)
  return(cg_count / total_count)
}



plan(multisession, workers = 4)


results <- future_sapply(seq_along(filtered_data$TXSTART), function(i) {
    chrom <- filtered_data$TXCHROM[i]
    start <- filtered_data$TXSTART[i]
    end <- filtered_data$TXEND[i]
    
    if (!is.na(chrom) && !is.na(start) && !is.na(end)) {
        seq <- as.character(getSeq(BSgenome.Hsapiens.UCSC.hg38, names = chrom, start = start, end = end))
        return(calcCGContent(seq))
    } else {
        return(NA)  
    }
}, simplify = "vector")


filtered_data$CGContent <- results

#saveRDS(filtered_data, "gene_metadata_location.rds")

# contains information about location TXCHROM , TXSTART , TXEND and CG content of each gene
filtered_data = readRDS("/Volumes/T7/genomics/complement_amp/gene_metadata_location.rds")

dim(filtered_data)

## [1] 13005     9

dim(gene.exp.count)

## [1] 12861    56

filtered_data$length = filtered_data$TXEND - filtered_data$TXSTART
head(filtered_data)

##   TXNAME_no_version            TXNAME GENEID TXCHROM   TXSTART     TXEND
## 1   ENST00000000412 ENST00000000412.8   4074   chr12   8940361   8949645
## 2   ENST00000001008 ENST00000001008.6   2288   chr12   2794970   2805423
## 3   ENST00000001146 ENST00000001146.7  56603    chr2  72129238  72147862
## 4   ENST00000004103 ENST00000004103.8  55365    chr7 150800769 150805118
## 5   ENST00000005178 ENST00000005178.6   5166    chr7  95583499  95596516
## 6   ENST00000005180 ENST00000005180.9  10344    chr7  75769538  75772211
##     SYMBOL ENTREZID CGContent length
## 1     M6PR     4074 0.4208939   9284
## 2    FKBP4     2288 0.5153051  10453
## 3  CYP26B1    56603 0.5984966  18624
## 4 TMEM176A    55365 0.5604598   4349
## 5     PDK4     5166 0.3608081  13017
## 6    CCL26    10344 0.4925206   2673

2 Check bias of gc content and length biases

#Gene filtering 

gene.exp.count_gc = gene.exp.count

gene.exp.count_gc = gene.exp.count_gc[rowSums(gene.exp.count_gc) > 10,]


#gene.exp.count_gc= gene.exp.count[rownames(gene.exp.count) %in% filtered_data$SYMBOL,]
filtered_data_lastver = filtered_data[filtered_data$SYMBOL %in% rownames(gene.exp.count_gc),]

dim(filtered_data_lastver)

## [1] 5838   10

dim(gene.exp.count_gc)

## [1] 5863   56

gene.exp.count_gc = gene.exp.count_gc[filtered_data_lastver$SYMBOL,] # Now they are in the same order

library(edgeR)
library(EDASeq)

rownames(filtered_data_lastver) = filtered_data_lastver$SYMBOL

filtered_data_lastver <- filtered_data_lastver[rownames(gene.exp.count_gc), ]
pheno_data <- data.frame(
    conditions = factor(meta_all$group),
    lane = meta_all$lane,
    replicate = meta_all$replicate,
    row.names = colnames(gene.exp.count_gc)
)

pheno_annotated <- AnnotatedDataFrame(data = pheno_data)

# Setup feature data, ensuring alignment with the gene counts
feature_data <- data.frame(
    gc = filtered_data_lastver$CGContent, 
    length = filtered_data_lastver$length,
    row.names = rownames(gene.exp.count_gc)
)

counts <- newSeqExpressionSet(counts = gene.exp.count_gc,
                              phenoData = pheno_annotated,
                              featureData = AnnotatedDataFrame(data = feature_data))



meanVarPlot(counts, log=TRUE)

meanVarPlot(counts)

upperdataWithin <- withinLaneNormalization(counts,"gc", which="upper")

3 DE Analysis after preprocessing

norms = betweenLaneNormalization(upperdataWithin, which="median")
genes_exclude <- grep("^(MT|RPL|RPS|MALAT1|MIR-)", row.names(norms@assayData$normalizedCounts), value = TRUE)






meta_all$group <- ifelse(meta_all$group %in% c("100nM_C3a", "100nM_C5a"), 
                         paste0("a",meta_all$group ), 
                         meta_all$group)

cond <- factor(meta_all$group)
lane = meta_all$lane
replicate = meta_all$replicate
# remove batch effect

design <- model.matrix(~ 0 + cond  + lane + replicate )  # Appropriate way to adjust for batch (4.5.7 edgeR manual)
colnames(design) = gsub("cond","",colnames(design))

contrast <- makeContrasts(
  

    a100nM_C5a_vs_No_tx = a100nM_C5a - No_tx,
    SF_T___100nM_C5a_vs_No_tx = SF_T___100nM_C5a - No_tx,

    levels = design
)




temp <- norms@assayData$normalizedCounts[!rownames(norms@assayData$normalizedCounts) %in% genes_exclude,]


replicate = meta_all$replicate
# group <- factor(meta_all$group)
lanes =  meta_all$lane



y <- DGEList(counts=temp, group=cond)
#y <- DGEList(counts=gene.exp.count_gc, group=meta_all$group)


 y <- calcNormFactors(y, method = "TMM")
 y <- estimateDisp(y, design)
# fit <- glmQLFit(y, design) #Quassi
 
# plotBCV(y)
 
 
fit <- glmFit(y, design)

colnames(design)

##  [1] "a100nM_C3a"       "a100nM_C5a"       "No_tx"            "SF"              
##  [5] "SF_T"             "SF_T___100nM_C3a" "SF_T___100nM_C5a" "laneL008"        
##  [9] "replicate2"       "replicate3"       "replicate4"

contrast_list = list()

for (i in colnames(contrast)){
  contrast_list[[i]] =  glmLRT(fit, contrast=contrast[, i])
}

#contrast_list = readRDS("./contrast_list.rds")

length(contrast_list)

## [1] 2

juan = contrast_list

all_output_juan= data.frame(logfc = numeric(), pval = numeric(), pval_adj = numeric())


for (i in names(juan)){
  
out = data.frame(
    Gene = rownames(juan[[i]]$table),
    condition = i,
    logfc = juan[[i]]$table$logFC, 
    pval =juan[[i]]$table$PValue, 
    pval_adj = p.adjust(juan[[i]]$table$PValue, method = "BH"))

all_output_juan = rbind(all_output_juan,out)
  
}

saveRDS(all_output_juan, "DEG_res.rds")

library(stringr)
library(tidyverse)
FC <- 2 # Fold change threshold
anno_degs_num <- 10 # Number of genes to annotate
tmp <- all_output_juan

conditions_to_print <- c(
  "a100nM_C5a_vs_No_tx",
  "SF_T___100nM_C5a_vs_No_tx"
)

unique_conditions <- unique(tmp$condition)
unique_conditions <- unique_conditions[unique_conditions %in% conditions_to_print]

for (conditions in unique_conditions) {
  
  condition1 <- str_split(conditions, pattern = "_vs_", simplify = TRUE)[, 1]
  condition2 <- str_split(conditions, pattern = "_vs_", simplify = TRUE)[, 2]
  
  current_data <- tmp[tmp$condition == conditions, ]
  
  vol <- ggplot(current_data, aes(x = logfc, y = -log10(pval_adj)))
  
  sig_up <- current_data[current_data$logfc > log2(FC) & current_data$pval_adj < 0.05, ]
  sig_down <- current_data[current_data$logfc < -log2(FC) & current_data$pval_adj < 0.05, ]
  
  sig_up2 <- current_data[current_data$logfc > log2(FC) & current_data$pval_adj > 0.05, ]
  sig_down2 <- current_data[current_data$logfc < -log2(FC) & current_data$pval_adj > 0.05, ]
  
  sig_up3 <- current_data[current_data$logfc > 0 & current_data$pval_adj < 0.05, ]
  sig_down3 <- current_data[current_data$logfc < 0 & current_data$pval_adj < 0.05, ]
  
  insig <- current_data[
    current_data$logfc < log2(FC) &
      current_data$logfc > -log2(FC) &
      current_data$pval_adj > 0.05,
  ]
  
  sig_up <- sig_up[order(sig_up$pval_adj), ]
  sig_down <- sig_down[order(sig_down$pval_adj), ]
  
  size <- 2
  
  g <- vol +   
    ggtitle(label = conditions) +
    geom_point(data = sig_up, col = "red", alpha = 1, size = 1) +
    geom_point(data = sig_down, col = "blue", alpha = 1, size = 1) +
    geom_point(data = sig_up2, col = "pink", alpha = 0.5, size = 1) +
    geom_point(data = sig_down2, col = "skyblue", alpha = 0.5, size = 1) +
    geom_point(data = sig_up3, col = "pink", alpha = 0.5, size = 1) +
    geom_point(data = sig_down3, col = "skyblue", alpha = 0.5, size = 1) +
    geom_point(data = insig, col = "grey", alpha = 0.5, size = 1) +
    ggrepel::geom_text_repel(
      data = sig_up,
      aes(label = Gene),
      max.overlaps = 25,
      fontface = "italic",
      size = size,
      segment.color = "grey"
    ) +
    ggrepel::geom_text_repel(
      data = sig_down,
      aes(label = Gene),
      max.overlaps = 25,
      fontface = "italic",
      size = size,
      segment.color = "grey"
    ) +
    theme_bw(base_size = 14) + 
    theme(
      legend.position = "right",
      axis.text.x = element_text(size = 15),
      axis.text.y = element_text(size = 15)
    ) + 
    xlab(bquote(log[2](.(condition1) / .(condition2)))) + 
    ylab(expression(-log[10]("Adjusted P-value"))) +
    geom_hline(yintercept = -log10(0.05), colour = "#990000", linetype = "dashed") + 
    geom_vline(xintercept = log2(FC), colour = "#990000", linetype = "dashed") + 
    geom_vline(xintercept = -log2(FC), colour = "#990000", linetype = "dashed")
  
  file_name <- paste0(gsub("[[:punct:]]", "_", conditions), "_deg_volcano.pdf")
  
  # ggsave(filename = file_name, plot = g, width = 10, height = 8, dpi = 300)
  print(g)
}

getwd()

## [1] "/Volumes/T7/genomics"

library(stringr)
library(tidyverse)
library(ggrepel)

FC <- 2
tmp <- all_output_juan

# Only plot this one condition
condition_use <- "SF_T___100nM_C5a_vs_No_tx"

current_data <- tmp[tmp$condition == condition_use, ]

# Optional check
dim(current_data)

## [1] 5341    5

unique(current_data$condition)

## [1] "SF_T___100nM_C5a_vs_No_tx"

sig_up <- current_data[current_data$logfc > log2(FC) & current_data$pval_adj < 0.05, ]
sig_down <- current_data[current_data$logfc < -log2(FC) & current_data$pval_adj < 0.05, ]

sig_up2 <- current_data[current_data$logfc > log2(FC) & current_data$pval_adj > 0.05, ]
sig_down2 <- current_data[current_data$logfc < -log2(FC) & current_data$pval_adj > 0.05, ]

sig_up3 <- current_data[current_data$logfc > 0 & current_data$pval_adj < 0.05, ]
sig_down3 <- current_data[current_data$logfc < 0 & current_data$pval_adj < 0.05, ]

insig <- current_data[
  current_data$logfc < log2(FC) &
    current_data$logfc > -log2(FC) &
    current_data$pval_adj > 0.05,
]

sig_up <- sig_up[order(sig_up$pval_adj), ]
sig_down <- sig_down[order(sig_down$pval_adj), ]

size <- 4.2

# Only visualize selected genes
up <- c(
  "C5AR2", "CCL18", "LYVE1", "SOCS3", "DUSP4", "G0S2",
  "CXCL1", "CXCL2", "CXCL3", "CCL8", "CCL23", "CXCL5",
  "CXCL10", "CCL18", "CXCR4", "IL27", "FABP4", "TNFRSF613",
  "ENO2", "CNTF", "INHBE", "PF4", "GEM", "ITGB3",
  "PPBP", "MMP12"
)

sig_up_label <- sig_up[sig_up$Gene %in% up, ]

down <- c(
  "CFD", "PECAM1", "ABI3", "GAPT", "MARCKSL1", "GPR82",
  "XYLT1", "NCF1C", "ATP6V0D2", "TMEM119", "EBI3", "FGL2"
)

sig_down_label <- sig_down[sig_down$Gene %in% down, ]

# Check labeled genes
dim(sig_up_label)

## [1] 22  5

dim(sig_down_label)

## [1] 12  5

options(repr.plot.height = 7, repr.plot.width = 9)

p = ggplot(current_data, aes(x = logfc, y = -log10(pval_adj))) +
  geom_point(data = sig_up, col = "red", alpha = 1, size = 1) +
  geom_point(data = sig_down, col = "blue", alpha = 1, size = 1) +
  geom_point(data = sig_up_label, col = "red", alpha = 1, size = 2.5) +
  geom_point(data = sig_down_label, col = "blue", alpha = 1, size = 2.5) +
  geom_point(data = sig_up2, col = "pink", alpha = 0.5, size = 1) +
  geom_point(data = sig_down2, col = "skyblue", alpha = 0.5, size = 1) +
  geom_point(data = sig_up3, col = "pink", alpha = 0.5, size = 1) +
  geom_point(data = sig_down3, col = "skyblue", alpha = 0.5, size = 1) +
  geom_point(data = insig, col = "grey", alpha = 0.5, size = 1) +
  geom_hline(
    yintercept = -log10(0.05),
    colour = "#990000",
    linetype = "dashed"
  ) +
  geom_vline(
    xintercept = log2(FC),
    colour = "#990000",
    linetype = "dashed"
  ) +
  geom_vline(
    xintercept = -log2(FC),
    colour = "#990000",
    linetype = "dashed"
  ) +
  ggrepel::geom_label_repel(
    data = sig_up_label,
    aes(label = Gene),
    max.overlaps = 30,
    fontface = "italic",
    size = size,
    segment.color = "grey",
    box.padding = 0.5
  ) +
  ggrepel::geom_label_repel(
    data = sig_down_label,
    aes(label = Gene),
    max.overlaps = 30,
    fontface = "italic",
    size = size,
    segment.color = "grey",
    box.padding = 0.5
  ) +
  theme_bw(base_size = 18) +
  theme(
    legend.position = "right",
    axis.text.x = element_text(size = 15),
    axis.text.y = element_text(size = 15)
  ) +
  xlab(bquote(log[2](.("SF_T rC5a") / .("control")))) +
  ylab(expression(-log[10]("Adjusted p")))

print(p)

#ggsave("volcano_c5.pdf", width = 9, height = 7, dpi = 300)

ggsave(
  filename = "SF_T_100nM_C5a_vs_No_tx_volcano.png",
  plot = p,
  width = 9,
  height = 7,
  dpi = 300,
  units = "in"
)

library(stringr)
library(tidyverse)
library(ggrepel)

FC <- 2
tmp <- all_output_juan

# Only plot this condition
condition_use <- "a100nM_C5a_vs_No_tx"

current_data <- tmp[tmp$condition == condition_use, ]

# Optional checks
dim(current_data)

## [1] 5341    5

unique(current_data$condition)

## [1] "a100nM_C5a_vs_No_tx"

sig_up <- current_data[current_data$logfc > log2(FC) & current_data$pval_adj < 0.05, ]
sig_down <- current_data[current_data$logfc < -log2(FC) & current_data$pval_adj < 0.05, ]

sig_up2 <- current_data[current_data$logfc > log2(FC) & current_data$pval_adj > 0.05, ]
sig_down2 <- current_data[current_data$logfc < -log2(FC) & current_data$pval_adj > 0.05, ]

sig_up3 <- current_data[current_data$logfc > 0 & current_data$pval_adj < 0.05, ]
sig_down3 <- current_data[current_data$logfc < 0 & current_data$pval_adj < 0.05, ]

insig <- current_data[
  current_data$logfc < log2(FC) &
    current_data$logfc > -log2(FC) &
    current_data$pval_adj > 0.05,
]

sig_up <- sig_up[order(sig_up$pval_adj), ]
sig_down <- sig_down[order(sig_down$pval_adj), ]

size <- 4.2

# Only visualize selected genes
up <- c(
  "CXCR4", "CXCL3", "CXCL5", "CCL3", "CCL22", "DUSP5",
  "SLC7A5", "SERPINE1", "CNIH3", "FABP4", "EREG",
  "ICAM5", "DKK3", "PDCD1", "ITGB3", "GREM1",
  "SLC28A3", "MMP12", "C2orf72", "TBX21", "NOTCH3"
)

sig_up_label <- sig_up[sig_up$Gene %in% up, ]

down <- c(
  "MAF", "PECAM1", "IGFBP4", "CFD", "GAPT", "HSPA6",
  "ABI3", "CFB", "CEBPD", "SELENOP", "CEACAM4"
)

sig_down_label <- sig_down[sig_down$Gene %in% down, ]

# Check labeled genes
dim(sig_up_label)

## [1] 20  5

dim(sig_down_label)

## [1] 10  5

options(repr.plot.height = 7, repr.plot.width = 9)

p = ggplot(current_data, aes(x = logfc, y = -log10(pval_adj))) +
  geom_point(data = sig_up, col = "red", alpha = 1, size = 1) +
  geom_point(data = sig_down, col = "blue", alpha = 1, size = 1) +
  geom_point(data = sig_up_label, col = "red", alpha = 1, size = 2.5) +
  geom_point(data = sig_down_label, col = "blue", alpha = 1, size = 2.5) +
  geom_point(data = sig_up2, col = "pink", alpha = 0.5, size = 1) +
  geom_point(data = sig_down2, col = "skyblue", alpha = 0.5, size = 1) +
  geom_point(data = sig_up3, col = "pink", alpha = 0.5, size = 1) +
  geom_point(data = sig_down3, col = "skyblue", alpha = 0.5, size = 1) +
  geom_point(data = insig, col = "grey", alpha = 0.5, size = 1) +
  geom_hline(
    yintercept = -log10(0.05),
    colour = "#990000",
    linetype = "dashed"
  ) +
  geom_vline(
    xintercept = log2(FC),
    colour = "#990000",
    linetype = "dashed"
  ) +
  geom_vline(
    xintercept = -log2(FC),
    colour = "#990000",
    linetype = "dashed"
  ) +
  ggrepel::geom_label_repel(
    data = sig_up_label,
    aes(label = Gene),
    max.overlaps = 30,
    fontface = "italic",
    size = size,
    segment.color = "grey",
    box.padding = 0.5
  ) +
  ggrepel::geom_label_repel(
    data = sig_down_label,
    aes(label = Gene),
    max.overlaps = 30,
    fontface = "italic",
    size = size,
    segment.color = "grey",
    box.padding = 0.5
  ) +
  theme_bw(base_size = 18) +
  theme(
    legend.position = "right",
    axis.text.x = element_text(size = 15),
    axis.text.y = element_text(size = 15)
  ) +
  xlab(bquote(log[2](.("rC5a") / .("control")))) +
  ylab(expression(-log[10]("Adjusted p")))

print(p)

ggsave(
  filename = "C5a_vs_No_tx_volcano.png",
  plot = p,
  width = 9,
  height = 7,
  dpi = 300,
  units = "in"
)
#ggsave("volcano_c5_notx.pdf", width = 9, height = 7, dpi = 300)

Differential gene expression analysis for bulk RNA-seq to identify complement treatment effect

12 June, 2026

1 Quality Control

2 Check bias of gc content and length biases

3 DE Analysis after preprocessing