在生信分析和数据挖掘时,如果研究比较少,无法通过对比获得差异基因,只能整合同一疾病的多个数据集进行综合分析。但是其中一个经常被忽视的问题就是批次效应 (batch effect),这是因为检测受到实验条件、时间、试剂批次和人员等多种差异因素的影响。当批次效应与感兴趣的结果相关并导致不正确结论时,这将成为一个主要问题。
批次效应前
一般认为,不同数据集不能直接合并;同一平台的芯片或测序数据去除批次效应后合并;不同平台的数据集则不建议合并;芯片数据和测序数据不建议合并。因此,我们的推荐是①;②韦恩图。退而求其次的策略是做不同批次数据集的合并分析(二代测序数据中GTEx和TCGA合并分析也是次选)。
批次效应后
系统性红斑狼疮(Systemic lupus erythematosus, SLE)是最常见的自身免疫病之一, 被称为“不死的癌症”。目前这种疾病无法根治,但经正规科学治疗,病情可以得到控制。果友在处理GEO数据库中的SLE数据时,遇到问题,我们一起探讨下。
下载GSE169080、GSE144390以及GSE139350数据集,对 SLE 患者和健康对照组PBMC的高通量测序结果进行再分析。利用 GEO2R 和 omicstudio 分析工具筛选 DEG 后,应用Enrichr 在线工具对各个数据集中上调的差异基因进行富集分析,然后对3个数据集各自的上调通路进行overlap分析,发现有139个通路重合 (图2A)。
随后,用omicstudio绘图工具对GSE169080数据集中这些重合的上调通路做差异分析,发现除Toll样受体信号通路(toll-like receptor,TLR)等经典相关通路外,多个代谢通路如OXPHOS、丙酮酸代谢、三羧酸循环和糖代谢通路等在系统性红斑狼疮患者中均上调(图2B),提示 SLE 患者整体代谢水平异常。
果友的想法是,合并不同测序平台的SLE数据集,然后分为正常组和疾病组进行差异分析,尽管这种方法不是我们所推荐的。我们将借助这次分析,结合DeepSeek就多平台数据集的分析展开系列讨论。本次分析使用的是数据集分别是GSE61635(GPL570),GSE65391(GPL10558)和GSE121239(GPL13158)。我们从GSE61635数据集开始。
GSE61635是79个患者99个样本和30个健康志愿者30个样本的外周血的芯片数据。我们借助DeepSeek完成其数据读取、数据清洗、数据分组、主成分分析和差异分析。
########DeepSeek生成的代码--------------------------------------------
# 加载必要的R包
library(Biobase)
library(GEOquery)
library(limma)
library(ggplot2)
library(tidyverse)
library(ggrepel)
library(hgu133plus2.db) # 用于探针ID转换
# 1. 数据读取
gse <- getGEO(filename = "GSE61635_series_matrix.txt.gz", getGPL = F)
# 提取表达矩阵
expr_data <- exprs(gse)
# 2. 数据清洗和log2转换
# 检查数据是否需要log2转换
qx <- as.numeric(quantile(expr_data, c(0., 0.25, 0.5, 0.75, 0.99, 1.0), na.rm = T))
LogC <- (qx[5] > 100) ||
(qx[6]-qx[1] > 50 && qx[2] > 0)
if (LogC) {
expr_data[which(expr_data <= 0)] <- NaN
expr_data <- log2(expr_data)
}
# 3. 数据标准化
expr_data <- normalizeBetweenArrays(expr_data)
# 4. 样本分组
# 从样本描述中提取分组信息
sample_info <- pData(gse)
sample_info$group <- ifelse(grepl("SLE patient", sample_info$title), "SLE",
ifelse(grepl("Healthy volunteer", sample_info$title), "CTRL", NA))
# 确保只保留SLE和CTRL样本
keep_samples <- !is.na(sample_info$group)
expr_data <- expr_data[, keep_samples]
sample_info <- sample_info[keep_samples, ]
# 5. 探针ID转换(取平均值)
# 将探针ID转换为基因符号
probe_ids <- rownames(expr_data)
gene_symbols <- mapIds(hgu133plus2.db,
keys = probe_ids,
column = "SYMBOL",
keytype = "PROBEID")
# 去除没有对应基因符号的探针
keep_probes <- !is.na(gene_symbols)
expr_data <- expr_data[keep_probes, ]
gene_symbols <- gene_symbols[keep_probes]
# 对多个探针对应同一基因的情况取平均值
expr_data <- avereps(expr_data, ID = gene_symbols)
# 6. PCA分析
pca <- prcomp(t(expr_data), scale. = TRUE)
pca_data <- data.frame(pca$x[, 1:2],
Group = sample_info$group,
Sample = sample_info$title)
# 绘制PCA图
ggplot(pca_data, aes(x = PC1, y = PC2, color = Group, shape = Group)) +
geom_point(size = 4, alpha = 0.8) + # 增加点大小和透明度
stat_ellipse(level = 0.95, show.legend = FALSE) + # 添加95%置信椭圆
theme_minimal(base_size = 12) + # 增大基础字体大小
labs(title = "PCA Plot of SLE vs CTRL Samples",
x = paste0("PC1 (", round(summary(pca)$importance[2,1]*100, 1), "%)"),
y = paste0("PC2 (", round(summary(pca)$importance[2,2]*100, 1), "%)"),
color = "Group", shape = "Group") + # 添加图例标题
scale_color_manual(values = c("CTRL" = "#1F78B4", "SLE" = "#E31A1C")) + # 使用更专业的颜色
scale_shape_manual(values = c("CTRL" = 16, "SLE" = 17)) + # 不同组使用不同形状
theme(
plot.title = element_text(hjust = 0.5, face = "bold"), # 标题居中加粗
legend.position = "right", # 图例放在右侧
panel.grid.minor = element_blank(), # 去除次要网格线
aspect.ratio = 1 # 保持1:1纵横比
)
# 保存PCA图
ggsave("PCA_plot.png", width = 10, height = 8, dpi = 300)
# 7. 差异表达分析
# 创建设计矩阵
design <- model.matrix(~0 + sample_info$group)
colnames(design) <- c("CTRL", "SLE")
# 拟合线性模型
fit <- lmFit(expr_data, design)
# 设置对比矩阵
contrast_matrix <- makeContrasts(SLE - CTRL, levels = design)
# 计算对比
fit2 <- contrasts.fit(fit, contrast_matrix)
fit2 <- eBayes(fit2)
# 获取差异分析结果
diff_results <- topTable(fit2, number = Inf, adjust.method = "BH")
# 添加基因名列
diff_results$Gene <- rownames(diff_results)
# 重新排列列顺序
diff_results <- diff_results[, c("Gene", "logFC", "AveExpr", "t", "P.Value", "adj.P.Val", "B")]
# 8. 保存差异分析结果
library(openxlsx)
write.xlsx(diff_results, "SLE_vs_CTRL_Differential_Expression.xlsx", row.names = FALSE)
sum(diff_results$adj.P.Val < 0.05 & diff_results$logFC > 1)
sum(diff_results$adj.P.Val < 0.05 & diff_results$logFC < -1)
# 9. 绘制火山图
# 准备火山图数据
volcano_data <- diff_results %>%
mutate(
Significance = case_when(
adj.P.Val < 0.05 & logFC > 1 ~ "Up-regulated",
adj.P.Val < 0.05 & logFC < -1 ~ "Down-regulated",
TRUE ~ "Not significant"
),
Gene_label = ifelse(
(adj.P.Val < 0.05 & abs(logFC) > 1) |
(Gene %in% head(diff_results[order(diff_results$adj.P.Val), "Gene"], 10)),
Gene, ""
)
)
# 定义颜色
volcano_colors <- c(
"Up-regulated" = "#E31A1C",
"Down-regulated" = "#1F78B4",
"Not significant" = "grey60"
)
# 绘制火山图
ggplot(volcano_data, aes(x = logFC, y = -log10(adj.P.Val), color = Significance)) +
geom_point(size = 2, alpha = 0.7) +
geom_vline(xintercept = c(-1, 1), linetype = "dashed", color = "black", alpha = 0.5) +
geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "black", alpha = 0.5) +
scale_color_manual(values = volcano_colors) +
labs(
title = "Volcano Plot: SLE vs CTRL",
x = "log2 Fold Change (SLE/CTRL)",
y = "-log10(Adjusted P-value)",
color = "Expression"
) +
theme_minimal(base_size = 12) +
theme(
plot.title = element_text(hjust = 0.5, face = "bold"),
legend.position = "right",
panel.grid.major = element_line(color = "grey90"),
panel.grid.minor = element_blank(),
aspect.ratio = 1
) +
geom_text_repel(
aes(label = Gene_label),
size = 3,
max.overlaps = 20,
box.padding = 0.5,
point.padding = 0.2,
segment.color = "grey50"
) +
xlim(
min(volcano_data$logFC, na.rm = TRUE) - 0.5,
max(volcano_data$logFC, na.rm = TRUE) + 0.5
)
# 保存火山图
ggsave("Volcano_plot.png", width = 10, height = 8, dpi = 300)
# 10. 输出差异基因
# 筛选显著差异表达基因 (adj.P.Val < 0.05 且 |logFC| > 1)
sig_genes <- diff_results %>%
filter(adj.P.Val < 0.05 & abs(logFC) > 1) %>%
arrange(desc(logFC)) # 按logFC从大到小排序
# 分别获取上调和下调基因
up_genes <- sig_genes %>% filter(logFC > 1)
down_genes <- sig_genes %>% filter(logFC < -1)
# 输出统计信息
cat("总差异表达基因数:", nrow(sig_genes), "\n")
cat("上调基因数:", nrow(up_genes), "\n")
cat("下调基因数:", nrow(down_genes), "\n")
# 保存所有差异基因列表
write.xlsx(
list(
"All_DE_Genes" = sig_genes,
"Upregulated" = up_genes,
"Downregulated" = down_genes
),
file = "Differential_Expression_Genes.xlsx",
row.names = FALSE
)
# 保存为CSV文件(可选)
write.csv(sig_genes, "Differential_Expression_Genes.csv", row.names = FALSE)
# 输出前10个上调和下调基因
cat("\nTop 10上调基因:\n")
print(head(up_genes, 10)[, c("Gene", "logFC", "adj.P.Val")])
cat("\nTop 10下调基因:\n")
print(head(down_genes, 10)[, c("Gene", "logFC", "adj.P.Val")])第二个数据集GSE65391, 样本量更大, 也是芯片数据,不过测序平台不同。总共924个SLE样本,72个健康对照样本。比较稀缺的是,这些病人样本的测序数据是有时间追踪的(158个病人,连续检测4年),可以进行拟时序分析。
其结果已经在Cell杂志发表,题目为Personalized Immunomonitoring Uncovers Molecular Networks that Stratify Lupus Patients。从中性粒细胞signature角度出发进行分析。
我们借助DeepSeek完成其数据读取、数据清洗、数据分组、主成分分析和差异分析。
#### 完成GSE65391数据的分析流程,包括数据读取、清洗、标准化、差异分析和可视化-------------------
# 加载必要的R包
library(Biobase)
library(GEOquery)
library(limma)
library(ggplot2)
library(ggrepel)
library(dplyr)
library(tidyr)
library(openxlsx)
library(illuminaHumanv4.db) # 用于Illumina HumanHT-12 V4.0芯片的探针注释
# 1. 数据读取
gse <- getGEO(filename = "GSE65391_series_matrix.txt.gz", getGPL = F)
# 提取表达矩阵
expr_data <- exprs(gse)
# 2. 数据清洗和log2转换
# 检查数据是否需要log2转换
qx <- as.numeric(quantile(expr_data, c(0., 0.25, 0.5, 0.75, 0.99, 1.0), na.rm = T))
LogC <- (qx[5] > 100) ||
(qx[6]-qx[1] > 50 && qx[2] > 0)
if (LogC) {
expr_data[which(expr_data <= 0)] <- NaN
expr_data <- log2(expr_data)
}
# 3. 数据标准化
expr_data <- normalizeBetweenArrays(expr_data)
# 4. 样本分组
# 从样本描述中提取分组信息
sample_info <- pData(gse)
sample_info$group <- ifelse(grepl("whole blood-SLE", sample_info$title), "SLE",
ifelse(grepl("Healthy", sample_info$title), "CTRL", NA))
# 确保只保留SLE和CTRL样本
keep_samples <- !is.na(sample_info$group)
expr_data <- expr_data[, keep_samples]
sample_info <- sample_info[keep_samples, ]
# 5. 探针ID转换(取平均值)
# 将探针ID转换为基因符号
probe_ids <- rownames(expr_data)
gene_symbols <- mapIds(illuminaHumanv4.db,
keys = probe_ids,
column = "SYMBOL",
keytype = "PROBEID")
# 去除没有对应基因符号的探针
keep_probes <- !is.na(gene_symbols)
expr_data <- expr_data[keep_probes, ]
gene_symbols <- gene_symbols[keep_probes]
# 对多个探针对应同一基因的情况取平均值
expr_data <- avereps(expr_data, ID = gene_symbols)
# 6. PCA分析
# 在PCA分析前添加数据过滤步骤
# 6-1. 移除在所有样本中表达量相同的基因(零方差基因)
expr_var <- apply(expr_data, 1, var) # 计算每个基因的方差
nonzero_var_genes <- expr_var > 0 # 找出方差大于0的基因
expr_data_filtered <- expr_data[nonzero_var_genes, ] # 过滤数据
cat("过滤掉的零方差基因数量:", sum(!nonzero_var_genes), "\n")
cat("保留的基因数量:", sum(nonzero_var_genes), "\n")
# 6-2. 执行PCA分析
pca <- prcomp(t(expr_data_filtered), scale. = TRUE)
# 3. 绘制PCA图
pca_data <- data.frame(pca$x[, 1:2],
Group = sample_info$group,
Sample = sample_info$title)
ggplot(pca_data, aes(x = PC1, y = PC2, color = Group)) +
geom_point(size = 3) +
theme_minimal() +
labs(title = "PCA Plot of SLE vs CTRL Samples (GSE65391)",
subtitle = paste("After removing", sum(!nonzero_var_genes), "zero-variance genes"),
x = paste0("PC1 (", round(summary(pca)$importance[2,1]*100, 1), "%)"),
y = paste0("PC2 (", round(summary(pca)$importance[2,2]*100, 1), "%)")) +
scale_color_manual(values = c("CTRL" = "#1F78B4", "SLE" = "#E31A1C")) +
stat_ellipse(level = 0.95) # 添加95%置信椭圆
# 保存PCA图
ggsave("PCA_plot_GSE65391_filtered.png", width = 10, height = 8, dpi = 300)
# 7. 差异表达分析
# 创建设计矩阵
design <- model.matrix(~0 + sample_info$group)
colnames(design) <- c("CTRL", "SLE")
# 拟合线性模型
fit <- lmFit(expr_data, design)
# 设置对比矩阵
contrast_matrix <- makeContrasts(SLE - CTRL, levels = design)
# 计算对比
fit2 <- contrasts.fit(fit, contrast_matrix)
fit2 <- eBayes(fit2)
# 获取差异分析结果
diff_results <- topTable(fit2, number = Inf, adjust.method = "BH")
# 添加基因名列
diff_results$Gene <- rownames(diff_results)
# 重新排列列顺序
diff_results <- diff_results[, c("Gene", "logFC", "AveExpr", "t", "P.Value", "adj.P.Val", "B")]
# 筛选显著差异基因 (adj.P.Val < 0.05 & |logFC| > 1)
sig_diff <- diff_results %>%
filter(adj.P.Val < 0.05 & abs(logFC) > 1) %>%
arrange(desc(abs(logFC)))
# 8. 保存分析结果
# 创建Excel工作簿
wb <- createWorkbook()
# 添加全部基因工作表
addWorksheet(wb, "All_Genes")
writeData(wb, "All_Genes", diff_results, rowNames = FALSE)
# 添加差异基因工作表
addWorksheet(wb, "DEGs")
writeData(wb, "DEGs", sig_diff, rowNames = FALSE)
# 保存Excel文件
saveWorkbook(wb, "SLE_vs_CTRL_Differential_Expression_GSE65391.xlsx", overwrite = TRUE)
# 9. 绘制火山图
# 标记显著差异基因
diff_results$Significant <- ifelse(diff_results$adj.P.Val < 0.05 & abs(diff_results$logFC) > 1,
ifelse(diff_results$logFC > 1, "Up", "Down"), "Not")
# 绘制火山图
volcano_plot <- ggplot(diff_results, aes(x = logFC, y = -log10(adj.P.Val), color = Significant)) +
geom_point(size = 2, alpha = 0.6) +
scale_color_manual(values = c("Up" = "red", "Down" = "blue", "Not" = "grey")) +
theme_minimal() +
labs(title = "Volcano Plot of SLE vs CTRL",
x = "log2 Fold Change",
y = "-log10(Adjusted P-value)") +
geom_hline(yintercept = -log10(0.05), linetype = "dashed") +
geom_vline(xintercept = c(-1, 1), linetype = "dashed") +
theme(legend.position = "bottom")
volcano_plot
## 简单美化火山图
# 准备火山图数据
volcano_data2 <- diff_results %>%
mutate(
Significance = case_when(
adj.P.Val < 0.05 & logFC > 1 ~ "Up-regulated",
adj.P.Val < 0.05 & logFC < -1 ~ "Down-regulated",
TRUE ~ "Not significant"
),
Gene_label = ifelse(
(adj.P.Val < 0.05 & abs(logFC) > 1) |
(Gene %in% head(diff_results[order(diff_results$adj.P.Val),"Gene"],10)),
Gene,""
)
)
# 定义颜色
volcano_colors <- c(
"Up-regulated" ="#E31A1C",
"Down-regulated" = "#1F78B4",
"Not significant" = "grey60"
)
# 绘制火山图
ggplot(volcano_data2, aes(x= logFC,y = -log10(adj.P.Val), color = Significance)) +
geom_point(size = 3, alpha = 0.7) +
geom_vline(xintercept = c(-1, 1), linetype = "dashed", color = "black", alpha = 0.5) +
geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "black", alpha = 0.5) +
scale_color_manual(values = volcano_colors) +
labs(
title ="Volcano Plot: SLE vs CTRL",
x = "log2 Fold Change (SLE/CTRL)",
y = "-log10(Adjusted P-value)",
color = "Expression"
) +
theme_minimal(base_size = 12) +
theme(
plot.title = element_text(hjust = 0.5, face = "bold"),
legend.position = "right",
panel.grid.major = element_line(color = "grey90"),
panel.grid.minor = element_blank(),
aspect.ratio = 1
) +
geom_text_repel(
aes(label = Gene_label),
size = 3,
max.overlaps = 20,
box.padding = 0.5,
point.padding = 0.2,
segment.color = "grey50"
) +
xlim(
min(volcano_data2$logFC, na.rm = TRUE) - 1.5,
max(volcano_data2$logFC, na.rm = TRUE) + 0.5
)
# 保存火山图
ggsave("Volcano_plot.png", width = 10, height = 8, dpi = 300)
热门跟贴