决定了！把免疫做到新高度就用这个分析！|（附代码）

WGCNA应用之免疫浸润

大家好，我是风风。今天我们要继续我们的WGCNA系列。在之前的推文中，我们已经一起做了对单个数据集进行了WGCNA分析并完整地走完了单个WGCNA的流程和针对多数据集联合分析的共识性分析，同时，在构建网络的过程中，我们也使用了“自动构建拓扑异构网络”法和“逐步法构建拓扑异构网络”两种方法，那么我们WGCNA的常用知识就算基本完结了，接下来无非就是利用WGCNA进行实战的内容。我们将通过三次不同角度的实战，利用我们自己的数据进行分析，看看如何将WGCNA应用到我们的文章之中。

其实关于WGCNA的应用，最关键的内容就是网络构建和联系网络和表型数据，这两步基本决定了分析结果的好坏。今天我们开始第一个内容——联合免疫浸润分析的结果和表达数据进行WGCNA分析：

WGCNA系列传送门

仅凭一个分析就能发3.5+SCI！万字长文教你彻底学会WGCNA！不分享说不过去吧？

比另一半更爱你！被你们问的最多的生信分析！一个分析就是一篇3+SCI！高级！给力！

被问爆的WGCNA教程，高分秘籍，有手就会！（附代码）

免疫浸润分析

免疫浸润分析我们使用耳熟能详的CIBERSORT算法，使用的数据和相关代码放在公众号后台，回复关键词即可获取：

rm(list = ls())# BiocManager::install("WGCNA") 安装R包# BiocManager::install("tidyverse") # BiocManager::install("reshape2") # BiocManager::install("plyr") # BiocManager::install("tidyr") # BiocManager::install("dplyr") # BiocManager::install("ggplot2") options(stringsAsFactors = FALSE)library(tidyverse)## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --## v ggplot2 3.3.3 v purrr 0.3.4## v tibble 3.1.2 v dplyr 1.0.6## v tidyr 1.1.3 v stringr 1.4.0## v readr 1.4.0 v forcats 0.5.1## -- Conflicts ------------------------------------------ tidyverse_conflicts() --## x dplyr::filter() masks stats::filter()## x dplyr::lag() masks stats::lag()library(reshape2)## ## 载入程辑包：'reshape2'## The following object is masked from 'package:tidyr':## ## smithslibrary(WGCNA)## 载入需要的程辑包：dynamicTreeCut## 载入需要的程辑包：fastcluster## ## 载入程辑包：'fastcluster'## The following object is masked from 'package:stats':## ## hclust#### ## 载入程辑包：'WGCNA'## The following object is masked from 'package:stats':## ## corlibrary(plyr)## ------------------------------------------------------------------------------## You have loaded plyr after dplyr - this is likely to cause problems.## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:## library(plyr); library(dplyr)## ------------------------------------------------------------------------------## ## 载入程辑包：'plyr'## The following objects are masked from 'package:dplyr':## ## arrange, count, desc, failwith, id, mutate, rename, summarise,## summarize## The following object is masked from 'package:purrr':## ## compactlibrary(dplyr)library(tidyr)library(ggplot2)# 免疫浸润分析# 进行CIBERSORT分析source("CIBERSORT.R") tpms_data <- "tpms_data.txt"LM22_file <- "LM22.txt"cibersort_res_tpms <- CIBERSORT(LM22_file, tpms_data, perm = 10, QN = TRUE)cibersort_res_tpms1 <- as.data.frame(cibersort_res_tpms)# 去除0值较多的结果cibersort_res_tpms2 <- cibersort_res_tpms1 %>% select(-c("P-value","Correlation","RMSE"))# 筛选出0值太多的一些细胞res_sample <- apply(cibersort_res_tpms2, 2, function(x) sum(x == 0) < dim(cibersort_res_tpms2)[1]/2)cibersort_res_tpms3 <- cibersort_res_tpms2[,res_sample] %>% as.matrix() %>% t()dim(cibersort_res_tpms3)## [1] 11 430library(RColorBrewer)mypalette <- colorRampPalette(brewer.pal(8,"Set3"))cibersort_res_tpms4 <- cibersort_res_tpms3 %>% as.data.frame() %>% rownames_to_column("cell_type") %>% gather(sample, fraction, - cell_type) cibersort_res_tpms5 <- cibersort_res_tpms3 %>% as.data.frame() %>% rownames_to_column("cell_type")ggplot(cibersort_res_tpms4, aes(cell_type, fraction, fill = cell_type)) + geom_boxplot(outlier.shape = 21, coulour = "black") + theme_bw() + labs(x = "Cell Type", y = "Estimated Proportion") + theme(axis.text.x = element_blank()) + theme(axis.ticks.x = element_blank()) + scale_fill_manual(values = mypalette(22))## Warning: Ignoring unknown parameters: coulour

# 导出免疫浸润分析结果作为表型数据write_tsv(cibersort_res_tpms5, "cibersort_res.txt")

WGCNA分析

获得了免疫浸润的数据之后，我们只需要把结果视为表型数据的一种就可以了，接下来我们要做的就是利用WGCNA分析构建网络并获得相应模块：

# 联合免疫浸润和WGCNA分析WGCNA_data <- read_tsv("tpms_data.txt") %>% column_to_rownames("GeneSymbol")## ## -- Column specification --------------------------------------------------------## cols(## .default = col_double(),## GeneSymbol = col_character()## )## i<U+00A0>Use `spec()` for the full column specifications.WGCNA_data[1:3, 1:3]## TCGA-AA-3975-01A-01R-1022-07 TCGA-AA-A00R-01A-01R-A002-07## A1BG 0.05639508 0.1112941## A1CF 35.75448131 0.4451762## A2M 292.91605034 228.2084637## TCGA-AY-A69D-01A-11R-A37K-07## A1BG 0.1540014## A1CF 44.0957226## A2M 206.3960444WGCNA_data1 <- WGCNA_data %>% t() %>% as.data.frame()dim(WGCNA_data1)## [1] 430 19470WGCNA_data1 <- log(WGCNA_data1 + 1)# 筛选基因变化较大的前50%的基因vars_res <- apply(WGCNA_data1, 2, var)# 计算百分位数截止值per_res <- quantile(vars_res, probs = seq(0, 1, 0.25)) upperGene <- WGCNA_data1[, which(vars_res > per_res[4])] WGCNA_data2 <- data.matrix(upperGene)# 对分析数据进行质量评估gsg <- goodSamplesGenes(WGCNA_data2, verbose = 3);## Flagging genes and samples with too many missing values...## ..step 1gsg[["allOK"]] # 全部合格## [1] TRUEdatExpr0 <- WGCNA_data2# 判断矩阵的样本是否都合格？gsg <- goodSamplesGenes(datExpr0, verbose = 3);## Flagging genes and samples with too many missing values...## ..step 1gsg$allOK # 显示TURE则为都合格，不需要进行额外处理，如果是FALSE，则运行下面这部分代码## [1] TRUEif (!gsg$allOK){# Optionally, print the gene and sample names that were removed:if (sum(!gsg$goodGenes)>0) printFlush(paste("Removing genes:", paste(names(datExpr0)[!gsg$goodGenes], collapse = ", ")));if (sum(!gsg$goodSamples)>0) printFlush(paste("Removing samples:", paste(rownames(datExpr0)[!gsg$goodSamples], collapse = ", ")));# Remove the offending genes and samples from the data: datExpr0 = datExpr0[gsg$goodSamples, gsg$goodGenes]}# 对样本进行聚类，dist为计算距离，hclust为聚类sampleTree <- hclust(dist(datExpr0), method = "average")# 聚类结果可视化# pdf("1_sampleTree.pdf",width = 10, height = 5)par(cex = 0.7);par(mar <- c(0,4,2,0))## NULLplot(sampleTree, main = "Sample clustering to detect outliers", sub="", xlab="", cex.lab = 1.5, cex.axis = 1.5, cex.main = 2)

# dev.off()datExpr <- datExpr0nGenes <- ncol(datExpr)nSamples <- nrow(datExpr)# 读取表型数据allTraits <- as.data.frame(t(cibersort_res_tpms3))dim(allTraits)## [1] 430 11names(allTraits)## [1] "B cells naive""Plasma cells"## [3] "T cells CD8""T cells CD4 memory resting"## [5] "T cells regulatory (Tregs)""NK cells resting"## [7] "Monocytes""Macrophages M0"## [9] "Macrophages M1""Macrophages M2"## [11] "Mast cells activated"# 合并表达矩阵和表型数据traitRows <- match(rownames(datExpr), rownames(allTraits))datTraits <- allTraits[traitRows, ]# 对样本进行重聚类sampleTree2 <- hclust(dist(datExpr), method = "average")# 设置颜色traitColors <- numbers2colors(datTraits, signed = FALSE)# 聚类结果可视化# pdf("2_sampleTree.pdf",width = 10, height = 5)plotDendroAndColors(sampleTree2, traitColors, groupLabels = names(datTraits), main = "Sample dendrogram and trait heatmap")

# dev.off()# 保存清洗后的数据save(datExpr, datTraits, file = "dataInput.RData")# 构建网络并进行模块识别# 设置多线程进行WGCNA分析enableWGCNAThreads()## Allowing parallel execution with up to 11 working processes.# 加载前面清洗完成的数据lnames <- load(file = "dataInput.RData")lnames## [1] "datExpr" "datTraits"# 设置软阈值区间powers = c(c(1:10), seq(from = 12, to = 20, by = 2))# 进行拓扑网络分析识别最佳软阈值sft = pickSoftThreshold(datExpr, powerVector = powers, verbose = 5)## pickSoftThreshold: will use block size 4868.## pickSoftThreshold: calculating connectivity for given powers...## ..working on genes 1 through 4868 of 4868## Power SFT.R.sq slope truncated.R.sq mean.k. median.k. max.k.## 1 1 0.00659 0.123 0.841 781.000 7.30e+02 1490.0## 2 2 0.69200 -1.120 0.794 231.000 1.68e+02 771.0## 3 3 0.80700 -1.310 0.814 96.500 4.85e+01 487.0## 4 4 0.81800 -1.290 0.822 49.800 1.61e+01 337.0## 5 5 0.83900 -1.230 0.859 29.300 5.97e+00 246.0## 6 6 0.85000 -1.210 0.883 18.700 2.49e+00 185.0## 7 7 0.85100 -1.190 0.891 12.700 1.11e+00 143.0## 8 8 0.86400 -1.170 0.909 8.920 5.28e-01 112.0## 9 9 0.87700 -1.160 0.927 6.490 2.61e-01 89.4## 10 10 0.88400 -1.150 0.936 4.840 1.32e-01 72.2## 11 12 0.88500 -1.170 0.949 2.850 3.80e-02 49.9## 12 14 0.89600 -1.180 0.958 1.780 1.19e-02 35.6## 13 16 0.89400 -1.200 0.962 1.160 3.81e-03 26.1## 14 18 0.91700 -1.190 0.976 0.787 1.29e-03 19.5## 15 20 0.91600 -1.170 0.975 0.548 4.74e-04 14.9powerEstimates <- sft$powerEstimate # 查看自动识别的最佳软阈值powerEstimates## [1] 6# 如果上述没有自动识别最佳软阈值，则可以将结果进行可视化后尽心选取# pdf("2_Threshold.pdf",width = 10, height = 5)par(mfrow = c(1,2));cex1 = 0.9;plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2], xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n", main = paste("Scale independence"));text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2], labels=powers,cex=cex1,col="red");abline(h=0.90,col="red")plot(sft$fitIndices[,1], sft$fitIndices[,5], xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n", main = paste("Mean connectivity"))text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")

# dev.off() # 构建网络（很关键）softPower<- powerEstimates # 设置最佳软阈值adjacency<- adjacency(datExpr, power = softPower)# 将最邻近值转化为拓扑网络TOM<- TOMsimilarity(adjacency)## ..connectivity..## ..matrix multiplication (system BLAS)..## ..normalization..## ..done.dissTOM <- 1-TOM# 调用层次聚类函数geneTree<- hclust(as.dist(dissTOM), method = "average")# 如果我们喜欢大的模块，那我们就可以把最小模块的数值设置得比较高一些：minModuleSize = 50# 模块识别使用动态切割:dynamicMods<- cutreeDynamic(dendro = geneTree, distM = dissTOM,deepSplit = 2, pamRespectsDendro = FALSE,minClusterSize = minModuleSize)## ..cutHeight not given, setting it to 0.998 ===> 99% of the (truncated) height range in dendro.## ..done.table(dynamicMods)## dynamicMods## 0 1 2 3 4 5 6 7 8 ## 1124 893 770 765 464 378 244 144 86# 可以看到，每个模块的基因数目都大于50# 转换数字标签成颜色标签dynamicColors<- labels2colors(dynamicMods)table(dynamicColors)## dynamicColors## black blue brown green grey pink red turquoise ## 144 770 765 378 1124 86 244 893 ## yellow ## 464# 在下面画出树状图和颜色plotDendroAndColors(geneTree,dynamicColors,"Dynamic Tree Cut",dendroLabels = FALSE,hang = 0.03,addGuide = TRUE,guideHang = 0.05,main = "Gene dendrogram and module colors")

# 接下来，我们就可以计算Eigengenes了MEList <- moduleEigengenes(datExpr, colors = dynamicColors)str(MEList)## List of 12## $ eigengenes :'data.frame': 430 obs. of 9 variables:## ..$ MEblack : num [1:430] 0.00398 -0.01492 -0.0239 -0.01864 -0.02114 ...## ..$ MEblue : num [1:430] 0.01021 0.000788 -0.028916 0.023806 -0.031946 ...## ..$ MEbrown : num [1:430] -0.0173 0.1039 -0.0436 0.0921 -0.0788 ...## ..$ MEgreen : num [1:430] 0.0111 0.1147 -0.0125 0.0747 -0.0782 ...## ..$ MEgrey : num [1:430] -3.55e-02 1.98e-02 8.83e-05 5.94e-02 -9.34e-02 ...## ..$ MEpink : num [1:430] 0.0641 0.0736 -0.052 0.0685 -0.0504 ...## ..$ MEred : num [1:430] 0.09325 -0.01676 0.03156 0.00199 0.00716 ...## ..$ MEturquoise: num [1:430] 0.00806 -0.01397 -0.00363 -0.01001 -0.01136 ...## ..$ MEyellow : num [1:430] -0.0291 -0.0205 0.0416 -0.0316 0.0348 ...## $ averageExpr :'data.frame': 430 obs. of 9 variables:## ..$ AEblack : num [1:430] 0.111 -0.204 -0.299 -0.251 -0.289 ...## ..$ AEblue : num [1:430] 0.1636 -0.0595 -0.337 0.2479 -0.3418 ...## ..$ AEbrown : num [1:430] 0.2782 0.0865 -0.2335 0.4193 -0.3366 ...## ..$ AEgreen : num [1:430] 0.185 1.242 -0.212 0.864 -0.875 ...## ..$ AEgrey : num [1:430] 0.2614 -0.5079 -0.0675 -0.1852 0.0861 ...## ..$ AEpink : num [1:430] 0.726 0.931 -0.622 0.745 -0.56 ...## ..$ AEred : num [1:430] 0.9547 -0.1598 0.2893 0.016 0.0291 ...## ..$ AEturquoise: num [1:430] 0.1578 -0.288 -0.0457 -0.1067 -0.0892 ...## ..$ AEyellow : num [1:430] -0.212 -0.243 0.358 -0.32 0.372 ...## $ varExplained:'data.frame': 1 obs. of 9 variables:## ..$ X1: num 0.497## ..$ X2: num 0.406## ..$ X3: num 0.317## ..$ X4: num 0.394## ..$ X5: num 0.0866## ..$ X6: num 0.426## ..$ X7: num 0.289## ..$ X8: num 0.304## ..$ X9: num 0.333## $ nPC : num 1## $ validMEs : logi [1:9] TRUE TRUE TRUE TRUE TRUE TRUE ...## $ validColors : chr [1:4868] "brown" "turquoise" "blue" "blue" ...## $ allOK : logi TRUE## $ allPC : logi TRUE## $ isPC : logi [1:9] TRUE TRUE TRUE TRUE TRUE TRUE ...## $ isHub : logi [1:9] FALSE FALSE FALSE FALSE FALSE FALSE ...## $ validAEs : logi [1:9] TRUE TRUE TRUE TRUE TRUE TRUE ...## $ allAEOK : logi TRUEMEs <- MEList$eigengenes# 计算模块特征间的不相似性MEDiss = 1 - cor(MEs)METree <- hclust(as.dist(MEDiss), method = "average")# 合并相似模块merge <- mergeCloseModules(datExpr, dynamicColors, cutHeight = 0.2, verbose = 3)## mergeCloseModules: Merging modules whose distance is less than 0.2## multiSetMEs: Calculating module MEs.## Working on set 1 ...## moduleEigengenes: Calculating 9 module eigengenes in given set.## multiSetMEs: Calculating module MEs.## Working on set 1 ...## moduleEigengenes: Calculating 6 module eigengenes in given set.## Calculating new MEs...## multiSetMEs: Calculating module MEs.## Working on set 1 ...## moduleEigengenes: Calculating 6 module eigengenes in given set.mergedColors = merge$colors# 新合并模块的特征基因:mergedMEs = merge$newMEstable(mergedColors)## mergedColors## black brown grey pink red yellow ## 1807 1143 1124 86 244 464# 进行可视化# pdf("4_Module.pdf", 10, 5)plotDendroAndColors(geneTree, cbind(dynamicColors, mergedColors), c("Dynamic Tree Cut", "Merged dynamic"), dendroLabels = FALSE, hang = 0.03, addGuide = TRUE, guideHang = 0.05)

# dev.off()# 重命名模块颜色名称moduleColors = mergedColorscolorOrder = c("grey", standardColors(50))moduleLabels = match(moduleColors, colorOrder) - 1MEs = mergedMEs# 保存模块颜色和标签，以供后续部分使用save(MEs, moduleLabels, moduleColors, geneTree, file = "networkConstruction-stepByStep.RData")

关联表型数据和WGCNA网络

现在我们手上已经有了WGCNA的模块网络，也有了免疫浸润分析的结果作为表型数据，接下来就是关联网络和表型数据进行分析看相关性，从而找到关键网络了：

# 联合表型数据library(reshape2)library(tidyverse)# 读取表型数据dim(allTraits)## [1] 430 11names(allTraits)## [1] "B cells naive""Plasma cells"## [3] "T cells CD8""T cells CD4 memory resting"## [5] "T cells regulatory (Tregs)""NK cells resting"## [7] "Monocytes""Macrophages M0"## [9] "Macrophages M1""Macrophages M2"## [11] "Mast cells activated"# 合并表达矩阵和表型数据traitRows <- match(rownames(datExpr), rownames(allTraits))datTraits <- allTraits[traitRows, ]data2 <- datTraits# 联合表型分析# 加载数据lnames = load(file = "networkConstruction-stepByStep.RData");lnames## [1] "MEs""moduleLabels""moduleColors""geneTree"nGenes = ncol(datExpr);nSamples = nrow(datExpr);# 重新赋予每个ME模块颜色标签MEs0 = moduleEigengenes(datExpr, moduleColors)$eigengenesMEs = orderMEs(MEs0)# 提取MEs相同样本的表型数据cosample <- intersect(rownames(data2), rownames(MEs))data3 <- data2[cosample,]data3 <- rownames_to_column(data3, "ID")write_tsv(data3, "WGCNA_pheno.txt")data3 <- read_tsv("WGCNA_pheno.txt") %>% column_to_rownames("ID")## ## -- Column specification --------------------------------------------------------## cols(## ID = col_character(),## `B cells naive` = col_double(),## `Plasma cells` = col_double(),## `T cells CD8` = col_double(),## `T cells CD4 memory resting` = col_double(),## `T cells regulatory (Tregs)` = col_double(),## `NK cells resting` = col_double(),## Monocytes = col_double(),## `Macrophages M0` = col_double(),## `Macrophages M1` = col_double(),## `Macrophages M2` = col_double(),## `Mast cells activated` = col_double()## )moduleTraitCor = cor(MEs, data3, use = "p")moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples)textMatrix = paste(signif(moduleTraitCor, 2), "\n(", signif(moduleTraitPvalue, 1), ")", sep = "")dim(textMatrix) = dim(moduleTraitCor)# pdf("5_Module_trait.pdf", 8, 6)labeledHeatmap(Matrix = moduleTraitCor, xLabels = colnames(data3), yLabels = names(MEs), cex.lab = 0.5, ySymbols = colnames(MEs), colorLabels = FALSE, colors = colorRampPalette(c("blue", "white", "red"))(100), textMatrix = textMatrix, setStdMargins = FALSE, cex.text = 0.5, zlim = c(-1,1), main = paste("Module trait relationships"))