title: “R Notebook”
output: html_notebook
1 下载加载包
cran_packages <- c('tidyr',
'tibble',
'dplyr',
'stringr',
'ggplot2',
'ggpubr',
'factoextra',
'FactoMineR',
'WGCNA')
Biocductor_packages <- c('GEOquery',
'hgu133plus2.db',
"KEGG.db",
"limma",
"impute",
"GSEABase",
"GSVA",
"clusterProfiler",
"genefu",
"org.Hs.eg.db",
"preprocessCore",
"hugene10sttranscriptcluster.db")
for (pkg in c(Biocductor_packages,cran_packages)){
require(pkg,character.only=T)
}
2 下载数据
rm(list = ls())
library(GEOquery)
eSet <- getGEO("GSE42872",
destdir = '.',
getGPL = F)
# 从eSet中提取表达矩阵exp
exp <- exprs(eSet[[1]])
head(exp)
3 ID转换
3.1 方案一:可以找到对应平台
##探针ID(probe_id)转换成symbol ID(方案一:可以找到对应平台)
eSet[[1]]@annotation # 查看GPL号
library(hugene10sttranscriptcluster.db) # 加载特定的R包,下载哪个包,需要根据GPL来定
ls("package:hugene10sttranscriptcluster.db") # 注意加上后缀.db
ids=toTable(hugene10sttranscriptclusterSYMBOL) #想要得到probe_id和symbol的对应关系要用hugene10sttranscriptclusterSYMBOL数据集,用toTable函数提取数据集里面的信息
head(ids) #查看1-6行
# unique函数是用来:Extract Unique Elements 去除重复的symbol只提取不同的元素;length函数统计去重之后还有多少个基因。
length(unique(ids$symbol))
#table() 函数可以生成频数统计表,这里就是统计每个基因symbol出现的次数然后将其表格化;
#sort()函数将symbol出现的频率从小到大进行排序;tail()取最后6个即出现频率最大的6个。
tail(sort(table(ids$symbol)))
# table一下我们可以看到,多少个基因设计了几个探针;
table(sort(table(ids$symbol)))
# ids$probe_id是具有对应基因的所有探针,所以返回的TRUE就是文章数据中有对应基因的探针数。
table(rownames(exp) %in% ids$probe_id)
dim(exp)
# 对探针进行过滤,把没有对应基因名的探针过滤掉
exp = exp[rownames(exp) %in% ids$probe_id,]
dim(exp)
# match函数把ids里的探针顺序改一下,使ids里探针顺序和我们表达矩阵的顺序完全一样。
ids=ids[match(rownames(exp),ids$probe_id),]
head(ids)
head(exp)
# by()函数在这里发挥的功能就是将表达矩阵exp中的探针分组,同一个symbol所对应的多个探针分到一组,并对每组探针进行统计得到symbol所对应的唯一探针。所以tmp里放着by()函数的统计结果即每个symbol所对应的唯一探针IDprobe_id,用probes = as.character(tmp)将结果变身为纯字符型向量
## 具体:第二个参数ids$symbol定义了分组,将第一参数—exp表达矩阵分成了若干个小矩阵,每个小矩阵里存放着同一个symbol所对应的所有探针。第三个参数是我们自己定义的函数:计算每个小矩阵中每行探针表达量的平均值(也就是每个探针在6个样本中表达量的均值rowMeans(x)),再取平均值最大的那个探针作为该symbol所对应的唯一探针which.max(rowMeans(x))。
## by()函数就可以返回每个分组里的统计结果,即每个symbol所对应的唯一探针IDprobe_id。这时,探针ID和基因symbol就一一对应了,将表达矩阵探针ID即exp表达矩阵的行名(rownames(exp))换为基因symbol
tmp = by(exp,
ids$symbol,
function(x) rownames(x)[which.max(rowMeans(x))])
probes = as.character(tmp)
head(tmp)
head(probes)
dim(exp)
exp = exp[rownames(exp) %in% probes,]
dim(exp)
rownames(exp)=ids[match(rownames(exp),ids$probe_id),2]
head(exp)
pd <- pData(eSet[[1]]) # pData函数得到每个样本的描述信息
head(pd)
save(pd,exp,file = "step1output.Rdata")
save(exp,file = "DEGinput.Rdata")
rm(list = ls()) ## 魔幻操作,一键清空~
options(stringsAsFactors = F)
load(file = "step1output.Rdata")
3.2 方案二:找不到GPL平台对应的R注释包
# 如果找不到GPL平台对应的R注释包(方案二)
## 下载平台信息(GPL),从平台信息中选择我们想要的列:探针名、基因名....
gpl <- getGEO('GPL6480', destdir = ".")
colnames(Table(gpl))
head(Table(gpl)[,c(1,6,7)]) #看gpl对象中哪一列是我们想要的取出来,发现1/6/7列是我们想要的
write.csv(Table(gpl)[,c(1,6,7)],"GPL6480.csv") #把我们想要的部分即探针名对应的基因名....存起来
3.3 获取分组信息—group_list, 哪些组是control;哪些组是tumor
# 方法一:使用stringr函数
library(stringr)
group_list = ifelse(str_detect(pd$title,"Control")==TRUE,"contorl","treat")
group_list
# stringr包用于字符串的处理,str_detect是该包里的函数,用来确定一个字符向量能否匹配一种模式。它返回一个与输入向量具有同样长度的逻辑向量:
# 方法二:自己造一个
group_list=c(rep("control",times=3),rep("treat",times=3))
4 boxplot
4.1 检查表达矩阵,画典型基因表达量的boxplot
exp['GAPDH',]
exp['ACTB',]
boxplot(exp)
boxplot(exp['GAPDH',])
boxplot(exp['ACTB',])
4.2 #各个样本表达量的boxplot, 准备画图所需数据exp_L
library(reshape2)
head(exp)
exp_L = melt(exp)
head(exp_L)
colnames(exp_L)=c('symbol','sample','value')
head(exp_L)
4.3 获得分组信息
library(stringr)
group_list = ifelse(str_detect(pd$title,"Control")==TRUE,"contorl","treat")
group_list
exp_L$group = rep(group_list,each=nrow(exp))
head(exp_L)
table(exp_L[,2])
dim(exp_L)
5 ggplot2绘图,聚类,PCA
library(ggplot2)
p = ggplot(exp_L,
aes(x=sample,y=value,fill=group))+geom_boxplot()
print(p)
# 对表达矩阵进行聚类绘图,并添加样本的临床表型数据信息(更改样本名)
## hclust
# 更改表达矩阵列名
head(exp)
colnames(exp) = paste(group_list,1:6,sep='')
head(exp)
# 定义nodePar
nodePar <- list(lab.cex = 0.6, pch = c(NA, 19),
cex = 0.7, col = "blue")
# 聚类
hc=hclust(dist(t(exp)))
par(mar=c(5,5,5,10))
# 绘图
plot(as.dendrogram(hc), nodePar = nodePar, horiz = TRUE)
## PCA
library(ggfortify)
# 互换行和列,dim一下
head(exp)
df=as.data.frame(t(exp))
# 不要view df,列太多,软件会崩掉;
dim(df)
dim(exp)
exp[1:6,1:6]
df[1:6,1:6]
df$group=group_list
autoplot(prcomp( df[,1:(ncol(df)-1)] ), data=df,colour = 'group')
save(exp,group_list,file = "step2output.Rdata")
6 用limma对芯片数据进行差异分析
#差异分析——limma
rm(list = ls()) ## 魔幻操作,一键清空~
options(stringsAsFactors = F)
load(file = "step2output.Rdata")
dim(exp)
library(limma)
# 做分组矩阵
design <- model.matrix(~0+factor(group_list))
colnames(design)=levels(factor(group_list))
rownames(design)=colnames(exp)
design #分组矩阵
# 做比较矩阵
# contrast.matrix<-makeContrasts(paste0(unique(group_list),collapse = "-"),levels = design)
# contrast.matrix ##这个矩阵声明,我们要把treat组和contorl组进行差异分析比较
# -1和1的意思是contorl是用来被比的,treat是来比的
contrast.matrix<-makeContrasts(paste0(c("treat","contorl"),collapse = "-"),levels = design)
contrast.matrix
#到此,做差异分析所需要的三个矩阵就做好了:表达矩阵、分组矩阵、差异比较矩阵
#我们已经制作好了必要的输入数据,下面开始讲如何使用limma这个包来进行差异分析了!
##step1
fit <- lmFit(exp,design)
##step2
fit2 <- contrasts.fit(fit, contrast.matrix) ##这一步很重要,大家可以自行看看效果
fit2 <- eBayes(fit2) ## default no trend !!!
##eBayes() with trend=TRUE
##step3
tempOutput = topTable(fit2, coef=1, n=Inf)
nrDEG = na.omit(tempOutput)
#write.csv(nrDEG2,"limma_notrend.results.csv",quote = F)
head(nrDEG)
save(exp,group_list,nrDEG,file = "DEGoutput.Rdata")
7 热图
# 热图的类似代码如下
library(pheatmap)
annotation_col=data.frame(group=Group)
rownames(annotation_col)=colnames(n)
pheatmap(n,
show_colnames =F,
show_rownames = F,
annotation_col=annotation_col
)
8 火山图
rm(list = ls()) ## 魔幻操作,一键清空~
options(stringsAsFactors = F)
load(file = "DEGoutput.Rdata")
colnames(nrDEG)
plot(nrDEG$logFC,-log10(nrDEG$P.Value))
DEG=nrDEG
logFC_cutoff <- with(DEG,mean(abs( logFC)) + 2*sd(abs( logFC)) )
DEG$change = as.factor(ifelse(DEG$P.Value < 0.05 & abs(DEG$logFC) > logFC_cutoff,
ifelse(DEG$logFC > logFC_cutoff ,'UP','DOWN'),'NOT')
)
this_tile <- paste0('Cutoff for logFC is ',round(logFC_cutoff,3),
'\nThe number of up gene is ',nrow(DEG[DEG$change =='UP',]) ,
'\nThe number of down gene is ',nrow(DEG[DEG$change =='DOWN',])
)
this_tile
head(DEG)
g = ggplot(data=DEG, aes(x=logFC, y=-log10(P.Value), color=change)) +
geom_point(alpha=0.4, size=1.75) +
theme_set(theme_set(theme_bw(base_size=20)))+
xlab("log2 fold change") + ylab("-log10 p-value") +
ggtitle( this_tile ) + theme(plot.title = element_text(size=15,hjust = 0.5))+
scale_colour_manual(values = c('blue','black','red')) ## corresponding to the levels(res$change)
print(g)
9 富集分析
9.1 准备工作:首先对差异表达矩阵nrDEG,进行加工
1.把行名变成SYMBOL列
rm(list = ls()) ## 魔幻操作,一键清空~
options(stringsAsFactors = F)
load(file = "DEGoutput.Rdata")
library(dplyr)
deg = nrDEG
deg <- mutate(deg,symbol = rownames(deg))
head(deg)
2.加change列:上调或下调,火山图要用
logFC_t = 1 #不同的阈值,筛选到的差异基因数量就不一样,后面的超几何分布检验结果就大相径庭。
change=ifelse(deg$P.Value>0.01,'stable',
ifelse( deg$logFC >logFC_t,'up',
ifelse( deg$logFC < -logFC_t,'down','stable') )
)
deg <- mutate(deg,change)
head(deg)
table(deg$change)
3.加ENTREZID列,后面富集分析要用
library(ggplot2)
library(clusterProfiler)
library(org.Hs.eg.db)
s2e <- bitr(unique(deg$symbol), fromType = "SYMBOL", #ID转换核心函数bitr
toType = c( "ENTREZID"),
OrgDb = org.Hs.eg.db)
head(s2e)
head(deg)
deg <- inner_join(deg,s2e,by=c("symbol"="SYMBOL"))
head(deg)
save(exp,group_list,deg,file = "enrich_input.Rdata")
9.2 富集分析
rm(list = ls())
options(stringsAsFactors = F)
load(file = 'enrich_input.Rdata')
## 1.KEGG pathway analysis
#上调、下调、差异、所有基因
9.2.1 clusterProfiler作kegg富集分析:
library(clusterProfiler)
gene_up= deg[deg$change == 'up','ENTREZID']
gene_down=deg[deg$change == 'down','ENTREZID']
gene_diff=c(gene_up,gene_down)
gene_all = deg[,'ENTREZID']
kk.up <- enrichKEGG(gene = gene_up,
organism = 'hsa',
universe = gene_all,
pvalueCutoff = 0.9,
qvalueCutoff =0.9)
head(kk.up)[,1:6]
dim(kk.up)
kk.down <- enrichKEGG(gene = gene_down,
organism = 'hsa',
universe = gene_all,
pvalueCutoff = 0.9,
qvalueCutoff =0.9)
head(kk.down)[,1:6]
dim(kk.down)
kk.diff <- enrichKEGG(gene = gene_diff,
organism = 'hsa',
pvalueCutoff = 0.05)
head(kk.diff)[,1:6]
class(kk.diff)
#提取出数据框
kegg_diff_dt <- kk.diff@result
#根据pvalue来选,用于可视化
down_kegg <- kk.down@result %>%
filter(pvalue<0.05) %>%
mutate(group=-1)
up_kegg <- kk.up@result %>%
filter(pvalue<0.05) %>%
mutate(group=1)
#可视化
kegg_plot <- function(up_kegg,down_kegg){
dat=rbind(up_kegg,down_kegg)
colnames(dat)
dat$pvalue = -log10(dat$pvalue)
dat$pvalue=dat$pvalue*dat$group
dat=dat[order(dat$pvalue,decreasing = F),]
g_kegg<- ggplot(dat, aes(x=reorder(Description,order(pvalue, decreasing = F)), y=pvalue, fill=group)) +
geom_bar(stat="identity") +
scale_fill_gradient(low="blue",high="red",guide = FALSE) +
scale_x_discrete(name ="Pathway names") +
scale_y_continuous(name ="log10P-value") +
coord_flip() + theme_bw()+theme(plot.title = element_text(hjust = 0.5))+
ggtitle("Pathway Enrichment")
}
g_kegg <- kegg_plot(up_kegg,down_kegg)
g_kegg
ggsave(g_kegg,filename = 'kegg_up_down.png')
9.2.2 gsea作kegg富集分析
data(geneList, package="DOSE")
head(geneList)
length(geneList)
names(geneList)
boxplot(geneList)
boxplot(deg$logFC)
geneList=deg$logFC
names(geneList)=deg$ENTREZID
geneList=sort(geneList,decreasing = T)
kk_gse <- gseKEGG(geneList = geneList,
organism = 'hsa',
nPerm = 1000,
minGSSize = 120,
pvalueCutoff = 0.9,
verbose = FALSE)
head(kk_gse)[,1:6]
gseaplot(kk_gse, geneSetID = rownames(kk_gse[1,]))
down_kegg<-kk_gse[kk_gse$pvalue<0.05 & kk_gse$enrichmentScore < 0,];down_kegg$group=-1
up_kegg<-kk_gse[kk_gse$pvalue<0.05 & kk_gse$enrichmentScore > 0,];up_kegg$group=1
gse_kegg=kegg_plot(up_kegg,down_kegg)
print(gse_kegg)
ggsave(gse_kegg,filename ='kegg_up_down_gsea.png')
9.3 GO database analysis
9.3.1 富集分析
library(clusterProfiler)
#输入数据
gene_up= deg[deg$change == 'up','ENTREZID']
gene_down=deg[deg$change == 'down','ENTREZID']
gene_diff=c(gene_up,gene_down)
head(deg)
9.3.2 GO分析三大块
#细胞组分
ego_CC <- enrichGO(gene = gene_diff,
OrgDb= org.Hs.eg.db,
ont = "CC",
pAdjustMethod = "BH",
minGSSize = 1,
pvalueCutoff = 0.01,
qvalueCutoff = 0.01,
readable = TRUE)
#生物过程
ego_BP <- enrichGO(gene = gene_diff,
OrgDb= org.Hs.eg.db,
ont = "BP",
pAdjustMethod = "BH",
minGSSize = 1,
pvalueCutoff = 0.01,
qvalueCutoff = 0.01,
readable = TRUE)
#分子功能:
ego_MF <- enrichGO(gene = gene_diff,
OrgDb= org.Hs.eg.db,
ont = "MF",
pAdjustMethod = "BH",
minGSSize = 1,
pvalueCutoff = 0.01,
qvalueCutoff = 0.01,
readable = TRUE)
save(ego_CC,ego_BP,ego_MF,file = "ego_GPL6244.Rdata")
rm(list = ls())
load(file = "ego_GPL6244.Rdata")
#第一种,条带图,按p从小到大排的
barplot(ego_CC, showCategory=20,title="EnrichmentGO_CC")
barplot(ego_BP, showCategory=20,title="EnrichmentGO_CC")
#如果运行了没出图,就dev.new()
#第二种,点图,按富集数从大到小的
dotplot(ego_CC,title="EnrichmentGO_BP_dot")
#保存
pdf(file = "dotplot_GPL6244.pdf")
dotplot(ego_CC,title="EnrichmentGO_BP_dot")
dev.off()