<- list.files("tcga_meso", pattern = ".tsv",
allfiles full.names = T,recursive = T) # 查看到最里面一层文件夹
head(allfiles)
## [1] "tcga_meso/gdc_download_20240915_113923.366242/01d53607-1a69-4ce8-9058-705db14adc4e/853dbd81-9079-4d4f-aa38-05b04b159667.rna_seq.augmented_star_gene_counts.tsv"
## [2] "tcga_meso/gdc_download_20240915_113923.366242/03902ef2-8a01-4d14-ae8b-efd0734e712e/7a1d80d7-d1ee-4fc0-8299-cbf119263b52.rna_seq.augmented_star_gene_counts.tsv"
## [3] "tcga_meso/gdc_download_20240915_113923.366242/0a3a4b5f-6bdd-4839-a4ea-eb3baeeb031f/835763c1-a525-401c-bb56-90db5918a621.rna_seq.augmented_star_gene_counts.tsv"
## [4] "tcga_meso/gdc_download_20240915_113923.366242/0ea8ca4e-505b-469c-806a-eee66288e9fd/f8fae9b4-d6ff-4dc9-9018-15d91a8ed36e.rna_seq.augmented_star_gene_counts.tsv"
## [5] "tcga_meso/gdc_download_20240915_113923.366242/113ff3c8-dddc-4794-acfe-1d024f1be05e/11864629-bce9-4e38-9bd1-432df0a54e5b.rna_seq.augmented_star_gene_counts.tsv"
## [6] "tcga_meso/gdc_download_20240915_113923.366242/138a7897-ddf8-41e4-9555-ce07d62f316d/c46e9df6-eba7-49c3-a339-2dc82e208a04.rna_seq.augmented_star_gene_counts.tsv"
length(allfiles)
## [1] 87
13 TCGA数据下载和整理
选择TCGA数据下载和整理,原因主要有以下几个:
- 够复杂但不超纲,能够用到非常多前面介绍的知识,这些内容也全部都是R语言的基础知识;
- 够实用,TCGA数据下载和整理是很多人都需要的知识,但是很多初学者不会做;
- 前后对比,先介绍一个很复杂的方法,再介绍一个很简单的方法,要善于使用工具!
13.1 从官网下载TCGA数据
请跟着视频操作。
TCGA官网:https://portal.gdc.cancer.gov/
13.2 表达矩阵整理
查看所有文件(注意,和视频中的路径不一样,千万要注意你自己的路径!!):
用文本编辑器打开其中1个看看情况。如果你用Linux就很简单,windows推荐使用VScode,免费下载,大厂软件,质量有保障。
现在版本的TCGA官网下载的文件,一个文件是1个样本的结果,里面包括60660个mrna(既有编码RNA也有非编码RNA),同时给出了count/tpm/fpkm3种格式。
先读取其中1个试试看:
<- read.table(allfiles[1],sep = "\t",header = F,skip = 6)
tmp dim(tmp)
## [1] 60660 9
head(tmp)
## V1 V2 V3 V4 V5 V6 V7 V8
## 1 ENSG00000000003.15 TSPAN6 protein_coding 506 247 259 5.6293 1.8997
## 2 ENSG00000000005.6 TNMD protein_coding 0 0 0 0.0000 0.0000
## 3 ENSG00000000419.13 DPM1 protein_coding 1763 874 890 73.7093 24.8747
## 4 ENSG00000000457.14 SCYL3 protein_coding 490 432 422 3.5925 1.2124
## 5 ENSG00000000460.17 C1orf112 protein_coding 170 280 284 1.4370 0.4849
## 6 ENSG00000000938.13 FGR protein_coding 876 441 435 13.0710 4.4111
## V9
## 1 2.2155
## 2 0.0000
## 3 29.0094
## 4 1.4139
## 5 0.5655
## 6 5.1443
第4列是count值:
# counts
1:4,c(1,2,4)]
tmp[## V1 V2 V4
## 1 ENSG00000000003.15 TSPAN6 506
## 2 ENSG00000000005.6 TNMD 0
## 3 ENSG00000000419.13 DPM1 1763
## 4 ENSG00000000457.14 SCYL3 490
前两列是ensembl_id和HGNC_gene_symbol(参考:生信初学者最佳实践):
<- tmp[,1:2]
gene_ids colnames(gene_ids) <- c("ensembl_id","gene_name")
head(gene_ids)
## ensembl_id gene_name
## 1 ENSG00000000003.15 TSPAN6
## 2 ENSG00000000005.6 TNMD
## 3 ENSG00000000419.13 DPM1
## 4 ENSG00000000457.14 SCYL3
## 5 ENSG00000000460.17 C1orf112
## 6 ENSG00000000938.13 FGR
批量读取,并按列拼接在一起(所有样本的基因顺序都是一样的,每个文件的结构都是一样的,所以才可以直接拼接):
<- do.call(cbind, lapply(allfiles, function(x){
meso_expr <- read.table(x,sep = "\t",header = F,skip = 6)
tmp <- tmp[,4] # 选count,你想要tpm就选tpm那一列
tmp
}))
dim(meso_expr)
## [1] 60660 87
1:4,1:4]
meso_expr[## [,1] [,2] [,3] [,4]
## [1,] 506 332 1055 821
## [2,] 0 0 1 2
## [3,] 1763 1034 1999 1337
## [4,] 490 520 856 344
13.3 添加行名和列名
添加行名,就是基因名,我选择HGNC_gene_symbol:
rownames(meso_expr) <- tmp[,2]
列名比较复杂,我这里就先用文件名作为列名。
查看下文件和表达矩阵列的对应关系对不对:
2]
allfiles[## [1] "tcga_meso/gdc_download_20240915_113923.366242/03902ef2-8a01-4d14-ae8b-efd0734e712e/7a1d80d7-d1ee-4fc0-8299-cbf119263b52.rna_seq.augmented_star_gene_counts.tsv"
我们不需要前面的路径,只需要xxx.tsv这个名字即可,所以分割字符串:
<- strsplit(allfiles, "/")
row_names <- sapply(row_names,function(x){x[4]})
row_names
length(row_names)
## [1] 87
head(row_names)
## [1] "853dbd81-9079-4d4f-aa38-05b04b159667.rna_seq.augmented_star_gene_counts.tsv"
## [2] "7a1d80d7-d1ee-4fc0-8299-cbf119263b52.rna_seq.augmented_star_gene_counts.tsv"
## [3] "835763c1-a525-401c-bb56-90db5918a621.rna_seq.augmented_star_gene_counts.tsv"
## [4] "f8fae9b4-d6ff-4dc9-9018-15d91a8ed36e.rna_seq.augmented_star_gene_counts.tsv"
## [5] "11864629-bce9-4e38-9bd1-432df0a54e5b.rna_seq.augmented_star_gene_counts.tsv"
## [6] "c46e9df6-eba7-49c3-a339-2dc82e208a04.rna_seq.augmented_star_gene_counts.tsv"
添加行名即可:
colnames(meso_expr) <- row_names
1:2,1:2]
meso_expr[## 853dbd81-9079-4d4f-aa38-05b04b159667.rna_seq.augmented_star_gene_counts.tsv
## TSPAN6 506
## TNMD 0
## 7a1d80d7-d1ee-4fc0-8299-cbf119263b52.rna_seq.augmented_star_gene_counts.tsv
## TSPAN6 332
## TNMD 0
变成数据框结构,并对列排个顺序:
<- as.data.frame(meso_expr)
meso_expr <- meso_expr[,order(colnames(meso_expr))]
meso_expr head(colnames(meso_expr))
## [1] "02204e67-61ec-4407-a8f4-daab9082498e.rna_seq.augmented_star_gene_counts.tsv"
## [2] "068fb718-0419-4ae0-a4e4-df86e352a0ff.rna_seq.augmented_star_gene_counts.tsv"
## [3] "06923905-91b3-45c0-8c05-67ee1ca1fa65.rna_seq.augmented_star_gene_counts.tsv"
## [4] "0a4a1bec-4376-490b-9abb-cfeb9dd6c74b.rna_seq.augmented_star_gene_counts.tsv"
## [5] "0c7e4640-0092-4931-b84f-fd1bef907348.rna_seq.augmented_star_gene_counts.tsv"
## [6] "0cf30abd-2531-4c1e-998e-3c6b7a89e5b8.rna_seq.augmented_star_gene_counts.tsv"
13.4 获取样本名和文件名的对应关系
正经的TCGA表达矩阵的列名很明显不是这样的,而是TCGA-XX-XX-XX这种,所以我们需要给它改一下。
这个名字在哪里呢?就在我们下载的JSON文件里。
如何读取这种文件呢?搜索一下即可。
library(rjson)
# 读取下试试看
<- fromJSON(file = "tcga_meso/metadata.cart.2024-09-03.json") aa
找到文件名和样本名的对应关系:
1]]$file_name
aa[[## [1] "aed00751-a16b-4452-a69b-af47d910b5b5.rna_seq.augmented_star_gene_counts.tsv"
1]]$associated_entities[[1]]$entity_submitter_id
aa[[## [1] "TCGA-XT-AASU-01A-11R-A40A-07"
批量提取:
<- lapply(aa, function(x){
meta_data data.frame(file_name = x$file_name,
sample_id = x$associated_entities[[1]]$entity_submitter_id)
})
1]]
meta_data[[## file_name
## 1 aed00751-a16b-4452-a69b-af47d910b5b5.rna_seq.augmented_star_gene_counts.tsv
## sample_id
## 1 TCGA-XT-AASU-01A-11R-A40A-07
<- do.call(rbind, meta_data)
meta_data 1:2,]
meta_data[## file_name
## 1 aed00751-a16b-4452-a69b-af47d910b5b5.rna_seq.augmented_star_gene_counts.tsv
## 2 f1183765-f532-499a-aed8-fdd39156a67b.rna_seq.augmented_star_gene_counts.tsv
## sample_id
## 1 TCGA-XT-AASU-01A-11R-A40A-07
## 2 TCGA-TS-A7P1-01A-41R-A40A-07
也按照文件名排个序:
<- meta_data[order(meta_data$file_name),]
meta_data head(meta_data$file_name)
## [1] "02204e67-61ec-4407-a8f4-daab9082498e.rna_seq.augmented_star_gene_counts.tsv"
## [2] "068fb718-0419-4ae0-a4e4-df86e352a0ff.rna_seq.augmented_star_gene_counts.tsv"
## [3] "06923905-91b3-45c0-8c05-67ee1ca1fa65.rna_seq.augmented_star_gene_counts.tsv"
## [4] "0a4a1bec-4376-490b-9abb-cfeb9dd6c74b.rna_seq.augmented_star_gene_counts.tsv"
## [5] "0c7e4640-0092-4931-b84f-fd1bef907348.rna_seq.augmented_star_gene_counts.tsv"
## [6] "0cf30abd-2531-4c1e-998e-3c6b7a89e5b8.rna_seq.augmented_star_gene_counts.tsv"
查看这个顺序和表达矩阵的列的顺序是否完全一样:
identical(meta_data$file_name, colnames(meso_expr))
## [1] TRUE
直接替换列名即可:
colnames(meso_expr) <- meta_data$sample_id
1:4,1:4]
meso_expr[## TCGA-ZN-A9VO-01A-11R-A40A-07 TCGA-LK-A4O7-01A-11R-A34F-07
## TSPAN6 1068 1165
## TNMD 1 0
## DPM1 2599 1883
## SCYL3 660 531
## TCGA-MQ-A6BL-01A-11R-A34F-07 TCGA-TS-A8AI-01A-11R-A40A-07
## TSPAN6 1307 1242
## TNMD 0 1
## DPM1 1924 2263
## SCYL3 336 520
13.5 easyTCGA
后续问题?
临床信息? 临床信息和表达矩阵匹配?
直接用easyTCGA
。
library(easyTCGA)
1行代码搞定一切(详细用法请看视频教程):
# 对网络有要求
getmrnaexpr("TCGA-MESO")
=> Querying data.
'dbplyr':
Registered S3 methods overwritten by
method from
print.tbl_lazy
print.tbl_sql --------------------------------------
: Searching in GDC database
o GDCquery--------------------------------------
: hg38
Genome of reference--------------------------------------------
oo Accessing GDC. This might take a while...--------------------------------------------
: TCGA-MESO
ooo Project--------------------
oo Filtering results--------------------
ooo By data.type
ooo By workflow.type----------------
oo Checking data----------------
if there are duplicated cases
ooo Checking if there are results for the query
ooo Checking -------------------
o Preparing output-------------------
=> Downloading data.
for project TCGA-MESO
Downloading data 87 files. A total of 368.350266 MB
GDCdownload will download 1 of 1 (87 files, size = 368.350266 MB) as Thu_Sep_12_13_36_31_2024_0.tar.gz
Downloading chunk : 89 MB
Downloading=> Preparing SummarizedExperiment.
|=================================|100% Completed after 8 s
Starting to add information to samples=> Add clinical information to samples
=> Adding TCGA molecular information from marker papers
=> Information will have prefix 'paper_'
in SummarizedExperiment :
Available assays => unstranded
=> stranded_first
=> stranded_second
=> tpm_unstrand
=> fpkm_unstrand
=> fpkm_uq_unstrand
=> Saving file: output_mRNA_lncRNA_expr/TCGA-MESO_SummarizedExperiment.rdata
=> File saved
=> Preparing mRNA and lncRNA.
=> Successful.