-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmake.coding.genes.R
executable file
·36 lines (26 loc) · 1.08 KB
/
make.coding.genes.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
#!/usr/bin/env Rscript
argv <- commandArgs(trailingOnly = TRUE)
if(length(argv) < 3) {
q()
}
options(stringsAsFactors = FALSE)
library(dplyr)
library(rtracklayer)
## select coding genes in GTF
gtf.file <- argv[1] # e.g., gtf.file <- 'rawdata/references/gencode.v26.GRCh38.genes.gtf'
data.gene.file <- argv[2] # e.g., data.gene.file <- 'data/rnaseq.genes.txt.gz'
out.file <- argv[3]
gtf.tab <- readGFF(gtf.file, tags = c('gene_id', 'gene_name', 'transcript_name', 'gene_type'))
coding.genes <- gtf.tab %>% mutate(chr = seqid, ensg = gene_id) %>%
filter(gene_type == 'protein_coding', type == 'transcript') %>%
select(chr, start, end, strand, ensg, gene_name) %>%
arrange(chr, start)
data.genes <- read.table(data.gene.file)
colnames(data.genes) <- c('ensg', 'data.loc')
chr.names <- paste('chr',c(1:22),sep='')
out.tab <- coding.genes %>%
left_join(data.genes, by = 'ensg') %>%
na.omit() %>%
filter(chr %in% chr.names)
write.table(out.tab, file = gzfile(out.file), sep = '\t',
quote = FALSE, row.names = FALSE, col.names = FALSE)