-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathremoves_3UTR_5UTR_from_GTF.py
147 lines (129 loc) · 7.19 KB
/
removes_3UTR_5UTR_from_GTF.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
The scikit-ribo is design for analysis of yeast data, which does not have UTR annotation.
In order to run with other genomes, the boundaries of exons in gtf should be confined to CDS regions (including stop codons).
This script merges the start and stop position of the exon with the start and the stop codon of the CDS.
"""
import os
import argparse
import subprocess
from gtfparse import read_gtf
def main(args):
# Read the gtf file
df = read_gtf(args.gtf_file)
#print (df)
file_name = str(args.gtf_file)[:-4]
list_transcripts_ID = set(list(df['transcript_id']))
dic_start_codons_pos = {}
dic_stop_codons_pos = {}
dic_strand = {}
# Here we write a new file with the principal isoforms, filtering all the transcripts
# having more than one annotated start/stop codon, or not having a start/stop codon.
outfile = open(file_name + ".txt", 'w')
for ID in list_transcripts_ID:
# Find all the feature associated with a single transript
df_for_transcript_ID = df[df['transcript_id']==ID]
#print (df_for_transcript_ID)
# Find the position of the start codon, the stop codon, and the strand of the transcripts
start_codon_feature = df_for_transcript_ID[df_for_transcript_ID['feature']=='start_codon']
stop_codon_feature = df_for_transcript_ID[df_for_transcript_ID['feature']=='stop_codon']
# Skip all the transcripts without a start codon or with more than one start codon
if start_codon_feature.shape[0] == 0 or start_codon_feature.shape[0] > 1:
continue
# Skip all the transcripts without a stop codon or with more than one stop codon
if stop_codon_feature.shape[0] == 0 or stop_codon_feature.shape[0] > 1:
continue
strand = str(start_codon_feature['strand'].values[0])
outfile.write("%s\n" %(ID))
# Positive strand
if strand == '+':
dic_start_codons_pos[ID] = int(start_codon_feature['start'].values[0])
dic_stop_codons_pos[ID] = int(stop_codon_feature['end'].values[0])
dic_strand[ID] = strand
# Negative strand
if strand == '-':
dic_start_codons_pos[ID] = int(start_codon_feature['end'].values[0])
dic_stop_codons_pos[ID] = int(stop_codon_feature['start'].values[0])
dic_strand[ID] = strand
outfile.close()
# Create a new gtf file with the filtered principal isoforms
subprocess.call("grep -f " + file_name + ".txt " + args.gtf_file + " > " + file_name + "_complete.gtf", shell=True)
# Write the file with the correct positions for each features.
# The input file is generated using the grep function above.
# We need to cut all the regions outside the start and the stop codon to run scikit-ribo on multicellular organisms.
# We generate a file with two columns: corrected start position, corrected stop positions
infile = open(file_name + "_complete.gtf", 'r')
outfile = open('correct_positions.txt', 'w')
for line in infile:
transcript_ID = line.split(' ')[8].split('; ')[1].split(' ')[1][1:-1]
# Positive strand (+)
if dic_strand[transcript_ID] == '+':
feature_start = int(line.split(' ')[3])
feature_end = int(line.split(' ')[4])
# Feature start < start codon
if feature_start < dic_start_codons_pos[transcript_ID]:
new_start = dic_start_codons_pos[transcript_ID]
# Feature start > stop codon
elif feature_start > dic_stop_codons_pos[transcript_ID]:
new_start = dic_stop_codons_pos[transcript_ID]
else:
new_start = feature_start
# Feature end < start codon
if feature_end < dic_start_codons_pos[transcript_ID]:
new_end = dic_start_codons_pos[transcript_ID]
# Feature end > stop codon
elif feature_end > dic_stop_codons_pos[transcript_ID]:
new_end = dic_stop_codons_pos[transcript_ID]
else:
new_end = feature_end
outfile.write("%s %s\n" %(new_start,new_end))
# Negative strand (-)
if dic_strand[transcript_ID] == '-':
feature_start = int(line.split(' ')[4])
feature_end = int(line.split(' ')[3])
# Feature start > start codon
if feature_start > dic_start_codons_pos[transcript_ID]:
new_start = dic_start_codons_pos[transcript_ID]
# Feature start < stop codon
elif feature_start < dic_stop_codons_pos[transcript_ID]:
new_start = dic_stop_codons_pos[transcript_ID]
else:
new_start = feature_start
# Feature end > start codon
if feature_end > dic_start_codons_pos[transcript_ID]:
new_end = dic_start_codons_pos[transcript_ID]
# Feature end < stop codon
elif feature_end < dic_stop_codons_pos[transcript_ID]:
new_end = dic_stop_codons_pos[transcript_ID]
else:
new_end = feature_end
outfile.write("%s %s\n" %(new_end,new_start))
infile.close()
outfile.close()
# Replace the corrected start positions using awk (OFS='\t' sets the output field separator)
subprocess.call("awk -F'\t' 'FNR==NR{a[NR]=$1;next}{$4=a[FNR]}1' OFS='\t' correct_positions.txt " + file_name + "_complete.gtf" + " > " + file_name + "_corrected_temp_1.gtf", shell=True)
# Replace the corrected stop positions using awk (OFS='\t' sets the output field separator)
subprocess.call("awk -F'\t' 'FNR==NR{a[NR]=$2;next}{$5=a[FNR]}1' OFS='\t' correct_positions.txt " + file_name + "_corrected_temp_1.gtf" + " > " + file_name + "_corrected_temp_2.gtf", shell=True)
# Remove all the features non-overlapping the coding regions
subprocess.call("awk '$4!=$5' " + file_name + "_corrected_temp_2.gtf" + " > " + file_name + "_corrected_temp_3.gtf", shell=True)
# Remove all the features associated with 'transcript' and 'UTR'
subprocess.call("""awk 'BEGIN { OFS=FS="\t" } $3 !~ /^(transcript|UTR)/' """ + file_name + """_corrected_temp_3.gtf > """ + file_name + """_corrected_temp_4.gtf""", shell=True)
# Remove all the attributes in the 9th column after gene_id and transcript_id to reduce the memory
subprocess.call("""awk 'BEGIN{FS=OFS="\t"} {sub(/gene_type.*/,"",$9)} 1' """ + file_name + """_corrected_temp_4.gtf > """ + file_name + """.corrected_annotation.gtf""", shell=True)
# Remove the intermidiate files
os.remove("correct_positions.txt")
os.remove(file_name + "_corrected_temp_1.gtf")
os.remove(file_name + "_corrected_temp_2.gtf")
os.remove(file_name + "_corrected_temp_3.gtf")
os.remove(file_name + "_corrected_temp_4.gtf")
# Standard boilerplate to call the main() function to begin the program.
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description="Remove UTR regions from ORFS in GTF file, keeping only features overlapping coding regions.")
# TODO Specify your real parameters here.
parser.add_argument("gtf_file",
help="Source GTF File",
metavar="FILE")
args = parser.parse_args()
main(args)