Gets start and stop positions of a gene locus from a GFF file given a Gene ID.
PRstartstopfinderASF.py — 1.4 KB
File contents
#This script is used to retrieve start and stop positions of genes from a GFF file, using gene IDs of verified PR #genes of interest as a query PRgenelist = open("PRfile") #PRfile should be tab separated and contain list of PR gene IDs locuslist = [] for lines in PRgenelist: words = lines.split("\t") locuslist.append(words[1])#ensure that numeral matches appropriate column for gene IDs. Also ensure # that the geneID format matches that in the GFF file startlist = [] stoplist = [] genelist = [] gff = open("GFFfile'") #GFF file can be downloaded from phytozome or genome database of interest header = gff.readline() for lines in gff: for i in range(len(locuslist)): word = lines.split("\t") if locuslist[i] in word[8]: #gene ID should be ninth column in GFF file if "gene" in word[2]: #prevents duplicates from transcript and polypeptide annotations startlist.append(word[3]) #start position should be fourth column stoplist.append(word[4]) #stop position should be fifth column genelist.append(locuslist[i]) #saves geneID again locusstartstoplist = open("newfile", "w") #create new file with gene ID and start and stop positions for b in range(len(genelist)): locusstartstoplist.write(genelist[b] + "\t" + startlist[b] + "\t" + stoplist[b] + "\n") locusstartstoplist.close()