Gets start and stop positions of a gene locus from a GFF file given a Gene ID.

Python Source icon — Python Source, 1 KB (1440 bytes)

File contents

#This script is used to retrieve start and stop positions of genes from a GFF file, using gene IDs of verified PR
#genes of interest as a query

PRgenelist = open("PRfile") #PRfile should be tab separated and contain list of PR gene IDs

locuslist = []

for lines in PRgenelist:
    words = lines.split("\t")
    locuslist.append(words[1])#ensure that numeral matches appropriate column for gene IDs. Also ensure
                            # that the geneID format matches that in the GFF file 

startlist = []
stoplist = []
genelist = []
gff = open("GFFfile'") #GFF file can be downloaded from phytozome or genome database of interest
header = gff.readline()

for lines in gff:
    for i in range(len(locuslist)):
        word = lines.split("\t")
        if locuslist[i] in word[8]: #gene ID should be ninth column in GFF file
            if "gene" in word[2]: #prevents duplicates from transcript and polypeptide annotations
                startlist.append(word[3]) #start position should be fourth column
                stoplist.append(word[4]) #stop position should be fifth column
                genelist.append(locuslist[i]) #saves geneID again

locusstartstoplist = open("newfile", "w") #create new file with gene ID and start and stop positions
for b in range(len(genelist)):
    locusstartstoplist.write(genelist[b] + "\t" + startlist[b] + "\t" + stoplist[b] + "\n")