Share

PRaminoacidgetterASF.py

Retrieves AA sequence from GFF file given gene id.

Python Source icon PRaminoacidgetterASF.py — Python Source, 1 KB (1558 bytes)

File contents

#This script retrieves amino acid seqeuences from a GFF file, extracting only those sequences with gene IDs present in another file

from Bio import SeqIO #Imports a package for dealing with FASTA format

PRlist = open("PRfile") #'file' should be a tab separated file in which one column contains gene IDs
PRIDlist = [] #create list to which gene IDs will be appended

for line in PRlist: #reads the file line by line	
    words = line.split("\t") #divides each line according to tab separation
    PRIDlist.append(words[1]) #change the numeral to match the column containing gene IDs. This adds eachgene ID to the previously created list


aagff = open("GFFfile") #should be an amino acid GFF file in FASTA format

GFFIDlist = [] # a list for gene IDs in the GFF file
seqlist = [] # a list for sequences in the GFF file

#For each entry in PRIDlist, this searches the GFF file, saving gene IDs and corresponding sequences to the above two lists
for seq_record in SeqIO.parse(aagff, "fasta"):
    for i in range(len(PRIDlist)):
        if PRIDlist[i] in seq_record.id:
            GFFIDlist.append(seq_record.id)
            seqlist.append(seq_record.seq)

#This saves a new file, in FASTA format, containing only gene IDs of interest and their sequences
PRaaseqlist = open("newfile", "w") #change newfile to desired name
for i in range(len(GFFIDlist)):
    PRaaseqlist.write(str(">" + GFFIDlist[i]) + "\n") #Makes ID FASTA format
    PRaaseqlist.write(str(seqlist[i]) + "\n") #places sequence on line below
PRaaseqlist.close()