Retrieves AA sequence from GFF file given gene id.
PRaminoacidgetterASF.py — 1.5 KB
File contents
#This script retrieves amino acid seqeuences from a GFF file, extracting only those sequences with gene IDs present in another file from Bio import SeqIO #Imports a package for dealing with FASTA format PRlist = open("PRfile") #'file' should be a tab separated file in which one column contains gene IDs PRIDlist = [] #create list to which gene IDs will be appended for line in PRlist: #reads the file line by line words = line.split("\t") #divides each line according to tab separation PRIDlist.append(words[1]) #change the numeral to match the column containing gene IDs. This adds eachgene ID to the previously created list aagff = open("GFFfile") #should be an amino acid GFF file in FASTA format GFFIDlist = [] # a list for gene IDs in the GFF file seqlist = [] # a list for sequences in the GFF file #For each entry in PRIDlist, this searches the GFF file, saving gene IDs and corresponding sequences to the above two lists for seq_record in SeqIO.parse(aagff, "fasta"): for i in range(len(PRIDlist)): if PRIDlist[i] in seq_record.id: GFFIDlist.append(seq_record.id) seqlist.append(seq_record.seq) #This saves a new file, in FASTA format, containing only gene IDs of interest and their sequences PRaaseqlist = open("newfile", "w") #change newfile to desired name for i in range(len(GFFIDlist)): PRaaseqlist.write(str(">" + GFFIDlist[i]) + "\n") #Makes ID FASTA format PRaaseqlist.write(str(seqlist[i]) + "\n") #places sequence on line below PRaaseqlist.close()