Sorts output of NCBI Conserved Domain search (http://www.ncbi.nlm.nih.gov/Structure/cdd/wrpsb.cgi) to group all of the domains in a PR gene with it's Gene ID.
PRdomainsorterASF.py
— 1.7 KB
File contents
#condenses list of domains outputted by CD-Search and combines with other Eval from BLAST search PRgenelist = open("PRfile") #PR file should be a tab separated file containing gene IDs and E values from BLAST search PRgenes = [] #list for gene IDs PRgenesdomains = [] #list to become domains in PR genes Evals = [] #list of Evalues for line in PRgenelist: word = line.split("\t") #reads according to tab separation PRgenes.append(word[1]) #ensure that geneID is second column PRgenesdomains.append(word[1]) #ensure that geneID is again second column Evals.append(word[2].rstrip('\n')) #ensure that Evalue is third column PRgenedomainslist = [] for i in range(len(PRgenes)): PRgenesdomains[i] = [] #makes each entry in PRgenesdomains an empty list to be filled later PRgenedomainslist.append(PRgenesdomains[i]) #adds each empty list to new list Domainlist = open("domainfile") #'domainfile' is data outputted from CD-Search, with headspace removed. Should be tab separated. #every time a geneID in CD-Search output matches a geneID in the PR file, the domain detected will be added to the domainslist for lines in Domainlist: words = lines.split("\t") for m in range(len(PRgenes)): if PRgenes[m] == words[0]: #ensure gene ID is first column of domainfile PRgenedomainslist[m].append(words[8]) #ensure that domains are in column 9 #create new file containing gene ID, E value from BLAST search, and a list of all domains detected for each PR gene sortedcondensed = open("newfile", "w") #change 'newfile' to desired filename for n in range(len(PRgenes)): sortedcondensed.write(PRgenes[n] + "\t" + Evals[n] + "\t" + str(PRgenelistlist[n]) + "\n") sortedcondensed.close()