Team:TU-Eindhoven/Code:PDBQuery

From 2013.igem.org

Contents

PDB Querying

General Information

Dependencies Python, numpy, pylab, wget
License GPL
Example Usage python queryPDB.py -q query.xml -f 10
Output freq.csv, which is a comma seperated file containing various properties of the different proteins. The most important value is the frequency of the aminoacids Lysing (K) or Arginine (R). Which on is taken can be changed in the source code of queryPDB.py changing the K in the following line: freq = L.getFrequencies(proteins, 'K', 'freq.csv')

query.xml

<orgPdbQuery>
    <version>head</version>
    <queryType>org.pdb.query.simple.ChainTypeQuery</queryType>
    <description>Chain Type: there is a Protein chain</description>
    <containsProtein>Y</containsProtein>
    <containsDna>?</containsDna>
    <containsRna>?</containsRna>
    <containsHybrid>?</containsHybrid>
 </orgPdbQuery>

PDB.py

#!/usr/bin/env python

"""
Module to read and process PDB data. (http://www.pdb.org)

@author: Pascal Pieters
@copyright: Copyright 2013, TU-Eindhoven iGEM Team
@credits: TU-Eindhoven iGEM Team 2013
@license: GPL
@version: 0.0.1
@maintainer: Pascal Pieters
@email: p.a.pieters@student.tue.nl
@status: Testing
"""

import urllib2
import os
import pylab as P
import numpy as N
import csv
import shutil

class RemoteInstance:
	"""General PDB RemoteInstance that can be used to retrieve PDB data"""
	def __init__(self):
		"""Initiate the RemoteInstance"""
		self.searchURL = "http://www.rcsb.org/pdb/rest/search"
		self.query = ""
		self.result = ""
		return
	
	def loadQuery(self, filename):
		f = open(filename, 'r')
		self.query = ''.join(f.readlines())
		f.close()
		return
	
	def setQuery(self,q):
		self.query = q
		return
	
	def executeQuery(self):
		req = urllib2.Request(self.searchURL, data=self.query)
		r = urllib2.urlopen(req)
		self.result = r.read()
		self.result = self.result.split('\n')[0:-1]
		return len(self.result)
	
	def getBasicInformation(self, filename):
		u = ["http://www.rcsb.org/pdb/rest/customReport?pdbids=", "&customReportColumns=structureId,chainId,entityId,sequence,chainLength,db_id,db_name,molecularWeight,secondaryStructure,entityMacromoleculeType&service=wsdisplay&format=csv"]
		ft = open(filename, 'w')
		
		for i in range(0, len(self.result), 100):
			print "# Downloading %d-%d" % (i, i+100)
			url = u[0] + (', '.join((self.result[i:i+100] if i+100 < len(self.result) else self.result[i:]))) + u[1]
			os.system(("wget -q -O out.csv \"%s\"" % url))
		
			f = open('out.csv', 'r')
			data = ''.join(f.readlines())
			print "# - Data Size: %d" % len(data)
			f.close()
			os.remove('out.csv')
		
			data = data.replace('<br />', '\n')
			if i > 0:
				[temp, data] = data.split('\n',1)
			ft.write(data)
		ft.close()
		return
	

class LocalInstance:
	def __init__(self, path):
		self.filename = path
		return
	
	def filterBasics(self, minChainLength=1):
		fin = open(self.filename, 'r')
		fout = open("temp.csv", 'w')
		
		dh = ['chainLength']
		headertext = fin.readline()
		fout.write(headertext)
		header = headertext.strip().split(',')
		d = {i:header.index(i) for i in dh}
		print "Chain Length: %d" % tuple(d.values())
				
		for line in fin:
			l = line.strip().split(',')
			l = [i.strip('"') for i in l]
			if int(l[d['chainLength']]) > minChainLength:
				fout.write(line)
		
		fout.close()
		fin.close()
		os.remove(self.filename)
		shutil.move("temp.csv", self.filename)
		return
	
	def getProteinSequences(self):
		f = open(self.filename, 'r')
		p = {}
		
		dh = ['structureId', 'chainId', 'entityId', 'sequence', 'entityMacromoleculeType']
		header = f.readline().strip().split(',')
		d = {i:header.index(i) for i in dh}
		print "# Parsing '%s'\n# - Header: " % (self.filename),
		for i in d.keys():
			print "%s: %d" % (i, d[i]),
		print " "		

		for line in f:
			l = line.strip().split(',')
			l = [i.strip('"') for i in l]
			if 'Polypeptide' in l[d['entityMacromoleculeType']]:
				p[(l[d['structureId']], l[d['chainId']], int(l[d['entityId']]))] = l[d['sequence']]
		
		f.close()
		return p
	
	def getFrequencies(self, p, aa, filename):
		f = {}
		max_cnt = 0; max_rat = 0
		for k, v in p.iteritems():
			cnt = v.count(aa)
			lev = len(v)
			rat = float(cnt)/float(lev)
			f[k] = (cnt, lev, rat)
			max_cnt = max(max_cnt, cnt)
			max_rat = max(max_rat, rat)
		
		f = {k:(v[0], v[1], v[2], (float(v[0])/float(max_cnt)), (float(v[2])/float(max_rat)), (((float(v[0])/float(max_cnt))**2)+((float(v[2])/float(max_rat))**2))) for k,v in f.iteritems()}
		
		self.writeToCSV(filename, ['structureId', 'chainId', 'entityId', 'lysineCount', 'chainLength', 'lysineRatio', 'normalizedLysineCount', 'normalizedLysineRatio', 'suitabilityIndexA'], [[k[0], k[1], k[2], v[0], v[1], v[2], v[3], v[4], v[5]] for k,v in f.iteritems()])

		return f
	
	def plotFrequencies(self, f):
		x = []; y = []; d = []; keys = []
		for k,v in f.iteritems():
			x.append(v[2])
			y.append(float(v[0]))
			d.append(v[5])
			keys.append(k)
		
		npd = N.array(d)
		r_mean = N.mean(npd)
		r_std = N.std(npd)
		del npd
		
		xs = []; ys = []; ds = []; keyss = []
		t = []
		i = 0;
		while i < len(x):
			if d[i] > (r_mean+(3*r_std)):
				ds.append(d.pop(i))
				xs.append(x.pop(i))
				ys.append(y.pop(i))
				keyss.append(keys.pop(i))
				t.append((xs[-1], ys[-1], ds[-1], keyss[-1]))
			else:
				i += 1
		del i
		
		fig = P.figure(1)
		ax = fig.add_subplot(111)
		
		P.scatter(x,y)
		P.scatter(xs, ys, c='r')
		
		P.ylabel('Lysine Count')
		P.xlabel('Lysine Ratio')
		
		t = sorted(t, key=lambda st: st[2])
		
		P.show()
	
	def writeToCSV(self, filename, header, data):
		with open(filename, 'wb') as csvfile:
			cwriter = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
			cwriter.writerow(header)
			for i in data:
				cwriter.writerow(i)
		return

queryPDB.py

#!/usr/bin/env python

"""
Script to execute a simple PDB query

@author: Pascal Pieters
@copyright: Copyright 2013, TU-Eindhoven iGEM Team
@credits: TU-Eindhoven iGEM Team 2013
@license: GPL
@version: 0.0.1
@maintainer: Pascal Pieters
@email: p.a.pieters@student.tue.nl
@status: Testing
"""

import PDB
import getopt, sys

if __name__ == "__main__":
	opts, args = getopt.getopt(sys.argv[1:], "q:f:pv", ["query=", "filterlength=", "plot"])
	
	query = ""
	filterlength = 0
	plot = False
	verbose = False
	
	for o, a in opts:
		if o == "-v":
			verbose = True
		elif o in ("-q", "--query"):
			query = (a if a else "query.xml") 
		elif o in ("-f", "--filterlength"):
			filterlength = (int(a) if a else 10)
		elif o in ("-p", "--plot"):
			plot = True
		else:
			assert False, "unhandled option"
	
	if query:
		P = PDB.RemoteInstance()
		P.loadQuery("query.xml")
		if verbose: print "Structures Found: ", P.executeQuery()
		else: P.executeQuery()
		P.getBasicInformation("basic.csv")
	
	if filterlength or plot:
		L = PDB.LocalInstance("basic.csv")
	if filterlength:
		L.filterBasics(minChainLength=filterlength)
	if plot:
		proteins = L.getProteinSequences()
		if verbose:
			for k,v in proteins.iteritems():
				print k, ": ", v
		freq = L.getFrequencies(proteins, 'K', 'freq.csv')
		L.plotFrequencies(freq)