Team:TU-Eindhoven/Code:PDBQuery

From 2013.igem.org

(Difference between revisions)
(PDB Querying)
Line 13: Line 13:
     <containsHybrid>?</containsHybrid>
     <containsHybrid>?</containsHybrid>
  </orgPdbQuery>
  </orgPdbQuery>
 +
</nowiki>{{:Team:TU-Eindhoven/Template:CodeEnd }}
 +
===PDB.py===
 +
{{:Team:TU-Eindhoven/Template:Code }}<nowiki>
 +
#!/usr/bin/env python
 +
 +
"""
 +
Module to read and process PDB data. (http://www.pdb.org)
 +
 +
@author: Pascal Pieters
 +
@copyright: Copyright 2013, TU-Eindhoven iGEM Team
 +
@credits: TU-Eindhoven iGEM Team 2013
 +
@license: GPL
 +
@version: 0.0.1
 +
@maintainer: Pascal Pieters
 +
@email: p.a.pieters@student.tue.nl
 +
@status: Testing
 +
"""
 +
 +
import urllib2
 +
import os
 +
import pylab as P
 +
import numpy as N
 +
import csv
 +
import shutil
 +
 +
class RemoteInstance:
 +
"""General PDB RemoteInstance that can be used to retrieve PDB data"""
 +
def __init__(self):
 +
"""Initiate the RemoteInstance"""
 +
self.searchURL = "http://www.rcsb.org/pdb/rest/search"
 +
self.query = ""
 +
self.result = ""
 +
return
 +
 +
def loadQuery(self, filename):
 +
f = open(filename, 'r')
 +
self.query = ''.join(f.readlines())
 +
f.close()
 +
return
 +
 +
def setQuery(self,q):
 +
self.query = q
 +
return
 +
 +
def executeQuery(self):
 +
req = urllib2.Request(self.searchURL, data=self.query)
 +
r = urllib2.urlopen(req)
 +
self.result = r.read()
 +
self.result = self.result.split('\n')[0:-1]
 +
return len(self.result)
 +
 +
def getBasicInformation(self, filename):
 +
u = ["http://www.rcsb.org/pdb/rest/customReport?pdbids=", "&customReportColumns=structureId,chainId,entityId,sequence,chainLength,db_id,db_name,molecularWeight,secondaryStructure,entityMacromoleculeType&service=wsdisplay&format=csv"]
 +
ft = open(filename, 'w')
 +
 +
for i in range(0, len(self.result), 100):
 +
print "# Downloading %d-%d" % (i, i+100)
 +
url = u[0] + (', '.join((self.result[i:i+100] if i+100 < len(self.result) else self.result[i:]))) + u[1]
 +
os.system(("wget -q -O out.csv \"%s\"" % url))
 +
 +
f = open('out.csv', 'r')
 +
data = ''.join(f.readlines())
 +
print "# - Data Size: %d" % len(data)
 +
f.close()
 +
os.remove('out.csv')
 +
 +
data = data.replace('<br />', '\n')
 +
if i > 0:
 +
[temp, data] = data.split('\n',1)
 +
ft.write(data)
 +
ft.close()
 +
return
 +
 +
 +
class LocalInstance:
 +
def __init__(self, path):
 +
self.filename = path
 +
return
 +
 +
def filterBasics(self, minChainLength=1):
 +
fin = open(self.filename, 'r')
 +
fout = open("temp.csv", 'w')
 +
 +
dh = ['chainLength']
 +
headertext = fin.readline()
 +
fout.write(headertext)
 +
header = headertext.strip().split(',')
 +
d = {i:header.index(i) for i in dh}
 +
print "Chain Length: %d" % tuple(d.values())
 +
 +
for line in fin:
 +
l = line.strip().split(',')
 +
l = [i.strip('"') for i in l]
 +
if int(l[d['chainLength']]) > minChainLength:
 +
fout.write(line)
 +
 +
fout.close()
 +
fin.close()
 +
os.remove(self.filename)
 +
shutil.move("temp.csv", self.filename)
 +
return
 +
 +
def getProteinSequences(self):
 +
f = open(self.filename, 'r')
 +
p = {}
 +
 +
dh = ['structureId', 'chainId', 'entityId', 'sequence', 'entityMacromoleculeType']
 +
header = f.readline().strip().split(',')
 +
d = {i:header.index(i) for i in dh}
 +
print "# Parsing '%s'\n# - Header: " % (self.filename),
 +
for i in d.keys():
 +
print "%s: %d" % (i, d[i]),
 +
print " "
 +
 +
for line in f:
 +
l = line.strip().split(',')
 +
l = [i.strip('"') for i in l]
 +
if 'Polypeptide' in l[d['entityMacromoleculeType']]:
 +
p[(l[d['structureId']], l[d['chainId']], int(l[d['entityId']]))] = l[d['sequence']]
 +
 +
f.close()
 +
return p
 +
 +
def getFrequencies(self, p, aa, filename):
 +
f = {}
 +
max_cnt = 0; max_rat = 0
 +
for k, v in p.iteritems():
 +
cnt = v.count(aa)
 +
lev = len(v)
 +
rat = float(cnt)/float(lev)
 +
f[k] = (cnt, lev, rat)
 +
max_cnt = max(max_cnt, cnt)
 +
max_rat = max(max_rat, rat)
 +
 +
f = {k:(v[0], v[1], v[2], (float(v[0])/float(max_cnt)), (float(v[2])/float(max_rat)), (((float(v[0])/float(max_cnt))**2)+((float(v[2])/float(max_rat))**2))) for k,v in f.iteritems()}
 +
 +
self.writeToCSV(filename, ['structureId', 'chainId', 'entityId', 'lysineCount', 'chainLength', 'lysineRatio', 'normalizedLysineCount', 'normalizedLysineRatio', 'suitabilityIndexA'], [[k[0], k[1], k[2], v[0], v[1], v[2], v[3], v[4], v[5]] for k,v in f.iteritems()])
 +
 +
return f
 +
 +
def plotFrequencies(self, f):
 +
x = []; y = []; d = []; keys = []
 +
for k,v in f.iteritems():
 +
x.append(v[2])
 +
y.append(float(v[0]))
 +
d.append(v[5])
 +
keys.append(k)
 +
 +
npd = N.array(d)
 +
r_mean = N.mean(npd)
 +
r_std = N.std(npd)
 +
del npd
 +
 +
xs = []; ys = []; ds = []; keyss = []
 +
t = []
 +
i = 0;
 +
while i < len(x):
 +
if d[i] > (r_mean+(3*r_std)):
 +
ds.append(d.pop(i))
 +
xs.append(x.pop(i))
 +
ys.append(y.pop(i))
 +
keyss.append(keys.pop(i))
 +
t.append((xs[-1], ys[-1], ds[-1], keyss[-1]))
 +
else:
 +
i += 1
 +
del i
 +
 +
fig = P.figure(1)
 +
ax = fig.add_subplot(111)
 +
 +
P.scatter(x,y)
 +
P.scatter(xs, ys, c='r')
 +
 +
P.ylabel('Lysine Count')
 +
P.xlabel('Lysine Ratio')
 +
 +
t = sorted(t, key=lambda st: st[2])
 +
 +
#for i in t:
 +
#ax.annotate(keyss[i], xy=(xs[i], ys[i]), xycoords='data', size=8)
 +
# print i[3], ": Lysine Ratio=", i[0], ", Lysine Count=", i[1], ", Suitability Index A=", i[2]
 +
 +
P.show()
 +
 +
def writeToCSV(self, filename, header, data):
 +
with open(filename, 'wb') as csvfile:
 +
cwriter = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
 +
cwriter.writerow(header)
 +
for i in data:
 +
cwriter.writerow(i)
 +
return
</nowiki>{{:Team:TU-Eindhoven/Template:CodeEnd }}
</nowiki>{{:Team:TU-Eindhoven/Template:CodeEnd }}

Revision as of 12:05, 2 July 2013

PDB Querying

query.xml

<orgPdbQuery>
    <version>head</version>
    <queryType>org.pdb.query.simple.ChainTypeQuery</queryType>
    <description>Chain Type: there is a Protein chain</description>
    <containsProtein>Y</containsProtein>
    <containsDna>?</containsDna>
    <containsRna>?</containsRna>
    <containsHybrid>?</containsHybrid>
 </orgPdbQuery>

PDB.py

#!/usr/bin/env python

"""
Module to read and process PDB data. (http://www.pdb.org)

@author: Pascal Pieters
@copyright: Copyright 2013, TU-Eindhoven iGEM Team
@credits: TU-Eindhoven iGEM Team 2013
@license: GPL
@version: 0.0.1
@maintainer: Pascal Pieters
@email: p.a.pieters@student.tue.nl
@status: Testing
"""

import urllib2
import os
import pylab as P
import numpy as N
import csv
import shutil

class RemoteInstance:
	"""General PDB RemoteInstance that can be used to retrieve PDB data"""
	def __init__(self):
		"""Initiate the RemoteInstance"""
		self.searchURL = "http://www.rcsb.org/pdb/rest/search"
		self.query = ""
		self.result = ""
		return
	
	def loadQuery(self, filename):
		f = open(filename, 'r')
		self.query = ''.join(f.readlines())
		f.close()
		return
	
	def setQuery(self,q):
		self.query = q
		return
	
	def executeQuery(self):
		req = urllib2.Request(self.searchURL, data=self.query)
		r = urllib2.urlopen(req)
		self.result = r.read()
		self.result = self.result.split('\n')[0:-1]
		return len(self.result)
	
	def getBasicInformation(self, filename):
		u = ["http://www.rcsb.org/pdb/rest/customReport?pdbids=", "&customReportColumns=structureId,chainId,entityId,sequence,chainLength,db_id,db_name,molecularWeight,secondaryStructure,entityMacromoleculeType&service=wsdisplay&format=csv"]
		ft = open(filename, 'w')
		
		for i in range(0, len(self.result), 100):
			print "# Downloading %d-%d" % (i, i+100)
			url = u[0] + (', '.join((self.result[i:i+100] if i+100 < len(self.result) else self.result[i:]))) + u[1]
			os.system(("wget -q -O out.csv \"%s\"" % url))
		
			f = open('out.csv', 'r')
			data = ''.join(f.readlines())
			print "# - Data Size: %d" % len(data)
			f.close()
			os.remove('out.csv')
		
			data = data.replace('<br />', '\n')
			if i > 0:
				[temp, data] = data.split('\n',1)
			ft.write(data)
		ft.close()
		return
	

class LocalInstance:
	def __init__(self, path):
		self.filename = path
		return
	
	def filterBasics(self, minChainLength=1):
		fin = open(self.filename, 'r')
		fout = open("temp.csv", 'w')
		
		dh = ['chainLength']
		headertext = fin.readline()
		fout.write(headertext)
		header = headertext.strip().split(',')
		d = {i:header.index(i) for i in dh}
		print "Chain Length: %d" % tuple(d.values())
				
		for line in fin:
			l = line.strip().split(',')
			l = [i.strip('"') for i in l]
			if int(l[d['chainLength']]) > minChainLength:
				fout.write(line)
		
		fout.close()
		fin.close()
		os.remove(self.filename)
		shutil.move("temp.csv", self.filename)
		return
	
	def getProteinSequences(self):
		f = open(self.filename, 'r')
		p = {}
		
		dh = ['structureId', 'chainId', 'entityId', 'sequence', 'entityMacromoleculeType']
		header = f.readline().strip().split(',')
		d = {i:header.index(i) for i in dh}
		print "# Parsing '%s'\n# - Header: " % (self.filename),
		for i in d.keys():
			print "%s: %d" % (i, d[i]),
		print " "		

		for line in f:
			l = line.strip().split(',')
			l = [i.strip('"') for i in l]
			if 'Polypeptide' in l[d['entityMacromoleculeType']]:
				p[(l[d['structureId']], l[d['chainId']], int(l[d['entityId']]))] = l[d['sequence']]
		
		f.close()
		return p
	
	def getFrequencies(self, p, aa, filename):
		f = {}
		max_cnt = 0; max_rat = 0
		for k, v in p.iteritems():
			cnt = v.count(aa)
			lev = len(v)
			rat = float(cnt)/float(lev)
			f[k] = (cnt, lev, rat)
			max_cnt = max(max_cnt, cnt)
			max_rat = max(max_rat, rat)
		
		f = {k:(v[0], v[1], v[2], (float(v[0])/float(max_cnt)), (float(v[2])/float(max_rat)), (((float(v[0])/float(max_cnt))**2)+((float(v[2])/float(max_rat))**2))) for k,v in f.iteritems()}
		
		self.writeToCSV(filename, ['structureId', 'chainId', 'entityId', 'lysineCount', 'chainLength', 'lysineRatio', 'normalizedLysineCount', 'normalizedLysineRatio', 'suitabilityIndexA'], [[k[0], k[1], k[2], v[0], v[1], v[2], v[3], v[4], v[5]] for k,v in f.iteritems()])

		return f
	
	def plotFrequencies(self, f):
		x = []; y = []; d = []; keys = []
		for k,v in f.iteritems():
			x.append(v[2])
			y.append(float(v[0]))
			d.append(v[5])
			keys.append(k)
		
		npd = N.array(d)
		r_mean = N.mean(npd)
		r_std = N.std(npd)
		del npd
		
		xs = []; ys = []; ds = []; keyss = []
		t = []
		i = 0;
		while i < len(x):
			if d[i] > (r_mean+(3*r_std)):
				ds.append(d.pop(i))
				xs.append(x.pop(i))
				ys.append(y.pop(i))
				keyss.append(keys.pop(i))
				t.append((xs[-1], ys[-1], ds[-1], keyss[-1]))
			else:
				i += 1
		del i
		
		fig = P.figure(1)
		ax = fig.add_subplot(111)
		
		P.scatter(x,y)
		P.scatter(xs, ys, c='r')
		
		P.ylabel('Lysine Count')
		P.xlabel('Lysine Ratio')
		
		t = sorted(t, key=lambda st: st[2])
		
		#for i in t:
			#ax.annotate(keyss[i], xy=(xs[i], ys[i]), xycoords='data', size=8)
		#	print i[3], ": Lysine Ratio=", i[0], ", Lysine Count=", i[1], ", Suitability Index A=", i[2]
				
		P.show()
	
	def writeToCSV(self, filename, header, data):
		with open(filename, 'wb') as csvfile:
			cwriter = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
			cwriter.writerow(header)
			for i in data:
				cwriter.writerow(i)
		return