PDB Querying
General Information
Dependencies
| Python, numpy, pylab, wget
|
License
| GPL
|
Example Usage
| python queryPDB.py -q query.xml -f 10
|
Output
| freq.csv, which is a comma seperated file containing various properties of the different proteins. The most important value is the frequency of the aminoacids Lysing (K) or Arginine (R). Which on is taken can be changed in the source code of queryPDB.py changing the K in the following line: freq = L.getFrequencies(proteins, 'K', 'freq.csv')
|
query.xml
<orgPdbQuery>
<version>head</version>
<queryType>org.pdb.query.simple.ChainTypeQuery</queryType>
<description>Chain Type: there is a Protein chain</description>
<containsProtein>Y</containsProtein>
<containsDna>?</containsDna>
<containsRna>?</containsRna>
<containsHybrid>?</containsHybrid>
</orgPdbQuery>
PDB.py
#!/usr/bin/env python
"""
Module to read and process PDB data. (http://www.pdb.org)
@author: Pascal Pieters
@copyright: Copyright 2013, TU-Eindhoven iGEM Team
@credits: TU-Eindhoven iGEM Team 2013
@license: GPL
@version: 0.0.1
@maintainer: Pascal Pieters
@email: p.a.pieters@student.tue.nl
@status: Testing
"""
import urllib2
import os
import pylab as P
import numpy as N
import csv
import shutil
class RemoteInstance:
"""General PDB RemoteInstance that can be used to retrieve PDB data"""
def __init__(self):
"""Initiate the RemoteInstance"""
self.searchURL = "http://www.rcsb.org/pdb/rest/search"
self.query = ""
self.result = ""
return
def loadQuery(self, filename):
f = open(filename, 'r')
self.query = ''.join(f.readlines())
f.close()
return
def setQuery(self,q):
self.query = q
return
def executeQuery(self):
req = urllib2.Request(self.searchURL, data=self.query)
r = urllib2.urlopen(req)
self.result = r.read()
self.result = self.result.split('\n')[0:-1]
return len(self.result)
def getBasicInformation(self, filename):
u = ["http://www.rcsb.org/pdb/rest/customReport?pdbids=", "&customReportColumns=structureId,chainId,entityId,sequence,chainLength,db_id,db_name,molecularWeight,secondaryStructure,entityMacromoleculeType&service=wsdisplay&format=csv"]
ft = open(filename, 'w')
for i in range(0, len(self.result), 100):
print "# Downloading %d-%d" % (i, i+100)
url = u[0] + (', '.join((self.result[i:i+100] if i+100 < len(self.result) else self.result[i:]))) + u[1]
os.system(("wget -q -O out.csv \"%s\"" % url))
f = open('out.csv', 'r')
data = ''.join(f.readlines())
print "# - Data Size: %d" % len(data)
f.close()
os.remove('out.csv')
data = data.replace('<br />', '\n')
if i > 0:
[temp, data] = data.split('\n',1)
ft.write(data)
ft.close()
return
class LocalInstance:
def __init__(self, path):
self.filename = path
return
def filterBasics(self, minChainLength=1):
fin = open(self.filename, 'r')
fout = open("temp.csv", 'w')
dh = ['chainLength']
headertext = fin.readline()
fout.write(headertext)
header = headertext.strip().split(',')
d = {i:header.index(i) for i in dh}
print "Chain Length: %d" % tuple(d.values())
for line in fin:
l = line.strip().split(',')
l = [i.strip('"') for i in l]
if int(l[d['chainLength']]) > minChainLength:
fout.write(line)
fout.close()
fin.close()
os.remove(self.filename)
shutil.move("temp.csv", self.filename)
return
def getProteinSequences(self):
f = open(self.filename, 'r')
p = {}
dh = ['structureId', 'chainId', 'entityId', 'sequence', 'entityMacromoleculeType']
header = f.readline().strip().split(',')
d = {i:header.index(i) for i in dh}
print "# Parsing '%s'\n# - Header: " % (self.filename),
for i in d.keys():
print "%s: %d" % (i, d[i]),
print " "
for line in f:
l = line.strip().split(',')
l = [i.strip('"') for i in l]
if 'Polypeptide' in l[d['entityMacromoleculeType']]:
p[(l[d['structureId']], l[d['chainId']], int(l[d['entityId']]))] = l[d['sequence']]
f.close()
return p
def getFrequencies(self, p, aa, filename):
f = {}
max_cnt = 0; max_rat = 0
for k, v in p.iteritems():
cnt = v.count(aa)
lev = len(v)
rat = float(cnt)/float(lev)
f[k] = (cnt, lev, rat)
max_cnt = max(max_cnt, cnt)
max_rat = max(max_rat, rat)
f = {k:(v[0], v[1], v[2], (float(v[0])/float(max_cnt)), (float(v[2])/float(max_rat)), (((float(v[0])/float(max_cnt))**2)+((float(v[2])/float(max_rat))**2))) for k,v in f.iteritems()}
self.writeToCSV(filename, ['structureId', 'chainId', 'entityId', 'lysineCount', 'chainLength', 'lysineRatio', 'normalizedLysineCount', 'normalizedLysineRatio', 'suitabilityIndexA'], [[k[0], k[1], k[2], v[0], v[1], v[2], v[3], v[4], v[5]] for k,v in f.iteritems()])
return f
def plotFrequencies(self, f):
x = []; y = []; d = []; keys = []
for k,v in f.iteritems():
x.append(v[2])
y.append(float(v[0]))
d.append(v[5])
keys.append(k)
npd = N.array(d)
r_mean = N.mean(npd)
r_std = N.std(npd)
del npd
xs = []; ys = []; ds = []; keyss = []
t = []
i = 0;
while i < len(x):
if d[i] > (r_mean+(3*r_std)):
ds.append(d.pop(i))
xs.append(x.pop(i))
ys.append(y.pop(i))
keyss.append(keys.pop(i))
t.append((xs[-1], ys[-1], ds[-1], keyss[-1]))
else:
i += 1
del i
fig = P.figure(1)
ax = fig.add_subplot(111)
P.scatter(x,y)
P.scatter(xs, ys, c='r')
P.ylabel('Lysine Count')
P.xlabel('Lysine Ratio')
t = sorted(t, key=lambda st: st[2])
P.show()
def writeToCSV(self, filename, header, data):
with open(filename, 'wb') as csvfile:
cwriter = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
cwriter.writerow(header)
for i in data:
cwriter.writerow(i)
return
queryPDB.py
#!/usr/bin/env python
"""
Script to execute a simple PDB query
@author: Pascal Pieters
@copyright: Copyright 2013, TU-Eindhoven iGEM Team
@credits: TU-Eindhoven iGEM Team 2013
@license: GPL
@version: 0.0.1
@maintainer: Pascal Pieters
@email: p.a.pieters@student.tue.nl
@status: Testing
"""
import PDB
import getopt, sys
if __name__ == "__main__":
opts, args = getopt.getopt(sys.argv[1:], "q:f:pv", ["query=", "filterlength=", "plot"])
query = ""
filterlength = 0
plot = False
verbose = False
for o, a in opts:
if o == "-v":
verbose = True
elif o in ("-q", "--query"):
query = (a if a else "query.xml")
elif o in ("-f", "--filterlength"):
filterlength = (int(a) if a else 10)
elif o in ("-p", "--plot"):
plot = True
else:
assert False, "unhandled option"
if query:
P = PDB.RemoteInstance()
P.loadQuery("query.xml")
if verbose: print "Structures Found: ", P.executeQuery()
else: P.executeQuery()
P.getBasicInformation("basic.csv")
if filterlength or plot:
L = PDB.LocalInstance("basic.csv")
if filterlength:
L.filterBasics(minChainLength=filterlength)
if plot:
proteins = L.getProteinSequences()
if verbose:
for k,v in proteins.iteritems():
print k, ": ", v
freq = L.getFrequencies(proteins, 'K', 'freq.csv')
L.plotFrequencies(freq)