Team:TU-Eindhoven/Code:PDBQuery
From 2013.igem.org
Revision as of 12:12, 2 July 2013 by Pascalaldo (Talk | contribs)
Contents |
PDB Querying
General Information
Dependencies | Python, numpy, pylab, wget |
License | GPL |
Example Usage | python queryPDB.py -q query.xml -f 10 |
query.xml
<orgPdbQuery> <version>head</version> <queryType>org.pdb.query.simple.ChainTypeQuery</queryType> <description>Chain Type: there is a Protein chain</description> <containsProtein>Y</containsProtein> <containsDna>?</containsDna> <containsRna>?</containsRna> <containsHybrid>?</containsHybrid> </orgPdbQuery>
PDB.py
#!/usr/bin/env python """ Module to read and process PDB data. (http://www.pdb.org) @author: Pascal Pieters @copyright: Copyright 2013, TU-Eindhoven iGEM Team @credits: TU-Eindhoven iGEM Team 2013 @license: GPL @version: 0.0.1 @maintainer: Pascal Pieters @email: p.a.pieters@student.tue.nl @status: Testing """ import urllib2 import os import pylab as P import numpy as N import csv import shutil class RemoteInstance: """General PDB RemoteInstance that can be used to retrieve PDB data""" def __init__(self): """Initiate the RemoteInstance""" self.searchURL = "http://www.rcsb.org/pdb/rest/search" self.query = "" self.result = "" return def loadQuery(self, filename): f = open(filename, 'r') self.query = ''.join(f.readlines()) f.close() return def setQuery(self,q): self.query = q return def executeQuery(self): req = urllib2.Request(self.searchURL, data=self.query) r = urllib2.urlopen(req) self.result = r.read() self.result = self.result.split('\n')[0:-1] return len(self.result) def getBasicInformation(self, filename): u = ["http://www.rcsb.org/pdb/rest/customReport?pdbids=", "&customReportColumns=structureId,chainId,entityId,sequence,chainLength,db_id,db_name,molecularWeight,secondaryStructure,entityMacromoleculeType&service=wsdisplay&format=csv"] ft = open(filename, 'w') for i in range(0, len(self.result), 100): print "# Downloading %d-%d" % (i, i+100) url = u[0] + (', '.join((self.result[i:i+100] if i+100 < len(self.result) else self.result[i:]))) + u[1] os.system(("wget -q -O out.csv \"%s\"" % url)) f = open('out.csv', 'r') data = ''.join(f.readlines()) print "# - Data Size: %d" % len(data) f.close() os.remove('out.csv') data = data.replace('<br />', '\n') if i > 0: [temp, data] = data.split('\n',1) ft.write(data) ft.close() return class LocalInstance: def __init__(self, path): self.filename = path return def filterBasics(self, minChainLength=1): fin = open(self.filename, 'r') fout = open("temp.csv", 'w') dh = ['chainLength'] headertext = fin.readline() fout.write(headertext) header = headertext.strip().split(',') d = {i:header.index(i) for i in dh} print "Chain Length: %d" % tuple(d.values()) for line in fin: l = line.strip().split(',') l = [i.strip('"') for i in l] if int(l[d['chainLength']]) > minChainLength: fout.write(line) fout.close() fin.close() os.remove(self.filename) shutil.move("temp.csv", self.filename) return def getProteinSequences(self): f = open(self.filename, 'r') p = {} dh = ['structureId', 'chainId', 'entityId', 'sequence', 'entityMacromoleculeType'] header = f.readline().strip().split(',') d = {i:header.index(i) for i in dh} print "# Parsing '%s'\n# - Header: " % (self.filename), for i in d.keys(): print "%s: %d" % (i, d[i]), print " " for line in f: l = line.strip().split(',') l = [i.strip('"') for i in l] if 'Polypeptide' in l[d['entityMacromoleculeType']]: p[(l[d['structureId']], l[d['chainId']], int(l[d['entityId']]))] = l[d['sequence']] f.close() return p def getFrequencies(self, p, aa, filename): f = {} max_cnt = 0; max_rat = 0 for k, v in p.iteritems(): cnt = v.count(aa) lev = len(v) rat = float(cnt)/float(lev) f[k] = (cnt, lev, rat) max_cnt = max(max_cnt, cnt) max_rat = max(max_rat, rat) f = {k:(v[0], v[1], v[2], (float(v[0])/float(max_cnt)), (float(v[2])/float(max_rat)), (((float(v[0])/float(max_cnt))**2)+((float(v[2])/float(max_rat))**2))) for k,v in f.iteritems()} self.writeToCSV(filename, ['structureId', 'chainId', 'entityId', 'lysineCount', 'chainLength', 'lysineRatio', 'normalizedLysineCount', 'normalizedLysineRatio', 'suitabilityIndexA'], [[k[0], k[1], k[2], v[0], v[1], v[2], v[3], v[4], v[5]] for k,v in f.iteritems()]) return f def plotFrequencies(self, f): x = []; y = []; d = []; keys = [] for k,v in f.iteritems(): x.append(v[2]) y.append(float(v[0])) d.append(v[5]) keys.append(k) npd = N.array(d) r_mean = N.mean(npd) r_std = N.std(npd) del npd xs = []; ys = []; ds = []; keyss = [] t = [] i = 0; while i < len(x): if d[i] > (r_mean+(3*r_std)): ds.append(d.pop(i)) xs.append(x.pop(i)) ys.append(y.pop(i)) keyss.append(keys.pop(i)) t.append((xs[-1], ys[-1], ds[-1], keyss[-1])) else: i += 1 del i fig = P.figure(1) ax = fig.add_subplot(111) P.scatter(x,y) P.scatter(xs, ys, c='r') P.ylabel('Lysine Count') P.xlabel('Lysine Ratio') t = sorted(t, key=lambda st: st[2]) P.show() def writeToCSV(self, filename, header, data): with open(filename, 'wb') as csvfile: cwriter = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) cwriter.writerow(header) for i in data: cwriter.writerow(i) return
queryPDB.py
#!/usr/bin/env python """ Script to execute a simple PDB query @author: Pascal Pieters @copyright: Copyright 2013, TU-Eindhoven iGEM Team @credits: TU-Eindhoven iGEM Team 2013 @license: GPL @version: 0.0.1 @maintainer: Pascal Pieters @email: p.a.pieters@student.tue.nl @status: Testing """ import PDB import getopt, sys if __name__ == "__main__": opts, args = getopt.getopt(sys.argv[1:], "q:f:pv", ["query=", "filterlength=", "plot"]) query = "" filterlength = 0 plot = False verbose = False for o, a in opts: if o == "-v": verbose = True elif o in ("-q", "--query"): query = (a if a else "query.xml") elif o in ("-f", "--filterlength"): filterlength = (int(a) if a else 10) elif o in ("-p", "--plot"): plot = True else: assert False, "unhandled option" if query: P = PDB.RemoteInstance() P.loadQuery("query.xml") if verbose: print "Structures Found: ", P.executeQuery() else: P.executeQuery() P.getBasicInformation("basic.csv") if filterlength or plot: L = PDB.LocalInstance("basic.csv") if filterlength: L.filterBasics(minChainLength=filterlength) if plot: proteins = L.getProteinSequences() if verbose: for k,v in proteins.iteritems(): print k, ": ", v freq = L.getFrequencies(proteins, 'R', 'freq.csv') L.plotFrequencies(freq)