Team:TU-Eindhoven/Code:PDBQuery
From 2013.igem.org
(Difference between revisions)
Pascalaldo (Talk | contribs) (Created page with "{{:Team:TU-Eindhoven/Template:Base}} {{:Team:TU-Eindhoven/Template:MenuBar}} {{:Team:TU-Eindhoven/Template:Code }}<nowiki> <orgPdbQuery> <version>head</version> <queryTy...") |
Pascalaldo (Talk | contribs) |
||
(9 intermediate revisions not shown) | |||
Line 1: | Line 1: | ||
{{:Team:TU-Eindhoven/Template:Base}} | {{:Team:TU-Eindhoven/Template:Base}} | ||
{{:Team:TU-Eindhoven/Template:MenuBar}} | {{:Team:TU-Eindhoven/Template:MenuBar}} | ||
+ | ==PDB Querying== | ||
+ | ===General Information=== | ||
+ | {|class="table table-striped" style="width: 600px;" | ||
+ | | '''Dependencies''' | ||
+ | | Python, numpy, pylab, wget | ||
+ | |- | ||
+ | | '''License''' | ||
+ | | GPL | ||
+ | |- | ||
+ | | '''Example Usage''' | ||
+ | | python ''queryPDB.py'' -q ''query.xml'' -f 10 | ||
+ | |- | ||
+ | | '''Output''' | ||
+ | | ''freq.csv'', which is a comma seperated file containing various properties of the different proteins. The most important value is the frequency of the aminoacids Lysing (K) or Arginine (R). Which on is taken can be changed in the source code of ''queryPDB.py'' changing the K in the following line: <code>freq = L.getFrequencies(proteins, 'K', 'freq.csv')</code> | ||
+ | |} | ||
+ | ===query.xml=== | ||
{{:Team:TU-Eindhoven/Template:Code }}<nowiki> | {{:Team:TU-Eindhoven/Template:Code }}<nowiki> | ||
<orgPdbQuery> | <orgPdbQuery> | ||
Line 12: | Line 28: | ||
<containsHybrid>?</containsHybrid> | <containsHybrid>?</containsHybrid> | ||
</orgPdbQuery> | </orgPdbQuery> | ||
+ | </nowiki>{{:Team:TU-Eindhoven/Template:CodeEnd }} | ||
+ | ===PDB.py=== | ||
+ | {{:Team:TU-Eindhoven/Template:Code }}<nowiki> | ||
+ | #!/usr/bin/env python | ||
+ | |||
+ | """ | ||
+ | Module to read and process PDB data. (http://www.pdb.org) | ||
+ | |||
+ | @author: Pascal Pieters | ||
+ | @copyright: Copyright 2013, TU-Eindhoven iGEM Team | ||
+ | @credits: TU-Eindhoven iGEM Team 2013 | ||
+ | @license: GPL | ||
+ | @version: 0.0.1 | ||
+ | @maintainer: Pascal Pieters | ||
+ | @email: p.a.pieters@student.tue.nl | ||
+ | @status: Testing | ||
+ | """ | ||
+ | |||
+ | import urllib2 | ||
+ | import os | ||
+ | import pylab as P | ||
+ | import numpy as N | ||
+ | import csv | ||
+ | import shutil | ||
+ | |||
+ | class RemoteInstance: | ||
+ | """General PDB RemoteInstance that can be used to retrieve PDB data""" | ||
+ | def __init__(self): | ||
+ | """Initiate the RemoteInstance""" | ||
+ | self.searchURL = "http://www.rcsb.org/pdb/rest/search" | ||
+ | self.query = "" | ||
+ | self.result = "" | ||
+ | return | ||
+ | |||
+ | def loadQuery(self, filename): | ||
+ | f = open(filename, 'r') | ||
+ | self.query = ''.join(f.readlines()) | ||
+ | f.close() | ||
+ | return | ||
+ | |||
+ | def setQuery(self,q): | ||
+ | self.query = q | ||
+ | return | ||
+ | |||
+ | def executeQuery(self): | ||
+ | req = urllib2.Request(self.searchURL, data=self.query) | ||
+ | r = urllib2.urlopen(req) | ||
+ | self.result = r.read() | ||
+ | self.result = self.result.split('\n')[0:-1] | ||
+ | return len(self.result) | ||
+ | |||
+ | def getBasicInformation(self, filename): | ||
+ | u = ["http://www.rcsb.org/pdb/rest/customReport?pdbids=", "&customReportColumns=structureId,chainId,entityId,sequence,chainLength,db_id,db_name,molecularWeight,secondaryStructure,entityMacromoleculeType&service=wsdisplay&format=csv"] | ||
+ | ft = open(filename, 'w') | ||
+ | |||
+ | for i in range(0, len(self.result), 100): | ||
+ | print "# Downloading %d-%d" % (i, i+100) | ||
+ | url = u[0] + (', '.join((self.result[i:i+100] if i+100 < len(self.result) else self.result[i:]))) + u[1] | ||
+ | os.system(("wget -q -O out.csv \"%s\"" % url)) | ||
+ | |||
+ | f = open('out.csv', 'r') | ||
+ | data = ''.join(f.readlines()) | ||
+ | print "# - Data Size: %d" % len(data) | ||
+ | f.close() | ||
+ | os.remove('out.csv') | ||
+ | |||
+ | data = data.replace('<br />', '\n') | ||
+ | if i > 0: | ||
+ | [temp, data] = data.split('\n',1) | ||
+ | ft.write(data) | ||
+ | ft.close() | ||
+ | return | ||
+ | |||
+ | |||
+ | class LocalInstance: | ||
+ | def __init__(self, path): | ||
+ | self.filename = path | ||
+ | return | ||
+ | |||
+ | def filterBasics(self, minChainLength=1): | ||
+ | fin = open(self.filename, 'r') | ||
+ | fout = open("temp.csv", 'w') | ||
+ | |||
+ | dh = ['chainLength'] | ||
+ | headertext = fin.readline() | ||
+ | fout.write(headertext) | ||
+ | header = headertext.strip().split(',') | ||
+ | d = {i:header.index(i) for i in dh} | ||
+ | print "Chain Length: %d" % tuple(d.values()) | ||
+ | |||
+ | for line in fin: | ||
+ | l = line.strip().split(',') | ||
+ | l = [i.strip('"') for i in l] | ||
+ | if int(l[d['chainLength']]) > minChainLength: | ||
+ | fout.write(line) | ||
+ | |||
+ | fout.close() | ||
+ | fin.close() | ||
+ | os.remove(self.filename) | ||
+ | shutil.move("temp.csv", self.filename) | ||
+ | return | ||
+ | |||
+ | def getProteinSequences(self): | ||
+ | f = open(self.filename, 'r') | ||
+ | p = {} | ||
+ | |||
+ | dh = ['structureId', 'chainId', 'entityId', 'sequence', 'entityMacromoleculeType'] | ||
+ | header = f.readline().strip().split(',') | ||
+ | d = {i:header.index(i) for i in dh} | ||
+ | print "# Parsing '%s'\n# - Header: " % (self.filename), | ||
+ | for i in d.keys(): | ||
+ | print "%s: %d" % (i, d[i]), | ||
+ | print " " | ||
+ | |||
+ | for line in f: | ||
+ | l = line.strip().split(',') | ||
+ | l = [i.strip('"') for i in l] | ||
+ | if 'Polypeptide' in l[d['entityMacromoleculeType']]: | ||
+ | p[(l[d['structureId']], l[d['chainId']], int(l[d['entityId']]))] = l[d['sequence']] | ||
+ | |||
+ | f.close() | ||
+ | return p | ||
+ | |||
+ | def getFrequencies(self, p, aa, filename): | ||
+ | f = {} | ||
+ | max_cnt = 0; max_rat = 0 | ||
+ | for k, v in p.iteritems(): | ||
+ | cnt = v.count(aa) | ||
+ | lev = len(v) | ||
+ | rat = float(cnt)/float(lev) | ||
+ | f[k] = (cnt, lev, rat) | ||
+ | max_cnt = max(max_cnt, cnt) | ||
+ | max_rat = max(max_rat, rat) | ||
+ | |||
+ | f = {k:(v[0], v[1], v[2], (float(v[0])/float(max_cnt)), (float(v[2])/float(max_rat)), (((float(v[0])/float(max_cnt))**2)+((float(v[2])/float(max_rat))**2))) for k,v in f.iteritems()} | ||
+ | |||
+ | self.writeToCSV(filename, ['structureId', 'chainId', 'entityId', 'lysineCount', 'chainLength', 'lysineRatio', 'normalizedLysineCount', 'normalizedLysineRatio', 'suitabilityIndexA'], [[k[0], k[1], k[2], v[0], v[1], v[2], v[3], v[4], v[5]] for k,v in f.iteritems()]) | ||
+ | |||
+ | return f | ||
+ | |||
+ | def plotFrequencies(self, f): | ||
+ | x = []; y = []; d = []; keys = [] | ||
+ | for k,v in f.iteritems(): | ||
+ | x.append(v[2]) | ||
+ | y.append(float(v[0])) | ||
+ | d.append(v[5]) | ||
+ | keys.append(k) | ||
+ | |||
+ | npd = N.array(d) | ||
+ | r_mean = N.mean(npd) | ||
+ | r_std = N.std(npd) | ||
+ | del npd | ||
+ | |||
+ | xs = []; ys = []; ds = []; keyss = [] | ||
+ | t = [] | ||
+ | i = 0; | ||
+ | while i < len(x): | ||
+ | if d[i] > (r_mean+(3*r_std)): | ||
+ | ds.append(d.pop(i)) | ||
+ | xs.append(x.pop(i)) | ||
+ | ys.append(y.pop(i)) | ||
+ | keyss.append(keys.pop(i)) | ||
+ | t.append((xs[-1], ys[-1], ds[-1], keyss[-1])) | ||
+ | else: | ||
+ | i += 1 | ||
+ | del i | ||
+ | |||
+ | fig = P.figure(1) | ||
+ | ax = fig.add_subplot(111) | ||
+ | |||
+ | P.scatter(x,y) | ||
+ | P.scatter(xs, ys, c='r') | ||
+ | |||
+ | P.ylabel('Lysine Count') | ||
+ | P.xlabel('Lysine Ratio') | ||
+ | |||
+ | t = sorted(t, key=lambda st: st[2]) | ||
+ | |||
+ | P.show() | ||
+ | |||
+ | def writeToCSV(self, filename, header, data): | ||
+ | with open(filename, 'wb') as csvfile: | ||
+ | cwriter = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) | ||
+ | cwriter.writerow(header) | ||
+ | for i in data: | ||
+ | cwriter.writerow(i) | ||
+ | return | ||
+ | </nowiki>{{:Team:TU-Eindhoven/Template:CodeEnd }} | ||
+ | ===queryPDB.py=== | ||
+ | {{:Team:TU-Eindhoven/Template:Code }}<nowiki> | ||
+ | #!/usr/bin/env python | ||
+ | |||
+ | """ | ||
+ | Script to execute a simple PDB query | ||
+ | |||
+ | @author: Pascal Pieters | ||
+ | @copyright: Copyright 2013, TU-Eindhoven iGEM Team | ||
+ | @credits: TU-Eindhoven iGEM Team 2013 | ||
+ | @license: GPL | ||
+ | @version: 0.0.1 | ||
+ | @maintainer: Pascal Pieters | ||
+ | @email: p.a.pieters@student.tue.nl | ||
+ | @status: Testing | ||
+ | """ | ||
+ | |||
+ | import PDB | ||
+ | import getopt, sys | ||
+ | |||
+ | if __name__ == "__main__": | ||
+ | opts, args = getopt.getopt(sys.argv[1:], "q:f:pv", ["query=", "filterlength=", "plot"]) | ||
+ | |||
+ | query = "" | ||
+ | filterlength = 0 | ||
+ | plot = False | ||
+ | verbose = False | ||
+ | |||
+ | for o, a in opts: | ||
+ | if o == "-v": | ||
+ | verbose = True | ||
+ | elif o in ("-q", "--query"): | ||
+ | query = (a if a else "query.xml") | ||
+ | elif o in ("-f", "--filterlength"): | ||
+ | filterlength = (int(a) if a else 10) | ||
+ | elif o in ("-p", "--plot"): | ||
+ | plot = True | ||
+ | else: | ||
+ | assert False, "unhandled option" | ||
+ | |||
+ | if query: | ||
+ | P = PDB.RemoteInstance() | ||
+ | P.loadQuery("query.xml") | ||
+ | if verbose: print "Structures Found: ", P.executeQuery() | ||
+ | else: P.executeQuery() | ||
+ | P.getBasicInformation("basic.csv") | ||
+ | |||
+ | if filterlength or plot: | ||
+ | L = PDB.LocalInstance("basic.csv") | ||
+ | if filterlength: | ||
+ | L.filterBasics(minChainLength=filterlength) | ||
+ | if plot: | ||
+ | proteins = L.getProteinSequences() | ||
+ | if verbose: | ||
+ | for k,v in proteins.iteritems(): | ||
+ | print k, ": ", v | ||
+ | freq = L.getFrequencies(proteins, 'K', 'freq.csv') | ||
+ | L.plotFrequencies(freq) | ||
+ | |||
</nowiki>{{:Team:TU-Eindhoven/Template:CodeEnd }} | </nowiki>{{:Team:TU-Eindhoven/Template:CodeEnd }} | ||
{{:Team:TU-Eindhoven/Template:BaseFooter}} | {{:Team:TU-Eindhoven/Template:BaseFooter}} | ||
+ | {{:Team:TU-Eindhoven/Template:Sponsors}} | ||
{{:Team:TU-Eindhoven/Template:SetTitle | menu=drylab | page=Code: PDB Querying }} | {{:Team:TU-Eindhoven/Template:SetTitle | menu=drylab | page=Code: PDB Querying }} | ||
{{:Team:TU-Eindhoven/Template:UseSyntaxHighlighting}} | {{:Team:TU-Eindhoven/Template:UseSyntaxHighlighting}} |
Latest revision as of 00:04, 5 October 2013
Contents |
PDB Querying
General Information
Dependencies | Python, numpy, pylab, wget |
License | GPL |
Example Usage | python queryPDB.py -q query.xml -f 10 |
Output | freq.csv, which is a comma seperated file containing various properties of the different proteins. The most important value is the frequency of the aminoacids Lysing (K) or Arginine (R). Which on is taken can be changed in the source code of queryPDB.py changing the K in the following line: freq = L.getFrequencies(proteins, 'K', 'freq.csv')
|
query.xml
<orgPdbQuery> <version>head</version> <queryType>org.pdb.query.simple.ChainTypeQuery</queryType> <description>Chain Type: there is a Protein chain</description> <containsProtein>Y</containsProtein> <containsDna>?</containsDna> <containsRna>?</containsRna> <containsHybrid>?</containsHybrid> </orgPdbQuery>
PDB.py
#!/usr/bin/env python """ Module to read and process PDB data. (http://www.pdb.org) @author: Pascal Pieters @copyright: Copyright 2013, TU-Eindhoven iGEM Team @credits: TU-Eindhoven iGEM Team 2013 @license: GPL @version: 0.0.1 @maintainer: Pascal Pieters @email: p.a.pieters@student.tue.nl @status: Testing """ import urllib2 import os import pylab as P import numpy as N import csv import shutil class RemoteInstance: """General PDB RemoteInstance that can be used to retrieve PDB data""" def __init__(self): """Initiate the RemoteInstance""" self.searchURL = "http://www.rcsb.org/pdb/rest/search" self.query = "" self.result = "" return def loadQuery(self, filename): f = open(filename, 'r') self.query = ''.join(f.readlines()) f.close() return def setQuery(self,q): self.query = q return def executeQuery(self): req = urllib2.Request(self.searchURL, data=self.query) r = urllib2.urlopen(req) self.result = r.read() self.result = self.result.split('\n')[0:-1] return len(self.result) def getBasicInformation(self, filename): u = ["http://www.rcsb.org/pdb/rest/customReport?pdbids=", "&customReportColumns=structureId,chainId,entityId,sequence,chainLength,db_id,db_name,molecularWeight,secondaryStructure,entityMacromoleculeType&service=wsdisplay&format=csv"] ft = open(filename, 'w') for i in range(0, len(self.result), 100): print "# Downloading %d-%d" % (i, i+100) url = u[0] + (', '.join((self.result[i:i+100] if i+100 < len(self.result) else self.result[i:]))) + u[1] os.system(("wget -q -O out.csv \"%s\"" % url)) f = open('out.csv', 'r') data = ''.join(f.readlines()) print "# - Data Size: %d" % len(data) f.close() os.remove('out.csv') data = data.replace('<br />', '\n') if i > 0: [temp, data] = data.split('\n',1) ft.write(data) ft.close() return class LocalInstance: def __init__(self, path): self.filename = path return def filterBasics(self, minChainLength=1): fin = open(self.filename, 'r') fout = open("temp.csv", 'w') dh = ['chainLength'] headertext = fin.readline() fout.write(headertext) header = headertext.strip().split(',') d = {i:header.index(i) for i in dh} print "Chain Length: %d" % tuple(d.values()) for line in fin: l = line.strip().split(',') l = [i.strip('"') for i in l] if int(l[d['chainLength']]) > minChainLength: fout.write(line) fout.close() fin.close() os.remove(self.filename) shutil.move("temp.csv", self.filename) return def getProteinSequences(self): f = open(self.filename, 'r') p = {} dh = ['structureId', 'chainId', 'entityId', 'sequence', 'entityMacromoleculeType'] header = f.readline().strip().split(',') d = {i:header.index(i) for i in dh} print "# Parsing '%s'\n# - Header: " % (self.filename), for i in d.keys(): print "%s: %d" % (i, d[i]), print " " for line in f: l = line.strip().split(',') l = [i.strip('"') for i in l] if 'Polypeptide' in l[d['entityMacromoleculeType']]: p[(l[d['structureId']], l[d['chainId']], int(l[d['entityId']]))] = l[d['sequence']] f.close() return p def getFrequencies(self, p, aa, filename): f = {} max_cnt = 0; max_rat = 0 for k, v in p.iteritems(): cnt = v.count(aa) lev = len(v) rat = float(cnt)/float(lev) f[k] = (cnt, lev, rat) max_cnt = max(max_cnt, cnt) max_rat = max(max_rat, rat) f = {k:(v[0], v[1], v[2], (float(v[0])/float(max_cnt)), (float(v[2])/float(max_rat)), (((float(v[0])/float(max_cnt))**2)+((float(v[2])/float(max_rat))**2))) for k,v in f.iteritems()} self.writeToCSV(filename, ['structureId', 'chainId', 'entityId', 'lysineCount', 'chainLength', 'lysineRatio', 'normalizedLysineCount', 'normalizedLysineRatio', 'suitabilityIndexA'], [[k[0], k[1], k[2], v[0], v[1], v[2], v[3], v[4], v[5]] for k,v in f.iteritems()]) return f def plotFrequencies(self, f): x = []; y = []; d = []; keys = [] for k,v in f.iteritems(): x.append(v[2]) y.append(float(v[0])) d.append(v[5]) keys.append(k) npd = N.array(d) r_mean = N.mean(npd) r_std = N.std(npd) del npd xs = []; ys = []; ds = []; keyss = [] t = [] i = 0; while i < len(x): if d[i] > (r_mean+(3*r_std)): ds.append(d.pop(i)) xs.append(x.pop(i)) ys.append(y.pop(i)) keyss.append(keys.pop(i)) t.append((xs[-1], ys[-1], ds[-1], keyss[-1])) else: i += 1 del i fig = P.figure(1) ax = fig.add_subplot(111) P.scatter(x,y) P.scatter(xs, ys, c='r') P.ylabel('Lysine Count') P.xlabel('Lysine Ratio') t = sorted(t, key=lambda st: st[2]) P.show() def writeToCSV(self, filename, header, data): with open(filename, 'wb') as csvfile: cwriter = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) cwriter.writerow(header) for i in data: cwriter.writerow(i) return
queryPDB.py
#!/usr/bin/env python """ Script to execute a simple PDB query @author: Pascal Pieters @copyright: Copyright 2013, TU-Eindhoven iGEM Team @credits: TU-Eindhoven iGEM Team 2013 @license: GPL @version: 0.0.1 @maintainer: Pascal Pieters @email: p.a.pieters@student.tue.nl @status: Testing """ import PDB import getopt, sys if __name__ == "__main__": opts, args = getopt.getopt(sys.argv[1:], "q:f:pv", ["query=", "filterlength=", "plot"]) query = "" filterlength = 0 plot = False verbose = False for o, a in opts: if o == "-v": verbose = True elif o in ("-q", "--query"): query = (a if a else "query.xml") elif o in ("-f", "--filterlength"): filterlength = (int(a) if a else 10) elif o in ("-p", "--plot"): plot = True else: assert False, "unhandled option" if query: P = PDB.RemoteInstance() P.loadQuery("query.xml") if verbose: print "Structures Found: ", P.executeQuery() else: P.executeQuery() P.getBasicInformation("basic.csv") if filterlength or plot: L = PDB.LocalInstance("basic.csv") if filterlength: L.filterBasics(minChainLength=filterlength) if plot: proteins = L.getProteinSequences() if verbose: for k,v in proteins.iteritems(): print k, ": ", v freq = L.getFrequencies(proteins, 'K', 'freq.csv') L.plotFrequencies(freq)