Team:TU-Eindhoven/Code:PDBQuery
From 2013.igem.org
(Difference between revisions)
Pascalaldo (Talk | contribs) |
Pascalaldo (Talk | contribs) |
||
(5 intermediate revisions not shown) | |||
Line 11: | Line 11: | ||
|- | |- | ||
| '''Example Usage''' | | '''Example Usage''' | ||
- | | python queryPDB.py -q query.xml -f 10 | + | | python ''queryPDB.py'' -q ''query.xml'' -f 10 |
+ | |- | ||
+ | | '''Output''' | ||
+ | | ''freq.csv'', which is a comma seperated file containing various properties of the different proteins. The most important value is the frequency of the aminoacids Lysing (K) or Arginine (R). Which on is taken can be changed in the source code of ''queryPDB.py'' changing the K in the following line: <code>freq = L.getFrequencies(proteins, 'K', 'freq.csv')</code> | ||
|} | |} | ||
Line 269: | Line 272: | ||
for k,v in proteins.iteritems(): | for k,v in proteins.iteritems(): | ||
print k, ": ", v | print k, ": ", v | ||
- | freq = L.getFrequencies(proteins, ' | + | freq = L.getFrequencies(proteins, 'K', 'freq.csv') |
L.plotFrequencies(freq) | L.plotFrequencies(freq) | ||
Line 275: | Line 278: | ||
{{:Team:TU-Eindhoven/Template:BaseFooter}} | {{:Team:TU-Eindhoven/Template:BaseFooter}} | ||
+ | {{:Team:TU-Eindhoven/Template:Sponsors}} | ||
{{:Team:TU-Eindhoven/Template:SetTitle | menu=drylab | page=Code: PDB Querying }} | {{:Team:TU-Eindhoven/Template:SetTitle | menu=drylab | page=Code: PDB Querying }} | ||
{{:Team:TU-Eindhoven/Template:UseSyntaxHighlighting}} | {{:Team:TU-Eindhoven/Template:UseSyntaxHighlighting}} |
Latest revision as of 00:04, 5 October 2013
Contents |
PDB Querying
General Information
Dependencies | Python, numpy, pylab, wget |
License | GPL |
Example Usage | python queryPDB.py -q query.xml -f 10 |
Output | freq.csv, which is a comma seperated file containing various properties of the different proteins. The most important value is the frequency of the aminoacids Lysing (K) or Arginine (R). Which on is taken can be changed in the source code of queryPDB.py changing the K in the following line: freq = L.getFrequencies(proteins, 'K', 'freq.csv')
|
query.xml
<orgPdbQuery> <version>head</version> <queryType>org.pdb.query.simple.ChainTypeQuery</queryType> <description>Chain Type: there is a Protein chain</description> <containsProtein>Y</containsProtein> <containsDna>?</containsDna> <containsRna>?</containsRna> <containsHybrid>?</containsHybrid> </orgPdbQuery>
PDB.py
#!/usr/bin/env python """ Module to read and process PDB data. (http://www.pdb.org) @author: Pascal Pieters @copyright: Copyright 2013, TU-Eindhoven iGEM Team @credits: TU-Eindhoven iGEM Team 2013 @license: GPL @version: 0.0.1 @maintainer: Pascal Pieters @email: p.a.pieters@student.tue.nl @status: Testing """ import urllib2 import os import pylab as P import numpy as N import csv import shutil class RemoteInstance: """General PDB RemoteInstance that can be used to retrieve PDB data""" def __init__(self): """Initiate the RemoteInstance""" self.searchURL = "http://www.rcsb.org/pdb/rest/search" self.query = "" self.result = "" return def loadQuery(self, filename): f = open(filename, 'r') self.query = ''.join(f.readlines()) f.close() return def setQuery(self,q): self.query = q return def executeQuery(self): req = urllib2.Request(self.searchURL, data=self.query) r = urllib2.urlopen(req) self.result = r.read() self.result = self.result.split('\n')[0:-1] return len(self.result) def getBasicInformation(self, filename): u = ["http://www.rcsb.org/pdb/rest/customReport?pdbids=", "&customReportColumns=structureId,chainId,entityId,sequence,chainLength,db_id,db_name,molecularWeight,secondaryStructure,entityMacromoleculeType&service=wsdisplay&format=csv"] ft = open(filename, 'w') for i in range(0, len(self.result), 100): print "# Downloading %d-%d" % (i, i+100) url = u[0] + (', '.join((self.result[i:i+100] if i+100 < len(self.result) else self.result[i:]))) + u[1] os.system(("wget -q -O out.csv \"%s\"" % url)) f = open('out.csv', 'r') data = ''.join(f.readlines()) print "# - Data Size: %d" % len(data) f.close() os.remove('out.csv') data = data.replace('<br />', '\n') if i > 0: [temp, data] = data.split('\n',1) ft.write(data) ft.close() return class LocalInstance: def __init__(self, path): self.filename = path return def filterBasics(self, minChainLength=1): fin = open(self.filename, 'r') fout = open("temp.csv", 'w') dh = ['chainLength'] headertext = fin.readline() fout.write(headertext) header = headertext.strip().split(',') d = {i:header.index(i) for i in dh} print "Chain Length: %d" % tuple(d.values()) for line in fin: l = line.strip().split(',') l = [i.strip('"') for i in l] if int(l[d['chainLength']]) > minChainLength: fout.write(line) fout.close() fin.close() os.remove(self.filename) shutil.move("temp.csv", self.filename) return def getProteinSequences(self): f = open(self.filename, 'r') p = {} dh = ['structureId', 'chainId', 'entityId', 'sequence', 'entityMacromoleculeType'] header = f.readline().strip().split(',') d = {i:header.index(i) for i in dh} print "# Parsing '%s'\n# - Header: " % (self.filename), for i in d.keys(): print "%s: %d" % (i, d[i]), print " " for line in f: l = line.strip().split(',') l = [i.strip('"') for i in l] if 'Polypeptide' in l[d['entityMacromoleculeType']]: p[(l[d['structureId']], l[d['chainId']], int(l[d['entityId']]))] = l[d['sequence']] f.close() return p def getFrequencies(self, p, aa, filename): f = {} max_cnt = 0; max_rat = 0 for k, v in p.iteritems(): cnt = v.count(aa) lev = len(v) rat = float(cnt)/float(lev) f[k] = (cnt, lev, rat) max_cnt = max(max_cnt, cnt) max_rat = max(max_rat, rat) f = {k:(v[0], v[1], v[2], (float(v[0])/float(max_cnt)), (float(v[2])/float(max_rat)), (((float(v[0])/float(max_cnt))**2)+((float(v[2])/float(max_rat))**2))) for k,v in f.iteritems()} self.writeToCSV(filename, ['structureId', 'chainId', 'entityId', 'lysineCount', 'chainLength', 'lysineRatio', 'normalizedLysineCount', 'normalizedLysineRatio', 'suitabilityIndexA'], [[k[0], k[1], k[2], v[0], v[1], v[2], v[3], v[4], v[5]] for k,v in f.iteritems()]) return f def plotFrequencies(self, f): x = []; y = []; d = []; keys = [] for k,v in f.iteritems(): x.append(v[2]) y.append(float(v[0])) d.append(v[5]) keys.append(k) npd = N.array(d) r_mean = N.mean(npd) r_std = N.std(npd) del npd xs = []; ys = []; ds = []; keyss = [] t = [] i = 0; while i < len(x): if d[i] > (r_mean+(3*r_std)): ds.append(d.pop(i)) xs.append(x.pop(i)) ys.append(y.pop(i)) keyss.append(keys.pop(i)) t.append((xs[-1], ys[-1], ds[-1], keyss[-1])) else: i += 1 del i fig = P.figure(1) ax = fig.add_subplot(111) P.scatter(x,y) P.scatter(xs, ys, c='r') P.ylabel('Lysine Count') P.xlabel('Lysine Ratio') t = sorted(t, key=lambda st: st[2]) P.show() def writeToCSV(self, filename, header, data): with open(filename, 'wb') as csvfile: cwriter = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) cwriter.writerow(header) for i in data: cwriter.writerow(i) return
queryPDB.py
#!/usr/bin/env python """ Script to execute a simple PDB query @author: Pascal Pieters @copyright: Copyright 2013, TU-Eindhoven iGEM Team @credits: TU-Eindhoven iGEM Team 2013 @license: GPL @version: 0.0.1 @maintainer: Pascal Pieters @email: p.a.pieters@student.tue.nl @status: Testing """ import PDB import getopt, sys if __name__ == "__main__": opts, args = getopt.getopt(sys.argv[1:], "q:f:pv", ["query=", "filterlength=", "plot"]) query = "" filterlength = 0 plot = False verbose = False for o, a in opts: if o == "-v": verbose = True elif o in ("-q", "--query"): query = (a if a else "query.xml") elif o in ("-f", "--filterlength"): filterlength = (int(a) if a else 10) elif o in ("-p", "--plot"): plot = True else: assert False, "unhandled option" if query: P = PDB.RemoteInstance() P.loadQuery("query.xml") if verbose: print "Structures Found: ", P.executeQuery() else: P.executeQuery() P.getBasicInformation("basic.csv") if filterlength or plot: L = PDB.LocalInstance("basic.csv") if filterlength: L.filterBasics(minChainLength=filterlength) if plot: proteins = L.getProteinSequences() if verbose: for k,v in proteins.iteritems(): print k, ": ", v freq = L.getFrequencies(proteins, 'K', 'freq.csv') L.plotFrequencies(freq)