Team:TU-Eindhoven/Code:PDBQuery
From 2013.igem.org
(Difference between revisions)
Pascalaldo (Talk | contribs) (→General Information) |
Pascalaldo (Talk | contribs) (→General Information) |
||
Line 14: | Line 14: | ||
|- | |- | ||
| '''Output''' | | '''Output''' | ||
- | | ''freq.csv'', which is a comma seperated file containing various properties of the different proteins. The most important value is the frequency of the aminoacids Lysing (K) or Arganine (R). Which on is taken can be changed in the source code of ''queryPDB.py''. | + | | ''freq.csv'', which is a comma seperated file containing various properties of the different proteins. The most important value is the frequency of the aminoacids Lysing (K) or Arganine (R). Which on is taken can be changed in the source code of ''queryPDB.py'' changing the R in <code>freq = L.getFrequencies(proteins, 'R', 'freq.csv')</code>. |
|} | |} | ||
Revision as of 12:24, 2 July 2013
Contents |
PDB Querying
General Information
Dependencies | Python, numpy, pylab, wget |
License | GPL |
Example Usage | python queryPDB.py -q query.xml -f 10 |
Output | freq.csv, which is a comma seperated file containing various properties of the different proteins. The most important value is the frequency of the aminoacids Lysing (K) or Arganine (R). Which on is taken can be changed in the source code of queryPDB.py changing the R in freq = L.getFrequencies(proteins, 'R', 'freq.csv') .
|
query.xml
<orgPdbQuery> <version>head</version> <queryType>org.pdb.query.simple.ChainTypeQuery</queryType> <description>Chain Type: there is a Protein chain</description> <containsProtein>Y</containsProtein> <containsDna>?</containsDna> <containsRna>?</containsRna> <containsHybrid>?</containsHybrid> </orgPdbQuery>
PDB.py
#!/usr/bin/env python """ Module to read and process PDB data. (http://www.pdb.org) @author: Pascal Pieters @copyright: Copyright 2013, TU-Eindhoven iGEM Team @credits: TU-Eindhoven iGEM Team 2013 @license: GPL @version: 0.0.1 @maintainer: Pascal Pieters @email: p.a.pieters@student.tue.nl @status: Testing """ import urllib2 import os import pylab as P import numpy as N import csv import shutil class RemoteInstance: """General PDB RemoteInstance that can be used to retrieve PDB data""" def __init__(self): """Initiate the RemoteInstance""" self.searchURL = "http://www.rcsb.org/pdb/rest/search" self.query = "" self.result = "" return def loadQuery(self, filename): f = open(filename, 'r') self.query = ''.join(f.readlines()) f.close() return def setQuery(self,q): self.query = q return def executeQuery(self): req = urllib2.Request(self.searchURL, data=self.query) r = urllib2.urlopen(req) self.result = r.read() self.result = self.result.split('\n')[0:-1] return len(self.result) def getBasicInformation(self, filename): u = ["http://www.rcsb.org/pdb/rest/customReport?pdbids=", "&customReportColumns=structureId,chainId,entityId,sequence,chainLength,db_id,db_name,molecularWeight,secondaryStructure,entityMacromoleculeType&service=wsdisplay&format=csv"] ft = open(filename, 'w') for i in range(0, len(self.result), 100): print "# Downloading %d-%d" % (i, i+100) url = u[0] + (', '.join((self.result[i:i+100] if i+100 < len(self.result) else self.result[i:]))) + u[1] os.system(("wget -q -O out.csv \"%s\"" % url)) f = open('out.csv', 'r') data = ''.join(f.readlines()) print "# - Data Size: %d" % len(data) f.close() os.remove('out.csv') data = data.replace('<br />', '\n') if i > 0: [temp, data] = data.split('\n',1) ft.write(data) ft.close() return class LocalInstance: def __init__(self, path): self.filename = path return def filterBasics(self, minChainLength=1): fin = open(self.filename, 'r') fout = open("temp.csv", 'w') dh = ['chainLength'] headertext = fin.readline() fout.write(headertext) header = headertext.strip().split(',') d = {i:header.index(i) for i in dh} print "Chain Length: %d" % tuple(d.values()) for line in fin: l = line.strip().split(',') l = [i.strip('"') for i in l] if int(l[d['chainLength']]) > minChainLength: fout.write(line) fout.close() fin.close() os.remove(self.filename) shutil.move("temp.csv", self.filename) return def getProteinSequences(self): f = open(self.filename, 'r') p = {} dh = ['structureId', 'chainId', 'entityId', 'sequence', 'entityMacromoleculeType'] header = f.readline().strip().split(',') d = {i:header.index(i) for i in dh} print "# Parsing '%s'\n# - Header: " % (self.filename), for i in d.keys(): print "%s: %d" % (i, d[i]), print " " for line in f: l = line.strip().split(',') l = [i.strip('"') for i in l] if 'Polypeptide' in l[d['entityMacromoleculeType']]: p[(l[d['structureId']], l[d['chainId']], int(l[d['entityId']]))] = l[d['sequence']] f.close() return p def getFrequencies(self, p, aa, filename): f = {} max_cnt = 0; max_rat = 0 for k, v in p.iteritems(): cnt = v.count(aa) lev = len(v) rat = float(cnt)/float(lev) f[k] = (cnt, lev, rat) max_cnt = max(max_cnt, cnt) max_rat = max(max_rat, rat) f = {k:(v[0], v[1], v[2], (float(v[0])/float(max_cnt)), (float(v[2])/float(max_rat)), (((float(v[0])/float(max_cnt))**2)+((float(v[2])/float(max_rat))**2))) for k,v in f.iteritems()} self.writeToCSV(filename, ['structureId', 'chainId', 'entityId', 'lysineCount', 'chainLength', 'lysineRatio', 'normalizedLysineCount', 'normalizedLysineRatio', 'suitabilityIndexA'], [[k[0], k[1], k[2], v[0], v[1], v[2], v[3], v[4], v[5]] for k,v in f.iteritems()]) return f def plotFrequencies(self, f): x = []; y = []; d = []; keys = [] for k,v in f.iteritems(): x.append(v[2]) y.append(float(v[0])) d.append(v[5]) keys.append(k) npd = N.array(d) r_mean = N.mean(npd) r_std = N.std(npd) del npd xs = []; ys = []; ds = []; keyss = [] t = [] i = 0; while i < len(x): if d[i] > (r_mean+(3*r_std)): ds.append(d.pop(i)) xs.append(x.pop(i)) ys.append(y.pop(i)) keyss.append(keys.pop(i)) t.append((xs[-1], ys[-1], ds[-1], keyss[-1])) else: i += 1 del i fig = P.figure(1) ax = fig.add_subplot(111) P.scatter(x,y) P.scatter(xs, ys, c='r') P.ylabel('Lysine Count') P.xlabel('Lysine Ratio') t = sorted(t, key=lambda st: st[2]) P.show() def writeToCSV(self, filename, header, data): with open(filename, 'wb') as csvfile: cwriter = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) cwriter.writerow(header) for i in data: cwriter.writerow(i) return
queryPDB.py
#!/usr/bin/env python """ Script to execute a simple PDB query @author: Pascal Pieters @copyright: Copyright 2013, TU-Eindhoven iGEM Team @credits: TU-Eindhoven iGEM Team 2013 @license: GPL @version: 0.0.1 @maintainer: Pascal Pieters @email: p.a.pieters@student.tue.nl @status: Testing """ import PDB import getopt, sys if __name__ == "__main__": opts, args = getopt.getopt(sys.argv[1:], "q:f:pv", ["query=", "filterlength=", "plot"]) query = "" filterlength = 0 plot = False verbose = False for o, a in opts: if o == "-v": verbose = True elif o in ("-q", "--query"): query = (a if a else "query.xml") elif o in ("-f", "--filterlength"): filterlength = (int(a) if a else 10) elif o in ("-p", "--plot"): plot = True else: assert False, "unhandled option" if query: P = PDB.RemoteInstance() P.loadQuery("query.xml") if verbose: print "Structures Found: ", P.executeQuery() else: P.executeQuery() P.getBasicInformation("basic.csv") if filterlength or plot: L = PDB.LocalInstance("basic.csv") if filterlength: L.filterBasics(minChainLength=filterlength) if plot: proteins = L.getProteinSequences() if verbose: for k,v in proteins.iteritems(): print k, ": ", v freq = L.getFrequencies(proteins, 'R', 'freq.csv') L.plotFrequencies(freq)