Team:TU-Eindhoven/Code:PDBQuery
From 2013.igem.org
(Difference between revisions)
Pascalaldo (Talk | contribs) |
Pascalaldo (Talk | contribs) (→PDB Querying) |
||
Line 13: | Line 13: | ||
<containsHybrid>?</containsHybrid> | <containsHybrid>?</containsHybrid> | ||
</orgPdbQuery> | </orgPdbQuery> | ||
+ | </nowiki>{{:Team:TU-Eindhoven/Template:CodeEnd }} | ||
+ | ===PDB.py=== | ||
+ | {{:Team:TU-Eindhoven/Template:Code }}<nowiki> | ||
+ | #!/usr/bin/env python | ||
+ | |||
+ | """ | ||
+ | Module to read and process PDB data. (http://www.pdb.org) | ||
+ | |||
+ | @author: Pascal Pieters | ||
+ | @copyright: Copyright 2013, TU-Eindhoven iGEM Team | ||
+ | @credits: TU-Eindhoven iGEM Team 2013 | ||
+ | @license: GPL | ||
+ | @version: 0.0.1 | ||
+ | @maintainer: Pascal Pieters | ||
+ | @email: p.a.pieters@student.tue.nl | ||
+ | @status: Testing | ||
+ | """ | ||
+ | |||
+ | import urllib2 | ||
+ | import os | ||
+ | import pylab as P | ||
+ | import numpy as N | ||
+ | import csv | ||
+ | import shutil | ||
+ | |||
+ | class RemoteInstance: | ||
+ | """General PDB RemoteInstance that can be used to retrieve PDB data""" | ||
+ | def __init__(self): | ||
+ | """Initiate the RemoteInstance""" | ||
+ | self.searchURL = "http://www.rcsb.org/pdb/rest/search" | ||
+ | self.query = "" | ||
+ | self.result = "" | ||
+ | return | ||
+ | |||
+ | def loadQuery(self, filename): | ||
+ | f = open(filename, 'r') | ||
+ | self.query = ''.join(f.readlines()) | ||
+ | f.close() | ||
+ | return | ||
+ | |||
+ | def setQuery(self,q): | ||
+ | self.query = q | ||
+ | return | ||
+ | |||
+ | def executeQuery(self): | ||
+ | req = urllib2.Request(self.searchURL, data=self.query) | ||
+ | r = urllib2.urlopen(req) | ||
+ | self.result = r.read() | ||
+ | self.result = self.result.split('\n')[0:-1] | ||
+ | return len(self.result) | ||
+ | |||
+ | def getBasicInformation(self, filename): | ||
+ | u = ["http://www.rcsb.org/pdb/rest/customReport?pdbids=", "&customReportColumns=structureId,chainId,entityId,sequence,chainLength,db_id,db_name,molecularWeight,secondaryStructure,entityMacromoleculeType&service=wsdisplay&format=csv"] | ||
+ | ft = open(filename, 'w') | ||
+ | |||
+ | for i in range(0, len(self.result), 100): | ||
+ | print "# Downloading %d-%d" % (i, i+100) | ||
+ | url = u[0] + (', '.join((self.result[i:i+100] if i+100 < len(self.result) else self.result[i:]))) + u[1] | ||
+ | os.system(("wget -q -O out.csv \"%s\"" % url)) | ||
+ | |||
+ | f = open('out.csv', 'r') | ||
+ | data = ''.join(f.readlines()) | ||
+ | print "# - Data Size: %d" % len(data) | ||
+ | f.close() | ||
+ | os.remove('out.csv') | ||
+ | |||
+ | data = data.replace('<br />', '\n') | ||
+ | if i > 0: | ||
+ | [temp, data] = data.split('\n',1) | ||
+ | ft.write(data) | ||
+ | ft.close() | ||
+ | return | ||
+ | |||
+ | |||
+ | class LocalInstance: | ||
+ | def __init__(self, path): | ||
+ | self.filename = path | ||
+ | return | ||
+ | |||
+ | def filterBasics(self, minChainLength=1): | ||
+ | fin = open(self.filename, 'r') | ||
+ | fout = open("temp.csv", 'w') | ||
+ | |||
+ | dh = ['chainLength'] | ||
+ | headertext = fin.readline() | ||
+ | fout.write(headertext) | ||
+ | header = headertext.strip().split(',') | ||
+ | d = {i:header.index(i) for i in dh} | ||
+ | print "Chain Length: %d" % tuple(d.values()) | ||
+ | |||
+ | for line in fin: | ||
+ | l = line.strip().split(',') | ||
+ | l = [i.strip('"') for i in l] | ||
+ | if int(l[d['chainLength']]) > minChainLength: | ||
+ | fout.write(line) | ||
+ | |||
+ | fout.close() | ||
+ | fin.close() | ||
+ | os.remove(self.filename) | ||
+ | shutil.move("temp.csv", self.filename) | ||
+ | return | ||
+ | |||
+ | def getProteinSequences(self): | ||
+ | f = open(self.filename, 'r') | ||
+ | p = {} | ||
+ | |||
+ | dh = ['structureId', 'chainId', 'entityId', 'sequence', 'entityMacromoleculeType'] | ||
+ | header = f.readline().strip().split(',') | ||
+ | d = {i:header.index(i) for i in dh} | ||
+ | print "# Parsing '%s'\n# - Header: " % (self.filename), | ||
+ | for i in d.keys(): | ||
+ | print "%s: %d" % (i, d[i]), | ||
+ | print " " | ||
+ | |||
+ | for line in f: | ||
+ | l = line.strip().split(',') | ||
+ | l = [i.strip('"') for i in l] | ||
+ | if 'Polypeptide' in l[d['entityMacromoleculeType']]: | ||
+ | p[(l[d['structureId']], l[d['chainId']], int(l[d['entityId']]))] = l[d['sequence']] | ||
+ | |||
+ | f.close() | ||
+ | return p | ||
+ | |||
+ | def getFrequencies(self, p, aa, filename): | ||
+ | f = {} | ||
+ | max_cnt = 0; max_rat = 0 | ||
+ | for k, v in p.iteritems(): | ||
+ | cnt = v.count(aa) | ||
+ | lev = len(v) | ||
+ | rat = float(cnt)/float(lev) | ||
+ | f[k] = (cnt, lev, rat) | ||
+ | max_cnt = max(max_cnt, cnt) | ||
+ | max_rat = max(max_rat, rat) | ||
+ | |||
+ | f = {k:(v[0], v[1], v[2], (float(v[0])/float(max_cnt)), (float(v[2])/float(max_rat)), (((float(v[0])/float(max_cnt))**2)+((float(v[2])/float(max_rat))**2))) for k,v in f.iteritems()} | ||
+ | |||
+ | self.writeToCSV(filename, ['structureId', 'chainId', 'entityId', 'lysineCount', 'chainLength', 'lysineRatio', 'normalizedLysineCount', 'normalizedLysineRatio', 'suitabilityIndexA'], [[k[0], k[1], k[2], v[0], v[1], v[2], v[3], v[4], v[5]] for k,v in f.iteritems()]) | ||
+ | |||
+ | return f | ||
+ | |||
+ | def plotFrequencies(self, f): | ||
+ | x = []; y = []; d = []; keys = [] | ||
+ | for k,v in f.iteritems(): | ||
+ | x.append(v[2]) | ||
+ | y.append(float(v[0])) | ||
+ | d.append(v[5]) | ||
+ | keys.append(k) | ||
+ | |||
+ | npd = N.array(d) | ||
+ | r_mean = N.mean(npd) | ||
+ | r_std = N.std(npd) | ||
+ | del npd | ||
+ | |||
+ | xs = []; ys = []; ds = []; keyss = [] | ||
+ | t = [] | ||
+ | i = 0; | ||
+ | while i < len(x): | ||
+ | if d[i] > (r_mean+(3*r_std)): | ||
+ | ds.append(d.pop(i)) | ||
+ | xs.append(x.pop(i)) | ||
+ | ys.append(y.pop(i)) | ||
+ | keyss.append(keys.pop(i)) | ||
+ | t.append((xs[-1], ys[-1], ds[-1], keyss[-1])) | ||
+ | else: | ||
+ | i += 1 | ||
+ | del i | ||
+ | |||
+ | fig = P.figure(1) | ||
+ | ax = fig.add_subplot(111) | ||
+ | |||
+ | P.scatter(x,y) | ||
+ | P.scatter(xs, ys, c='r') | ||
+ | |||
+ | P.ylabel('Lysine Count') | ||
+ | P.xlabel('Lysine Ratio') | ||
+ | |||
+ | t = sorted(t, key=lambda st: st[2]) | ||
+ | |||
+ | #for i in t: | ||
+ | #ax.annotate(keyss[i], xy=(xs[i], ys[i]), xycoords='data', size=8) | ||
+ | # print i[3], ": Lysine Ratio=", i[0], ", Lysine Count=", i[1], ", Suitability Index A=", i[2] | ||
+ | |||
+ | P.show() | ||
+ | |||
+ | def writeToCSV(self, filename, header, data): | ||
+ | with open(filename, 'wb') as csvfile: | ||
+ | cwriter = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) | ||
+ | cwriter.writerow(header) | ||
+ | for i in data: | ||
+ | cwriter.writerow(i) | ||
+ | return | ||
</nowiki>{{:Team:TU-Eindhoven/Template:CodeEnd }} | </nowiki>{{:Team:TU-Eindhoven/Template:CodeEnd }} | ||
Revision as of 12:05, 2 July 2013
PDB Querying
query.xml
<orgPdbQuery> <version>head</version> <queryType>org.pdb.query.simple.ChainTypeQuery</queryType> <description>Chain Type: there is a Protein chain</description> <containsProtein>Y</containsProtein> <containsDna>?</containsDna> <containsRna>?</containsRna> <containsHybrid>?</containsHybrid> </orgPdbQuery>
PDB.py
#!/usr/bin/env python """ Module to read and process PDB data. (http://www.pdb.org) @author: Pascal Pieters @copyright: Copyright 2013, TU-Eindhoven iGEM Team @credits: TU-Eindhoven iGEM Team 2013 @license: GPL @version: 0.0.1 @maintainer: Pascal Pieters @email: p.a.pieters@student.tue.nl @status: Testing """ import urllib2 import os import pylab as P import numpy as N import csv import shutil class RemoteInstance: """General PDB RemoteInstance that can be used to retrieve PDB data""" def __init__(self): """Initiate the RemoteInstance""" self.searchURL = "http://www.rcsb.org/pdb/rest/search" self.query = "" self.result = "" return def loadQuery(self, filename): f = open(filename, 'r') self.query = ''.join(f.readlines()) f.close() return def setQuery(self,q): self.query = q return def executeQuery(self): req = urllib2.Request(self.searchURL, data=self.query) r = urllib2.urlopen(req) self.result = r.read() self.result = self.result.split('\n')[0:-1] return len(self.result) def getBasicInformation(self, filename): u = ["http://www.rcsb.org/pdb/rest/customReport?pdbids=", "&customReportColumns=structureId,chainId,entityId,sequence,chainLength,db_id,db_name,molecularWeight,secondaryStructure,entityMacromoleculeType&service=wsdisplay&format=csv"] ft = open(filename, 'w') for i in range(0, len(self.result), 100): print "# Downloading %d-%d" % (i, i+100) url = u[0] + (', '.join((self.result[i:i+100] if i+100 < len(self.result) else self.result[i:]))) + u[1] os.system(("wget -q -O out.csv \"%s\"" % url)) f = open('out.csv', 'r') data = ''.join(f.readlines()) print "# - Data Size: %d" % len(data) f.close() os.remove('out.csv') data = data.replace('<br />', '\n') if i > 0: [temp, data] = data.split('\n',1) ft.write(data) ft.close() return class LocalInstance: def __init__(self, path): self.filename = path return def filterBasics(self, minChainLength=1): fin = open(self.filename, 'r') fout = open("temp.csv", 'w') dh = ['chainLength'] headertext = fin.readline() fout.write(headertext) header = headertext.strip().split(',') d = {i:header.index(i) for i in dh} print "Chain Length: %d" % tuple(d.values()) for line in fin: l = line.strip().split(',') l = [i.strip('"') for i in l] if int(l[d['chainLength']]) > minChainLength: fout.write(line) fout.close() fin.close() os.remove(self.filename) shutil.move("temp.csv", self.filename) return def getProteinSequences(self): f = open(self.filename, 'r') p = {} dh = ['structureId', 'chainId', 'entityId', 'sequence', 'entityMacromoleculeType'] header = f.readline().strip().split(',') d = {i:header.index(i) for i in dh} print "# Parsing '%s'\n# - Header: " % (self.filename), for i in d.keys(): print "%s: %d" % (i, d[i]), print " " for line in f: l = line.strip().split(',') l = [i.strip('"') for i in l] if 'Polypeptide' in l[d['entityMacromoleculeType']]: p[(l[d['structureId']], l[d['chainId']], int(l[d['entityId']]))] = l[d['sequence']] f.close() return p def getFrequencies(self, p, aa, filename): f = {} max_cnt = 0; max_rat = 0 for k, v in p.iteritems(): cnt = v.count(aa) lev = len(v) rat = float(cnt)/float(lev) f[k] = (cnt, lev, rat) max_cnt = max(max_cnt, cnt) max_rat = max(max_rat, rat) f = {k:(v[0], v[1], v[2], (float(v[0])/float(max_cnt)), (float(v[2])/float(max_rat)), (((float(v[0])/float(max_cnt))**2)+((float(v[2])/float(max_rat))**2))) for k,v in f.iteritems()} self.writeToCSV(filename, ['structureId', 'chainId', 'entityId', 'lysineCount', 'chainLength', 'lysineRatio', 'normalizedLysineCount', 'normalizedLysineRatio', 'suitabilityIndexA'], [[k[0], k[1], k[2], v[0], v[1], v[2], v[3], v[4], v[5]] for k,v in f.iteritems()]) return f def plotFrequencies(self, f): x = []; y = []; d = []; keys = [] for k,v in f.iteritems(): x.append(v[2]) y.append(float(v[0])) d.append(v[5]) keys.append(k) npd = N.array(d) r_mean = N.mean(npd) r_std = N.std(npd) del npd xs = []; ys = []; ds = []; keyss = [] t = [] i = 0; while i < len(x): if d[i] > (r_mean+(3*r_std)): ds.append(d.pop(i)) xs.append(x.pop(i)) ys.append(y.pop(i)) keyss.append(keys.pop(i)) t.append((xs[-1], ys[-1], ds[-1], keyss[-1])) else: i += 1 del i fig = P.figure(1) ax = fig.add_subplot(111) P.scatter(x,y) P.scatter(xs, ys, c='r') P.ylabel('Lysine Count') P.xlabel('Lysine Ratio') t = sorted(t, key=lambda st: st[2]) #for i in t: #ax.annotate(keyss[i], xy=(xs[i], ys[i]), xycoords='data', size=8) # print i[3], ": Lysine Ratio=", i[0], ", Lysine Count=", i[1], ", Suitability Index A=", i[2] P.show() def writeToCSV(self, filename, header, data): with open(filename, 'wb') as csvfile: cwriter = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) cwriter.writerow(header) for i in data: cwriter.writerow(i) return