Team:TU-Eindhoven/Code:PDBQuery

From 2013.igem.org

Revision as of 12:15, 2 July 2013 by Pascalaldo (Talk | contribs)

Contents

PDB Querying

General Information

Dependencies Python, numpy, pylab, wget
License GPL
Example Usage python queryPDB.py -q query.xml -f 10

query.xml

<orgPdbQuery>
    <version>head</version>
    <queryType>org.pdb.query.simple.ChainTypeQuery</queryType>
    <description>Chain Type: there is a Protein chain</description>
    <containsProtein>Y</containsProtein>
    <containsDna>?</containsDna>
    <containsRna>?</containsRna>
    <containsHybrid>?</containsHybrid>
 </orgPdbQuery>

PDB.py

#!/usr/bin/env python

"""
Module to read and process PDB data. (http://www.pdb.org)

@author: Pascal Pieters
@copyright: Copyright 2013, TU-Eindhoven iGEM Team
@credits: TU-Eindhoven iGEM Team 2013
@license: GPL
@version: 0.0.1
@maintainer: Pascal Pieters
@email: p.a.pieters@student.tue.nl
@status: Testing
"""

import urllib2
import os
import pylab as P
import numpy as N
import csv
import shutil

class RemoteInstance:
	"""General PDB RemoteInstance that can be used to retrieve PDB data"""
	def __init__(self):
		"""Initiate the RemoteInstance"""
		self.searchURL = "http://www.rcsb.org/pdb/rest/search"
		self.query = ""
		self.result = ""
		return
	
	def loadQuery(self, filename):
		f = open(filename, 'r')
		self.query = ''.join(f.readlines())
		f.close()
		return
	
	def setQuery(self,q):
		self.query = q
		return
	
	def executeQuery(self):
		req = urllib2.Request(self.searchURL, data=self.query)
		r = urllib2.urlopen(req)
		self.result = r.read()
		self.result = self.result.split('\n')[0:-1]
		return len(self.result)
	
	def getBasicInformation(self, filename):
		u = ["http://www.rcsb.org/pdb/rest/customReport?pdbids=", "&customReportColumns=structureId,chainId,entityId,sequence,chainLength,db_id,db_name,molecularWeight,secondaryStructure,entityMacromoleculeType&service=wsdisplay&format=csv"]
		ft = open(filename, 'w')
		
		for i in range(0, len(self.result), 100):
			print "# Downloading %d-%d" % (i, i+100)
			url = u[0] + (', '.join((self.result[i:i+100] if i+100 < len(self.result) else self.result[i:]))) + u[1]
			os.system(("wget -q -O out.csv \"%s\"" % url))
		
			f = open('out.csv', 'r')
			data = ''.join(f.readlines())
			print "# - Data Size: %d" % len(data)
			f.close()
			os.remove('out.csv')
		
			data = data.replace('<br />', '\n')
			if i > 0:
				[temp, data] = data.split('\n',1)
			ft.write(data)
		ft.close()
		return
	

class LocalInstance:
	def __init__(self, path):
		self.filename = path
		return
	
	def filterBasics(self, minChainLength=1):
		fin = open(self.filename, 'r')
		fout = open("temp.csv", 'w')
		
		dh = ['chainLength']
		headertext = fin.readline()
		fout.write(headertext)
		header = headertext.strip().split(',')
		d = {i:header.index(i) for i in dh}
		print "Chain Length: %d" % tuple(d.values())
				
		for line in fin:
			l = line.strip().split(',')
			l = [i.strip('"') for i in l]
			if int(l[d['chainLength']]) > minChainLength:
				fout.write(line)
		
		fout.close()
		fin.close()
		os.remove(self.filename)
		shutil.move("temp.csv", self.filename)
		return
	
	def getProteinSequences(self):
		f = open(self.filename, 'r')
		p = {}
		
		dh = ['structureId', 'chainId', 'entityId', 'sequence', 'entityMacromoleculeType']
		header = f.readline().strip().split(',')
		d = {i:header.index(i) for i in dh}
		print "# Parsing '%s'\n# - Header: " % (self.filename),
		for i in d.keys():
			print "%s: %d" % (i, d[i]),
		print " "		

		for line in f:
			l = line.strip().split(',')
			l = [i.strip('"') for i in l]
			if 'Polypeptide' in l[d['entityMacromoleculeType']]:
				p[(l[d['structureId']], l[d['chainId']], int(l[d['entityId']]))] = l[d['sequence']]
		
		f.close()
		return p
	
	def getFrequencies(self, p, aa, filename):
		f = {}
		max_cnt = 0; max_rat = 0
		for k, v in p.iteritems():
			cnt = v.count(aa)
			lev = len(v)
			rat = float(cnt)/float(lev)
			f[k] = (cnt, lev, rat)
			max_cnt = max(max_cnt, cnt)
			max_rat = max(max_rat, rat)
		
		f = {k:(v[0], v[1], v[2], (float(v[0])/float(max_cnt)), (float(v[2])/float(max_rat)), (((float(v[0])/float(max_cnt))**2)+((float(v[2])/float(max_rat))**2))) for k,v in f.iteritems()}
		
		self.writeToCSV(filename, ['structureId', 'chainId', 'entityId', 'lysineCount', 'chainLength', 'lysineRatio', 'normalizedLysineCount', 'normalizedLysineRatio', 'suitabilityIndexA'], [[k[0], k[1], k[2], v[0], v[1], v[2], v[3], v[4], v[5]] for k,v in f.iteritems()])

		return f
	
	def plotFrequencies(self, f):
		x = []; y = []; d = []; keys = []
		for k,v in f.iteritems():
			x.append(v[2])
			y.append(float(v[0]))
			d.append(v[5])
			keys.append(k)
		
		npd = N.array(d)
		r_mean = N.mean(npd)
		r_std = N.std(npd)
		del npd
		
		xs = []; ys = []; ds = []; keyss = []
		t = []
		i = 0;
		while i < len(x):
			if d[i] > (r_mean+(3*r_std)):
				ds.append(d.pop(i))
				xs.append(x.pop(i))
				ys.append(y.pop(i))
				keyss.append(keys.pop(i))
				t.append((xs[-1], ys[-1], ds[-1], keyss[-1]))
			else:
				i += 1
		del i
		
		fig = P.figure(1)
		ax = fig.add_subplot(111)
		
		P.scatter(x,y)
		P.scatter(xs, ys, c='r')
		
		P.ylabel('Lysine Count')
		P.xlabel('Lysine Ratio')
		
		t = sorted(t, key=lambda st: st[2])
		
		P.show()
	
	def writeToCSV(self, filename, header, data):
		with open(filename, 'wb') as csvfile:
			cwriter = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
			cwriter.writerow(header)
			for i in data:
				cwriter.writerow(i)
		return

queryPDB.py

#!/usr/bin/env python

"""
Script to execute a simple PDB query

@author: Pascal Pieters
@copyright: Copyright 2013, TU-Eindhoven iGEM Team
@credits: TU-Eindhoven iGEM Team 2013
@license: GPL
@version: 0.0.1
@maintainer: Pascal Pieters
@email: p.a.pieters@student.tue.nl
@status: Testing
"""

import PDB
import getopt, sys

if __name__ == "__main__":
	opts, args = getopt.getopt(sys.argv[1:], "q:f:pv", ["query=", "filterlength=", "plot"])
	
	query = ""
	filterlength = 0
	plot = False
	verbose = False
	
	for o, a in opts:
		if o == "-v":
			verbose = True
		elif o in ("-q", "--query"):
			query = (a if a else "query.xml") 
		elif o in ("-f", "--filterlength"):
			filterlength = (int(a) if a else 10)
		elif o in ("-p", "--plot"):
			plot = True
		else:
			assert False, "unhandled option"
	
	if query:
		P = PDB.RemoteInstance()
		P.loadQuery("query.xml")
		if verbose: print "Structures Found: ", P.executeQuery()
		else: P.executeQuery()
		P.getBasicInformation("basic.csv")
	
	if filterlength or plot:
		L = PDB.LocalInstance("basic.csv")
	if filterlength:
		L.filterBasics(minChainLength=filterlength)
	if plot:
		proteins = L.getProteinSequences()
		if verbose:
			for k,v in proteins.iteritems():
				print k, ": ", v
		freq = L.getFrequencies(proteins, 'R', 'freq.csv')
		L.plotFrequencies(freq)