OpenStructure
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Groups Pages
blast.py
Go to the documentation of this file.
1 from ost import settings
2 import subprocess
3 from xml.dom import minidom
4 from ost import io, seq
5 import ost
6 import os, re, sys
7 sys.path.append('PYMODULES')
8 
9 
11  """
12  An aligned patch, aka. HSP
13 
14  .. attribute:: aln
15 
16  The local alignment. Sequence offset of both sequences in the alignment are
17  set to the starting position in the query and target sequence, respectively.
18 
19  :type: :class:`~ost.seq.AlignmentHandle`
20 
21  .. attribute:: bit_score
22 
23  The bit score of the HSP
24 
25  .. attribute:: score
26 
27  The score of the HSP
28 
29  .. attribute:: evalue
30 
31  The E-value of the HSP
32 
33  .. attribute:: seqid
34 
35  The sequence identity of the HSP
36  """
37  def __init__(self, aln, bit_score, score, evalue, seqid):
38  self.aln=aln
39  self.bit_score=bit_score
40  self.score=score
41  self.evalue=evalue
42  self.seqid=seqid
43 
44 class BlastHit:
45  """
46  A positive match found by BLAST.
47 
48  Each blast hit consists of one or more HSPs, encoded by the
49  :class:`AlignedPatch` class.
50 
51  .. attribute:: identifier
52 
53  The identifier of the matching sequence
54 
55  .. attribute:: aligned_patches
56 
57  list of :class:`AlignedPatch` instances holding the actual HSPs.
58  """
59  def __init__(self, identifier, aligned_patches):
60  self.identifier=identifier
61  self.aligned_patches=aligned_patches
62 
63 
64 def ParseBlastOutput(string, seqid_thres=0, evalue_thres=float("infinity")):
65  """
66  Parses the blast output and returns a list of BlastHits
67  setting no seqid_thres or evalue_thres, restores default behaviour without filtering
68  """
69  def _GetText(node):
70  rc=''
71  for child in node.childNodes:
72  if child.nodeType==child.TEXT_NODE:
73  rc+=child.data
74  return rc
75 
76  def _GetValue(node, tag_name):
77  tag=node.getElementsByTagName(tag_name)
78  assert len(tag)==1
79  return _GetText(tag[0])
80 
81  def _GetInt(node, tag_name):
82  return int(_GetValue(node, tag_name))
83 
84  def _ParseHsp(query_id, hit_id, hsp, tot_query_len, seqid_thres=0, evalue_thres=float("infinity")):
85  bit_score=float(_GetValue(hsp, 'Hsp_bit-score'))
86  score=float(_GetValue(hsp, 'Hsp_score'))
87  evalue=float(_GetValue(hsp, 'Hsp_evalue'))
88  identity=float(_GetValue(hsp, 'Hsp_identity'))
89  hsp_align_len=float(_GetValue(hsp, 'Hsp_align-len'))
90  seqid=identity/hsp_align_len
91  query_offset=_GetInt(hsp, 'Hsp_query-from')-1
92  hit_offset=_GetInt(hsp, 'Hsp_hit-from')-1
93  query_seq=seq.CreateSequence(str(query_id), str(_GetValue(hsp, 'Hsp_qseq')))
94  query_seq.offset=query_offset
95  hit_seq=seq.CreateSequence(str(hit_id), str(_GetValue(hsp, 'Hsp_hseq')))
96  hit_seq.offset=hit_offset
97  try:
98  if seqid > float(seqid_thres) and evalue < evalue_thres:
99  aln=seq.CreateAlignment(query_seq, hit_seq)
100  return AlignedPatch(aln, bit_score, score, evalue, seqid)
101 
102  except Exception, e:
103  print str(e), query_seq, hit_seq
104 
105  try:
106  doc=minidom.parseString(string)
107  except Exception, e:
108  ost.LogError('Error while parsing BLAST output: %s' % str(e))
109  return None
110  hits=[]
111  query_id=_GetValue(doc, 'BlastOutput_query-def')
112  tot_query_len=_GetValue(doc, 'BlastOutput_query-len')
113  for hit in doc.getElementsByTagName('Hit'):
114  hit_id=_GetValue(hit, 'Hit_def')
115  hsps=hit.getElementsByTagName('Hsp')
116  aligned_patches=[_ParseHsp(query_id, hit_id, hsp, tot_query_len, seqid_thres, evalue_thres) for hsp in hsps]
117  hits.append(BlastHit(hit_id, aligned_patches))
118  return hits
119 
120 
121 
122 class BlastError(RuntimeError):
123  def __init__(self, brief, details):
124  self.brief=brief
125  self.details=details
126 
127  def __str__(self):
128  if self.details:
129  return '%s\n%s' % (self.brief, self.details)
130  else:
131  return self.brief
132 
133 def CreateDB(infasta, dbout, mkdb_cmd=None):
134  """
135  Create a blast DB from a fasta file
136 
137  :param infasta: the pdb fasta from which the database will be created
138  :type infasta: :class:`string`
139 
140  :param dbout: output location for blastDB file
141  :type infasta: :class:`string`
142 
143 
144  """
145  if mkdb_cmd==None:
146  try:
147  exe=settings.Locate('formatdb')
148  args=[exe, '-i', infasta, '-v',str(10000),'-n',dbout]
149  except:
150  try:
151  exe=settings.Locate('makeblastdb')
152  args=[exe, '-in', infasta, ' -max_file_sz 10GB -out', dbout,'-dbtype','prot']
153 
154  except:
155  raise RuntimeError('could not find makeblastdb/formatdb executable')
156  else:
157  if os.path.basename(mkdb_cmd)=='makeblastdb':
158  exe=settings.Locate('makeblastdb',explicit_file_name=mkdb_cmd)
159  args=[exe, '-in', infasta, ' -max_file_sz 10GB -out', dbout,'-dbtype','prot']
160  elif os.path.basename(mkdb_cmd)=='formatdb':
161  exe=settings.Locate('formatdb',explicit_filename=mkdb_cmd)
162  args=[exe, '-i', infasta, '-v',str(10000),'-n',dbout]
163  else:
164  raise IOError('mkdb command must either be the path to formatdb or makeblastdb!')
165 
166  cmd=' '.join(args)
167  ost.LogInfo('creating blast DB (%s)' % cmd)
168  os.system(cmd)
169 
170 def BlastVersion(blast_location=None):
171  """
172  Returns the version of the BLAST executable, e.g. 2.2.24 as a string
173  """
174 
175  try:
176  blast_exe=settings.Locate('blastall',explicit_file_name=blast_location)
177  except:
178  try:
179  blast_exe=settings.Locate('blastp', explicit_file_name=blast_location)
180  except:
181  raise RuntimeError('could not find blast executable')
182 
183  if os.path.basename(blast_exe)=='blastall':
184  args=[blast_exe]
185  pattern=re.compile(r'blastall (\d+\.\d+\.\d+)\s+arguments:\s*')
186 
187  else:
188  args=[blast_exe, '-version']
189  pattern=re.compile(r'Package: blast (\d+\.\d+\.\d+),\s+')
190 
191  blast_pipe=subprocess.Popen(args, stdout=subprocess.PIPE,
192  stderr=subprocess.PIPE)
193  lines=blast_pipe.stdout.readlines()
194 
195  for line in lines:
196  m=pattern.match(line)
197  if m:
198  return m.group(1)
199  raise IOError("could not determine blast version for '%s'" % blast_exe)
200 
201 
202 
203 def Blast(query, database, gap_open=11, gap_ext=1, matrix='BLOSUM62',
204  blast_location=None, outfmt=0, filter_low_complexity=True):
205  """
206  Runs a protein vs. protein blast search. The results are returned as a
207  list of :class:`BlastHit` instances.
208 
209  :param query: the query sequence
210  :type query: :class:`seq.ConstSequenceHandle`
211 
212  :param database: The filename of the sequence database. Make sure that
213  formatdb has been run on the database and the <database>.pin file exists.
214  :param matrix: The substitution matrix to be used. Must be one of 'BLOSUM45',
215  'BLOSUM62', 'BLOSUM80', 'PAM30', 'PAM70'.
216  :param gap_open: Gap opening penalty. Note that only a subset of gap opening
217  penalties is supported for each substitutition matrix. Consult the blast
218  docs for more information.
219  :param gap_ext: Gap extension penalty. Only a subset of gap extension
220  penalties are supported for each of the substitution matrices. Consult the
221  blast docs for more information.
222  :param outfmt: output format, where '0' corresponds to default output (parsed blast output and 1 to raw output)
223  :param filter_low_complexity: Mask off segments of the query sequence that
224  have low compositional complexity, as determined by the SEG program of
225  Wootton & Federhen (Computers and Chemistry, 1993)
226  """
227  subst_mats=('BLOSUM45', 'BLOSUM62', 'BLOSUM80', 'PAM30', 'PAM70',)
228  if matrix not in subst_mats:
229  raise ValueError('matrix must be one of %s' % ', '.join(subst_mats))
230  if not os.path.exists('%s.pin' % database) and not os.path.exists('%s.pal' % database):
231  raise IOError("Database %s does not exist" % database)
232  if blast_location!=None and not os.path.exists(blast_location):
233  ost.LogScript('Could not find %s' %blast_location)
234 
235  if blast_location==None:
236  try:
237  blast_exe=settings.Locate('blastall')
238  except:
239  try:
240  blast_exe=settings.Locate('blastp')
241  except:
242  raise RuntimeError('could not find blast executable')
243  else:
244  blast_exe=settings.Locate(os.path.basename(blast_location),explicit_file_name=blast_location)
245 
246  if os.path.basename(blast_exe)=='blastall':
247  args=[blast_exe, '-d', database, '-p', 'blastp',
248  '-m', '7', '-M', matrix, '-G', str(gap_open), '-E', str(gap_ext)]
249  if filter_low_complexity==False:
250  args.append('-F')
251  args.append('F')
252 
253  else:
254  complexity_opt='-seg'
255  if filter_low_complexity==True:
256  complexity_arg='yes'
257  else:
258  complexity_arg='no'
259  args=[blast_exe, '-db', database, '-matrix', matrix,
260  '-gapopen', str(gap_open), '-gapextend', str(gap_ext), '-outfmt', '5', complexity_opt, complexity_arg ]
261 
262  ost.LogInfo('running BLAST (%s)' % ' '.join(args))
263  blast_pipe=subprocess.Popen(args, stderr=subprocess.PIPE,
264  stdout=subprocess.PIPE, stdin=subprocess.PIPE)
265  if isinstance(query, str):
266  stdout, stderr=blast_pipe.communicate(query)
267  else:
268  stdout, stderr=blast_pipe.communicate(io.SequenceToString(query, 'fasta'))
269 
270  if len(stderr)>0:
271  pattern=re.compile(r'^\[.*\]\s+ERROR:\s+(.*)')
272  lines=stderr.split('\n')
273  error_message=pattern.match(lines[0])
274  if error_message:
275  raise BlastError(error_message.group(1), '\n'.join(lines[1:]))
276  if outfmt==0:
277  return ParseBlastOutput(stdout)
278  else:
279  return stdout