OpenStructure
blast.py
Go to the documentation of this file.
1 from ost import settings
2 import subprocess
3 from xml.dom import minidom
4 from ost import io, seq
5 import ost
6 import os, re, sys
7 
9  """
10  An aligned patch, aka. HSP
11 
12  .. attribute:: aln
13 
14  The local alignment. Sequence offset of both sequences in the alignment are
15  set to the starting position in the query and target sequence, respectively.
16 
17  :type: :class:`~ost.seq.AlignmentHandle`
18 
19  .. attribute:: bit_score
20 
21  The bit score of the HSP
22 
23  .. attribute:: score
24 
25  The score of the HSP
26 
27  .. attribute:: evalue
28 
29  The E-value of the HSP
30 
31  .. attribute:: seqid
32 
33  The sequence identity of the HSP
34  """
35  def __init__(self, aln, bit_score, score, evalue, seqid):
36  self.alnaln=aln
37  self.bit_scorebit_score=bit_score
38  self.scorescore=score
39  self.evalueevalue=evalue
40  self.seqidseqid=seqid
41 
42 class BlastHit:
43  """
44  A positive match found by BLAST.
45 
46  Each blast hit consists of one or more HSPs, encoded by the
47  :class:`AlignedPatch` class.
48 
49  .. attribute:: identifier
50 
51  The identifier of the matching sequence
52 
53  .. attribute:: aligned_patches
54 
55  list of :class:`AlignedPatch` instances holding the actual HSPs.
56  """
57  def __init__(self, identifier, aligned_patches):
58  self.identifieridentifier=identifier
59  self.aligned_patchesaligned_patches=aligned_patches
60 
61 
62 def ParseBlastOutput(string, seqid_thres=0, evalue_thres=float("infinity")):
63  """
64  Parses the blast output and returns a list of BlastHits
65  setting no seqid_thres or evalue_thres, restores default behaviour without filtering
66  """
67  def _GetText(node):
68  rc=''
69  for child in node.childNodes:
70  if child.nodeType==child.TEXT_NODE:
71  rc+=child.data
72  return rc
73 
74  def _GetValue(node, tag_name):
75  tag=node.getElementsByTagName(tag_name)
76  assert len(tag)==1
77  return _GetText(tag[0])
78 
79  def _GetInt(node, tag_name):
80  return int(_GetValue(node, tag_name))
81 
82  def _ParseHsp(query_id, hit_id, hsp, tot_query_len, seqid_thres=0, evalue_thres=float("infinity")):
83  bit_score=float(_GetValue(hsp, 'Hsp_bit-score'))
84  score=float(_GetValue(hsp, 'Hsp_score'))
85  evalue=float(_GetValue(hsp, 'Hsp_evalue'))
86  try:
87  identity=float(_GetValue(hsp, 'Hsp_identity'))
88  except AssertionError:
89  # The Hsp_identity tag is not a 'must' in the BLAST XML format. It
90  # describes the number of matching characters. Hence we assume, if it is
91  # missing, there are 0 matches.
92  identity=0
93  hsp_align_len=float(_GetValue(hsp, 'Hsp_align-len'))
94  seqid=identity/hsp_align_len
95  query_offset=_GetInt(hsp, 'Hsp_query-from')-1
96  hit_offset=_GetInt(hsp, 'Hsp_hit-from')-1
97  query_seq=seq.CreateSequence(str(query_id), str(_GetValue(hsp, 'Hsp_qseq')))
98  query_seq.offset=query_offset
99  hit_seq=seq.CreateSequence(str(hit_id), str(_GetValue(hsp, 'Hsp_hseq')))
100  hit_seq.offset=hit_offset
101  try:
102  if seqid > float(seqid_thres) and evalue < evalue_thres:
103  aln=seq.CreateAlignment(query_seq, hit_seq)
104  return AlignedPatch(aln, bit_score, score, evalue, seqid)
105 
106  except Exception as e:
107  print(str(e), query_seq, hit_seq)
108 
109  try:
110  doc=minidom.parseString(string)
111  except Exception as e:
112  ost.LogError('Error while parsing BLAST output: %s' % str(e))
113  return None
114  hits=[]
115  query_id=_GetValue(doc, 'BlastOutput_query-def')
116  tot_query_len=_GetValue(doc, 'BlastOutput_query-len')
117  for hit in doc.getElementsByTagName('Hit'):
118  hit_id=_GetValue(hit, 'Hit_def')
119  hsps=hit.getElementsByTagName('Hsp')
120  aligned_patches=[_ParseHsp(query_id, hit_id, hsp, tot_query_len, seqid_thres, evalue_thres) for hsp in hsps]
121  hits.append(BlastHit(hit_id, aligned_patches))
122  return hits
123 
124 
125 
126 class BlastError(RuntimeError):
127  def __init__(self, brief, details):
128  self.briefbrief=brief
129  self.detailsdetails=details
130 
131  def __str__(self):
132  if self.detailsdetails:
133  return '%s\n%s' % (self.briefbrief, self.detailsdetails)
134  else:
135  return self.briefbrief
136 
137 def CreateDB(infasta, dbout, mkdb_cmd=None):
138  """
139  Create a blast DB from a fasta file
140 
141  :param infasta: the pdb fasta from which the database will be created
142  :type infasta: :class:`string`
143 
144  :param dbout: output location for blastDB file
145  :type dbout: :class:`string`
146 
147 
148  """
149  if mkdb_cmd==None:
150  try:
151  exe=settings.Locate('makeblastdb')
152  args=[exe, '-in', infasta, '-out', dbout, '-dbtype', 'prot']
153  except:
154  try:
155  exe=settings.Locate('formatdb')
156  args=[exe, '-i', infasta, '-n', dbout]
157  except:
158  raise RuntimeError('could not find makeblastdb/formatdb executable')
159  else:
160  if os.path.basename(mkdb_cmd)=='makeblastdb':
161  exe=settings.Locate('makeblastdb',explicit_file_name=mkdb_cmd)
162  args=[exe, '-in', infasta, '-out', dbout, '-dbtype', 'prot']
163  elif os.path.basename(mkdb_cmd)=='formatdb':
164  exe=settings.Locate('formatdb',explicit_filename=mkdb_cmd)
165  args=[exe, '-i', infasta, '-n', dbout]
166  else:
167  raise IOError('mkdb command must either be the path to formatdb or makeblastdb!')
168 
169  ost.LogInfo('creating blast DB (%s)' % ' '.join(args))
170  blast_pipe=subprocess.Popen(args, stdout=subprocess.PIPE,
171  stderr=subprocess.PIPE)
172  blast_pipe.communicate()
173 
174 def BlastVersion(blast_location=None):
175  """
176  Returns the version of the BLAST executable, e.g. 2.2.24 as a string
177  """
178 
179  try:
180  blast_exe=settings.Locate('blastp',explicit_file_name=blast_location)
181  except:
182  try:
183  blast_exe=settings.Locate('blastall', explicit_file_name=blast_location)
184  except:
185  raise RuntimeError('could not find blast executable')
186 
187  if os.path.basename(blast_exe)=='blastall':
188  args=[blast_exe]
189  pattern=re.compile(r'\s*blastall (\d+\.\d+\.\d+)\s+arguments:\s*')
190 
191  else:
192  args=[blast_exe, '-version']
193  pattern=re.compile(r'\s*Package: blast (\d+\.\d+\.\d+),\s+')
194 
195  blast_pipe=subprocess.Popen(args, stdout=subprocess.PIPE,
196  stderr=subprocess.PIPE)
197  stdout, _ = blast_pipe.communicate()
198  lines=stdout.decode().splitlines()
199 
200  for line in lines:
201  m=pattern.match(line)
202  if m:
203  return m.group(1)
204  raise IOError("could not determine blast version for '%s'" % blast_exe)
205 
206 
207 
208 def Blast(query, database, gap_open=11, gap_ext=1, matrix='BLOSUM62',
209  blast_location=None, outfmt=0, filter_low_complexity=True):
210  """
211  Runs a protein vs. protein blast search. The results are returned
212  according to the value of the ``outfmt`` parameter.
213 
214  :param query: the query sequence
215  :type query: :class:`seq.ConstSequenceHandle`
216 
217  :param database: The filename of the sequence database. Make sure that
218  formatdb has been run on the database and the <database>.pin file exists.
219  :param matrix: The substitution matrix to be used. Must be one of 'BLOSUM45',
220  'BLOSUM62', 'BLOSUM80', 'PAM30', 'PAM70'.
221  :param gap_open: Gap opening penalty. Note that only a subset of gap opening
222  penalties is supported for each substitutition matrix. Consult the blast
223  docs for more information.
224  :param gap_ext: Gap extension penalty. Only a subset of gap extension
225  penalties are supported for each of the substitution matrices. Consult the
226  blast docs for more information.
227  :param outfmt: output format, where '0' corresponds to default output (parsed
228  blast output and 1 to raw string output).
229  :param filter_low_complexity: Mask off segments of the query sequence that
230  have low compositional complexity, as determined by the SEG program of
231  Wootton & Federhen (Computers and Chemistry, 1993)
232  :rtype: :class:`BlastHit` (with ``outfmt=0``) or :class:`str`
233  (with ``outfmt=1``)
234  """
235  subst_mats=('BLOSUM45', 'BLOSUM62', 'BLOSUM80', 'PAM30', 'PAM70',)
236  if matrix not in subst_mats:
237  raise ValueError('matrix must be one of %s' % ', '.join(subst_mats))
238  if not os.path.exists('%s.pin' % database) and not os.path.exists('%s.pal' % database):
239  raise IOError("Database %s does not exist" % database)
240  if blast_location!=None and not os.path.exists(blast_location):
241  ost.LogScript('Could not find %s' %blast_location)
242 
243  if blast_location==None:
244  try:
245  blast_exe=settings.Locate('blastp')
246  except:
247  try:
248  blast_exe=settings.Locate('blastall')
249  except:
250  raise RuntimeError('could not find blast executable')
251  else:
252  blast_exe=settings.Locate(os.path.basename(blast_location),explicit_file_name=blast_location)
253 
254  if os.path.basename(blast_exe)=='blastall':
255  args=[blast_exe, '-d', database, '-p', 'blastp',
256  '-m', '7', '-M', matrix, '-G', str(gap_open), '-E', str(gap_ext)]
257  if filter_low_complexity==False:
258  args.append('-F')
259  args.append('F')
260 
261  else:
262  complexity_opt='-seg'
263  if filter_low_complexity==True:
264  complexity_arg='yes'
265  else:
266  complexity_arg='no'
267  args=[blast_exe, '-db', database, '-matrix', matrix,
268  '-gapopen', str(gap_open), '-gapextend', str(gap_ext), '-outfmt', '5', complexity_opt, complexity_arg ]
269 
270  ost.LogInfo('running BLAST (%s)' % ' '.join(args))
271  blast_pipe=subprocess.Popen(args, stderr=subprocess.PIPE,
272  stdout=subprocess.PIPE, stdin=subprocess.PIPE)
273  if isinstance(query, str):
274  stdout, stderr=blast_pipe.communicate(query.encode())
275  else:
276  stdout, stderr=blast_pipe.communicate(io.SequenceToString(query, 'fasta').encode())
277 
278  if len(stderr)>0:
279  pattern=re.compile(r'^\[.*\]\s+ERROR:\s+(.*)')
280  lines=stderr.decode().split('\n')
281  error_message=pattern.match(lines[0])
282  if error_message:
283  raise BlastError(error_message.group(1), '\n'.join(lines[1:]))
284  if outfmt==0:
285  return ParseBlastOutput(stdout.decode())
286  else:
287  return stdout.decode()
def __init__(self, aln, bit_score, score, evalue, seqid)
Definition: blast.py:35
def __init__(self, brief, details)
Definition: blast.py:127
def __init__(self, identifier, aligned_patches)
Definition: blast.py:57
def Blast(query, database, gap_open=11, gap_ext=1, matrix='BLOSUM62', blast_location=None, outfmt=0, filter_low_complexity=True)
Definition: blast.py:209
def ParseBlastOutput(string, seqid_thres=0, evalue_thres=float("infinity"))
Definition: blast.py:62
def CreateDB(infasta, dbout, mkdb_cmd=None)
Definition: blast.py:137
def BlastVersion(blast_location=None)
Definition: blast.py:174