OpenStructure
chain_mapping.py
Go to the documentation of this file.
1 """
2 Chain mapping aims to identify a one-to-one relationship between chains in a
3 reference structure and a model.
4 """
5 
6 import itertools
7 import copy
8 
9 import numpy as np
10 
11 from scipy.special import factorial
12 from scipy.special import binom # as of Python 3.8, the math module implements
13  # comb, i.e. n choose k
14 
15 import ost
16 from ost import seq
17 from ost import mol
18 from ost import geom
19 
20 from ost.mol.alg import lddt
21 from ost.mol.alg import qsscore
22 
23 def _CSel(ent, cnames):
24  """ Returns view with specified chains
25 
26  Ensures that quotation marks are around chain names to not confuse
27  OST query language with weird special characters.
28  """
29  query = "cname=" + ','.join([mol.QueryQuoteName(cname) for cname in cnames])
30  return ent.Select(query)
31 
33  """ Result object for the chain mapping functions in :class:`ChainMapper`
34 
35  Constructor is directly called within the functions, no need to construct
36  such objects yourself.
37  """
38  def __init__(self, target, model, chem_groups, chem_mapping, mapping, alns,
39  opt_score=None):
40  self._target_target = target
41  self._model_model = model
42  self._chem_groups_chem_groups = chem_groups
43  self._chem_mapping_chem_mapping = chem_mapping
44  self._mapping_mapping = mapping
45  self._alns_alns = alns
46  self._opt_score_opt_score = opt_score
47 
48  @property
49  def target(self):
50  """ Target/reference structure, i.e. :attr:`ChainMapper.target`
51 
52  :type: :class:`ost.mol.EntityView`
53  """
54  return self._target_target
55 
56  @property
57  def model(self):
58  """ Model structure that gets mapped onto :attr:`~target`
59 
60  Underwent same processing as :attr:`ChainMapper.target`, i.e.
61  only contains peptide/nucleotide chains of sufficient size.
62 
63  :type: :class:`ost.mol.EntityView`
64  """
65  return self._model_model
66 
67  @property
68  def chem_groups(self):
69  """ Groups of chemically equivalent chains in :attr:`~target`
70 
71  Same as :attr:`ChainMapper.chem_group`
72 
73  :class:`list` of :class:`list` of :class:`str` (chain names)
74  """
75  return self._chem_groups_chem_groups
76 
77  @property
78  def chem_mapping(self):
79  """ Assigns chains in :attr:`~model` to :attr:`~chem_groups`.
80 
81  :class:`list` of :class:`list` of :class:`str` (chain names)
82  """
83  return self._chem_mapping_chem_mapping
84 
85  @property
86  def mapping(self):
87  """ Mapping of :attr:`~model` chains onto :attr:`~target`
88 
89  Exact same shape as :attr:`~chem_groups` but containing the names of the
90  mapped chains in :attr:`~model`. May contain None for :attr:`~target`
91  chains that are not covered. No guarantee that all chains in
92  :attr:`~model` are mapped.
93 
94  :class:`list` of :class:`list` of :class:`str` (chain names)
95  """
96  return self._mapping_mapping
97 
98  @property
99  def alns(self):
100  """ Alignments of mapped chains in :attr:`~target` and :attr:`~model`
101 
102  Each alignment is accessible with ``alns[(t_chain,m_chain)]``. First
103  sequence is the sequence of :attr:`target` chain, second sequence the
104  one from :attr:`~model`. The respective :class:`ost.mol.EntityView` are
105  attached with :func:`ost.seq.ConstSequenceHandle.AttachView`.
106 
107  :type: :class:`dict` with key: :class:`tuple` of :class:`str`, value:
108  :class:`ost.seq.AlignmentHandle`
109  """
110  return self._alns_alns
111 
112  @property
113  def opt_score(self):
114  """ Placeholder property without any guarantee of being set
115 
116  Different scores get optimized in the various chain mapping algorithms.
117  Some of them may set their final optimal score in that property.
118  Consult the documentation of the respective chain mapping algorithm
119  for more information. Won't be in the return dict of
120  :func:`JSONSummary`.
121  """
122  return self._opt_score_opt_score
123 
124  def GetFlatMapping(self, mdl_as_key=False):
125  """ Returns flat mapping as :class:`dict` for all mapable chains
126 
127  :param mdl_as_key: Default is target chain name as key and model chain
128  name as value. This can be reversed with this flag.
129  :returns: :class:`dict` with :class:`str` as key/value that describe
130  one-to-one mapping
131  """
132  flat_mapping = dict()
133  for trg_chem_group, mdl_chem_group in zip(self.chem_groupschem_groups,
134  self.mappingmapping):
135  for a,b in zip(trg_chem_group, mdl_chem_group):
136  if a is not None and b is not None:
137  if mdl_as_key:
138  flat_mapping[b] = a
139  else:
140  flat_mapping[a] = b
141  return flat_mapping
142 
143  def JSONSummary(self):
144  """ Returns JSON serializable summary of results
145  """
146  json_dict = dict()
147  json_dict["chem_groups"] = self.chem_groupschem_groups
148  json_dict["mapping"] = self.mappingmapping
149  json_dict["flat_mapping"] = self.GetFlatMappingGetFlatMapping()
150  json_dict["alns"] = list()
151  for aln in self.alnsalns.values():
152  trg_seq = aln.GetSequence(0)
153  mdl_seq = aln.GetSequence(1)
154  aln_dict = {"trg_ch": trg_seq.GetName(), "trg_seq": str(trg_seq),
155  "mdl_ch": mdl_seq.GetName(), "mdl_seq": str(mdl_seq)}
156  json_dict["alns"].append(aln_dict)
157  return json_dict
158 
159 
161 
162  """ Result object for :func:`ChainMapper.GetRepr`
163 
164  Constructor is directly called within the function, no need to construct
165  such objects yourself.
166 
167  :param lDDT: lDDT for this mapping. Depends on how you call
168  :func:`ChainMapper.GetRepr` whether this is backbone only or
169  full atom lDDT.
170  :type lDDT: :class:`float`
171  :param substructure: The full substructure for which we searched for a
172  representation
173  :type substructure: :class:`ost.mol.EntityView`
174  :param ref_view: View pointing to the same underlying entity as
175  *substructure* but only contains the stuff that is mapped
176  :type ref_view: :class:`mol.EntityView`
177  :param mdl_view: The matching counterpart in model
178  :type mdl_view: :class:`mol.EntityView`
179  """
180  def __init__(self, lDDT, substructure, ref_view, mdl_view):
181  self._lDDT_lDDT = lDDT
182  self._substructure_substructure = substructure
183  assert(len(ref_view.residues) == len(mdl_view.residues))
184  self._ref_view_ref_view = ref_view
185  self._mdl_view_mdl_view = mdl_view
186 
187  # lazily evaluated attributes
188  self._ref_bb_pos_ref_bb_pos = None
189  self._mdl_bb_pos_mdl_bb_pos = None
190  self._ref_full_bb_pos_ref_full_bb_pos = None
191  self._mdl_full_bb_pos_mdl_full_bb_pos = None
192  self._transform_transform = None
193  self._superposed_mdl_bb_pos_superposed_mdl_bb_pos = None
194  self._bb_rmsd_bb_rmsd = None
195  self._gdt_8_gdt_8 = None
196  self._gdt_4_gdt_4 = None
197  self._gdt_2_gdt_2 = None
198  self._gdt_1_gdt_1 = None
199  self._ost_query_ost_query = None
200  self._flat_mapping_flat_mapping = None
201  self._inconsistent_residues_inconsistent_residues = None
202 
203  @property
204  def lDDT(self):
205  """ lDDT of representation result
206 
207  Depends on how you call :func:`ChainMapper.GetRepr` whether this is
208  backbone only or full atom lDDT.
209 
210  :type: :class:`float`
211  """
212  return self._lDDT_lDDT
213 
214  @property
215  def substructure(self):
216  """ The full substructure for which we searched for a
217  representation
218 
219  :type: :class:`ost.mol.EntityView`
220  """
221  return self._substructure_substructure
222 
223  @property
224  def ref_view(self):
225  """ View which contains the mapped subset of :attr:`substructure`
226 
227  :type: :class:`ost.mol.EntityView`
228  """
229  return self._ref_view_ref_view
230 
231  @property
232  def mdl_view(self):
233  """ The :attr:`ref_view` representation in the model
234 
235  :type: :class:`ost.mol.EntityView`
236  """
237  return self._mdl_view_mdl_view
238 
239  @property
240  def ref_residues(self):
241  """ The reference residues
242 
243  :type: class:`mol.ResidueViewList`
244  """
245  return self.ref_viewref_view.residues
246 
247  @property
248  def mdl_residues(self):
249  """ The model residues
250 
251  :type: :class:`mol.ResidueViewList`
252  """
253  return self.mdl_viewmdl_view.residues
254 
255  @property
257  """ A list of mapped residue whose names do not match (eg. ALA in the
258  reference and LEU in the model).
259 
260  The mismatches are reported as a tuple of :class:`~ost.mol.ResidueView`
261  (reference, model), or as an empty list if all the residue names match.
262 
263  :type: :class:`list`
264  """
265  if self._inconsistent_residues_inconsistent_residues is None:
266  self._inconsistent_residues_inconsistent_residues = self._GetInconsistentResidues_GetInconsistentResidues(
267  self.ref_residuesref_residues, self.mdl_residuesmdl_residues)
268  return self._inconsistent_residues_inconsistent_residues
269 
270  @property
271  def ref_bb_pos(self):
272  """ Representative backbone positions for reference residues.
273 
274  Thats CA positions for peptides and C3' positions for Nucleotides.
275 
276  :type: :class:`geom.Vec3List`
277  """
278  if self._ref_bb_pos_ref_bb_pos is None:
279  self._ref_bb_pos_ref_bb_pos = self._GetBBPos_GetBBPos(self.ref_residuesref_residues)
280  return self._ref_bb_pos_ref_bb_pos
281 
282  @property
283  def mdl_bb_pos(self):
284  """ Representative backbone positions for model residues.
285 
286  Thats CA positions for peptides and C3' positions for Nucleotides.
287 
288  :type: :class:`geom.Vec3List`
289  """
290  if self._mdl_bb_pos_mdl_bb_pos is None:
291  self._mdl_bb_pos_mdl_bb_pos = self._GetBBPos_GetBBPos(self.mdl_residuesmdl_residues)
292  return self._mdl_bb_pos_mdl_bb_pos
293 
294  @property
295  def ref_full_bb_pos(self):
296  """ Representative backbone positions for reference residues.
297 
298  Thats N, CA and C positions for peptides and O5', C5', C4', C3', O3'
299  positions for Nucleotides.
300 
301  :type: :class:`geom.Vec3List`
302  """
303  if self._ref_full_bb_pos_ref_full_bb_pos is None:
304  self._ref_full_bb_pos_ref_full_bb_pos = self._GetFullBBPos_GetFullBBPos(self.ref_residuesref_residues)
305  return self._ref_full_bb_pos_ref_full_bb_pos
306 
307  @property
308  def mdl_full_bb_pos(self):
309  """ Representative backbone positions for reference residues.
310 
311  Thats N, CA and C positions for peptides and O5', C5', C4', C3', O3'
312  positions for Nucleotides.
313 
314  :type: :class:`geom.Vec3List`
315  """
316  if self._mdl_full_bb_pos_mdl_full_bb_pos is None:
317  self._mdl_full_bb_pos_mdl_full_bb_pos = self._GetFullBBPos_GetFullBBPos(self.mdl_residuesmdl_residues)
318  return self._mdl_full_bb_pos_mdl_full_bb_pos
319 
320  @property
321  def transform(self):
322  """ Transformation to superpose mdl residues onto ref residues
323 
324  Superposition computed as minimal RMSD superposition on
325  :attr:`ref_bb_pos` and :attr:`mdl_bb_pos`. If number of positions is
326  smaller 3, the full_bb_pos equivalents are used instead.
327 
328  :type: :class:`ost.geom.Mat4`
329  """
330  if self._transform_transform is None:
331  if len(self.mdl_bb_posmdl_bb_pos) < 3:
332  self._transform_transform = _GetTransform(self.mdl_full_bb_posmdl_full_bb_pos,
333  self.ref_full_bb_posref_full_bb_pos, False)
334  else:
335  self._transform_transform = _GetTransform(self.mdl_bb_posmdl_bb_pos,
336  self.ref_bb_posref_bb_pos, False)
337  return self._transform_transform
338 
339  @property
341  """ :attr:`mdl_bb_pos` with :attr:`transform applied`
342 
343  :type: :class:`geom.Vec3List`
344  """
345  if self._superposed_mdl_bb_pos_superposed_mdl_bb_pos is None:
346  self._superposed_mdl_bb_pos_superposed_mdl_bb_pos = geom.Vec3List(self.mdl_bb_posmdl_bb_pos)
347  self._superposed_mdl_bb_pos_superposed_mdl_bb_pos.ApplyTransform(self.transformtransform)
348  return self._superposed_mdl_bb_pos_superposed_mdl_bb_pos
349 
350  @property
351  def bb_rmsd(self):
352  """ RMSD between :attr:`ref_bb_pos` and :attr:`superposed_mdl_bb_pos`
353 
354  :type: :class:`float`
355  """
356  if self._bb_rmsd_bb_rmsd is None:
357  self._bb_rmsd_bb_rmsd = self.ref_bb_posref_bb_pos.GetRMSD(self.superposed_mdl_bb_possuperposed_mdl_bb_pos)
358  return self._bb_rmsd_bb_rmsd
359 
360  @property
361  def gdt_8(self):
362  """ GDT with one single threshold: 8.0
363 
364  :type: :class:`float`
365  """
366  if self._gdt_8_gdt_8 is None:
367  self._gdt_8_gdt_8 = self.ref_bb_posref_bb_pos.GetGDT(self.superposed_mdl_bb_possuperposed_mdl_bb_pos, 8.0)
368  return self._gdt_8_gdt_8
369 
370  @property
371  def gdt_4(self):
372  """ GDT with one single threshold: 4.0
373 
374  :type: :class:`float`
375  """
376  if self._gdt_4_gdt_4 is None:
377  self._gdt_4_gdt_4 = self.ref_bb_posref_bb_pos.GetGDT(self.superposed_mdl_bb_possuperposed_mdl_bb_pos, 4.0)
378  return self._gdt_4_gdt_4
379 
380  @property
381  def gdt_2(self):
382  """ GDT with one single threshold: 2.0
383 
384  :type: :class:`float`
385  """
386  if self._gdt_2_gdt_2 is None:
387  self._gdt_2_gdt_2 = self.ref_bb_posref_bb_pos.GetGDT(self.superposed_mdl_bb_possuperposed_mdl_bb_pos, 2.0)
388  return self._gdt_2_gdt_2
389 
390  @property
391  def gdt_1(self):
392  """ GDT with one single threshold: 1.0
393 
394  :type: :class:`float`
395  """
396  if self._gdt_1_gdt_1 is None:
397  self._gdt_1_gdt_1 = self.ref_bb_posref_bb_pos.GetGDT(self.superposed_mdl_bb_possuperposed_mdl_bb_pos, 1.0)
398  return self._gdt_1_gdt_1
399 
400  @property
401  def ost_query(self):
402  """ query for mdl residues in OpenStructure query language
403 
404  Repr can be selected as ``full_mdl.Select(ost_query)``
405 
406  Returns invalid query if residue numbers have insertion codes.
407 
408  :type: :class:`str`
409  """
410  if self._ost_query_ost_query is None:
411  chain_rnums = dict()
412  for r in self.mdl_residuesmdl_residues:
413  chname = r.GetChain().GetName()
414  rnum = r.GetNumber().GetNum()
415  if chname not in chain_rnums:
416  chain_rnums[chname] = list()
417  chain_rnums[chname].append(str(rnum))
418  chain_queries = list()
419  for k,v in chain_rnums.items():
420  q = f"(cname={mol.QueryQuoteName(k)} and "
421  q += f"rnum={','.join(v)})"
422  chain_queries.append(q)
423  self._ost_query_ost_query = " or ".join(chain_queries)
424  return self._ost_query_ost_query
425 
426  def JSONSummary(self):
427  """ Returns JSON serializable summary of results
428  """
429  json_dict = dict()
430  json_dict["lDDT"] = self.lDDTlDDT
431  json_dict["ref_residues"] = [r.GetQualifiedName() for r in \
432  self.ref_residuesref_residues]
433  json_dict["mdl_residues"] = [r.GetQualifiedName() for r in \
434  self.mdl_residuesmdl_residues]
435  json_dict["transform"] = list(self.transformtransform.data)
436  json_dict["bb_rmsd"] = self.bb_rmsdbb_rmsd
437  json_dict["gdt_8"] = self.gdt_8gdt_8
438  json_dict["gdt_4"] = self.gdt_4gdt_4
439  json_dict["gdt_2"] = self.gdt_2gdt_2
440  json_dict["gdt_1"] = self.gdt_1gdt_1
441  json_dict["ost_query"] = self.ost_queryost_query
442  json_dict["flat_mapping"] = self.GetFlatChainMappingGetFlatChainMapping()
443  return json_dict
444 
445  def GetFlatChainMapping(self, mdl_as_key=False):
446  """ Returns flat mapping of all chains in the representation
447 
448  :param mdl_as_key: Default is target chain name as key and model chain
449  name as value. This can be reversed with this flag.
450  :returns: :class:`dict` with :class:`str` as key/value that describe
451  one-to-one mapping
452  """
453  flat_mapping = dict()
454  for trg_res, mdl_res in zip(self.ref_residuesref_residues, self.mdl_residuesmdl_residues):
455  if mdl_as_key:
456  flat_mapping[mdl_res.chain.name] = trg_res.chain.name
457  else:
458  flat_mapping[trg_res.chain.name] = mdl_res.chain.name
459  return flat_mapping
460 
461  def _GetFullBBPos(self, residues):
462  """ Helper to extract full backbone positions
463  """
464  exp_pep_atoms = ["N", "CA", "C"]
465  exp_nuc_atoms = ["\"O5'\"", "\"C5'\"", "\"C4'\"", "\"C3'\"", "\"O3'\""]
466  bb_pos = geom.Vec3List()
467  for r in residues:
468  if r.GetChemType() == mol.ChemType.NUCLEOTIDES:
469  exp_atoms = exp_nuc_atoms
470  elif r.GetChemType() == mol.ChemType.AMINOACIDS:
471  exp_atoms = exp_pep_atoms
472  else:
473  raise RuntimeError("Something terrible happened... RUN...")
474  for aname in exp_atoms:
475  a = r.FindAtom(aname)
476  if not a.IsValid():
477  raise RuntimeError("Something terrible happened... "
478  "RUN...")
479  bb_pos.append(a.GetPos())
480  return bb_pos
481 
482  def _GetBBPos(self, residues):
483  """ Helper to extract single representative position for each residue
484  """
485  bb_pos = geom.Vec3List()
486  for r in residues:
487  at = r.FindAtom("CA")
488  if not at.IsValid():
489  at = r.FindAtom("C3'")
490  if not at.IsValid():
491  raise RuntimeError("Something terrible happened... RUN...")
492  bb_pos.append(at.GetPos())
493  return bb_pos
494 
495  def _GetInconsistentResidues(self, ref_residues, mdl_residues):
496  """ Helper to extract a list of inconsistent residues.
497  """
498  if len(ref_residues) != len(mdl_residues):
499  raise ValueError("Something terrible happened... Reference and "
500  "model lengths differ... RUN...")
501  inconsistent_residues = list()
502  for ref_residue, mdl_residue in zip(ref_residues, mdl_residues):
503  if ref_residue.name != mdl_residue.name:
504  inconsistent_residues.append((ref_residue, mdl_residue))
505  return inconsistent_residues
506 
507 
509  """ Class to compute chain mappings
510 
511  All algorithms are performed on processed structures which fulfill
512  criteria as given in constructor arguments (*min_pep_length*,
513  "min_nuc_length") and only contain residues which have all required backbone
514  atoms. for peptide residues thats N, CA, C and CB (no CB for GLY), for
515  nucleotide residues thats O5', C5', C4', C3' and O3'.
516 
517  Chain mapping is a three step process:
518 
519  * Group chemically identical chains in *target* using pairwise
520  alignments that are either computed with Needleman-Wunsch (NW) or
521  simply derived from residue numbers (*resnum_alignments* flag).
522  In case of NW, *pep_subst_mat*, *pep_gap_open* and *pep_gap_ext*
523  and their nucleotide equivalents are relevant. Two chains are
524  considered identical if they fulfill the thresholds given by
525  *pep_seqid_thr*, *pep_gap_thr*, their nucleotide equivalents
526  respectively. The grouping information is available as
527  attributes of this class.
528 
529  * Map chains in an input model to these groups. Generating alignments
530  and the similarity criteria are the same as above. You can either
531  get the group mapping with :func:`GetChemMapping` or directly call
532  one of the full fletched one-to-one chain mapping functions which
533  execute that step internally.
534 
535  * Obtain one-to-one mapping for chains in an input model and
536  *target* with one of the available mapping functions. Just to get an
537  idea of complexity. If *target* and *model* are octamers, there are
538  ``8! = 40320`` possible chain mappings.
539 
540  :param target: Target structure onto which models are mapped.
541  Computations happen on a selection only containing
542  polypeptides and polynucleotides.
543  :type target: :class:`ost.mol.EntityView`/:class:`ost.mol.EntityHandle`
544  :param resnum_alignments: Use residue numbers instead of
545  Needleman-Wunsch to compute pairwise
546  alignments. Relevant for :attr:`~chem_groups`
547  and related attributes.
548  :type resnum_alignments: :class:`bool`
549  :param pep_seqid_thr: Threshold used to decide when two chains are
550  identical. 95 percent tolerates the few mutations
551  crystallographers like to do.
552  :type pep_seqid_thr: :class:`float`
553  :param pep_gap_thr: Additional threshold to avoid gappy alignments with
554  high seqid. By default this is disabled (set to 1.0).
555  This threshold checks for a maximum allowed fraction
556  of gaps in any of the two sequences after stripping
557  terminal gaps. The reason for not just normalizing
558  seqid by the longer sequence is that one sequence
559  might be a perfect subsequence of the other but only
560  cover half of it.
561  :type pep_gap_thr: :class:`float`
562  :param nuc_seqid_thr: Nucleotide equivalent for *pep_seqid_thr*
563  :type nuc_seqid_thr: :class:`float`
564  :param nuc_gap_thr: Nucleotide equivalent for *nuc_gap_thr*
565  :type nuc_gap_thr: :class:`float`
566  :param pep_subst_mat: Substitution matrix to align peptide sequences,
567  irrelevant if *resnum_alignments* is True,
568  defaults to seq.alg.BLOSUM62
569  :type pep_subst_mat: :class:`ost.seq.alg.SubstWeightMatrix`
570  :param pep_gap_open: Gap open penalty to align peptide sequences,
571  irrelevant if *resnum_alignments* is True
572  :type pep_gap_open: :class:`int`
573  :param pep_gap_ext: Gap extension penalty to align peptide sequences,
574  irrelevant if *resnum_alignments* is True
575  :type pep_gap_ext: :class:`int`
576  :param nuc_subst_mat: Nucleotide equivalent for *pep_subst_mat*,
577  defaults to seq.alg.NUC44
578  :type nuc_subst_mat: :class:`ost.seq.alg.SubstWeightMatrix`
579  :param nuc_gap_open: Nucleotide equivalent for *pep_gap_open*
580  :type nuc_gap_open: :class:`int`
581  :param nuc_gap_ext: Nucleotide equivalent for *pep_gap_ext*
582  :type nuc_gap_ext: :class:`int`
583  :param min_pep_length: Minimal number of residues for a peptide chain to be
584  considered in target and in models.
585  :type min_pep_length: :class:`int`
586  :param min_nuc_length: Minimal number of residues for a nucleotide chain to be
587  considered in target and in models.
588  :type min_nuc_length: :class:`int`
589  :param n_max_naive: Max possible chain mappings that are enumerated in
590  :func:`~GetNaivelDDTMapping` /
591  :func:`~GetDecomposerlDDTMapping`. A
592  :class:`RuntimeError` is raised in case of bigger
593  complexity.
594  :type n_max_naive: :class:`int`
595  """
596  def __init__(self, target, resnum_alignments=False,
597  pep_seqid_thr = 95., pep_gap_thr = 1.0,
598  nuc_seqid_thr = 95., nuc_gap_thr = 1.0,
599  pep_subst_mat = seq.alg.BLOSUM62, pep_gap_open = -11,
600  pep_gap_ext = -1, nuc_subst_mat = seq.alg.NUC44,
601  nuc_gap_open = -4, nuc_gap_ext = -4,
602  min_pep_length = 6, min_nuc_length = 4,
603  n_max_naive = 1e8):
604 
605  # attributes
606  self.resnum_alignmentsresnum_alignments = resnum_alignments
607  self.pep_seqid_thrpep_seqid_thr = pep_seqid_thr
608  self.pep_gap_thrpep_gap_thr = pep_gap_thr
609  self.nuc_seqid_thrnuc_seqid_thr = nuc_seqid_thr
610  self.nuc_gap_thrnuc_gap_thr = nuc_gap_thr
611  self.min_pep_lengthmin_pep_length = min_pep_length
612  self.min_nuc_lengthmin_nuc_length = min_nuc_length
613  self.n_max_naiven_max_naive = n_max_naive
614 
615  # lazy computed attributes
616  self._chem_groups_chem_groups = None
617  self._chem_group_alignments_chem_group_alignments = None
618  self._chem_group_ref_seqs_chem_group_ref_seqs = None
619  self._chem_group_types_chem_group_types = None
620 
621  # helper class to generate pairwise alignments
622  self.aligneraligner = _Aligner(resnum_aln = resnum_alignments,
623  pep_subst_mat = pep_subst_mat,
624  pep_gap_open = pep_gap_open,
625  pep_gap_ext = pep_gap_ext,
626  nuc_subst_mat = nuc_subst_mat,
627  nuc_gap_open = nuc_gap_open,
628  nuc_gap_ext = nuc_gap_ext)
629 
630  # target structure preprocessing
631  self._target, self._polypep_seqs, self._polynuc_seqs_polynuc_seqs = \
632  self.ProcessStructureProcessStructure(target)
633 
634  @property
635  def target(self):
636  """Target structure that only contains peptides/nucleotides
637 
638  Contains only residues that have the backbone representatives
639  (CA for peptide and C3' for nucleotides) to avoid ATOMSEQ alignment
640  inconsistencies when switching between all atom and backbone only
641  representations.
642 
643  :type: :class:`ost.mol.EntityView`
644  """
645  return self._target
646 
647  @property
648  def polypep_seqs(self):
649  """Sequences of peptide chains in :attr:`~target`
650 
651  Respective :class:`EntityView` from *target* for each sequence s are
652  available as ``s.GetAttachedView()``
653 
654  :type: :class:`ost.seq.SequenceList`
655  """
656  return self._polypep_seqs
657 
658  @property
659  def polynuc_seqs(self):
660  """Sequences of nucleotide chains in :attr:`~target`
661 
662  Respective :class:`EntityView` from *target* for each sequence s are
663  available as ``s.GetAttachedView()``
664 
665  :type: :class:`ost.seq.SequenceList`
666  """
667  return self._polynuc_seqs_polynuc_seqs
668 
669  @property
670  def chem_groups(self):
671  """Groups of chemically equivalent chains in :attr:`~target`
672 
673  First chain in group is the one with longest sequence.
674 
675  :getter: Computed on first use (cached)
676  :type: :class:`list` of :class:`list` of :class:`str` (chain names)
677  """
678  if self._chem_groups_chem_groups is None:
679  self._chem_groups_chem_groups = list()
680  for a in self.chem_group_alignmentschem_group_alignments:
681  self._chem_groups_chem_groups.append([s.GetName() for s in a.sequences])
682  return self._chem_groups_chem_groups
683 
684  @property
686  """MSA for each group in :attr:`~chem_groups`
687 
688  Sequences in MSAs exhibit same order as in :attr:`~chem_groups` and
689  have the respective :class:`ost.mol.EntityView` from *target* attached.
690 
691  :getter: Computed on first use (cached)
692  :type: :class:`ost.seq.AlignmentList`
693  """
694  if self._chem_group_alignments_chem_group_alignments is None:
695  self._chem_group_alignments_chem_group_alignments, self._chem_group_types_chem_group_types = \
696  _GetChemGroupAlignments(self.polypep_seqspolypep_seqs, self.polynuc_seqspolynuc_seqs,
697  self.aligneraligner,
698  pep_seqid_thr=self.pep_seqid_thrpep_seqid_thr,
699  pep_gap_thr=self.pep_gap_thrpep_gap_thr,
700  nuc_seqid_thr=self.nuc_seqid_thrnuc_seqid_thr,
701  nuc_gap_thr=self.nuc_gap_thrnuc_gap_thr)
702 
703  return self._chem_group_alignments_chem_group_alignments
704 
705  @property
707  """Reference (longest) sequence for each group in :attr:`~chem_groups`
708 
709  Respective :class:`EntityView` from *target* for each sequence s are
710  available as ``s.GetAttachedView()``
711 
712  :getter: Computed on first use (cached)
713  :type: :class:`ost.seq.SequenceList`
714  """
715  if self._chem_group_ref_seqs_chem_group_ref_seqs is None:
716  self._chem_group_ref_seqs_chem_group_ref_seqs = seq.CreateSequenceList()
717  for a in self.chem_group_alignmentschem_group_alignments:
718  s = seq.CreateSequence(a.GetSequence(0).GetName(),
719  a.GetSequence(0).GetGaplessString())
720  s.AttachView(a.GetSequence(0).GetAttachedView())
721  self._chem_group_ref_seqs_chem_group_ref_seqs.AddSequence(s)
722  return self._chem_group_ref_seqs_chem_group_ref_seqs
723 
724  @property
725  def chem_group_types(self):
726  """ChemType of each group in :attr:`~chem_groups`
727 
728  Specifying if groups are poly-peptides/nucleotides, i.e.
729  :class:`ost.mol.ChemType.AMINOACIDS` or
730  :class:`ost.mol.ChemType.NUCLEOTIDES`
731 
732  :getter: Computed on first use (cached)
733  :type: :class:`list` of :class:`ost.mol.ChemType`
734  """
735  if self._chem_group_types_chem_group_types is None:
736  self._chem_group_alignments_chem_group_alignments, self._chem_group_types_chem_group_types = \
737  _GetChemGroupAlignments(self.polypep_seqspolypep_seqs, self.polynuc_seqspolynuc_seqs,
738  self.aligneraligner,
739  pep_seqid_thr=self.pep_seqid_thrpep_seqid_thr,
740  pep_gap_thr=self.pep_gap_thrpep_gap_thr,
741  nuc_seqid_thr=self.nuc_seqid_thrnuc_seqid_thr,
742  nuc_gap_thr=self.nuc_gap_thrnuc_gap_thr)
743 
744  return self._chem_group_types_chem_group_types
745 
746  def GetChemMapping(self, model):
747  """Maps sequences in *model* to chem_groups of target
748 
749  :param model: Model from which to extract sequences, a
750  selection that only includes peptides and nucleotides
751  is performed and returned along other results.
752  :type model: :class:`ost.mol.EntityView`/:class:`ost.mol.EntityHandle`
753  :returns: Tuple with two lists of length `len(self.chem_groups)` and
754  an :class:`ost.mol.EntityView` representing *model*:
755  1) Each element is a :class:`list` with mdl chain names that
756  map to the chem group at that position.
757  2) Each element is a :class:`ost.seq.AlignmentList` aligning
758  these mdl chain sequences to the chem group ref sequences.
759  3) A selection of *model* that only contains polypeptides and
760  polynucleotides whose ATOMSEQ exactly matches the sequence
761  info in the returned alignments.
762  """
763  mdl, mdl_pep_seqs, mdl_nuc_seqs = self.ProcessStructureProcessStructure(model)
764  mapping = [list() for x in self.chem_groupschem_groups]
765  alns = [seq.AlignmentList() for x in self.chem_groupschem_groups]
766 
767  for s in mdl_pep_seqs:
768  idx, aln = _MapSequence(self.chem_group_ref_seqschem_group_ref_seqs,
769  self.chem_group_typeschem_group_types,
770  s, mol.ChemType.AMINOACIDS,
771  self.aligneraligner)
772  if idx is not None:
773  mapping[idx].append(s.GetName())
774  alns[idx].append(aln)
775 
776  for s in mdl_nuc_seqs:
777  idx, aln = _MapSequence(self.chem_group_ref_seqschem_group_ref_seqs,
778  self.chem_group_typeschem_group_types,
779  s, mol.ChemType.NUCLEOTIDES,
780  self.aligneraligner)
781  if idx is not None:
782  mapping[idx].append(s.GetName())
783  alns[idx].append(aln)
784 
785  return (mapping, alns, mdl)
786 
787 
788  def GetlDDTMapping(self, model, inclusion_radius=15.0,
789  thresholds=[0.5, 1.0, 2.0, 4.0], strategy="heuristic",
790  steep_opt_rate = None, block_seed_size = 5,
791  block_blocks_per_chem_group = 5,
792  chem_mapping_result = None,
793  heuristic_n_max_naive = 40320):
794  """ Identify chain mapping by optimizing lDDT score
795 
796  Maps *model* chain sequences to :attr:`~chem_groups` and find mapping
797  based on backbone only lDDT score (CA for amino acids C3' for
798  Nucleotides).
799 
800  Either performs a naive search, i.e. enumerate all possible mappings or
801  executes a greedy strategy that tries to identify a (close to) optimal
802  mapping in an iterative way by starting from a start mapping (seed). In
803  each iteration, the one-to-one mapping that leads to highest increase
804  in number of conserved contacts is added with the additional requirement
805  that this added mapping must have non-zero interface counts towards the
806  already mapped chains. So basically we're "growing" the mapped structure
807  by only adding connected stuff.
808 
809  The available strategies:
810 
811  * **naive**: Enumerates all possible mappings and returns best
812 
813  * **greedy_fast**: perform all vs. all single chain lDDTs within the
814  respective ref/mdl chem groups. The mapping with highest number of
815  conserved contacts is selected as seed for greedy extension
816 
817  * **greedy_full**: try multiple seeds for greedy extension, i.e. try
818  all ref/mdl chain combinations within the respective chem groups and
819  retain the mapping leading to the best lDDT.
820 
821  * **greedy_block**: try multiple seeds for greedy extension, i.e. try
822  all ref/mdl chain combinations within the respective chem groups and
823  extend them to *block_seed_size*. *block_blocks_per_chem_group*
824  for each chem group are selected for exhaustive extension.
825 
826  * **heuristic**: Uses *naive* strategy if number of possible mappings
827  is within *heuristic_n_max_naive*. The default of 40320 corresponds
828  to an octamer (8!=40320). A structure with stoichiometry A6B2 would be
829  6!*2!=1440 etc. If the number of possible mappings is larger,
830  *greedy_full* is used.
831 
832  Sets :attr:`MappingResult.opt_score` in case of no trivial one-to-one
833  mapping.
834 
835  :param model: Model to map
836  :type model: :class:`ost.mol.EntityView`/:class:`ost.mol.EntityHandle`
837  :param inclusion_radius: Inclusion radius for lDDT
838  :type inclusion_radius: :class:`float`
839  :param thresholds: Thresholds for lDDT
840  :type thresholds: :class:`list` of :class:`float`
841  :param strategy: Strategy to find mapping. Must be in ["naive",
842  "greedy_fast", "greedy_full", "greedy_block"]
843  :type strategy: :class:`str`
844  :param steep_opt_rate: Only relevant for greedy strategies.
845  If set, every *steep_opt_rate* mappings, a simple
846  optimization is executed with the goal of
847  avoiding local minima. The optimization
848  iteratively checks all possible swaps of mappings
849  within their respective chem groups and accepts
850  swaps that improve lDDT score. Iteration stops as
851  soon as no improvement can be achieved anymore.
852  :type steep_opt_rate: :class:`int`
853  :param block_seed_size: Param for *greedy_block* strategy - Initial seeds
854  are extended by that number of chains.
855  :type block_seed_size: :class:`int`
856  :param block_blocks_per_chem_group: Param for *greedy_block* strategy -
857  Number of blocks per chem group that
858  are extended in an initial search
859  for high scoring local solutions.
860  :type block_blocks_per_chem_group: :class:`int`
861  :param chem_mapping_result: Pro param. The result of
862  :func:`~GetChemMapping` where you provided
863  *model*. If set, *model* parameter is not
864  used.
865  :type chem_mapping_result: :class:`tuple`
866  :returns: A :class:`MappingResult`
867  """
868 
869  strategies = ["naive", "greedy_fast", "greedy_full", "greedy_block",
870  "heuristic"]
871  if strategy not in strategies:
872  raise RuntimeError(f"Strategy must be in {strategies}")
873 
874  if chem_mapping_result is None:
875  chem_mapping, chem_group_alns, mdl = self.GetChemMappingGetChemMapping(model)
876  else:
877  chem_mapping, chem_group_alns, mdl = chem_mapping_result
878 
879  ref_mdl_alns = _GetRefMdlAlns(self.chem_groupschem_groups,
880  self.chem_group_alignmentschem_group_alignments,
881  chem_mapping,
882  chem_group_alns)
883 
884  # check for the simplest case
885  one_to_one = _CheckOneToOneMapping(self.chem_groupschem_groups, chem_mapping)
886  if one_to_one is not None:
887  alns = dict()
888  for ref_group, mdl_group in zip(self.chem_groupschem_groups, one_to_one):
889  for ref_ch, mdl_ch in zip(ref_group, mdl_group):
890  if ref_ch is not None and mdl_ch is not None:
891  aln = ref_mdl_alns[(ref_ch, mdl_ch)]
892  aln.AttachView(0, _CSel(self.targettarget, [ref_ch]))
893  aln.AttachView(1, _CSel(mdl, [mdl_ch]))
894  alns[(ref_ch, mdl_ch)] = aln
895  return MappingResult(self.targettarget, mdl, self.chem_groupschem_groups, chem_mapping,
896  one_to_one, alns)
897 
898  if strategy == "heuristic":
899  if _NMappingsWithin(self.chem_groupschem_groups, chem_mapping,
900  heuristic_n_max_naive):
901  strategy = "naive"
902  else:
903  strategy = "greedy_full"
904 
905  mapping = None
906  opt_lddt = None
907 
908  if strategy == "naive":
909  mapping, opt_lddt = _lDDTNaive(self.targettarget, mdl, inclusion_radius,
910  thresholds, self.chem_groupschem_groups,
911  chem_mapping, ref_mdl_alns,
912  self.n_max_naiven_max_naive)
913  else:
914  # its one of the greedy strategies - setup greedy searcher
915  the_greed = _lDDTGreedySearcher(self.targettarget, mdl, self.chem_groupschem_groups,
916  chem_mapping, ref_mdl_alns,
917  inclusion_radius=inclusion_radius,
918  thresholds=thresholds,
919  steep_opt_rate=steep_opt_rate)
920  if strategy == "greedy_fast":
921  mapping = _lDDTGreedyFast(the_greed)
922  elif strategy == "greedy_full":
923  mapping = _lDDTGreedyFull(the_greed)
924  elif strategy == "greedy_block":
925  mapping = _lDDTGreedyBlock(the_greed, block_seed_size,
926  block_blocks_per_chem_group)
927  # cached => lDDT computation is fast here
928  opt_lddt = the_greed.lDDT(self.chem_groupschem_groups, mapping)
929 
930  alns = dict()
931  for ref_group, mdl_group in zip(self.chem_groupschem_groups, mapping):
932  for ref_ch, mdl_ch in zip(ref_group, mdl_group):
933  if ref_ch is not None and mdl_ch is not None:
934  aln = ref_mdl_alns[(ref_ch, mdl_ch)]
935  aln.AttachView(0, _CSel(self.targettarget, [ref_ch]))
936  aln.AttachView(1, _CSel(mdl, [mdl_ch]))
937  alns[(ref_ch, mdl_ch)] = aln
938 
939  return MappingResult(self.targettarget, mdl, self.chem_groupschem_groups, chem_mapping,
940  mapping, alns, opt_score = opt_lddt)
941 
942 
943  def GetQSScoreMapping(self, model, contact_d = 12.0, strategy = "heuristic",
944  block_seed_size = 5, block_blocks_per_chem_group = 5,
945  heuristic_n_max_naive = 40320, steep_opt_rate = None,
946  chem_mapping_result = None,
947  greedy_prune_contact_map = True):
948  """ Identify chain mapping based on QSScore
949 
950  Scoring is based on CA/C3' positions which are present in all chains of
951  a :attr:`chem_groups` as well as the *model* chains which are mapped to
952  that respective chem group.
953 
954  The following strategies are available:
955 
956  * **naive**: Naively iterate all possible mappings and return best based
957  on QS score.
958 
959  * **greedy_fast**: perform all vs. all single chain lDDTs within the
960  respective ref/mdl chem groups. The mapping with highest number of
961  conserved contacts is selected as seed for greedy extension.
962  Extension is based on QS-score.
963 
964  * **greedy_full**: try multiple seeds for greedy extension, i.e. try
965  all ref/mdl chain combinations within the respective chem groups and
966  retain the mapping leading to the best QS-score.
967 
968  * **greedy_block**: try multiple seeds for greedy extension, i.e. try
969  all ref/mdl chain combinations within the respective chem groups and
970  extend them to *block_seed_size*. *block_blocks_per_chem_group*
971  for each chem group are selected for exhaustive extension.
972 
973  * **heuristic**: Uses *naive* strategy if number of possible mappings
974  is within *heuristic_n_max_naive*. The default of 40320 corresponds
975  to an octamer (8!=40320). A structure with stoichiometry A6B2 would be
976  6!*2!=1440 etc. If the number of possible mappings is larger,
977  *greedy_full* is used.
978 
979  Sets :attr:`MappingResult.opt_score` in case of no trivial one-to-one
980  mapping.
981 
982  :param model: Model to map
983  :type model: :class:`ost.mol.EntityView`/:class:`ost.mol.EntityHandle`
984  :param contact_d: Max distance between two residues to be considered as
985  contact in qs scoring
986  :type contact_d: :class:`float`
987  :param strategy: Strategy for sampling, must be in ["naive",
988  "greedy_fast", "greedy_full", "greedy_block"]
989  :type strategy: :class:`str`
990  :param chem_mapping_result: Pro param. The result of
991  :func:`~GetChemMapping` where you provided
992  *model*. If set, *model* parameter is not
993  used.
994  :type chem_mapping_result: :class:`tuple`
995  :param greedy_prune_contact_map: Relevant for all strategies that use
996  greedy extensions. If True, only chains
997  with at least 3 contacts (8A CB
998  distance) towards already mapped chains
999  in trg/mdl are considered for
1000  extension. All chains that give a
1001  potential non-zero QS-score increase
1002  are used otherwise (at least one
1003  contact within 12A). The consequence
1004  is reduced runtime and usually no
1005  real reduction in accuracy.
1006  :returns: A :class:`MappingResult`
1007  """
1008 
1009  strategies = ["naive", "greedy_fast", "greedy_full", "greedy_block", "heuristic"]
1010  if strategy not in strategies:
1011  raise RuntimeError(f"strategy must be {strategies}")
1012 
1013  if chem_mapping_result is None:
1014  chem_mapping, chem_group_alns, mdl = self.GetChemMappingGetChemMapping(model)
1015  else:
1016  chem_mapping, chem_group_alns, mdl = chem_mapping_result
1017  ref_mdl_alns = _GetRefMdlAlns(self.chem_groupschem_groups,
1018  self.chem_group_alignmentschem_group_alignments,
1019  chem_mapping,
1020  chem_group_alns)
1021  # check for the simplest case
1022  one_to_one = _CheckOneToOneMapping(self.chem_groupschem_groups, chem_mapping)
1023  if one_to_one is not None:
1024  alns = dict()
1025  for ref_group, mdl_group in zip(self.chem_groupschem_groups, one_to_one):
1026  for ref_ch, mdl_ch in zip(ref_group, mdl_group):
1027  if ref_ch is not None and mdl_ch is not None:
1028  aln = ref_mdl_alns[(ref_ch, mdl_ch)]
1029  aln.AttachView(0, _CSel(self.targettarget, [ref_ch]))
1030  aln.AttachView(1, _CSel(mdl, [mdl_ch]))
1031  alns[(ref_ch, mdl_ch)] = aln
1032  return MappingResult(self.targettarget, mdl, self.chem_groupschem_groups, chem_mapping,
1033  one_to_one, alns)
1034 
1035  if strategy == "heuristic":
1036  if _NMappingsWithin(self.chem_groupschem_groups, chem_mapping,
1037  heuristic_n_max_naive):
1038  strategy = "naive"
1039  else:
1040  strategy = "greedy_full"
1041 
1042  mapping = None
1043  opt_qsscore = None
1044 
1045  if strategy == "naive":
1046  mapping, opt_qsscore = _QSScoreNaive(self.targettarget, mdl,
1047  self.chem_groupschem_groups,
1048  chem_mapping, ref_mdl_alns,
1049  contact_d, self.n_max_naiven_max_naive)
1050  else:
1051  # its one of the greedy strategies - setup greedy searcher
1052  the_greed = _QSScoreGreedySearcher(self.targettarget, mdl,
1053  self.chem_groupschem_groups,
1054  chem_mapping, ref_mdl_alns,
1055  contact_d = contact_d,
1056  steep_opt_rate=steep_opt_rate,
1057  greedy_prune_contact_map = greedy_prune_contact_map)
1058  if strategy == "greedy_fast":
1059  mapping = _QSScoreGreedyFast(the_greed)
1060  elif strategy == "greedy_full":
1061  mapping = _QSScoreGreedyFull(the_greed)
1062  elif strategy == "greedy_block":
1063  mapping = _QSScoreGreedyBlock(the_greed, block_seed_size,
1064  block_blocks_per_chem_group)
1065  # cached => QSScore computation is fast here
1066  opt_qsscore = the_greed.Score(mapping, check=False)
1067 
1068  alns = dict()
1069  for ref_group, mdl_group in zip(self.chem_groupschem_groups, mapping):
1070  for ref_ch, mdl_ch in zip(ref_group, mdl_group):
1071  if ref_ch is not None and mdl_ch is not None:
1072  aln = ref_mdl_alns[(ref_ch, mdl_ch)]
1073  aln.AttachView(0, _CSel(self.targettarget, [ref_ch]))
1074  aln.AttachView(1, _CSel(mdl, [mdl_ch]))
1075  alns[(ref_ch, mdl_ch)] = aln
1076 
1077  return MappingResult(self.targettarget, mdl, self.chem_groupschem_groups, chem_mapping,
1078  mapping, alns, opt_score = opt_qsscore)
1079 
1080  def GetRMSDMapping(self, model, strategy = "heuristic", subsampling=50,
1081  chem_mapping_result = None, heuristic_n_max_naive = 120):
1082  """Identify chain mapping based on minimal RMSD superposition
1083 
1084  Superposition and scoring is based on CA/C3' positions which are present
1085  in all chains of a :attr:`chem_groups` as well as the *model*
1086  chains which are mapped to that respective chem group.
1087 
1088  The following strategies are available:
1089 
1090  * **naive**: Naively iterate all possible mappings and return the one
1091  with lowes RMSD.
1092 
1093  * **greedy_single**: perform all vs. all single chain superpositions
1094  within the respective ref/mdl chem groups to use as starting points.
1095  For each starting point, iteratively add the model/target chain pair
1096  with lowest RMSD until a full mapping is achieved. The mapping with
1097  lowest RMSD is returned.
1098 
1099  * **greedy_iterative**: Same as greedy_single_rmsd exept that the
1100  transformation gets updated with each added chain pair.
1101 
1102  * **heuristic**: Uses *naive* strategy if number of possible mappings
1103  is within *heuristic_n_max_naive*. The default of 120 corresponds
1104  to a homo-pentamer (5!=120). If the number of possible mappings is
1105  larger, *greedy_iterative* is used.
1106 
1107  :param model: Model to map
1108  :type model: :class:`ost.mol.EntityView`/:class:`ost.mol.EntityHandle`
1109  :param strategy: Strategy for sampling. Must be in ["naive",
1110  "greedy_single", "greedy_iterative"]
1111  :type strategy: :class:`str`
1112  :param subsampling: If given, only an equally distributed subset
1113  of CA/C3' positions in each chain are used for
1114  superposition/scoring.
1115  :type subsampling: :class:`int`
1116  :param chem_mapping_result: Pro param. The result of
1117  :func:`~GetChemMapping` where you provided
1118  *model*. If set, *model* parameter is not
1119  used.
1120  :type chem_mapping_result: :class:`tuple`
1121  :returns: A :class:`MappingResult`
1122  """
1123 
1124  strategies = ["naive", "greedy_single", "greedy_iterative", "heuristic"]
1125 
1126  if strategy not in strategies:
1127  raise RuntimeError(f"strategy must be {strategies}")
1128 
1129  if chem_mapping_result is None:
1130  chem_mapping, chem_group_alns, mdl = self.GetChemMappingGetChemMapping(model)
1131  else:
1132  chem_mapping, chem_group_alns, mdl = chem_mapping_result
1133  ref_mdl_alns = _GetRefMdlAlns(self.chem_groupschem_groups,
1134  self.chem_group_alignmentschem_group_alignments,
1135  chem_mapping,
1136  chem_group_alns)
1137 
1138  # check for the simplest case
1139  one_to_one = _CheckOneToOneMapping(self.chem_groupschem_groups, chem_mapping)
1140  if one_to_one is not None:
1141  alns = dict()
1142  for ref_group, mdl_group in zip(self.chem_groupschem_groups, one_to_one):
1143  for ref_ch, mdl_ch in zip(ref_group, mdl_group):
1144  if ref_ch is not None and mdl_ch is not None:
1145  aln = ref_mdl_alns[(ref_ch, mdl_ch)]
1146  aln.AttachView(0, _CSel(self.targettarget, [ref_ch]))
1147  aln.AttachView(1, _CSel(mdl, [mdl_ch]))
1148  alns[(ref_ch, mdl_ch)] = aln
1149  return MappingResult(self.targettarget, mdl, self.chem_groupschem_groups, chem_mapping,
1150  one_to_one, alns)
1151 
1152  trg_group_pos, mdl_group_pos = _GetRefPos(self.targettarget, mdl,
1153  self.chem_group_alignmentschem_group_alignments,
1154  chem_group_alns,
1155  max_pos = subsampling)
1156 
1157  if strategy == "heuristic":
1158  if _NMappingsWithin(self.chem_groupschem_groups, chem_mapping,
1159  heuristic_n_max_naive):
1160  strategy = "naive"
1161  else:
1162  strategy = "greedy_iterative"
1163 
1164  mapping = None
1165 
1166  if strategy.startswith("greedy"):
1167  # get transforms of any mdl chain onto any trg chain in same chem
1168  # group that fulfills gdtts threshold
1169  initial_transforms = list()
1170  initial_mappings = list()
1171  for trg_pos, trg_chains, mdl_pos, mdl_chains in zip(trg_group_pos,
1172  self.chem_groupschem_groups,
1173  mdl_group_pos,
1174  chem_mapping):
1175  for t_pos, t in zip(trg_pos, trg_chains):
1176  for m_pos, m in zip(mdl_pos, mdl_chains):
1177  if len(t_pos) >= 3 and len(m_pos) >= 3:
1178  transform = _GetTransform(m_pos, t_pos, False)
1179  initial_transforms.append(transform)
1180  initial_mappings.append((t,m))
1181 
1182  if strategy == "greedy_single":
1183  mapping = _SingleRigidRMSD(initial_transforms,
1184  initial_mappings,
1185  self.chem_groupschem_groups,
1186  chem_mapping,
1187  trg_group_pos,
1188  mdl_group_pos)
1189 
1190 
1191  elif strategy == "greedy_iterative":
1192  mapping = _IterativeRigidRMSD(initial_transforms,
1193  initial_mappings,
1194  self.chem_groupschem_groups, chem_mapping,
1195  trg_group_pos, mdl_group_pos)
1196  elif strategy == "naive":
1197  mapping = _NaiveRMSD(self.chem_groupschem_groups, chem_mapping,
1198  trg_group_pos, mdl_group_pos,
1199  self.n_max_naiven_max_naive)
1200 
1201  # translate mapping format and return
1202  final_mapping = list()
1203  for ref_chains in self.chem_groupschem_groups:
1204  mapped_mdl_chains = list()
1205  for ref_ch in ref_chains:
1206  if ref_ch in mapping:
1207  mapped_mdl_chains.append(mapping[ref_ch])
1208  else:
1209  mapped_mdl_chains.append(None)
1210  final_mapping.append(mapped_mdl_chains)
1211 
1212  alns = dict()
1213  for ref_group, mdl_group in zip(self.chem_groupschem_groups, final_mapping):
1214  for ref_ch, mdl_ch in zip(ref_group, mdl_group):
1215  if ref_ch is not None and mdl_ch is not None:
1216  aln = ref_mdl_alns[(ref_ch, mdl_ch)]
1217  aln.AttachView(0, _CSel(self.targettarget, [ref_ch]))
1218  aln.AttachView(1, _CSel(mdl, [mdl_ch]))
1219  alns[(ref_ch, mdl_ch)] = aln
1220 
1221  return MappingResult(self.targettarget, mdl, self.chem_groupschem_groups, chem_mapping,
1222  final_mapping, alns)
1223 
1224  def GetMapping(self, model, n_max_naive = 40320):
1225  """ Convenience function to get mapping with currently preferred method
1226 
1227  If number of possible chain mappings is <= *n_max_naive*, a naive
1228  QS-score mapping is performed and optimal QS-score is guaranteed.
1229  For anything else, a QS-score mapping with the greedy_full strategy is
1230  performed (greedy_prune_contact_map = True). The default for
1231  *n_max_naive* of 40320 corresponds to an octamer (8!=40320). A
1232  structure with stoichiometry A6B2 would be 6!*2!=1440 etc.
1233  """
1234  return self.GetQSScoreMappingGetQSScoreMapping(model, strategy="heuristic",
1235  greedy_prune_contact_map=True,
1236  heuristic_n_max_naive = n_max_naive)
1237 
1238  def GetRepr(self, substructure, model, topn=1, inclusion_radius=15.0,
1239  thresholds=[0.5, 1.0, 2.0, 4.0], bb_only=False,
1240  only_interchain=False, chem_mapping_result = None,
1241  global_mapping = None):
1242  """ Identify *topn* representations of *substructure* in *model*
1243 
1244  *substructure* defines a subset of :attr:`~target` for which one
1245  wants the *topn* representations in *model*. Representations are scored
1246  and sorted by lDDT.
1247 
1248  :param substructure: A :class:`ost.mol.EntityView` which is a subset of
1249  :attr:`~target`. Should be selected with the
1250  OpenStructure query language. Example: if you're
1251  interested in residues with number 42,43 and 85 in
1252  chain A:
1253  ``substructure=mapper.target.Select("cname=A and rnum=42,43,85")``
1254  A :class:`RuntimeError` is raised if *substructure*
1255  does not refer to the same underlying
1256  :class:`ost.mol.EntityHandle` as :attr:`~target`.
1257  :type substructure: :class:`ost.mol.EntityView`
1258  :param model: Structure in which one wants to find representations for
1259  *substructure*
1260  :type model: :class:`ost.mol.EntityView`/:class:`ost.mol.EntityHandle`
1261  :param topn: Max number of representations that are returned
1262  :type topn: :class:`int`
1263  :param inclusion_radius: Inclusion radius for lDDT
1264  :type inclusion_radius: :class:`float`
1265  :param thresholds: Thresholds for lDDT
1266  :type thresholds: :class:`list` of :class:`float`
1267  :param bb_only: Only consider backbone atoms in lDDT computation
1268  :type bb_only: :class:`bool`
1269  :param only_interchain: Only score interchain contacts in lDDT. Useful
1270  if you want to identify interface patches.
1271  :type only_interchain: :class:`bool`
1272  :param chem_mapping_result: Pro param. The result of
1273  :func:`~GetChemMapping` where you provided
1274  *model*. If set, *model* parameter is not
1275  used.
1276  :type chem_mapping_result: :class:`tuple`
1277  :param global_mapping: Pro param. Specify a global mapping result. This
1278  fully defines the desired representation in the
1279  model but extracts it and enriches it with all
1280  the nice attributes of :class:`ReprResult`.
1281  The target attribute in *global_mapping* must be
1282  of the same entity as self.target and the model
1283  attribute of *global_mapping* must be of the same
1284  entity as *model*.
1285  :type global_mapping: :class:`MappingResult`
1286  :returns: :class:`list` of :class:`ReprResult`
1287  """
1288 
1289  if topn < 1:
1290  raise RuntimeError("topn must be >= 1")
1291 
1292  if global_mapping is not None:
1293  # ensure that this mapping is derived from the same structures
1294  if global_mapping.target.handle.GetHashCode() != \
1295  self.targettarget.handle.GetHashCode():
1296  raise RuntimeError("global_mapping.target must be the same "
1297  "entity as self.target")
1298  if global_mapping.model.handle.GetHashCode() != \
1299  model.handle.GetHashCode():
1300  raise RuntimeError("global_mapping.model must be the same "
1301  "entity as model param")
1302 
1303  # check whether substructure really is a subset of self.target
1304  for r in substructure.residues:
1305  ch_name = r.GetChain().GetName()
1306  rnum = r.GetNumber()
1307  target_r = self.targettarget.FindResidue(ch_name, rnum)
1308  if not target_r.IsValid():
1309  raise RuntimeError(f"substructure has residue "
1310  f"{r.GetQualifiedName()} which is not in "
1311  f"self.target")
1312  if target_r.handle.GetHashCode() != r.handle.GetHashCode():
1313  raise RuntimeError(f"substructure has residue "
1314  f"{r.GetQualifiedName()} which has an "
1315  f"equivalent in self.target but it does "
1316  f"not refer to the same underlying "
1317  f"EntityHandle")
1318  for a in r.atoms:
1319  target_a = target_r.FindAtom(a.GetName())
1320  if not target_a.IsValid():
1321  raise RuntimeError(f"substructure has atom "
1322  f"{a.GetQualifiedName()} which is not "
1323  f"in self.target")
1324  if a.handle.GetHashCode() != target_a.handle.GetHashCode():
1325  raise RuntimeError(f"substructure has atom "
1326  f"{a.GetQualifiedName()} which has an "
1327  f"equivalent in self.target but it does "
1328  f"not refer to the same underlying "
1329  f"EntityHandle")
1330 
1331  # check whether it contains either CA or C3'
1332  ca = r.FindAtom("CA")
1333  c3 = r.FindAtom("C3'") # FindAtom with prime in string is tested
1334  # and works
1335  if not ca.IsValid() and not c3.IsValid():
1336  raise RuntimeError("All residues in substructure must contain "
1337  "a backbone atom named CA or C3\'")
1338 
1339  # perform mapping and alignments on full structures
1340  if chem_mapping_result is None:
1341  chem_mapping, chem_group_alns, mdl = self.GetChemMappingGetChemMapping(model)
1342  else:
1343  chem_mapping, chem_group_alns, mdl = chem_mapping_result
1344  ref_mdl_alns = _GetRefMdlAlns(self.chem_groupschem_groups,
1345  self.chem_group_alignmentschem_group_alignments,
1346  chem_mapping,
1347  chem_group_alns)
1348 
1349  # Get residue indices relative to full target chain
1350  substructure_res_indices = dict()
1351  for ch in substructure.chains:
1352  full_ch = self.targettarget.FindChain(ch.GetName())
1353  idx = [full_ch.GetResidueIndex(r.GetNumber()) for r in ch.residues]
1354  substructure_res_indices[ch.GetName()] = idx
1355 
1356  # strip down variables to make them specific to substructure
1357  # keep only chem_groups which are present in substructure
1358  substructure_chem_groups = list()
1359  substructure_chem_mapping = list()
1360 
1361  chnames = set([ch.GetName() for ch in substructure.chains])
1362  for chem_group, mapping in zip(self.chem_groupschem_groups, chem_mapping):
1363  substructure_chem_group = [ch for ch in chem_group if ch in chnames]
1364  if len(substructure_chem_group) > 0:
1365  substructure_chem_groups.append(substructure_chem_group)
1366  substructure_chem_mapping.append(mapping)
1367 
1368  # early stopping if no mdl chain can be mapped to substructure
1369  n_mapped_mdl_chains = sum([len(m) for m in substructure_chem_mapping])
1370  if n_mapped_mdl_chains == 0:
1371  return list()
1372 
1373  # strip the reference sequence in alignments to only contain
1374  # sequence from substructure
1375  substructure_ref_mdl_alns = dict()
1376  mdl_views = dict()
1377  for ch in mdl.chains:
1378  mdl_views[ch.GetName()] = _CSel(mdl, [ch.GetName()])
1379  for chem_group, mapping in zip(substructure_chem_groups,
1380  substructure_chem_mapping):
1381  for ref_ch in chem_group:
1382  for mdl_ch in mapping:
1383  full_aln = ref_mdl_alns[(ref_ch, mdl_ch)]
1384  ref_seq = full_aln.GetSequence(0)
1385  # the ref sequence is tricky... we start with a gap only
1386  # sequence and only add olcs as defined by the residue
1387  # indices that we extracted before...
1388  tmp = ['-'] * len(full_aln)
1389  for idx in substructure_res_indices[ref_ch]:
1390  idx_in_seq = ref_seq.GetPos(idx)
1391  tmp[idx_in_seq] = ref_seq[idx_in_seq]
1392  ref_seq = seq.CreateSequence(ref_ch, ''.join(tmp))
1393  ref_seq.AttachView(_CSel(substructure, [ref_ch]))
1394  mdl_seq = full_aln.GetSequence(1)
1395  mdl_seq = seq.CreateSequence(mdl_seq.GetName(),
1396  mdl_seq.GetString())
1397  mdl_seq.AttachView(mdl_views[mdl_ch])
1398  aln = seq.CreateAlignment()
1399  aln.AddSequence(ref_seq)
1400  aln.AddSequence(mdl_seq)
1401  substructure_ref_mdl_alns[(ref_ch, mdl_ch)] = aln
1402 
1403  lddt_scorer = lddt.lDDTScorer(substructure,
1404  inclusion_radius = inclusion_radius,
1405  bb_only = bb_only)
1406  scored_mappings = list()
1407 
1408  if global_mapping:
1409  # construct mapping of substructure from global mapping
1410  flat_mapping = global_mapping.GetFlatMapping()
1411  mapping = list()
1412  for chem_group, chem_mapping in zip(substructure_chem_groups,
1413  substructure_chem_mapping):
1414  chem_group_mapping = list()
1415  for ch in chem_group:
1416  if ch in flat_mapping:
1417  mdl_ch = flat_mapping[ch]
1418  if mdl_ch in chem_mapping:
1419  chem_group_mapping.append(mdl_ch)
1420  else:
1421  chem_group_mapping.append(None)
1422  else:
1423  chem_group_mapping.append(None)
1424  mapping.append(chem_group_mapping)
1425  mappings = [mapping]
1426  else:
1427  mappings = list(_ChainMappings(substructure_chem_groups,
1428  substructure_chem_mapping,
1429  self.n_max_naiven_max_naive))
1430 
1431  for mapping in mappings:
1432  # chain_mapping and alns as input for lDDT computation
1433  lddt_chain_mapping = dict()
1434  lddt_alns = dict()
1435  n_res_aln = 0
1436  for ref_chem_group, mdl_chem_group in zip(substructure_chem_groups,
1437  mapping):
1438  for ref_ch, mdl_ch in zip(ref_chem_group, mdl_chem_group):
1439  # some mdl chains can be None
1440  if mdl_ch is not None:
1441  lddt_chain_mapping[mdl_ch] = ref_ch
1442  aln = substructure_ref_mdl_alns[(ref_ch, mdl_ch)]
1443  lddt_alns[mdl_ch] = aln
1444  tmp = [int(c[0] != '-' and c[1] != '-') for c in aln]
1445  n_res_aln += sum(tmp)
1446  # don't compute lDDT if no single residue in mdl and ref is aligned
1447  if n_res_aln == 0:
1448  continue
1449 
1450  lDDT, _ = lddt_scorer.lDDT(mdl, thresholds=thresholds,
1451  chain_mapping=lddt_chain_mapping,
1452  residue_mapping = lddt_alns,
1453  check_resnames = False,
1454  no_intrachain = only_interchain)
1455 
1456  if lDDT is None:
1457  ost.LogVerbose("No valid contacts in the reference")
1458  lDDT = 0.0 # that means, that we have not a single valid contact
1459  # in lDDT. For the code below to work, we just set it
1460  # to a terrible score => 0.0
1461 
1462  if len(scored_mappings) == 0:
1463  scored_mappings.append((lDDT, mapping))
1464  elif len(scored_mappings) < topn:
1465  scored_mappings.append((lDDT, mapping))
1466  scored_mappings.sort(reverse=True, key=lambda x: x[0])
1467  elif lDDT > scored_mappings[-1][0]:
1468  scored_mappings.append((lDDT, mapping))
1469  scored_mappings.sort(reverse=True, key=lambda x: x[0])
1470  scored_mappings = scored_mappings[:topn]
1471 
1472  # finalize and return
1473  results = list()
1474  for scored_mapping in scored_mappings:
1475  ref_view = substructure.handle.CreateEmptyView()
1476  mdl_view = mdl.handle.CreateEmptyView()
1477  for ref_ch_group, mdl_ch_group in zip(substructure_chem_groups,
1478  scored_mapping[1]):
1479  for ref_ch, mdl_ch in zip(ref_ch_group, mdl_ch_group):
1480  if ref_ch is not None and mdl_ch is not None:
1481  aln = substructure_ref_mdl_alns[(ref_ch, mdl_ch)]
1482  for col in aln:
1483  if col[0] != '-' and col[1] != '-':
1484  ref_view.AddResidue(col.GetResidue(0),
1485  mol.ViewAddFlag.INCLUDE_ALL)
1486  mdl_view.AddResidue(col.GetResidue(1),
1487  mol.ViewAddFlag.INCLUDE_ALL)
1488  results.append(ReprResult(scored_mapping[0], substructure,
1489  ref_view, mdl_view))
1490  return results
1491 
1492  def GetNMappings(self, model):
1493  """ Returns number of possible mappings
1494 
1495  :param model: Model with chains that are mapped onto
1496  :attr:`chem_groups`
1497  :type model: :class:`ost.mol.EntityView`/:class:`ost.mol.EntityHandle`
1498  """
1499  chem_mapping, chem_group_alns, mdl = self.GetChemMappingGetChemMapping(model)
1500  return _NMappings(self.chem_groupschem_groups, chem_mapping)
1501 
1502  def ProcessStructure(self, ent):
1503  """ Entity processing for chain mapping
1504 
1505  * Selects view containing peptide and nucleotide residues which have
1506  required backbone atoms present - for peptide residues thats
1507  N, CA, C and CB (no CB for GLY), for nucleotide residues thats
1508  O5', C5', C4', C3' and O3'.
1509  * filters view by chain lengths, see *min_pep_length* and
1510  *min_nuc_length* in constructor
1511  * Extracts atom sequences for each chain in that view
1512  * Attaches corresponding :class:`ost.mol.EntityView` to each sequence
1513  * If residue number alignments are used, strictly increasing residue
1514  numbers without insertion codes are ensured in each chain
1515 
1516  :param ent: Entity to process
1517  :type ent: :class:`ost.mol.EntityView`/:class:`ost.mol.EntityHandle`
1518  :returns: Tuple with 3 elements: 1) :class:`ost.mol.EntityView`
1519  containing peptide and nucleotide residues 2)
1520  :class:`ost.seq.SequenceList` containing ATOMSEQ sequences
1521  for each polypeptide chain in returned view, sequences have
1522  :class:`ost.mol.EntityView` of according chains attached
1523  3) same for polynucleotide chains
1524  """
1525  view = ent.CreateEmptyView()
1526  exp_pep_atoms = ["N", "CA", "C", "CB"]
1527  exp_nuc_atoms = ["\"O5'\"", "\"C5'\"", "\"C4'\"", "\"C3'\"", "\"O3'\""]
1528  pep_query = "peptide=true and aname=" + ','.join(exp_pep_atoms)
1529  nuc_query = "nucleotide=true and aname=" + ','.join(exp_nuc_atoms)
1530 
1531  pep_sel = ent.Select(pep_query)
1532  for r in pep_sel.residues:
1533  if len(r.atoms) == 4:
1534  view.AddResidue(r.handle, mol.INCLUDE_ALL)
1535  elif r.name == "GLY" and len(r.atoms) == 3:
1536  atom_names = [a.GetName() for a in r.atoms]
1537  if sorted(atom_names) == ["C", "CA", "N"]:
1538  view.AddResidue(r.handle, mol.INCLUDE_ALL)
1539 
1540  nuc_sel = ent.Select(nuc_query)
1541  for r in nuc_sel.residues:
1542  if len(r.atoms) == 5:
1543  view.AddResidue(r.handle, mol.INCLUDE_ALL)
1544 
1545  polypep_seqs = seq.CreateSequenceList()
1546  polynuc_seqs = seq.CreateSequenceList()
1547 
1548  if len(view.residues) == 0:
1549  # no residues survived => return
1550  return (view, polypep_seqs, polynuc_seqs)
1551 
1552  for ch in view.chains:
1553  n_res = len(ch.residues)
1554  n_pep = sum([r.IsPeptideLinking() for r in ch.residues])
1555  n_nuc = sum([r.IsNucleotideLinking() for r in ch.residues])
1556 
1557  # guarantee that we have either pep or nuc (no mix of the two)
1558  if n_pep > 0 and n_nuc > 0:
1559  raise RuntimeError(f"Must not mix peptide and nucleotide linking "
1560  f"residues in same chain ({ch.GetName()})")
1561 
1562  if (n_pep + n_nuc) != n_res:
1563  raise RuntimeError("All residues must either be peptide_linking "
1564  "or nucleotide_linking")
1565 
1566  # filter out short chains
1567  if n_pep > 0 and n_pep < self.min_pep_lengthmin_pep_length:
1568  continue
1569 
1570  if n_nuc > 0 and n_nuc < self.min_nuc_lengthmin_nuc_length:
1571  continue
1572 
1573  # the superfast residue number based alignment adds some
1574  # restrictions on the numbers themselves:
1575  # 1) no insertion codes 2) strictly increasing
1576  if self.resnum_alignmentsresnum_alignments:
1577  # check if no insertion codes are present in residue numbers
1578  ins_codes = [r.GetNumber().GetInsCode() for r in ch.residues]
1579  if len(set(ins_codes)) != 1 or ins_codes[0] != '\0':
1580  raise RuntimeError("Residue numbers in input structures must not "
1581  "contain insertion codes")
1582 
1583  # check if residue numbers are strictly increasing
1584  nums = [r.GetNumber().GetNum() for r in ch.residues]
1585  if not all(i < j for i, j in zip(nums, nums[1:])):
1586  raise RuntimeError("Residue numbers in input structures must be "
1587  "strictly increasing for each chain")
1588 
1589  s = ''.join([r.one_letter_code for r in ch.residues])
1590  s = seq.CreateSequence(ch.GetName(), s)
1591  s.AttachView(_CSel(view, [ch.GetName()]))
1592  if n_pep == n_res:
1593  polypep_seqs.AddSequence(s)
1594  elif n_nuc == n_res:
1595  polynuc_seqs.AddSequence(s)
1596  else:
1597  raise RuntimeError("This shouldnt happen")
1598 
1599  if len(polypep_seqs) == 0 and len(polynuc_seqs) == 0:
1600  raise RuntimeError(f"No chain fulfilled minimum length requirement "
1601  f"to be considered in chain mapping "
1602  f"({self.min_pep_length} for peptide chains, "
1603  f"{self.min_nuc_length} for nucleotide chains) "
1604  f"- mapping failed")
1605 
1606  # select for chains for which we actually extracted the sequence
1607  chain_names = [s.GetAttachedView().chains[0].name for s in polypep_seqs]
1608  chain_names += [s.GetAttachedView().chains[0].name for s in polynuc_seqs]
1609  view = _CSel(view, chain_names)
1610 
1611  return (view, polypep_seqs, polynuc_seqs)
1612 
1613  def Align(self, s1, s2, stype):
1614  """ Access to internal sequence alignment functionality
1615 
1616  Alignment parameterization is setup at ChainMapper construction
1617 
1618  :param s1: First sequence to align - must have view attached in case
1619  of resnum_alignments
1620  :type s1: :class:`ost.seq.SequenceHandle`
1621  :param s2: Second sequence to align - must have view attached in case
1622  of resnum_alignments
1623  :type s2: :class:`ost.seq.SequenceHandle`
1624  :param stype: Type of sequences to align, must be in
1625  [:class:`ost.mol.ChemType.AMINOACIDS`,
1626  :class:`ost.mol.ChemType.NUCLEOTIDES`]
1627  :returns: Pairwise alignment of s1 and s2
1628  """
1629  if stype not in [mol.ChemType.AMINOACIDS, mol.ChemType.NUCLEOTIDES]:
1630  raise RuntimeError("stype must be ost.mol.ChemType.AMINOACIDS or "
1631  "ost.mol.ChemType.NUCLEOTIDES")
1632  return self.aligneraligner.Align(s1, s2, chem_type = stype)
1633 
1634 
1635 # INTERNAL HELPERS
1636 
1637 class _Aligner:
1638  def __init__(self, pep_subst_mat = seq.alg.BLOSUM62, pep_gap_open = -5,
1639  pep_gap_ext = -2, nuc_subst_mat = seq.alg.NUC44,
1640  nuc_gap_open = -4, nuc_gap_ext = -4, resnum_aln = False):
1641  """ Helper class to compute alignments
1642 
1643  Sets default values for substitution matrix, gap open and gap extension
1644  penalties. They are only used in default mode (Needleman-Wunsch aln).
1645  If *resnum_aln* is True, only residue numbers of views that are attached
1646  to input sequences are considered.
1647  """
1648  self.pep_subst_matpep_subst_mat = pep_subst_mat
1649  self.pep_gap_openpep_gap_open = pep_gap_open
1650  self.pep_gap_extpep_gap_ext = pep_gap_ext
1651  self.nuc_subst_matnuc_subst_mat = nuc_subst_mat
1652  self.nuc_gap_opennuc_gap_open = nuc_gap_open
1653  self.nuc_gap_extnuc_gap_ext = nuc_gap_ext
1654  self.resnum_alnresnum_aln = resnum_aln
1655 
1656  def Align(self, s1, s2, chem_type=None):
1657  if self.resnum_alnresnum_aln:
1658  return self.ResNumAlignResNumAlign(s1, s2)
1659  else:
1660  if chem_type is None:
1661  raise RuntimeError("Must specify chem_type for NW alignment")
1662  return self.NWAlignNWAlign(s1, s2, chem_type)
1663 
1664  def NWAlign(self, s1, s2, chem_type):
1665  """ Returns pairwise alignment using Needleman-Wunsch algorithm
1666 
1667  :param s1: First sequence to align
1668  :type s1: :class:`ost.seq.SequenceHandle`
1669  :param s2: Second sequence to align
1670  :type s2: :class:`ost.seq.SequenceHandle`
1671  :param chem_type: Must be in [:class:`ost.mol.ChemType.AMINOACIDS`,
1672  :class:`ost.mol.ChemType.NUCLEOTIDES`], determines
1673  substitution matrix and gap open/extension penalties
1674  :type chem_type: :class:`ost.mol.ChemType`
1675  :returns: Alignment with s1 as first and s2 as second sequence
1676  """
1677  if chem_type == mol.ChemType.AMINOACIDS:
1678  return seq.alg.SemiGlobalAlign(s1, s2, self.pep_subst_matpep_subst_mat,
1679  gap_open=self.pep_gap_openpep_gap_open,
1680  gap_ext=self.pep_gap_extpep_gap_ext)[0]
1681  elif chem_type == mol.ChemType.NUCLEOTIDES:
1682  return seq.alg.SemiGlobalAlign(s1, s2, self.nuc_subst_matnuc_subst_mat,
1683  gap_open=self.nuc_gap_opennuc_gap_open,
1684  gap_ext=self.nuc_gap_extnuc_gap_ext)[0]
1685  else:
1686  raise RuntimeError("Invalid ChemType")
1687  return aln
1688 
1689  def ResNumAlign(self, s1, s2):
1690  """ Returns pairwise alignment using residue numbers of attached views
1691 
1692  Assumes that there are no insertion codes (alignment only on numerical
1693  component) and that resnums are strictly increasing (fast min/max
1694  identification). These requirements are assured if a structure has been
1695  processed by :class:`ChainMapper`.
1696 
1697  :param s1: First sequence to align, must have :class:`ost.mol.EntityView`
1698  attached
1699  :type s1: :class:`ost.seq.SequenceHandle`
1700  :param s2: Second sequence to align, must have :class:`ost.mol.EntityView`
1701  attached
1702  :type s2: :class:`ost.seq.SequenceHandle`
1703  """
1704  assert(s1.HasAttachedView())
1705  assert(s2.HasAttachedView())
1706  v1 = s1.GetAttachedView()
1707  rnums1 = [r.GetNumber().GetNum() for r in v1.residues]
1708  v2 = s2.GetAttachedView()
1709  rnums2 = [r.GetNumber().GetNum() for r in v2.residues]
1710 
1711  min_num = min(rnums1[0], rnums2[0])
1712  max_num = max(rnums1[-1], rnums2[-1])
1713  aln_length = max_num - min_num + 1
1714 
1715  aln_s1 = ['-'] * aln_length
1716  for r, rnum in zip(v1.residues, rnums1):
1717  aln_s1[rnum-min_num] = r.one_letter_code
1718 
1719  aln_s2 = ['-'] * aln_length
1720  for r, rnum in zip(v2.residues, rnums2):
1721  aln_s2[rnum-min_num] = r.one_letter_code
1722 
1723  aln = seq.CreateAlignment()
1724  aln.AddSequence(seq.CreateSequence(s1.GetName(), ''.join(aln_s1)))
1725  aln.AddSequence(seq.CreateSequence(s2.GetName(), ''.join(aln_s2)))
1726  return aln
1727 
1728 def _GetAlnPropsTwo(aln):
1729  """Returns basic properties of *aln* version two...
1730 
1731  :param aln: Alignment to compute properties
1732  :type aln: :class:`seq.AlignmentHandle`
1733  :returns: Tuple with 2 elements. 1) sequence identify in range [0, 100]
1734  considering aligned columns 2) Fraction of non-gap characters
1735  in first sequence that are covered by non-gap characters in
1736  second sequence.
1737  """
1738  assert(aln.GetCount() == 2)
1739  n_tot = sum([1 for col in aln if col[0] != '-'])
1740  n_aligned = sum([1 for col in aln if (col[0] != '-' and col[1] != '-')])
1741  return (seq.alg.SequenceIdentity(aln), float(n_aligned)/n_tot)
1742 
1743 def _GetAlnPropsOne(aln):
1744 
1745  """Returns basic properties of *aln* version one...
1746 
1747  :param aln: Alignment to compute properties
1748  :type aln: :class:`seq.AlignmentHandle`
1749  :returns: Tuple with 3 elements. 1) sequence identify in range [0, 100]
1750  considering aligned columns 2) Fraction of gaps between
1751  first and last aligned column in s1 3) same for s2.
1752  """
1753  assert(aln.GetCount() == 2)
1754  n_gaps_1 = str(aln.GetSequence(0)).strip('-').count('-')
1755  n_gaps_2 = str(aln.GetSequence(1)).strip('-').count('-')
1756  gap_frac_1 = float(n_gaps_1)/len(aln.GetSequence(0).GetGaplessString())
1757  gap_frac_2 = float(n_gaps_2)/len(aln.GetSequence(1).GetGaplessString())
1758  return (seq.alg.SequenceIdentity(aln), gap_frac_1, gap_frac_2)
1759 
1760 def _GetChemGroupAlignments(pep_seqs, nuc_seqs, aligner, pep_seqid_thr=95.,
1761  pep_gap_thr=0.1, nuc_seqid_thr=95.,
1762  nuc_gap_thr=0.1):
1763  """Returns alignments with groups of chemically equivalent chains
1764 
1765  :param pep_seqs: List of polypeptide sequences
1766  :type pep_seqs: :class:`seq.SequenceList`
1767  :param nuc_seqs: List of polynucleotide sequences
1768  :type nuc_seqs: :class:`seq.SequenceList`
1769  :param aligner: Helper class to generate pairwise alignments
1770  :type aligner: :class:`_Aligner`
1771  :param pep_seqid_thr: Threshold used to decide when two peptide chains are
1772  identical. 95 percent tolerates the few mutations
1773  crystallographers like to do.
1774  :type pep_seqid_thr: :class:`float`
1775  :param pep_gap_thr: Additional threshold to avoid gappy alignments with high
1776  seqid. The reason for not just normalizing seqid by the
1777  longer sequence is that one sequence might be a perfect
1778  subsequence of the other but only cover half of it. This
1779  threshold checks for a maximum allowed fraction of gaps
1780  in any of the two sequences after stripping terminal gaps.
1781  :type pep_gap_thr: :class:`float`
1782  :param nuc_seqid_thr: Nucleotide equivalent of *pep_seqid_thr*
1783  :type nuc_seqid_thr: :class:`float`
1784  :param nuc_gap_thr: Nucleotide equivalent of *nuc_gap_thr*
1785  :type nuc_gap_thr: :class:`float`
1786  :returns: Tuple with first element being an AlignmentList. Each alignment
1787  represents a group of chemically equivalent chains and the first
1788  sequence is the longest. Second element is a list of equivalent
1789  length specifying the types of the groups. List elements are in
1790  [:class:`ost.ChemType.AMINOACIDS`,
1791  :class:`ost.ChemType.NUCLEOTIDES`]
1792  """
1793  pep_groups = _GroupSequences(pep_seqs, pep_seqid_thr, pep_gap_thr, aligner,
1794  mol.ChemType.AMINOACIDS)
1795  nuc_groups = _GroupSequences(nuc_seqs, nuc_seqid_thr, nuc_gap_thr, aligner,
1796  mol.ChemType.NUCLEOTIDES)
1797  group_types = [mol.ChemType.AMINOACIDS] * len(pep_groups)
1798  group_types += [mol.ChemType.NUCLEOTIDES] * len(nuc_groups)
1799  groups = pep_groups
1800  groups.extend(nuc_groups)
1801  return (groups, group_types)
1802 
1803 def _GroupSequences(seqs, seqid_thr, gap_thr, aligner, chem_type):
1804  """Get list of alignments representing groups of equivalent sequences
1805 
1806  :param seqid_thr: Threshold used to decide when two chains are identical.
1807  :type seqid_thr: :class:`float`
1808  :param gap_thr: Additional threshold to avoid gappy alignments with high
1809  seqid. The reason for not just normalizing seqid by the
1810  longer sequence is that one sequence might be a perfect
1811  subsequence of the other but only cover half of it. This
1812  threshold checks for a maximum allowed fraction of gaps
1813  in any of the two sequences after stripping terminal gaps.
1814  :type gap_thr: :class:`float`
1815  :param aligner: Helper class to generate pairwise alignments
1816  :type aligner: :class:`_Aligner`
1817  :param chem_type: ChemType of seqs which is passed to *aligner*, must be in
1818  [:class:`ost.mol.ChemType.AMINOACIDS`,
1819  :class:`ost.mol.ChemType.NUCLEOTIDES`]
1820  :type chem_type: :class:`ost.mol.ChemType`
1821  :returns: A list of alignments, one alignment for each group
1822  with longest sequence (reference) as first sequence.
1823  :rtype: :class:`ost.seq.AlignmentList`
1824  """
1825  groups = list()
1826  for s_idx in range(len(seqs)):
1827  matching_group = None
1828  for g_idx in range(len(groups)):
1829  for g_s_idx in range(len(groups[g_idx])):
1830  aln = aligner.Align(seqs[s_idx], seqs[groups[g_idx][g_s_idx]],
1831  chem_type)
1832  sid, frac_i, frac_j = _GetAlnPropsOne(aln)
1833  if sid >= seqid_thr and frac_i < gap_thr and frac_j < gap_thr:
1834  matching_group = g_idx
1835  break
1836  if matching_group is not None:
1837  break
1838 
1839  if matching_group is None:
1840  groups.append([s_idx])
1841  else:
1842  groups[matching_group].append(s_idx)
1843 
1844  # sort based on sequence length
1845  sorted_groups = list()
1846  for g in groups:
1847  if len(g) > 1:
1848  tmp = sorted([[len(seqs[i]), i] for i in g], reverse=True)
1849  sorted_groups.append([x[1] for x in tmp])
1850  else:
1851  sorted_groups.append(g)
1852 
1853  # translate from indices back to sequences and directly generate alignments
1854  # of the groups with the longest (first) sequence as reference
1855  aln_list = seq.AlignmentList()
1856  for g in sorted_groups:
1857  if len(g) == 1:
1858  # aln with one single sequence
1859  aln_list.append(seq.CreateAlignment(seqs[g[0]]))
1860  else:
1861  # obtain pairwise aln of first sequence (reference) to all others
1862  alns = seq.AlignmentList()
1863  i = g[0]
1864  for j in g[1:]:
1865  alns.append(aligner.Align(seqs[i], seqs[j], chem_type))
1866  # and merge
1867  aln_list.append(seq.alg.MergePairwiseAlignments(alns, seqs[i]))
1868 
1869  # transfer attached views
1870  seq_dict = {s.GetName(): s for s in seqs}
1871  for aln_idx in range(len(aln_list)):
1872  for aln_s_idx in range(aln_list[aln_idx].GetCount()):
1873  s_name = aln_list[aln_idx].GetSequence(aln_s_idx).GetName()
1874  s = seq_dict[s_name]
1875  aln_list[aln_idx].AttachView(aln_s_idx, s.GetAttachedView())
1876 
1877  return aln_list
1878 
1879 def _MapSequence(ref_seqs, ref_types, s, s_type, aligner):
1880  """Tries top map *s* onto any of the sequences in *ref_seqs*
1881 
1882  Computes alignments of *s* to each of the reference sequences of equal type
1883  and sorts them by seqid*fraction_covered (seqid: sequence identity of
1884  aligned columns in alignment, fraction_covered: Fraction of non-gap
1885  characters in reference sequence that are covered by non-gap characters in
1886  *s*). Best scoring mapping is returned.
1887 
1888  :param ref_seqs: Reference sequences
1889  :type ref_seqs: :class:`ost.seq.SequenceList`
1890  :param ref_types: Types of reference sequences, e.g.
1891  ost.mol.ChemType.AminoAcids
1892  :type ref_types: :class:`list` of :class:`ost.mol.ChemType`
1893  :param s: Sequence to map
1894  :type s: :class:`ost.seq.SequenceHandle`
1895  :param s_type: Type of *s*, only try mapping to sequences in *ref_seqs*
1896  with equal type as defined in *ref_types*
1897  :param aligner: Helper class to generate pairwise alignments
1898  :type aligner: :class:`_Aligner`
1899  :returns: Tuple with two elements. 1) index of sequence in *ref_seqs* to
1900  which *s* can be mapped 2) Pairwise sequence alignment with
1901  sequence from *ref_seqs* as first sequence. Both elements are
1902  None if no mapping can be found.
1903  :raises: :class:`RuntimeError` if mapping is ambiguous, i.e. *s*
1904  successfully maps to more than one sequence in *ref_seqs*
1905  """
1906  scored_alns = list()
1907  for ref_idx, ref_seq in enumerate(ref_seqs):
1908  if ref_types[ref_idx] == s_type:
1909  aln = aligner.Align(ref_seq, s, s_type)
1910  seqid, fraction_covered = _GetAlnPropsTwo(aln)
1911  score = seqid * fraction_covered
1912  scored_alns.append((score, ref_idx, aln))
1913 
1914  if len(scored_alns) == 0:
1915  return (None, None) # no mapping possible...
1916 
1917  scored_alns = sorted(scored_alns, key=lambda x: x[0], reverse=True)
1918  return (scored_alns[0][1], scored_alns[0][2])
1919 
1920 def _GetRefMdlAlns(ref_chem_groups, ref_chem_group_msas, mdl_chem_groups,
1921  mdl_chem_group_alns, pairs=None):
1922  """ Get all possible ref/mdl chain alignments given chem group mapping
1923 
1924  :param ref_chem_groups: :attr:`ChainMapper.chem_groups`
1925  :type ref_chem_groups: :class:`list` of :class:`list` of :class:`str`
1926  :param ref_chem_group_msas: :attr:`ChainMapper.chem_group_alignments`
1927  :type ref_chem_group_msas: :class:`ost.seq.AlignmentList`
1928  :param mdl_chem_groups: Groups of model chains that are mapped to
1929  *ref_chem_groups*. Return value of
1930  :func:`ChainMapper.GetChemMapping`.
1931  :type mdl_chem_groups: :class:`list` of :class:`list` of :class:`str`
1932  :param mdl_chem_group_alns: A pairwise sequence alignment for every chain
1933  in *mdl_chem_groups* that aligns these sequences
1934  to the respective reference sequence.
1935  Return values of
1936  :func:`ChainMapper.GetChemMapping`.
1937  :type mdl_chem_group_alns: :class:`list` of :class:`ost.seq.AlignmentList`
1938  :param pairs: Pro param - restrict return dict to specified pairs. A set of
1939  tuples in form (<trg_ch>, <mdl_ch>)
1940  :type pairs: :class:`set`
1941  :returns: A dictionary holding all possible ref/mdl chain alignments. Keys
1942  in that dictionary are tuples of the form (ref_ch, mdl_ch) and
1943  values are the respective pairwise alignments with first sequence
1944  being from ref, the second from mdl.
1945  """
1946  # alignment of each model chain to chem_group reference sequence
1947  mdl_alns = dict()
1948  for alns in mdl_chem_group_alns:
1949  for aln in alns:
1950  mdl_chain_name = aln.GetSequence(1).GetName()
1951  mdl_alns[mdl_chain_name] = aln
1952 
1953  # generate all alignments between ref/mdl chain atomseqs that we will
1954  # ever observe
1955  ref_mdl_alns = dict()
1956  for ref_chains, mdl_chains, ref_aln in zip(ref_chem_groups, mdl_chem_groups,
1957  ref_chem_group_msas):
1958  for ref_ch in ref_chains:
1959  for mdl_ch in mdl_chains:
1960  if pairs is not None and (ref_ch, mdl_ch) not in pairs:
1961  continue
1962  # obtain alignments of mdl and ref chains towards chem
1963  # group ref sequence and merge them
1964  aln_list = seq.AlignmentList()
1965  # do ref aln
1966  s1 = ref_aln.GetSequence(0)
1967  s2 = ref_aln.GetSequence(ref_chains.index(ref_ch))
1968  aln_list.append(seq.CreateAlignment(s1, s2))
1969  # do mdl aln
1970  aln_list.append(mdl_alns[mdl_ch])
1971  # merge
1972  ref_seq = seq.CreateSequence(s1.GetName(),
1973  s1.GetGaplessString())
1974  merged_aln = seq.alg.MergePairwiseAlignments(aln_list,
1975  ref_seq)
1976  # merged_aln:
1977  # seq1: ref seq of chem group
1978  # seq2: seq of ref chain
1979  # seq3: seq of mdl chain
1980  # => we need the alignment between seq2 and seq3
1981  s2 = merged_aln.GetSequence(1)
1982  s3 = merged_aln.GetSequence(2)
1983  # cut leading and trailing gap columns
1984  a = 0 # number of leading gap columns
1985  for idx in range(len(s2)):
1986  if s2[idx] != '-' or s3[idx] != '-':
1987  break
1988  a += 1
1989  b = 0 # number of trailing gap columns
1990  for idx in reversed(range(len(s2))):
1991  if s2[idx] != '-' or s3[idx] != '-':
1992  break
1993  b += 1
1994  s2 = seq.CreateSequence(s2.GetName(), s2[a: len(s2)-b])
1995  s3 = seq.CreateSequence(s3.GetName(), s3[a: len(s3)-b])
1996  ref_mdl_alns[(ref_ch, mdl_ch)] = seq.CreateAlignment(s2, s3)
1997 
1998  return ref_mdl_alns
1999 
2000 def _CheckOneToOneMapping(ref_chains, mdl_chains):
2001  """ Checks whether we already have a perfect one to one mapping
2002 
2003  That means each list in *ref_chains* has exactly one element and each
2004  list in *mdl_chains* has either one element (it's mapped) or is empty
2005  (ref chain has no mapped mdl chain). Returns None if no such mapping
2006  can be found.
2007 
2008  :param ref_chains: corresponds to :attr:`ChainMapper.chem_groups`
2009  :type ref_chains: :class:`list` of :class:`list` of :class:`str`
2010  :param mdl_chains: mdl chains mapped to chem groups in *ref_chains*, i.e.
2011  the return value of :func:`ChainMapper.GetChemMapping`
2012  :type mdl_chains: class:`list` of :class:`list` of :class:`str`
2013  :returns: A :class:`list` of :class:`list` if a one to one mapping is found,
2014  None otherwise
2015  """
2016  only_one_to_one = True
2017  one_to_one = list()
2018  for ref, mdl in zip(ref_chains, mdl_chains):
2019  if len(ref) == 1 and len(mdl) == 1:
2020  one_to_one.append(mdl)
2021  elif len(ref) == 1 and len(mdl) == 0:
2022  one_to_one.append([None])
2023  else:
2024  only_one_to_one = False
2025  break
2026  if only_one_to_one:
2027  return one_to_one
2028  else:
2029  return None
2030 
2032 
2033  def __init__(self, ref, mdl, ref_mdl_alns, inclusion_radius = 15.0,
2034  thresholds = [0.5, 1.0, 2.0, 4.0]):
2035  """ Compute backbone only lDDT scores for ref/mdl
2036 
2037  Uses the pairwise decomposable property of backbone only lDDT and
2038  implements a caching mechanism to efficiently enumerate different
2039  chain mappings.
2040  """
2041 
2042  self.refref = ref
2043  self.mdlmdl = mdl
2044  self.ref_mdl_alnsref_mdl_alns = ref_mdl_alns
2045  self.inclusion_radiusinclusion_radius = inclusion_radius
2046  self.thresholdsthresholds = thresholds
2047 
2048  # keep track of single chains and interfaces in ref
2049  self.ref_chainsref_chains = list() # e.g. ['A', 'B', 'C']
2050  self.ref_interfacesref_interfaces = list() # e.g. [('A', 'B'), ('A', 'C')]
2051 
2052  # holds lDDT scorer for each chain in ref
2053  # key: chain name, value: scorer
2054  self.single_chain_scorersingle_chain_scorer = dict()
2055 
2056  # cache for single chain conserved contacts
2057  # key: tuple (ref_ch, mdl_ch) value: number of conserved contacts
2058  self.single_chain_cachesingle_chain_cache = dict()
2059 
2060  # holds lDDT scorer for each pairwise interface in target
2061  # key: tuple (ref_ch1, ref_ch2), value: scorer
2062  self.interface_scorerinterface_scorer = dict()
2063 
2064  # cache for interface conserved contacts
2065  # key: tuple of tuple ((ref_ch1, ref_ch2),((mdl_ch1, mdl_ch2))
2066  # value: number of conserved contacts
2067  self.interface_cacheinterface_cache = dict()
2068 
2069  self.nn = 0
2070 
2071  self._SetupScorer_SetupScorer()
2072 
2073  def _SetupScorer(self):
2074  for ch in self.refref.chains:
2075  # Select everything close to that chain
2076  query = f"{self.inclusion_radius} <> "
2077  query += f"[cname={mol.QueryQuoteName(ch.GetName())}] "
2078  query += f"and cname!={mol.QueryQuoteName(ch.GetName())}"
2079  for close_ch in self.refref.Select(query).chains:
2080  k1 = (ch.GetName(), close_ch.GetName())
2081  k2 = (close_ch.GetName(), ch.GetName())
2082  if k1 not in self.interface_scorerinterface_scorer and \
2083  k2 not in self.interface_scorerinterface_scorer:
2084  dimer_ref = _CSel(self.refref, [k1[0], k1[1]])
2085  s = lddt.lDDTScorer(dimer_ref, bb_only=True)
2086  self.interface_scorerinterface_scorer[k1] = s
2087  self.interface_scorerinterface_scorer[k2] = s
2088  self.nn += sum([len(x) for x in self.interface_scorerinterface_scorer[k1].ref_indices_ic])
2089  self.ref_interfacesref_interfaces.append(k1)
2090  # single chain scorer are actually interface scorers to save
2091  # some distance calculations
2092  if ch.GetName() not in self.single_chain_scorersingle_chain_scorer:
2093  self.single_chain_scorersingle_chain_scorer[ch.GetName()] = s
2094  self.nn += s.GetNChainContacts(ch.GetName(),
2095  no_interchain=True)
2096  self.ref_chainsref_chains.append(ch.GetName())
2097  if close_ch.GetName() not in self.single_chain_scorersingle_chain_scorer:
2098  self.single_chain_scorersingle_chain_scorer[close_ch.GetName()] = s
2099  self.nn += s.GetNChainContacts(close_ch.GetName(),
2100  no_interchain=True)
2101  self.ref_chainsref_chains.append(close_ch.GetName())
2102 
2103  # add any missing single chain scorer
2104  for ch in self.refref.chains:
2105  if ch.GetName() not in self.single_chain_scorersingle_chain_scorer:
2106  single_chain_ref = _CSel(self.refref, [ch.GetName()])
2107  self.single_chain_scorersingle_chain_scorer[ch.GetName()] = \
2108  lddt.lDDTScorer(single_chain_ref, bb_only = True)
2109  self.nn += self.single_chain_scorersingle_chain_scorer[ch.GetName()].n_distances
2110  self.ref_chainsref_chains.append(ch.GetName())
2111 
2112  def lDDT(self, ref_chain_groups, mdl_chain_groups):
2113 
2114  flat_map = dict()
2115  for ref_chains, mdl_chains in zip(ref_chain_groups, mdl_chain_groups):
2116  for ref_ch, mdl_ch in zip(ref_chains, mdl_chains):
2117  flat_map[ref_ch] = mdl_ch
2118 
2119  return self.lDDTFromFlatMaplDDTFromFlatMap(flat_map)
2120 
2121 
2122  def lDDTFromFlatMap(self, flat_map):
2123  conserved = 0
2124 
2125  # do single chain scores
2126  for ref_ch in self.ref_chainsref_chains:
2127  if ref_ch in flat_map and flat_map[ref_ch] is not None:
2128  conserved += self.SCCountsSCCounts(ref_ch, flat_map[ref_ch])
2129 
2130  # do interfaces
2131  for ref_ch1, ref_ch2 in self.ref_interfacesref_interfaces:
2132  if ref_ch1 in flat_map and ref_ch2 in flat_map:
2133  mdl_ch1 = flat_map[ref_ch1]
2134  mdl_ch2 = flat_map[ref_ch2]
2135  if mdl_ch1 is not None and mdl_ch2 is not None:
2136  conserved += self.IntCountsIntCounts(ref_ch1, ref_ch2, mdl_ch1,
2137  mdl_ch2)
2138 
2139  return conserved / (len(self.thresholdsthresholds) * self.nn)
2140 
2141  def SCCounts(self, ref_ch, mdl_ch):
2142  if not (ref_ch, mdl_ch) in self.single_chain_cachesingle_chain_cache:
2143  alns = dict()
2144  alns[mdl_ch] = self.ref_mdl_alnsref_mdl_alns[(ref_ch, mdl_ch)]
2145  mdl_sel = _CSel(self.mdlmdl, [mdl_ch])
2146  s = self.single_chain_scorersingle_chain_scorer[ref_ch]
2147  _,_,_,conserved,_,_,_ = s.lDDT(mdl_sel,
2148  residue_mapping=alns,
2149  return_dist_test=True,
2150  no_interchain=True,
2151  chain_mapping={mdl_ch: ref_ch},
2152  check_resnames=False)
2153  self.single_chain_cachesingle_chain_cache[(ref_ch, mdl_ch)] = conserved
2154  return self.single_chain_cachesingle_chain_cache[(ref_ch, mdl_ch)]
2155 
2156  def IntCounts(self, ref_ch1, ref_ch2, mdl_ch1, mdl_ch2):
2157  k1 = ((ref_ch1, ref_ch2),(mdl_ch1, mdl_ch2))
2158  k2 = ((ref_ch2, ref_ch1),(mdl_ch2, mdl_ch1))
2159  if k1 not in self.interface_cacheinterface_cache and k2 not in self.interface_cacheinterface_cache:
2160  alns = dict()
2161  alns[mdl_ch1] = self.ref_mdl_alnsref_mdl_alns[(ref_ch1, mdl_ch1)]
2162  alns[mdl_ch2] = self.ref_mdl_alnsref_mdl_alns[(ref_ch2, mdl_ch2)]
2163  mdl_sel = _CSel(self.mdlmdl, [mdl_ch1, mdl_ch2])
2164  s = self.interface_scorerinterface_scorer[(ref_ch1, ref_ch2)]
2165  _,_,_,conserved,_,_,_ = s.lDDT(mdl_sel,
2166  residue_mapping=alns,
2167  return_dist_test=True,
2168  no_intrachain=True,
2169  chain_mapping={mdl_ch1: ref_ch1,
2170  mdl_ch2: ref_ch2},
2171  check_resnames=False)
2172  self.interface_cacheinterface_cache[k1] = conserved
2173  self.interface_cacheinterface_cache[k2] = conserved
2174  return self.interface_cacheinterface_cache[k1]
2175 
2177  def __init__(self, ref, mdl, ref_chem_groups, mdl_chem_groups,
2178  ref_mdl_alns, inclusion_radius = 15.0,
2179  thresholds = [0.5, 1.0, 2.0, 4.0],
2180  steep_opt_rate = None):
2181  """ Greedy extension of already existing but incomplete chain mappings
2182  """
2183  super().__init__(ref, mdl, ref_mdl_alns,
2184  inclusion_radius = inclusion_radius,
2185  thresholds = thresholds)
2186  self.steep_opt_ratesteep_opt_rate = steep_opt_rate
2187  self.neighborsneighbors = {k: set() for k in self.ref_chainsref_chains}
2188  for k in self.interface_scorerinterface_scorer.keys():
2189  self.neighborsneighbors[k[0]].add(k[1])
2190  self.neighborsneighbors[k[1]].add(k[0])
2191 
2192  assert(len(ref_chem_groups) == len(mdl_chem_groups))
2193  self.ref_chem_groupsref_chem_groups = ref_chem_groups
2194  self.mdl_chem_groupsmdl_chem_groups = mdl_chem_groups
2195  self.ref_ch_group_mapperref_ch_group_mapper = dict()
2196  self.mdl_ch_group_mappermdl_ch_group_mapper = dict()
2197  for g_idx, (ref_g, mdl_g) in enumerate(zip(ref_chem_groups,
2198  mdl_chem_groups)):
2199  for ch in ref_g:
2200  self.ref_ch_group_mapperref_ch_group_mapper[ch] = g_idx
2201  for ch in mdl_g:
2202  self.mdl_ch_group_mappermdl_ch_group_mapper[ch] = g_idx
2203 
2204  # keep track of mdl chains that potentially give lDDT contributions,
2205  # i.e. they have locations within inclusion_radius + max(thresholds)
2206  self.mdl_neighborsmdl_neighbors = dict()
2207  d = self.inclusion_radiusinclusion_radius + max(self.thresholdsthresholds)
2208  for ch in self.mdlmdl.chains:
2209  ch_name = ch.GetName()
2210  self.mdl_neighborsmdl_neighbors[ch_name] = set()
2211  query = f"{d} <> [cname={mol.QueryQuoteName(ch_name)}]"
2212  query += f" and cname !={mol.QueryQuoteName(ch_name)}"
2213  for close_ch in self.mdlmdl.Select(query).chains:
2214  self.mdl_neighborsmdl_neighbors[ch_name].add(close_ch.GetName())
2215 
2216 
2217  def ExtendMapping(self, mapping, max_ext = None):
2218 
2219  if len(mapping) == 0:
2220  raise RuntimError("Mapping must contain a starting point")
2221 
2222  # Ref chains onto which we can map. The algorithm starts with a mapping
2223  # on ref_ch. From there we can start to expand to connected neighbors.
2224  # All neighbors that we can reach from the already mapped chains are
2225  # stored in this set which will be updated during runtime
2226  map_targets = set()
2227  for ref_ch in mapping.keys():
2228  map_targets.update(self.neighborsneighbors[ref_ch])
2229 
2230  # remove the already mapped chains
2231  for ref_ch in mapping.keys():
2232  map_targets.discard(ref_ch)
2233 
2234  if len(map_targets) == 0:
2235  return mapping # nothing to extend
2236 
2237  # keep track of what model chains are not yet mapped for each chem group
2238  free_mdl_chains = list()
2239  for chem_group in self.mdl_chem_groupsmdl_chem_groups:
2240  tmp = [x for x in chem_group if x not in mapping.values()]
2241  free_mdl_chains.append(set(tmp))
2242 
2243  # keep track of what ref chains got a mapping
2244  newly_mapped_ref_chains = list()
2245 
2246  something_happened = True
2247  while something_happened:
2248  something_happened=False
2249 
2250  if self.steep_opt_ratesteep_opt_rate is not None:
2251  n_chains = len(newly_mapped_ref_chains)
2252  if n_chains > 0 and n_chains % self.steep_opt_ratesteep_opt_rate == 0:
2253  mapping = self._SteepOpt_SteepOpt(mapping, newly_mapped_ref_chains)
2254 
2255  if max_ext is not None and len(newly_mapped_ref_chains) >= max_ext:
2256  break
2257 
2258  max_n = 0
2259  max_mapping = None
2260  for ref_ch in map_targets:
2261  chem_group_idx = self.ref_ch_group_mapperref_ch_group_mapper[ref_ch]
2262  for mdl_ch in free_mdl_chains[chem_group_idx]:
2263  # single chain score
2264  n_single = self.SCCountsSCCounts(ref_ch, mdl_ch)
2265  # scores towards neighbors that are already mapped
2266  n_inter = 0
2267  for neighbor in self.neighborsneighbors[ref_ch]:
2268  if neighbor in mapping and mapping[neighbor] in \
2269  self.mdl_neighborsmdl_neighbors[mdl_ch]:
2270  n_inter += self.IntCountsIntCounts(ref_ch, neighbor, mdl_ch,
2271  mapping[neighbor])
2272  n = n_single + n_inter
2273 
2274  if n_inter > 0 and n > max_n:
2275  # Only accept a new solution if its actually connected
2276  # i.e. n_inter > 0. Otherwise we could just map a big
2277  # fat mdl chain sitting somewhere in Nirvana
2278  max_n = n
2279  max_mapping = (ref_ch, mdl_ch)
2280 
2281  if max_n > 0:
2282  something_happened = True
2283  # assign new found mapping
2284  mapping[max_mapping[0]] = max_mapping[1]
2285 
2286  # add all neighboring chains to map targets as they are now
2287  # reachable
2288  for neighbor in self.neighborsneighbors[max_mapping[0]]:
2289  if neighbor not in mapping:
2290  map_targets.add(neighbor)
2291 
2292  # remove the ref chain from map targets
2293  map_targets.remove(max_mapping[0])
2294 
2295  # remove the mdl chain from free_mdl_chains - its taken...
2296  chem_group_idx = self.ref_ch_group_mapperref_ch_group_mapper[max_mapping[0]]
2297  free_mdl_chains[chem_group_idx].remove(max_mapping[1])
2298 
2299  # keep track of what ref chains got a mapping
2300  newly_mapped_ref_chains.append(max_mapping[0])
2301 
2302  return mapping
2303 
2304  def _SteepOpt(self, mapping, chains_to_optimize=None):
2305 
2306  # just optimize ALL ref chains if nothing specified
2307  if chains_to_optimize is None:
2308  chains_to_optimize = mapping.keys()
2309 
2310  # make sure that we only have ref chains which are actually mapped
2311  ref_chains = [x for x in chains_to_optimize if mapping[x] is not None]
2312 
2313  # group ref chains to be optimized into chem groups
2314  tmp = dict()
2315  for ch in ref_chains:
2316  chem_group_idx = self.ref_ch_group_mapperref_ch_group_mapper[ch]
2317  if chem_group_idx in tmp:
2318  tmp[chem_group_idx].append(ch)
2319  else:
2320  tmp[chem_group_idx] = [ch]
2321  chem_groups = list(tmp.values())
2322 
2323  # try all possible mapping swaps. Swaps that improve the score are
2324  # immediately accepted and we start all over again
2325  current_lddt = self.lDDTFromFlatMaplDDTFromFlatMap(mapping)
2326  something_happened = True
2327  while something_happened:
2328  something_happened = False
2329  for chem_group in chem_groups:
2330  if something_happened:
2331  break
2332  for ch1, ch2 in itertools.combinations(chem_group, 2):
2333  swapped_mapping = dict(mapping)
2334  swapped_mapping[ch1] = mapping[ch2]
2335  swapped_mapping[ch2] = mapping[ch1]
2336  score = self.lDDTFromFlatMaplDDTFromFlatMap(swapped_mapping)
2337  if score > current_lddt:
2338  something_happened = True
2339  mapping = swapped_mapping
2340  current_lddt = score
2341  break
2342 
2343  return mapping
2344 
2345 
2346 def _lDDTNaive(trg, mdl, inclusion_radius, thresholds, chem_groups,
2347  chem_mapping, ref_mdl_alns, n_max_naive):
2348  """ Naively iterates all possible chain mappings and returns the best
2349  """
2350  best_mapping = None
2351  best_lddt = -1.0
2352 
2353  # Benchmarks on homo-oligomers indicate that full blown lDDT
2354  # computation is faster up to tetramers => 4!=24 possible mappings.
2355  # For stuff bigger than that, the decomposer approach should be used
2356  if _NMappingsWithin(chem_groups, chem_mapping, 24):
2357  # Setup scoring
2358  lddt_scorer = lddt.lDDTScorer(trg, bb_only = True)
2359  for mapping in _ChainMappings(chem_groups, chem_mapping, n_max_naive):
2360  # chain_mapping and alns as input for lDDT computation
2361  lddt_chain_mapping = dict()
2362  lddt_alns = dict()
2363  for ref_chem_group, mdl_chem_group in zip(chem_groups, mapping):
2364  for ref_ch, mdl_ch in zip(ref_chem_group, mdl_chem_group):
2365  # some mdl chains can be None
2366  if mdl_ch is not None:
2367  lddt_chain_mapping[mdl_ch] = ref_ch
2368  lddt_alns[mdl_ch] = ref_mdl_alns[(ref_ch, mdl_ch)]
2369  lDDT, _ = lddt_scorer.lDDT(mdl, thresholds=thresholds,
2370  chain_mapping=lddt_chain_mapping,
2371  residue_mapping = lddt_alns,
2372  check_resnames = False)
2373  if lDDT > best_lddt:
2374  best_mapping = mapping
2375  best_lddt = lDDT
2376 
2377  else:
2378  # Setup scoring
2379  lddt_scorer = _lDDTDecomposer(trg, mdl, ref_mdl_alns,
2380  inclusion_radius=inclusion_radius,
2381  thresholds = thresholds)
2382  for mapping in _ChainMappings(chem_groups, chem_mapping, n_max_naive):
2383  lDDT = lddt_scorer.lDDT(chem_groups, mapping)
2384  if lDDT > best_lddt:
2385  best_mapping = mapping
2386  best_lddt = lDDT
2387 
2388  return (best_mapping, best_lddt)
2389 
2390 
2391 def _GetSeeds(ref_chem_groups, mdl_chem_groups, mapped_ref_chains = set(),
2392  mapped_mdl_chains = set()):
2393  seeds = list()
2394  for ref_chains, mdl_chains in zip(ref_chem_groups,
2395  mdl_chem_groups):
2396  for ref_ch in ref_chains:
2397  if ref_ch not in mapped_ref_chains:
2398  for mdl_ch in mdl_chains:
2399  if mdl_ch not in mapped_mdl_chains:
2400  seeds.append((ref_ch, mdl_ch))
2401  return seeds
2402 
2403 
2404 def _lDDTGreedyFast(the_greed):
2405 
2406  something_happened = True
2407  mapping = dict()
2408 
2409  while something_happened:
2410  something_happened = False
2411  seeds = _GetSeeds(the_greed.ref_chem_groups,
2412  the_greed.mdl_chem_groups,
2413  mapped_ref_chains = set(mapping.keys()),
2414  mapped_mdl_chains = set(mapping.values()))
2415  # search for best scoring starting point
2416  n_best = 0
2417  best_seed = None
2418  for seed in seeds:
2419  n = the_greed.SCCounts(seed[0], seed[1])
2420  if n > n_best:
2421  n_best = n
2422  best_seed = seed
2423  if n_best == 0:
2424  break # no proper seed found anymore...
2425  # add seed to mapping and start the greed
2426  mapping[best_seed[0]] = best_seed[1]
2427  mapping = the_greed.ExtendMapping(mapping)
2428  something_happened = True
2429 
2430  # translate mapping format and return
2431  final_mapping = list()
2432  for ref_chains in the_greed.ref_chem_groups:
2433  mapped_mdl_chains = list()
2434  for ref_ch in ref_chains:
2435  if ref_ch in mapping:
2436  mapped_mdl_chains.append(mapping[ref_ch])
2437  else:
2438  mapped_mdl_chains.append(None)
2439  final_mapping.append(mapped_mdl_chains)
2440 
2441  return final_mapping
2442 
2443 
2444 def _lDDTGreedyFull(the_greed):
2445  """ Uses each reference chain as starting point for expansion
2446  """
2447 
2448  seeds = _GetSeeds(the_greed.ref_chem_groups, the_greed.mdl_chem_groups)
2449  best_overall_score = -1.0
2450  best_overall_mapping = dict()
2451 
2452  for seed in seeds:
2453 
2454  # do initial extension
2455  mapping = the_greed.ExtendMapping({seed[0]: seed[1]})
2456 
2457  # repeat the process until we have a full mapping
2458  something_happened = True
2459  while something_happened:
2460  something_happened = False
2461  remnant_seeds = _GetSeeds(the_greed.ref_chem_groups,
2462  the_greed.mdl_chem_groups,
2463  mapped_ref_chains = set(mapping.keys()),
2464  mapped_mdl_chains = set(mapping.values()))
2465  if len(remnant_seeds) > 0:
2466  # still more mapping to be done
2467  best_score = -1.0
2468  best_mapping = None
2469  for remnant_seed in remnant_seeds:
2470  tmp_mapping = dict(mapping)
2471  tmp_mapping[remnant_seed[0]] = remnant_seed[1]
2472  tmp_mapping = the_greed.ExtendMapping(tmp_mapping)
2473  score = the_greed.lDDTFromFlatMap(tmp_mapping)
2474  if score > best_score:
2475  best_score = score
2476  best_mapping = tmp_mapping
2477  if best_mapping is not None:
2478  something_happened = True
2479  mapping = best_mapping
2480 
2481  score = the_greed.lDDTFromFlatMap(mapping)
2482  if score > best_overall_score:
2483  best_overall_score = score
2484  best_overall_mapping = mapping
2485 
2486  mapping = best_overall_mapping
2487 
2488  # translate mapping format and return
2489  final_mapping = list()
2490  for ref_chains in the_greed.ref_chem_groups:
2491  mapped_mdl_chains = list()
2492  for ref_ch in ref_chains:
2493  if ref_ch in mapping:
2494  mapped_mdl_chains.append(mapping[ref_ch])
2495  else:
2496  mapped_mdl_chains.append(None)
2497  final_mapping.append(mapped_mdl_chains)
2498 
2499  return final_mapping
2500 
2501 
2502 def _lDDTGreedyBlock(the_greed, seed_size, blocks_per_chem_group):
2503  """ try multiple seeds, i.e. try all ref/mdl chain combinations within the
2504  respective chem groups and compute single chain lDDTs. The
2505  *blocks_per_chem_group* best scoring ones are extend by *seed_size* chains
2506  and the best scoring one is exhaustively extended.
2507  """
2508 
2509  if seed_size is None or seed_size < 1:
2510  raise RuntimeError(f"seed_size must be an int >= 1 (got {seed_size})")
2511 
2512  if blocks_per_chem_group is None or blocks_per_chem_group < 1:
2513  raise RuntimeError(f"blocks_per_chem_group must be an int >= 1 "
2514  f"(got {blocks_per_chem_group})")
2515 
2516  max_ext = seed_size - 1 # -1 => start seed already has size 1
2517 
2518  ref_chem_groups = copy.deepcopy(the_greed.ref_chem_groups)
2519  mdl_chem_groups = copy.deepcopy(the_greed.mdl_chem_groups)
2520 
2521  mapping = dict()
2522  something_happened = True
2523  while something_happened:
2524  something_happened = False
2525  starting_blocks = list()
2526  for ref_chains, mdl_chains in zip(ref_chem_groups, mdl_chem_groups):
2527  if len(mdl_chains) == 0:
2528  continue # nothing to map
2529  ref_chains_copy = list(ref_chains)
2530  for i in range(blocks_per_chem_group):
2531  if len(ref_chains_copy) == 0:
2532  break
2533  seeds = list()
2534  for ref_ch in ref_chains_copy:
2535  seeds += [(ref_ch, mdl_ch) for mdl_ch in mdl_chains]
2536  # extend starting seeds to *seed_size* and retain best scoring
2537  # block for further extension
2538  best_score = -1.0
2539  best_mapping = None
2540  best_seed = None
2541  for s in seeds:
2542  seed = dict(mapping)
2543  seed.update({s[0]: s[1]})
2544  seed = the_greed.ExtendMapping(seed, max_ext = max_ext)
2545  seed_lddt = the_greed.lDDTFromFlatMap(seed)
2546  if seed_lddt > best_score:
2547  best_score = seed_lddt
2548  best_mapping = seed
2549  best_seed = s
2550  if best_mapping != None:
2551  starting_blocks.append(best_mapping)
2552  if best_seed[0] in ref_chains_copy:
2553  # remove that ref chain to enforce diversity
2554  ref_chains_copy.remove(best_seed[0])
2555 
2556  # fully expand initial starting blocks
2557  best_lddt = 0.0
2558  best_mapping = None
2559  for seed in starting_blocks:
2560  seed = the_greed.ExtendMapping(seed)
2561  seed_lddt = the_greed.lDDTFromFlatMap(seed)
2562  if seed_lddt > best_lddt:
2563  best_lddt = seed_lddt
2564  best_mapping = seed
2565 
2566  if best_lddt == 0.0:
2567  break # no proper mapping found anymore
2568 
2569  something_happened = True
2570  mapping.update(best_mapping)
2571  for ref_ch, mdl_ch in best_mapping.items():
2572  for group_idx in range(len(ref_chem_groups)):
2573  if ref_ch in ref_chem_groups[group_idx]:
2574  ref_chem_groups[group_idx].remove(ref_ch)
2575  if mdl_ch in mdl_chem_groups[group_idx]:
2576  mdl_chem_groups[group_idx].remove(mdl_ch)
2577 
2578  # translate mapping format and return
2579  final_mapping = list()
2580  for ref_chains in the_greed.ref_chem_groups:
2581  mapped_mdl_chains = list()
2582  for ref_ch in ref_chains:
2583  if ref_ch in mapping:
2584  mapped_mdl_chains.append(mapping[ref_ch])
2585  else:
2586  mapped_mdl_chains.append(None)
2587  final_mapping.append(mapped_mdl_chains)
2588 
2589  return final_mapping
2590 
2591 
2593  def __init__(self, ref, mdl, ref_chem_groups, mdl_chem_groups,
2594  ref_mdl_alns, contact_d = 12.0,
2595  steep_opt_rate = None, greedy_prune_contact_map=False):
2596  """ Greedy extension of already existing but incomplete chain mappings
2597  """
2598  super().__init__(ref, ref_chem_groups, mdl, ref_mdl_alns,
2599  contact_d = contact_d)
2600  self.refref = ref
2601  self.mdlmdl = mdl
2602  self.ref_mdl_alnsref_mdl_alns = ref_mdl_alns
2603  self.steep_opt_ratesteep_opt_rate = steep_opt_rate
2604 
2605  if greedy_prune_contact_map:
2606  self.neighborsneighbors = {k: set() for k in self.qsent1qsent1.chain_names}
2607  for p in self.qsent1qsent1.interacting_chains:
2608  if np.count_nonzero(self.qsent1qsent1.PairDist(p[0], p[1])<=8) >= 3:
2609  self.neighborsneighbors[p[0]].add(p[1])
2610  self.neighborsneighbors[p[1]].add(p[0])
2611 
2612  self.mdl_neighborsmdl_neighbors = {k: set() for k in self.qsent2qsent2.chain_names}
2613  for p in self.qsent2qsent2.interacting_chains:
2614  if np.count_nonzero(self.qsent2qsent2.PairDist(p[0], p[1])<=8) >= 3:
2615  self.mdl_neighborsmdl_neighbors[p[0]].add(p[1])
2616  self.mdl_neighborsmdl_neighbors[p[1]].add(p[0])
2617 
2618 
2619  else:
2620  self.neighborsneighbors = {k: set() for k in self.qsent1qsent1.chain_names}
2621  for p in self.qsent1qsent1.interacting_chains:
2622  self.neighborsneighbors[p[0]].add(p[1])
2623  self.neighborsneighbors[p[1]].add(p[0])
2624 
2625  self.mdl_neighborsmdl_neighbors = {k: set() for k in self.qsent2qsent2.chain_names}
2626  for p in self.qsent2qsent2.interacting_chains:
2627  self.mdl_neighborsmdl_neighbors[p[0]].add(p[1])
2628  self.mdl_neighborsmdl_neighbors[p[1]].add(p[0])
2629 
2630  assert(len(ref_chem_groups) == len(mdl_chem_groups))
2631  self.ref_chem_groupsref_chem_groups = ref_chem_groups
2632  self.mdl_chem_groupsmdl_chem_groups = mdl_chem_groups
2633  self.ref_ch_group_mapperref_ch_group_mapper = dict()
2634  self.mdl_ch_group_mappermdl_ch_group_mapper = dict()
2635  for g_idx, (ref_g, mdl_g) in enumerate(zip(ref_chem_groups,
2636  mdl_chem_groups)):
2637  for ch in ref_g:
2638  self.ref_ch_group_mapperref_ch_group_mapper[ch] = g_idx
2639  for ch in mdl_g:
2640  self.mdl_ch_group_mappermdl_ch_group_mapper[ch] = g_idx
2641 
2642  # cache for lDDT based single chain conserved contacts
2643  # used to identify starting points for further extension by QS score
2644  # key: tuple (ref_ch, mdl_ch) value: number of conserved contacts
2645  self.single_chain_scorersingle_chain_scorer = dict()
2646  self.single_chain_cachesingle_chain_cache = dict()
2647  for ch in self.refref.chains:
2648  single_chain_ref = _CSel(self.refref, [ch.GetName()])
2649  self.single_chain_scorersingle_chain_scorer[ch.GetName()] = \
2650  lddt.lDDTScorer(single_chain_ref, bb_only = True)
2651 
2652  def SCCounts(self, ref_ch, mdl_ch):
2653  if not (ref_ch, mdl_ch) in self.single_chain_cachesingle_chain_cache:
2654  alns = dict()
2655  alns[mdl_ch] = self.ref_mdl_alnsref_mdl_alns[(ref_ch, mdl_ch)]
2656  mdl_sel = _CSel(self.mdlmdl, [mdl_ch])
2657  s = self.single_chain_scorersingle_chain_scorer[ref_ch]
2658  _,_,_,conserved,_,_,_ = s.lDDT(mdl_sel,
2659  residue_mapping=alns,
2660  return_dist_test=True,
2661  no_interchain=True,
2662  chain_mapping={mdl_ch: ref_ch},
2663  check_resnames=False)
2664  self.single_chain_cachesingle_chain_cache[(ref_ch, mdl_ch)] = conserved
2665  return self.single_chain_cachesingle_chain_cache[(ref_ch, mdl_ch)]
2666 
2667  def ExtendMapping(self, mapping, max_ext = None):
2668 
2669  if len(mapping) == 0:
2670  raise RuntimError("Mapping must contain a starting point")
2671 
2672  # Ref chains onto which we can map. The algorithm starts with a mapping
2673  # on ref_ch. From there we can start to expand to connected neighbors.
2674  # All neighbors that we can reach from the already mapped chains are
2675  # stored in this set which will be updated during runtime
2676  map_targets = set()
2677  for ref_ch in mapping.keys():
2678  map_targets.update(self.neighborsneighbors[ref_ch])
2679 
2680  # remove the already mapped chains
2681  for ref_ch in mapping.keys():
2682  map_targets.discard(ref_ch)
2683 
2684  if len(map_targets) == 0:
2685  return mapping # nothing to extend
2686 
2687  # keep track of what model chains are not yet mapped for each chem group
2688  free_mdl_chains = list()
2689  for chem_group in self.mdl_chem_groupsmdl_chem_groups:
2690  tmp = [x for x in chem_group if x not in mapping.values()]
2691  free_mdl_chains.append(set(tmp))
2692 
2693  # keep track of what ref chains got a mapping
2694  newly_mapped_ref_chains = list()
2695 
2696  something_happened = True
2697  while something_happened:
2698  something_happened=False
2699 
2700  if self.steep_opt_ratesteep_opt_rate is not None:
2701  n_chains = len(newly_mapped_ref_chains)
2702  if n_chains > 0 and n_chains % self.steep_opt_ratesteep_opt_rate == 0:
2703  mapping = self._SteepOpt_SteepOpt(mapping, newly_mapped_ref_chains)
2704 
2705  if max_ext is not None and len(newly_mapped_ref_chains) >= max_ext:
2706  break
2707 
2708  score_result = self.FromFlatMappingFromFlatMapping(mapping)
2709  old_score = score_result.QS_global
2710  nominator = score_result.weighted_scores
2711  denominator = score_result.weight_sum + score_result.weight_extra_all
2712 
2713  max_diff = 0.0
2714  max_mapping = None
2715  for ref_ch in map_targets:
2716  chem_group_idx = self.ref_ch_group_mapperref_ch_group_mapper[ref_ch]
2717  for mdl_ch in free_mdl_chains[chem_group_idx]:
2718  # we're not computing full QS-score here, we directly hack
2719  # into the QS-score formula to compute a diff
2720  nominator_diff = 0.0
2721  denominator_diff = 0.0
2722  for neighbor in self.neighborsneighbors[ref_ch]:
2723  if neighbor in mapping and mapping[neighbor] in \
2724  self.mdl_neighborsmdl_neighbors[mdl_ch]:
2725  # it's a newly added interface if (ref_ch, mdl_ch)
2726  # are added to mapping
2727  int1 = (ref_ch, neighbor)
2728  int2 = (mdl_ch, mapping[neighbor])
2729  a, b, c, d = self._MappedInterfaceScores_MappedInterfaceScores(int1, int2)
2730  nominator_diff += a # weighted_scores
2731  denominator_diff += b # weight_sum
2732  denominator_diff += d # weight_extra_all
2733  # the respective interface penalties are subtracted
2734  # from denominator
2735  denominator_diff -= self._InterfacePenalty1_InterfacePenalty1(int1)
2736  denominator_diff -= self._InterfacePenalty2_InterfacePenalty2(int2)
2737 
2738  if nominator_diff > 0:
2739  # Only accept a new solution if its actually connected
2740  # i.e. nominator_diff > 0.
2741  new_nominator = nominator + nominator_diff
2742  new_denominator = denominator + denominator_diff
2743  new_score = 0.0
2744  if new_denominator != 0.0:
2745  new_score = new_nominator/new_denominator
2746  diff = new_score - old_score
2747  if diff > max_diff:
2748  max_diff = diff
2749  max_mapping = (ref_ch, mdl_ch)
2750 
2751  if max_mapping is not None:
2752  something_happened = True
2753  # assign new found mapping
2754  mapping[max_mapping[0]] = max_mapping[1]
2755 
2756  # add all neighboring chains to map targets as they are now
2757  # reachable
2758  for neighbor in self.neighborsneighbors[max_mapping[0]]:
2759  if neighbor not in mapping:
2760  map_targets.add(neighbor)
2761 
2762  # remove the ref chain from map targets
2763  map_targets.remove(max_mapping[0])
2764 
2765  # remove the mdl chain from free_mdl_chains - its taken...
2766  chem_group_idx = self.ref_ch_group_mapperref_ch_group_mapper[max_mapping[0]]
2767  free_mdl_chains[chem_group_idx].remove(max_mapping[1])
2768 
2769  # keep track of what ref chains got a mapping
2770  newly_mapped_ref_chains.append(max_mapping[0])
2771 
2772  return mapping
2773 
2774  def _SteepOpt(self, mapping, chains_to_optimize=None):
2775 
2776  # just optimize ALL ref chains if nothing specified
2777  if chains_to_optimize is None:
2778  chains_to_optimize = mapping.keys()
2779 
2780  # make sure that we only have ref chains which are actually mapped
2781  ref_chains = [x for x in chains_to_optimize if mapping[x] is not None]
2782 
2783  # group ref chains to be optimized into chem groups
2784  tmp = dict()
2785  for ch in ref_chains:
2786  chem_group_idx = self.ref_ch_group_mapperref_ch_group_mapper[ch]
2787  if chem_group_idx in tmp:
2788  tmp[chem_group_idx].append(ch)
2789  else:
2790  tmp[chem_group_idx] = [ch]
2791  chem_groups = list(tmp.values())
2792 
2793  # try all possible mapping swaps. Swaps that improve the score are
2794  # immediately accepted and we start all over again
2795  score_result = self.FromFlatMappingFromFlatMapping(mapping)
2796  current_score = score_result.QS_global
2797  something_happened = True
2798  while something_happened:
2799  something_happened = False
2800  for chem_group in chem_groups:
2801  if something_happened:
2802  break
2803  for ch1, ch2 in itertools.combinations(chem_group, 2):
2804  swapped_mapping = dict(mapping)
2805  swapped_mapping[ch1] = mapping[ch2]
2806  swapped_mapping[ch2] = mapping[ch1]
2807  score_result = self.FromFlatMappingFromFlatMapping(swapped_mapping)
2808  if score_result.QS_global > current_score:
2809  something_happened = True
2810  mapping = swapped_mapping
2811  current_score = score_result.QS_global
2812  break
2813  return mapping
2814 
2815 
2816 def _QSScoreNaive(trg, mdl, chem_groups, chem_mapping, ref_mdl_alns, contact_d,
2817  n_max_naive):
2818  best_mapping = None
2819  best_score = -1.0
2820  # qs_scorer implements caching, score calculation is thus as fast as it gets
2821  # you'll just hit a wall when the number of possible mappings becomes large
2822  qs_scorer = qsscore.QSScorer(trg, chem_groups, mdl, ref_mdl_alns)
2823  for mapping in _ChainMappings(chem_groups, chem_mapping, n_max_naive):
2824  score_result = qs_scorer.Score(mapping, check=False)
2825  if score_result.QS_global > best_score:
2826  best_mapping = mapping
2827  best_score = score_result.QS_global
2828  return (best_mapping, best_score)
2829 
2830 
2831 def _QSScoreGreedyFast(the_greed):
2832 
2833  something_happened = True
2834  mapping = dict()
2835  while something_happened:
2836  something_happened = False
2837  # search for best scoring starting point, we're using lDDT here
2838  n_best = 0
2839  best_seed = None
2840  seeds = _GetSeeds(the_greed.ref_chem_groups,
2841  the_greed.mdl_chem_groups,
2842  mapped_ref_chains = set(mapping.keys()),
2843  mapped_mdl_chains = set(mapping.values()))
2844  for seed in seeds:
2845  n = the_greed.SCCounts(seed[0], seed[1])
2846  if n > n_best:
2847  n_best = n
2848  best_seed = seed
2849  if n_best == 0:
2850  break # no proper seed found anymore...
2851  # add seed to mapping and start the greed
2852  mapping[best_seed[0]] = best_seed[1]
2853  mapping = the_greed.ExtendMapping(mapping)
2854  something_happened = True
2855 
2856  # translate mapping format and return
2857  final_mapping = list()
2858  for ref_chains in the_greed.ref_chem_groups:
2859  mapped_mdl_chains = list()
2860  for ref_ch in ref_chains:
2861  if ref_ch in mapping:
2862  mapped_mdl_chains.append(mapping[ref_ch])
2863  else:
2864  mapped_mdl_chains.append(None)
2865  final_mapping.append(mapped_mdl_chains)
2866 
2867  return final_mapping
2868 
2869 
2870 def _QSScoreGreedyFull(the_greed):
2871  """ Uses each reference chain as starting point for expansion
2872  """
2873 
2874  seeds = _GetSeeds(the_greed.ref_chem_groups, the_greed.mdl_chem_groups)
2875  best_overall_score = -1.0
2876  best_overall_mapping = dict()
2877 
2878  for seed in seeds:
2879 
2880  # do initial extension
2881  mapping = the_greed.ExtendMapping({seed[0]: seed[1]})
2882 
2883  # repeat the process until we have a full mapping
2884  something_happened = True
2885  while something_happened:
2886  something_happened = False
2887  remnant_seeds = _GetSeeds(the_greed.ref_chem_groups,
2888  the_greed.mdl_chem_groups,
2889  mapped_ref_chains = set(mapping.keys()),
2890  mapped_mdl_chains = set(mapping.values()))
2891  if len(remnant_seeds) > 0:
2892  # still more mapping to be done
2893  best_score = -1.0
2894  best_mapping = None
2895  for remnant_seed in remnant_seeds:
2896  tmp_mapping = dict(mapping)
2897  tmp_mapping[remnant_seed[0]] = remnant_seed[1]
2898  tmp_mapping = the_greed.ExtendMapping(tmp_mapping)
2899  score_result = the_greed.FromFlatMapping(tmp_mapping)
2900  if score_result.QS_global > best_score:
2901  best_score = score_result.QS_global
2902  best_mapping = tmp_mapping
2903  if best_mapping is not None:
2904  something_happened = True
2905  mapping = best_mapping
2906 
2907  score_result = the_greed.FromFlatMapping(mapping)
2908  if score_result.QS_global > best_overall_score:
2909  best_overall_score = score_result.QS_global
2910  best_overall_mapping = mapping
2911 
2912  mapping = best_overall_mapping
2913 
2914  # translate mapping format and return
2915  final_mapping = list()
2916  for ref_chains in the_greed.ref_chem_groups:
2917  mapped_mdl_chains = list()
2918  for ref_ch in ref_chains:
2919  if ref_ch in mapping:
2920  mapped_mdl_chains.append(mapping[ref_ch])
2921  else:
2922  mapped_mdl_chains.append(None)
2923  final_mapping.append(mapped_mdl_chains)
2924 
2925  return final_mapping
2926 
2927 
2928 def _QSScoreGreedyBlock(the_greed, seed_size, blocks_per_chem_group):
2929  """ try multiple seeds, i.e. try all ref/mdl chain combinations within the
2930  respective chem groups and compute single chain lDDTs. The
2931  *blocks_per_chem_group* best scoring ones are extend by *seed_size* chains
2932  and the best scoring one with respect to QS score is exhaustively extended.
2933  """
2934 
2935  if seed_size is None or seed_size < 1:
2936  raise RuntimeError(f"seed_size must be an int >= 1 (got {seed_size})")
2937 
2938  if blocks_per_chem_group is None or blocks_per_chem_group < 1:
2939  raise RuntimeError(f"blocks_per_chem_group must be an int >= 1 "
2940  f"(got {blocks_per_chem_group})")
2941 
2942  max_ext = seed_size - 1 # -1 => start seed already has size 1
2943 
2944  ref_chem_groups = copy.deepcopy(the_greed.ref_chem_groups)
2945  mdl_chem_groups = copy.deepcopy(the_greed.mdl_chem_groups)
2946 
2947  mapping = dict()
2948 
2949  something_happened = True
2950  while something_happened:
2951  something_happened = False
2952  starting_blocks = list()
2953  for ref_chains, mdl_chains in zip(ref_chem_groups, mdl_chem_groups):
2954  if len(mdl_chains) == 0:
2955  continue # nothing to map
2956  ref_chains_copy = list(ref_chains)
2957  for i in range(blocks_per_chem_group):
2958  if len(ref_chains_copy) == 0:
2959  break
2960  seeds = list()
2961  for ref_ch in ref_chains_copy:
2962  seeds += [(ref_ch, mdl_ch) for mdl_ch in mdl_chains]
2963  # extend starting seeds to *seed_size* and retain best scoring block
2964  # for further extension
2965  best_score = -1.0
2966  best_mapping = None
2967  best_seed = None
2968  for s in seeds:
2969  seed = dict(mapping)
2970  seed.update({s[0]: s[1]})
2971  seed = the_greed.ExtendMapping(seed, max_ext = max_ext)
2972  score_result = the_greed.FromFlatMapping(seed)
2973  if score_result.QS_global > best_score:
2974  best_score = score_result.QS_global
2975  best_mapping = seed
2976  best_seed = s
2977  if best_mapping != None:
2978  starting_blocks.append(best_mapping)
2979  if best_seed[0] in ref_chains_copy:
2980  # remove selected ref chain to enforce diversity
2981  ref_chains_copy.remove(best_seed[0])
2982 
2983  # fully expand initial starting blocks
2984  best_score = -1.0
2985  best_mapping = None
2986  for seed in starting_blocks:
2987  seed = the_greed.ExtendMapping(seed)
2988  score_result = the_greed.FromFlatMapping(seed)
2989  if score_result.QS_global > best_score:
2990  best_score = score_result.QS_global
2991  best_mapping = seed
2992 
2993  if best_mapping is not None and len(best_mapping) > len(mapping):
2994  # this even accepts extensions that lead to no increase in QS-score
2995  # at least they make sense from an lDDT perspective
2996  something_happened = True
2997  mapping.update(best_mapping)
2998  for ref_ch, mdl_ch in best_mapping.items():
2999  for group_idx in range(len(ref_chem_groups)):
3000  if ref_ch in ref_chem_groups[group_idx]:
3001  ref_chem_groups[group_idx].remove(ref_ch)
3002  if mdl_ch in mdl_chem_groups[group_idx]:
3003  mdl_chem_groups[group_idx].remove(mdl_ch)
3004 
3005  # translate mapping format and return
3006  final_mapping = list()
3007  for ref_chains in the_greed.ref_chem_groups:
3008  mapped_mdl_chains = list()
3009  for ref_ch in ref_chains:
3010  if ref_ch in mapping:
3011  mapped_mdl_chains.append(mapping[ref_ch])
3012  else:
3013  mapped_mdl_chains.append(None)
3014  final_mapping.append(mapped_mdl_chains)
3015 
3016  return final_mapping
3017 
3018 def _SingleRigidRMSD(initial_transforms, initial_mappings, chem_groups,
3019  chem_mapping, trg_group_pos, mdl_group_pos):
3020  """
3021  Takes initial transforms and sequentially adds chain pairs with lowest RMSD.
3022  The mapping from the transform that leads to lowest overall RMSD is
3023  returned.
3024  """
3025  best_mapping = dict()
3026  best_ssd = float("inf") # we're actually going for summed squared distances
3027  # Since all positions have same lengths and we do a
3028  # full mapping, lowest SSD has a guarantee of also
3029  # being lowest RMSD
3030  for transform in initial_transforms:
3031  mapping = dict()
3032  mapped_mdl_chains = set()
3033  ssd = 0.0
3034  for trg_chains, mdl_chains, trg_pos, mdl_pos, in zip(chem_groups,
3035  chem_mapping,
3036  trg_group_pos,
3037  mdl_group_pos):
3038  if len(trg_pos) == 0 or len(mdl_pos) == 0:
3039  continue # cannot compute valid rmsd
3040  ssds = list()
3041  t_mdl_pos = list()
3042  for m_pos in mdl_pos:
3043  t_m_pos = geom.Vec3List(m_pos)
3044  t_m_pos.ApplyTransform(transform)
3045  t_mdl_pos.append(t_m_pos)
3046  for t_pos, t in zip(trg_pos, trg_chains):
3047  for t_m_pos, m in zip(t_mdl_pos, mdl_chains):
3048  ssd = t_pos.GetSummedSquaredDistances(t_m_pos)
3049  ssds.append((ssd, (t,m)))
3050  ssds.sort()
3051  for item in ssds:
3052  p = item[1]
3053  if p[0] not in mapping and p[1] not in mapped_mdl_chains:
3054  mapping[p[0]] = p[1]
3055  mapped_mdl_chains.add(p[1])
3056  ssd += item[0]
3057 
3058  if ssd < best_ssd:
3059  best_ssd = ssd
3060  best_mapping = mapping
3061 
3062  return best_mapping
3063 
3064 def _IterativeRigidRMSD(initial_transforms, initial_mappings, chem_groups,
3065  chem_mapping, trg_group_pos, mdl_group_pos):
3066  """ Takes initial transforms and sequentially adds chain pairs with
3067  lowest RMSD. With each added chain pair, the transform gets updated.
3068  Thus the naming iterative. The mapping from the initial transform that
3069  leads to best overall RMSD score is returned.
3070  """
3071 
3072  # to directly retrieve positions using chain names
3073  trg_pos_dict = dict()
3074  for trg_pos, trg_chains in zip(trg_group_pos, chem_groups):
3075  for t_pos, t in zip(trg_pos, trg_chains):
3076  trg_pos_dict[t] = t_pos
3077  mdl_pos_dict = dict()
3078  for mdl_pos, mdl_chains in zip(mdl_group_pos, chem_mapping):
3079  for m_pos, m in zip(mdl_pos, mdl_chains):
3080  mdl_pos_dict[m] = m_pos
3081 
3082  best_mapping = dict()
3083  best_rmsd = float("inf")
3084  for initial_transform, initial_mapping in zip(initial_transforms,
3085  initial_mappings):
3086  mapping = {initial_mapping[0]: initial_mapping[1]}
3087  transform = geom.Mat4(initial_transform)
3088  mapped_trg_pos = geom.Vec3List(trg_pos_dict[initial_mapping[0]])
3089  mapped_mdl_pos = geom.Vec3List(mdl_pos_dict[initial_mapping[1]])
3090 
3091  # the following variables contain the chains which are
3092  # available for mapping
3093  trg_chain_groups = [set(group) for group in chem_groups]
3094  mdl_chain_groups = [set(group) for group in chem_mapping]
3095 
3096  # search and kick out inital mapping
3097  for group in trg_chain_groups:
3098  if initial_mapping[0] in group:
3099  group.remove(initial_mapping[0])
3100  break
3101  for group in mdl_chain_groups:
3102  if initial_mapping[1] in group:
3103  group.remove(initial_mapping[1])
3104  break
3105 
3106  something_happened = True
3107  while something_happened:
3108  # search for best mapping given current transform
3109  something_happened=False
3110  best_sc_mapping = None
3111  best_sc_group_idx = None
3112  best_sc_rmsd = float("inf")
3113  group_idx = 0
3114  for trg_chains, mdl_chains in zip(trg_chain_groups, mdl_chain_groups):
3115  for t in trg_chains:
3116  t_pos = trg_pos_dict[t]
3117  for m in mdl_chains:
3118  m_pos = mdl_pos_dict[m]
3119  t_m_pos = geom.Vec3List(m_pos)
3120  t_m_pos.ApplyTransform(transform)
3121  rmsd = t_pos.GetRMSD(t_m_pos)
3122  if rmsd < best_sc_rmsd:
3123  best_sc_rmsd = rmsd
3124  best_sc_mapping = (t,m)
3125  best_sc_group_idx = group_idx
3126  group_idx += 1
3127 
3128  if best_sc_mapping is not None:
3129  something_happened = True
3130  mapping[best_sc_mapping[0]] = best_sc_mapping[1]
3131  mapped_trg_pos.extend(trg_pos_dict[best_sc_mapping[0]])
3132  mapped_mdl_pos.extend(mdl_pos_dict[best_sc_mapping[1]])
3133  trg_chain_groups[best_sc_group_idx].remove(best_sc_mapping[0])
3134  mdl_chain_groups[best_sc_group_idx].remove(best_sc_mapping[1])
3135 
3136  transform = _GetTransform(mapped_mdl_pos, mapped_trg_pos,
3137  False)
3138 
3139  # compute overall RMSD for current transform
3140  mapped_mdl_pos.ApplyTransform(transform)
3141  rmsd = mapped_trg_pos.GetRMSD(mapped_mdl_pos)
3142 
3143  if rmsd < best_rmsd:
3144  best_rmsd = rmsd
3145  best_mapping = mapping
3146 
3147  return best_mapping
3148 
3149 def _NaiveRMSD(chem_groups, chem_mapping, trg_group_pos, mdl_group_pos,
3150  n_max_naive):
3151 
3152  # to directly retrieve positions using chain names
3153  trg_pos_dict = dict()
3154  for trg_pos, trg_chains in zip(trg_group_pos, chem_groups):
3155  for t_pos, t in zip(trg_pos, trg_chains):
3156  trg_pos_dict[t] = t_pos
3157  mdl_pos_dict = dict()
3158  for mdl_pos, mdl_chains in zip(mdl_group_pos, chem_mapping):
3159  for m_pos, m in zip(mdl_pos, mdl_chains):
3160  mdl_pos_dict[m] = m_pos
3161 
3162  best_mapping = dict()
3163  best_rmsd = float("inf")
3164 
3165  for mapping in _ChainMappings(chem_groups, chem_mapping, n_max_naive):
3166  trg_pos = geom.Vec3List()
3167  mdl_pos = geom.Vec3List()
3168  for trg_group, mdl_group in zip(chem_groups, mapping):
3169  for trg_ch, mdl_ch in zip(trg_group, mdl_group):
3170  if trg_ch is not None and mdl_ch is not None:
3171  trg_pos.extend(trg_pos_dict[trg_ch])
3172  mdl_pos.extend(mdl_pos_dict[mdl_ch])
3173  superpose_res = mol.alg.SuperposeSVD(mdl_pos, trg_pos)
3174  if superpose_res.rmsd < best_rmsd:
3175  best_rmsd = superpose_res.rmsd
3176  best_mapping = mapping
3177 
3178  # this is stupid...
3179  tmp = dict()
3180  for chem_group, mapping in zip(chem_groups, best_mapping):
3181  for trg_ch, mdl_ch in zip(chem_group, mapping):
3182  tmp[trg_ch] = mdl_ch
3183 
3184  return tmp
3185 
3186 
3187 def _GetRefPos(trg, mdl, trg_msas, mdl_alns, max_pos = None):
3188  """ Extracts reference positions which are present in trg and mdl
3189  """
3190 
3191  # select only backbone atoms, makes processing simpler later on
3192  # (just select res.atoms[0].GetPos() as ref pos)
3193  bb_trg = trg.Select("aname=\"CA\",\"C3'\"")
3194  bb_mdl = mdl.Select("aname=\"CA\",\"C3'\"")
3195 
3196  # mdl_alns are pairwise, let's construct MSAs
3197  mdl_msas = list()
3198  for aln_list in mdl_alns:
3199  if len(aln_list) > 0:
3200  tmp = aln_list[0].GetSequence(0)
3201  ref_seq = seq.CreateSequence(tmp.GetName(), tmp.GetGaplessString())
3202  mdl_msas.append(seq.alg.MergePairwiseAlignments(aln_list, ref_seq))
3203  else:
3204  mdl_msas.append(seq.CreateAlignment())
3205 
3206  trg_pos = list()
3207  mdl_pos = list()
3208 
3209  for trg_msa, mdl_msa in zip(trg_msas, mdl_msas):
3210 
3211  if mdl_msa.GetCount() > 0:
3212  # make sure they have the same ref sequence (should be a given...)
3213  assert(trg_msa.GetSequence(0).GetGaplessString() == \
3214  mdl_msa.GetSequence(0).GetGaplessString())
3215  else:
3216  # if mdl_msa is empty, i.e. no model chain maps to the chem group
3217  # represented by trg_msa, we just continue. The result will be
3218  # empty position lists added to trg_pos and mdl_pos.
3219  pass
3220 
3221  # check which columns in MSAs are fully covered (indices relative to
3222  # first sequence)
3223  trg_indices = _GetFullyCoveredIndices(trg_msa)
3224  mdl_indices = _GetFullyCoveredIndices(mdl_msa)
3225 
3226  # get indices where both, mdl and trg, are fully covered
3227  indices = sorted(list(trg_indices.intersection(mdl_indices)))
3228 
3229  # subsample if necessary
3230  if max_pos is not None and len(indices) > max_pos:
3231  step = int(len(indices)/max_pos)
3232  indices = [indices[i] for i in range(0, len(indices), step)]
3233 
3234  # translate to column indices in the respective MSAs
3235  trg_indices = _RefIndicesToColumnIndices(trg_msa, indices)
3236  mdl_indices = _RefIndicesToColumnIndices(mdl_msa, indices)
3237 
3238  # extract positions
3239  trg_pos.append(list())
3240  mdl_pos.append(list())
3241  for s_idx in range(trg_msa.GetCount()):
3242  trg_pos[-1].append(_ExtractMSAPos(trg_msa, s_idx, trg_indices,
3243  bb_trg))
3244  # first seq in mdl_msa is ref sequence in trg and does not belong to mdl
3245  for s_idx in range(1, mdl_msa.GetCount()):
3246  mdl_pos[-1].append(_ExtractMSAPos(mdl_msa, s_idx, mdl_indices,
3247  bb_mdl))
3248 
3249  return (trg_pos, mdl_pos)
3250 
3251 def _GetFullyCoveredIndices(msa):
3252  """ Helper for _GetRefPos
3253 
3254  Returns a set containing the indices relative to first sequence in msa which
3255  are fully covered in all other sequences
3256 
3257  --AA-A-A
3258  -BBBB-BB
3259  CCCC-C-C
3260 
3261  => (0,1,3)
3262  """
3263  indices = set()
3264  ref_idx = 0
3265  for col in msa:
3266  if sum([1 for olc in col if olc != '-']) == col.GetRowCount():
3267  indices.add(ref_idx)
3268  if col[0] != '-':
3269  ref_idx += 1
3270  return indices
3271 
3272 def _RefIndicesToColumnIndices(msa, indices):
3273  """ Helper for _GetRefPos
3274 
3275  Returns a list of mapped indices. indices refer to non-gap one letter
3276  codes in the first msa sequence. The returnes mapped indices are translated
3277  to the according msa column indices
3278  """
3279  ref_idx = 0
3280  mapping = dict()
3281  for col_idx, col in enumerate(msa):
3282  if col[0] != '-':
3283  mapping[ref_idx] = col_idx
3284  ref_idx += 1
3285  return [mapping[i] for i in indices]
3286 
3287 def _ExtractMSAPos(msa, s_idx, indices, view):
3288  """ Helper for _GetRefPos
3289 
3290  Returns a geom.Vec3List containing positions refering to given msa sequence.
3291  => Chain with corresponding name is mapped onto sequence and the position of
3292  the first atom of each residue specified in indices is extracted.
3293  Indices refers to column indices in msa!
3294  """
3295  s = msa.GetSequence(s_idx)
3296  s_v = _CSel(view, [s.GetName()])
3297 
3298  # sanity check
3299  assert(len(s.GetGaplessString()) == len(s_v.residues))
3300 
3301  residue_idx = [s.GetResidueIndex(i) for i in indices]
3302  return geom.Vec3List([s_v.residues[i].atoms[0].pos for i in residue_idx])
3303 
3304 def _NChemGroupMappings(ref_chains, mdl_chains):
3305  """ Number of mappings within one chem group
3306 
3307  :param ref_chains: Reference chains
3308  :type ref_chains: :class:`list` of :class:`str`
3309  :param mdl_chains: Model chains that are mapped onto *ref_chains*
3310  :type mdl_chains: :class:`list` of :class:`str`
3311  :returns: Number of possible mappings of *mdl_chains* onto *ref_chains*
3312  """
3313  n_ref = len(ref_chains)
3314  n_mdl = len(mdl_chains)
3315  if n_ref == n_mdl:
3316  return factorial(n_ref)
3317  elif n_ref > n_mdl:
3318  n_choose_k = binom(n_ref, n_mdl)
3319  return n_choose_k * factorial(n_mdl)
3320  else:
3321  n_choose_k = binom(n_mdl, n_ref)
3322  return n_choose_k * factorial(n_ref)
3323 
3324 def _NMappings(ref_chains, mdl_chains):
3325  """ Number of mappings for a full chem mapping
3326 
3327  :param ref_chains: Chem groups of reference
3328  :type ref_chains: :class:`list` of :class:`list` of :class:`str`
3329  :param mdl_chains: Model chains that map onto those chem groups
3330  :type mdl_chains: :class:`list` of :class:`list` of :class:`str`
3331  :returns: Number of possible mappings of *mdl_chains* onto *ref_chains*
3332  """
3333  assert(len(ref_chains) == len(mdl_chains))
3334  n = 1
3335  for a,b in zip(ref_chains, mdl_chains):
3336  n *= _NChemGroupMappings(a,b)
3337  return n
3338 
3339 def _NMappingsWithin(ref_chains, mdl_chains, max_mappings):
3340  """ Check whether total number of mappings is smaller than given maximum
3341 
3342  In principle the same as :func:`_NMappings` but it stops as soon as the
3343  maximum is hit.
3344 
3345  :param ref_chains: Chem groups of reference
3346  :type ref_chains: :class:`list` of :class:`list` of :class:`str`
3347  :param mdl_chains: Model chains that map onto those chem groups
3348  :type mdl_chains: :class:`list` of :class:`list` of :class:`str`
3349  :param max_mappings: Number of max allowed mappings
3350  :returns: Whether number of possible mappings of *mdl_chains* onto
3351  *ref_chains* is below or equal *max_mappings*.
3352  """
3353  assert(len(ref_chains) == len(mdl_chains))
3354  n = 1
3355  for a,b in zip(ref_chains, mdl_chains):
3356  n *= _NChemGroupMappings(a,b)
3357  if n > max_mappings:
3358  return False
3359  return True
3360 
3361 def _RefSmallerGenerator(ref_chains, mdl_chains):
3362  """ Returns all possible ways to map mdl_chains onto ref_chains
3363 
3364  Specific for the case where len(ref_chains) < len(mdl_chains)
3365  """
3366  for c in itertools.combinations(mdl_chains, len(ref_chains)):
3367  for p in itertools.permutations(c):
3368  yield list(p)
3369 
3370 def _RefLargerGenerator(ref_chains, mdl_chains):
3371  """ Returns all possible ways to map mdl_chains onto ref_chains
3372 
3373  Specific for the case where len(ref_chains) > len(mdl_chains)
3374  Ref chains without mapped mdl chain are assigned None
3375  """
3376  n_ref = len(ref_chains)
3377  n_mdl = len(mdl_chains)
3378  for c in itertools.combinations(range(n_ref), n_mdl):
3379  for p in itertools.permutations(mdl_chains):
3380  ret_list = [None] * n_ref
3381  for idx, ch in zip(c, p):
3382  ret_list[idx] = ch
3383  yield ret_list
3384 
3385 def _RefEqualGenerator(ref_chains, mdl_chains):
3386  """ Returns all possible ways to map mdl_chains onto ref_chains
3387 
3388  Specific for the case where len(ref_chains) == len(mdl_chains)
3389  """
3390  for p in itertools.permutations(mdl_chains):
3391  yield list(p)
3392 
3393 def _ConcatIterators(iterators):
3394  for item in itertools.product(*iterators):
3395  yield list(item)
3396 
3397 def _ChainMappings(ref_chains, mdl_chains, n_max=None):
3398  """Returns all possible ways to map *mdl_chains* onto fixed *ref_chains*
3399 
3400  :param ref_chains: List of list of chemically equivalent chains in reference
3401  :type ref_chains: :class:`list` of :class:`list`
3402  :param mdl_chains: Equally long list of list of chemically equivalent chains
3403  in model that map on those ref chains.
3404  :type mdl_chains: :class:`list` of :class:`list`
3405  :param n_max: Aborts and raises :class:`RuntimeError` if max number of
3406  mappings is above this threshold.
3407  :type n_max: :class:`int`
3408  :returns: Iterator over all possible mappings of *mdl_chains* onto fixed
3409  *ref_chains*. Potentially contains None as padding when number of
3410  model chains for a certain mapping is smaller than the according
3411  reference chains.
3412  Example: _ChainMappings([['A', 'B', 'C'], ['D', 'E']],
3413  [['x', 'y'], ['i', 'j']])
3414  gives an iterator over: [[['x', 'y', None], ['i', 'j']],
3415  [['x', 'y', None], ['j', 'i']],
3416  [['y', 'x', None], ['i', 'j']],
3417  [['y', 'x', None], ['j', 'i']],
3418  [['x', None, 'y'], ['i', 'j']],
3419  [['x', None, 'y'], ['j', 'i']],
3420  [['y', None, 'x'], ['i', 'j']],
3421  [['y', None, 'x'], ['j', 'i']],
3422  [[None, 'x', 'y'], ['i', 'j']],
3423  [[None, 'x', 'y'], ['j', 'i']],
3424  [[None, 'y', 'x'], ['i', 'j']],
3425  [[None, 'y', 'x'], ['j', 'i']]]
3426  """
3427  assert(len(ref_chains) == len(mdl_chains))
3428 
3429  if n_max is not None:
3430  if not _NMappingsWithin(ref_chains, mdl_chains, n_max):
3431  raise RuntimeError(f"Too many mappings. Max allowed: {n_max}")
3432 
3433  # one iterator per mapping representing all mdl combinations relative to
3434  # reference
3435  iterators = list()
3436  for ref, mdl in zip(ref_chains, mdl_chains):
3437  if len(ref) == 0:
3438  raise RuntimeError("Expext at least one chain in ref chem group")
3439  if len(ref) == len(mdl):
3440  iterators.append(_RefEqualGenerator(ref, mdl))
3441  elif len(ref) < len(mdl):
3442  iterators.append(_RefSmallerGenerator(ref, mdl))
3443  else:
3444  iterators.append(_RefLargerGenerator(ref, mdl))
3445 
3446  return _ConcatIterators(iterators)
3447 
3448 
3449 def _GetTransform(pos_one, pos_two, iterative):
3450  """ Computes minimal RMSD superposition for pos_one onto pos_two
3451 
3452  :param pos_one: Positions that should be superposed onto *pos_two*
3453  :type pos_one: :class:`geom.Vec3List`
3454  :param pos_two: Reference positions
3455  :type pos_two: :class:`geom.Vec3List`
3456  :iterative: Whether iterative superposition should be used. Iterative
3457  potentially raises, uses standard superposition as fallback.
3458  :type iterative: :class:`bool`
3459  :returns: Transformation matrix to superpose *pos_one* onto *pos_two*
3460  :rtype: :class:`geom.Mat4`
3461  """
3462  res = None
3463  if iterative:
3464  try:
3465  res = mol.alg.IterativeSuperposeSVD(pos_one, pos_two)
3466  except:
3467  pass # triggers fallback below
3468  if res is None:
3469  res = mol.alg.SuperposeSVD(pos_one, pos_two)
3470  return res.transformation
3471 
3472 # specify public interface
3473 __all__ = ('ChainMapper', 'ReprResult', 'MappingResult')
def __init__(self, pep_subst_mat=seq.alg.BLOSUM62, pep_gap_open=-5, pep_gap_ext=-2, nuc_subst_mat=seq.alg.NUC44, nuc_gap_open=-4, nuc_gap_ext=-4, resnum_aln=False)
def Align(self, s1, s2, chem_type=None)
def NWAlign(self, s1, s2, chem_type)
def __init__(self, ref, mdl, ref_chem_groups, mdl_chem_groups, ref_mdl_alns, contact_d=12.0, steep_opt_rate=None, greedy_prune_contact_map=False)
def ExtendMapping(self, mapping, max_ext=None)
def _SteepOpt(self, mapping, chains_to_optimize=None)
def __init__(self, ref, mdl, ref_mdl_alns, inclusion_radius=15.0, thresholds=[0.5, 1.0, 2.0, 4.0])
def lDDT(self, ref_chain_groups, mdl_chain_groups)
def IntCounts(self, ref_ch1, ref_ch2, mdl_ch1, mdl_ch2)
def __init__(self, ref, mdl, ref_chem_groups, mdl_chem_groups, ref_mdl_alns, inclusion_radius=15.0, thresholds=[0.5, 1.0, 2.0, 4.0], steep_opt_rate=None)
def ExtendMapping(self, mapping, max_ext=None)
def _SteepOpt(self, mapping, chains_to_optimize=None)
def GetlDDTMapping(self, model, inclusion_radius=15.0, thresholds=[0.5, 1.0, 2.0, 4.0], strategy="heuristic", steep_opt_rate=None, block_seed_size=5, block_blocks_per_chem_group=5, chem_mapping_result=None, heuristic_n_max_naive=40320)
def GetRepr(self, substructure, model, topn=1, inclusion_radius=15.0, thresholds=[0.5, 1.0, 2.0, 4.0], bb_only=False, only_interchain=False, chem_mapping_result=None, global_mapping=None)
def GetMapping(self, model, n_max_naive=40320)
def __init__(self, target, resnum_alignments=False, pep_seqid_thr=95., pep_gap_thr=1.0, nuc_seqid_thr=95., nuc_gap_thr=1.0, pep_subst_mat=seq.alg.BLOSUM62, pep_gap_open=-11, pep_gap_ext=-1, nuc_subst_mat=seq.alg.NUC44, nuc_gap_open=-4, nuc_gap_ext=-4, min_pep_length=6, min_nuc_length=4, n_max_naive=1e8)
def GetQSScoreMapping(self, model, contact_d=12.0, strategy="heuristic", block_seed_size=5, block_blocks_per_chem_group=5, heuristic_n_max_naive=40320, steep_opt_rate=None, chem_mapping_result=None, greedy_prune_contact_map=True)
def GetRMSDMapping(self, model, strategy="heuristic", subsampling=50, chem_mapping_result=None, heuristic_n_max_naive=120)
def GetFlatMapping(self, mdl_as_key=False)
def __init__(self, target, model, chem_groups, chem_mapping, mapping, alns, opt_score=None)
def GetFlatChainMapping(self, mdl_as_key=False)
def __init__(self, lDDT, substructure, ref_view, mdl_view)
def _GetInconsistentResidues(self, ref_residues, mdl_residues)
def _InterfacePenalty1(self, interface)
Definition: qsscore.py:681
def _InterfacePenalty2(self, interface)
Definition: qsscore.py:687
def _MappedInterfaceScores(self, int1, int2)
Definition: qsscore.py:532
def FromFlatMapping(self, flat_mapping)
Definition: qsscore.py:485