2 Chain mapping aims to identify a one-to-one relationship between chains in a
3 reference structure and a model.
11 from scipy.special
import factorial
12 from scipy.special
import binom
23 def _CSel(ent, cnames):
24 """ Returns view with specified chains
26 Ensures that quotation marks are around chain names to not confuse
27 OST query language with weird special characters.
29 query =
"cname=" +
','.join([mol.QueryQuoteName(cname)
for cname
in cnames])
30 return ent.Select(query)
33 """ Result object for the chain mapping functions in :class:`ChainMapper`
35 Constructor is directly called within the functions, no need to construct
36 such objects yourself.
38 def __init__(self, target, model, chem_groups, chem_mapping, mapping, alns,
45 self.
_alns_alns = alns
50 """ Target/reference structure, i.e. :attr:`ChainMapper.target`
52 :type: :class:`ost.mol.EntityView`
58 """ Model structure that gets mapped onto :attr:`~target`
60 Underwent same processing as :attr:`ChainMapper.target`, i.e.
61 only contains peptide/nucleotide chains of sufficient size.
63 :type: :class:`ost.mol.EntityView`
69 """ Groups of chemically equivalent chains in :attr:`~target`
71 Same as :attr:`ChainMapper.chem_group`
73 :class:`list` of :class:`list` of :class:`str` (chain names)
79 """ Assigns chains in :attr:`~model` to :attr:`~chem_groups`.
81 :class:`list` of :class:`list` of :class:`str` (chain names)
87 """ Mapping of :attr:`~model` chains onto :attr:`~target`
89 Exact same shape as :attr:`~chem_groups` but containing the names of the
90 mapped chains in :attr:`~model`. May contain None for :attr:`~target`
91 chains that are not covered. No guarantee that all chains in
92 :attr:`~model` are mapped.
94 :class:`list` of :class:`list` of :class:`str` (chain names)
100 """ Alignments of mapped chains in :attr:`~target` and :attr:`~model`
102 Each alignment is accessible with ``alns[(t_chain,m_chain)]``. First
103 sequence is the sequence of :attr:`target` chain, second sequence the
104 one from :attr:`~model`. The respective :class:`ost.mol.EntityView` are
105 attached with :func:`ost.seq.ConstSequenceHandle.AttachView`.
107 :type: :class:`dict` with key: :class:`tuple` of :class:`str`, value:
108 :class:`ost.seq.AlignmentHandle`
110 return self.
_alns_alns
114 """ Placeholder property without any guarantee of being set
116 Different scores get optimized in the various chain mapping algorithms.
117 Some of them may set their final optimal score in that property.
118 Consult the documentation of the respective chain mapping algorithm
119 for more information. Won't be in the return dict of
125 """ Returns flat mapping as :class:`dict` for all mapable chains
127 :param mdl_as_key: Default is target chain name as key and model chain
128 name as value. This can be reversed with this flag.
129 :returns: :class:`dict` with :class:`str` as key/value that describe
132 flat_mapping = dict()
133 for trg_chem_group, mdl_chem_group
in zip(self.
chem_groupschem_groups,
135 for a,b
in zip(trg_chem_group, mdl_chem_group):
136 if a
is not None and b
is not None:
144 """ Returns JSON serializable summary of results
147 json_dict[
"chem_groups"] = self.
chem_groupschem_groups
148 json_dict[
"mapping"] = self.
mappingmapping
150 json_dict[
"alns"] = list()
151 for aln
in self.
alnsalns.values():
152 trg_seq = aln.GetSequence(0)
153 mdl_seq = aln.GetSequence(1)
154 aln_dict = {
"trg_ch": trg_seq.GetName(),
"trg_seq": str(trg_seq),
155 "mdl_ch": mdl_seq.GetName(),
"mdl_seq": str(mdl_seq)}
156 json_dict[
"alns"].append(aln_dict)
162 """ Result object for :func:`ChainMapper.GetRepr`
164 Constructor is directly called within the function, no need to construct
165 such objects yourself.
167 :param lDDT: lDDT for this mapping. Depends on how you call
168 :func:`ChainMapper.GetRepr` whether this is backbone only or
170 :type lDDT: :class:`float`
171 :param substructure: The full substructure for which we searched for a
173 :type substructure: :class:`ost.mol.EntityView`
174 :param ref_view: View pointing to the same underlying entity as
175 *substructure* but only contains the stuff that is mapped
176 :type ref_view: :class:`mol.EntityView`
177 :param mdl_view: The matching counterpart in model
178 :type mdl_view: :class:`mol.EntityView`
180 def __init__(self, lDDT, substructure, ref_view, mdl_view):
181 self.
_lDDT_lDDT = lDDT
183 assert(len(ref_view.residues) == len(mdl_view.residues))
205 """ lDDT of representation result
207 Depends on how you call :func:`ChainMapper.GetRepr` whether this is
208 backbone only or full atom lDDT.
210 :type: :class:`float`
212 return self.
_lDDT_lDDT
216 """ The full substructure for which we searched for a
219 :type: :class:`ost.mol.EntityView`
225 """ View which contains the mapped subset of :attr:`substructure`
227 :type: :class:`ost.mol.EntityView`
233 """ The :attr:`ref_view` representation in the model
235 :type: :class:`ost.mol.EntityView`
241 """ The reference residues
243 :type: class:`mol.ResidueViewList`
245 return self.
ref_viewref_view.residues
249 """ The model residues
251 :type: :class:`mol.ResidueViewList`
253 return self.
mdl_viewmdl_view.residues
257 """ A list of mapped residue whose names do not match (eg. ALA in the
258 reference and LEU in the model).
260 The mismatches are reported as a tuple of :class:`~ost.mol.ResidueView`
261 (reference, model), or as an empty list if all the residue names match.
272 """ Representative backbone positions for reference residues.
274 Thats CA positions for peptides and C3' positions for Nucleotides.
276 :type: :class:`geom.Vec3List`
284 """ Representative backbone positions for model residues.
286 Thats CA positions for peptides and C3' positions for Nucleotides.
288 :type: :class:`geom.Vec3List`
296 """ Representative backbone positions for reference residues.
298 Thats N, CA and C positions for peptides and O5', C5', C4', C3', O3'
299 positions for Nucleotides.
301 :type: :class:`geom.Vec3List`
309 """ Representative backbone positions for reference residues.
311 Thats N, CA and C positions for peptides and O5', C5', C4', C3', O3'
312 positions for Nucleotides.
314 :type: :class:`geom.Vec3List`
322 """ Transformation to superpose mdl residues onto ref residues
324 Superposition computed as minimal RMSD superposition on
325 :attr:`ref_bb_pos` and :attr:`mdl_bb_pos`. If number of positions is
326 smaller 3, the full_bb_pos equivalents are used instead.
328 :type: :class:`ost.geom.Mat4`
341 """ :attr:`mdl_bb_pos` with :attr:`transform applied`
343 :type: :class:`geom.Vec3List`
352 """ RMSD between :attr:`ref_bb_pos` and :attr:`superposed_mdl_bb_pos`
354 :type: :class:`float`
362 """ GDT with one single threshold: 8.0
364 :type: :class:`float`
366 if self.
_gdt_8_gdt_8
is None:
372 """ GDT with one single threshold: 4.0
374 :type: :class:`float`
376 if self.
_gdt_4_gdt_4
is None:
382 """ GDT with one single threshold: 2.0
384 :type: :class:`float`
386 if self.
_gdt_2_gdt_2
is None:
392 """ GDT with one single threshold: 1.0
394 :type: :class:`float`
396 if self.
_gdt_1_gdt_1
is None:
402 """ query for mdl residues in OpenStructure query language
404 Repr can be selected as ``full_mdl.Select(ost_query)``
406 Returns invalid query if residue numbers have insertion codes.
413 chname = r.GetChain().GetName()
414 rnum = r.GetNumber().GetNum()
415 if chname
not in chain_rnums:
416 chain_rnums[chname] = list()
417 chain_rnums[chname].append(str(rnum))
418 chain_queries = list()
419 for k,v
in chain_rnums.items():
420 q = f
"(cname={mol.QueryQuoteName(k)} and "
421 q += f
"rnum={','.join(v)})"
422 chain_queries.append(q)
423 self.
_ost_query_ost_query =
" or ".join(chain_queries)
427 """ Returns JSON serializable summary of results
430 json_dict[
"lDDT"] = self.
lDDTlDDT
431 json_dict[
"ref_residues"] = [r.GetQualifiedName()
for r
in \
433 json_dict[
"mdl_residues"] = [r.GetQualifiedName()
for r
in \
435 json_dict[
"transform"] = list(self.
transformtransform.data)
436 json_dict[
"bb_rmsd"] = self.
bb_rmsdbb_rmsd
437 json_dict[
"gdt_8"] = self.
gdt_8gdt_8
438 json_dict[
"gdt_4"] = self.
gdt_4gdt_4
439 json_dict[
"gdt_2"] = self.
gdt_2gdt_2
440 json_dict[
"gdt_1"] = self.
gdt_1gdt_1
441 json_dict[
"ost_query"] = self.
ost_queryost_query
446 """ Returns flat mapping of all chains in the representation
448 :param mdl_as_key: Default is target chain name as key and model chain
449 name as value. This can be reversed with this flag.
450 :returns: :class:`dict` with :class:`str` as key/value that describe
453 flat_mapping = dict()
456 flat_mapping[mdl_res.chain.name] = trg_res.chain.name
458 flat_mapping[trg_res.chain.name] = mdl_res.chain.name
461 def _GetFullBBPos(self, residues):
462 """ Helper to extract full backbone positions
464 exp_pep_atoms = [
"N",
"CA",
"C"]
465 exp_nuc_atoms = [
"\"O5'\"",
"\"C5'\"",
"\"C4'\"",
"\"C3'\"",
"\"O3'\""]
468 if r.GetChemType() == mol.ChemType.NUCLEOTIDES:
469 exp_atoms = exp_nuc_atoms
470 elif r.GetChemType() == mol.ChemType.AMINOACIDS:
471 exp_atoms = exp_pep_atoms
473 raise RuntimeError(
"Something terrible happened... RUN...")
474 for aname
in exp_atoms:
475 a = r.FindAtom(aname)
477 raise RuntimeError(
"Something terrible happened... "
479 bb_pos.append(a.GetPos())
482 def _GetBBPos(self, residues):
483 """ Helper to extract single representative position for each residue
487 at = r.FindAtom(
"CA")
489 at = r.FindAtom(
"C3'")
491 raise RuntimeError(
"Something terrible happened... RUN...")
492 bb_pos.append(at.GetPos())
495 def _GetInconsistentResidues(self, ref_residues, mdl_residues):
496 """ Helper to extract a list of inconsistent residues.
498 if len(ref_residues) != len(mdl_residues):
499 raise ValueError(
"Something terrible happened... Reference and "
500 "model lengths differ... RUN...")
501 inconsistent_residues = list()
502 for ref_residue, mdl_residue
in zip(ref_residues, mdl_residues):
503 if ref_residue.name != mdl_residue.name:
504 inconsistent_residues.append((ref_residue, mdl_residue))
505 return inconsistent_residues
509 """ Class to compute chain mappings
511 All algorithms are performed on processed structures which fulfill
512 criteria as given in constructor arguments (*min_pep_length*,
513 "min_nuc_length") and only contain residues which have all required backbone
514 atoms. for peptide residues thats N, CA, C and CB (no CB for GLY), for
515 nucleotide residues thats O5', C5', C4', C3' and O3'.
517 Chain mapping is a three step process:
519 * Group chemically identical chains in *target* using pairwise
520 alignments that are either computed with Needleman-Wunsch (NW) or
521 simply derived from residue numbers (*resnum_alignments* flag).
522 In case of NW, *pep_subst_mat*, *pep_gap_open* and *pep_gap_ext*
523 and their nucleotide equivalents are relevant. Two chains are
524 considered identical if they fulfill the thresholds given by
525 *pep_seqid_thr*, *pep_gap_thr*, their nucleotide equivalents
526 respectively. The grouping information is available as
527 attributes of this class.
529 * Map chains in an input model to these groups. Generating alignments
530 and the similarity criteria are the same as above. You can either
531 get the group mapping with :func:`GetChemMapping` or directly call
532 one of the full fletched one-to-one chain mapping functions which
533 execute that step internally.
535 * Obtain one-to-one mapping for chains in an input model and
536 *target* with one of the available mapping functions. Just to get an
537 idea of complexity. If *target* and *model* are octamers, there are
538 ``8! = 40320`` possible chain mappings.
540 :param target: Target structure onto which models are mapped.
541 Computations happen on a selection only containing
542 polypeptides and polynucleotides.
543 :type target: :class:`ost.mol.EntityView`/:class:`ost.mol.EntityHandle`
544 :param resnum_alignments: Use residue numbers instead of
545 Needleman-Wunsch to compute pairwise
546 alignments. Relevant for :attr:`~chem_groups`
547 and related attributes.
548 :type resnum_alignments: :class:`bool`
549 :param pep_seqid_thr: Threshold used to decide when two chains are
550 identical. 95 percent tolerates the few mutations
551 crystallographers like to do.
552 :type pep_seqid_thr: :class:`float`
553 :param pep_gap_thr: Additional threshold to avoid gappy alignments with
554 high seqid. By default this is disabled (set to 1.0).
555 This threshold checks for a maximum allowed fraction
556 of gaps in any of the two sequences after stripping
557 terminal gaps. The reason for not just normalizing
558 seqid by the longer sequence is that one sequence
559 might be a perfect subsequence of the other but only
561 :type pep_gap_thr: :class:`float`
562 :param nuc_seqid_thr: Nucleotide equivalent for *pep_seqid_thr*
563 :type nuc_seqid_thr: :class:`float`
564 :param nuc_gap_thr: Nucleotide equivalent for *nuc_gap_thr*
565 :type nuc_gap_thr: :class:`float`
566 :param pep_subst_mat: Substitution matrix to align peptide sequences,
567 irrelevant if *resnum_alignments* is True,
568 defaults to seq.alg.BLOSUM62
569 :type pep_subst_mat: :class:`ost.seq.alg.SubstWeightMatrix`
570 :param pep_gap_open: Gap open penalty to align peptide sequences,
571 irrelevant if *resnum_alignments* is True
572 :type pep_gap_open: :class:`int`
573 :param pep_gap_ext: Gap extension penalty to align peptide sequences,
574 irrelevant if *resnum_alignments* is True
575 :type pep_gap_ext: :class:`int`
576 :param nuc_subst_mat: Nucleotide equivalent for *pep_subst_mat*,
577 defaults to seq.alg.NUC44
578 :type nuc_subst_mat: :class:`ost.seq.alg.SubstWeightMatrix`
579 :param nuc_gap_open: Nucleotide equivalent for *pep_gap_open*
580 :type nuc_gap_open: :class:`int`
581 :param nuc_gap_ext: Nucleotide equivalent for *pep_gap_ext*
582 :type nuc_gap_ext: :class:`int`
583 :param min_pep_length: Minimal number of residues for a peptide chain to be
584 considered in target and in models.
585 :type min_pep_length: :class:`int`
586 :param min_nuc_length: Minimal number of residues for a nucleotide chain to be
587 considered in target and in models.
588 :type min_nuc_length: :class:`int`
589 :param n_max_naive: Max possible chain mappings that are enumerated in
590 :func:`~GetNaivelDDTMapping` /
591 :func:`~GetDecomposerlDDTMapping`. A
592 :class:`RuntimeError` is raised in case of bigger
594 :type n_max_naive: :class:`int`
596 def __init__(self, target, resnum_alignments=False,
597 pep_seqid_thr = 95., pep_gap_thr = 1.0,
598 nuc_seqid_thr = 95., nuc_gap_thr = 1.0,
599 pep_subst_mat = seq.alg.BLOSUM62, pep_gap_open = -11,
600 pep_gap_ext = -1, nuc_subst_mat = seq.alg.NUC44,
601 nuc_gap_open = -4, nuc_gap_ext = -4,
602 min_pep_length = 6, min_nuc_length = 4,
623 pep_subst_mat = pep_subst_mat,
624 pep_gap_open = pep_gap_open,
625 pep_gap_ext = pep_gap_ext,
626 nuc_subst_mat = nuc_subst_mat,
627 nuc_gap_open = nuc_gap_open,
628 nuc_gap_ext = nuc_gap_ext)
631 self._target, self._polypep_seqs, self.
_polynuc_seqs_polynuc_seqs = \
636 """Target structure that only contains peptides/nucleotides
638 Contains only residues that have the backbone representatives
639 (CA for peptide and C3' for nucleotides) to avoid ATOMSEQ alignment
640 inconsistencies when switching between all atom and backbone only
643 :type: :class:`ost.mol.EntityView`
649 """Sequences of peptide chains in :attr:`~target`
651 Respective :class:`EntityView` from *target* for each sequence s are
652 available as ``s.GetAttachedView()``
654 :type: :class:`ost.seq.SequenceList`
656 return self._polypep_seqs
660 """Sequences of nucleotide chains in :attr:`~target`
662 Respective :class:`EntityView` from *target* for each sequence s are
663 available as ``s.GetAttachedView()``
665 :type: :class:`ost.seq.SequenceList`
671 """Groups of chemically equivalent chains in :attr:`~target`
673 First chain in group is the one with longest sequence.
675 :getter: Computed on first use (cached)
676 :type: :class:`list` of :class:`list` of :class:`str` (chain names)
681 self.
_chem_groups_chem_groups.append([s.GetName()
for s
in a.sequences])
686 """MSA for each group in :attr:`~chem_groups`
688 Sequences in MSAs exhibit same order as in :attr:`~chem_groups` and
689 have the respective :class:`ost.mol.EntityView` from *target* attached.
691 :getter: Computed on first use (cached)
692 :type: :class:`ost.seq.AlignmentList`
707 """Reference (longest) sequence for each group in :attr:`~chem_groups`
709 Respective :class:`EntityView` from *target* for each sequence s are
710 available as ``s.GetAttachedView()``
712 :getter: Computed on first use (cached)
713 :type: :class:`ost.seq.SequenceList`
718 s = seq.CreateSequence(a.GetSequence(0).GetName(),
719 a.GetSequence(0).GetGaplessString())
720 s.AttachView(a.GetSequence(0).GetAttachedView())
726 """ChemType of each group in :attr:`~chem_groups`
728 Specifying if groups are poly-peptides/nucleotides, i.e.
729 :class:`ost.mol.ChemType.AMINOACIDS` or
730 :class:`ost.mol.ChemType.NUCLEOTIDES`
732 :getter: Computed on first use (cached)
733 :type: :class:`list` of :class:`ost.mol.ChemType`
747 """Maps sequences in *model* to chem_groups of target
749 :param model: Model from which to extract sequences, a
750 selection that only includes peptides and nucleotides
751 is performed and returned along other results.
752 :type model: :class:`ost.mol.EntityView`/:class:`ost.mol.EntityHandle`
753 :returns: Tuple with two lists of length `len(self.chem_groups)` and
754 an :class:`ost.mol.EntityView` representing *model*:
755 1) Each element is a :class:`list` with mdl chain names that
756 map to the chem group at that position.
757 2) Each element is a :class:`ost.seq.AlignmentList` aligning
758 these mdl chain sequences to the chem group ref sequences.
759 3) A selection of *model* that only contains polypeptides and
760 polynucleotides whose ATOMSEQ exactly matches the sequence
761 info in the returned alignments.
763 mdl, mdl_pep_seqs, mdl_nuc_seqs = self.
ProcessStructureProcessStructure(model)
764 mapping = [list()
for x
in self.
chem_groupschem_groups]
765 alns = [seq.AlignmentList()
for x
in self.
chem_groupschem_groups]
767 for s
in mdl_pep_seqs:
770 s, mol.ChemType.AMINOACIDS,
773 mapping[idx].append(s.GetName())
774 alns[idx].append(aln)
776 for s
in mdl_nuc_seqs:
779 s, mol.ChemType.NUCLEOTIDES,
782 mapping[idx].append(s.GetName())
783 alns[idx].append(aln)
785 return (mapping, alns, mdl)
789 thresholds=[0.5, 1.0, 2.0, 4.0], strategy="heuristic",
790 steep_opt_rate = None, block_seed_size = 5,
791 block_blocks_per_chem_group = 5,
792 chem_mapping_result = None,
793 heuristic_n_max_naive = 40320):
794 """ Identify chain mapping by optimizing lDDT score
796 Maps *model* chain sequences to :attr:`~chem_groups` and find mapping
797 based on backbone only lDDT score (CA for amino acids C3' for
800 Either performs a naive search, i.e. enumerate all possible mappings or
801 executes a greedy strategy that tries to identify a (close to) optimal
802 mapping in an iterative way by starting from a start mapping (seed). In
803 each iteration, the one-to-one mapping that leads to highest increase
804 in number of conserved contacts is added with the additional requirement
805 that this added mapping must have non-zero interface counts towards the
806 already mapped chains. So basically we're "growing" the mapped structure
807 by only adding connected stuff.
809 The available strategies:
811 * **naive**: Enumerates all possible mappings and returns best
813 * **greedy_fast**: perform all vs. all single chain lDDTs within the
814 respective ref/mdl chem groups. The mapping with highest number of
815 conserved contacts is selected as seed for greedy extension
817 * **greedy_full**: try multiple seeds for greedy extension, i.e. try
818 all ref/mdl chain combinations within the respective chem groups and
819 retain the mapping leading to the best lDDT.
821 * **greedy_block**: try multiple seeds for greedy extension, i.e. try
822 all ref/mdl chain combinations within the respective chem groups and
823 extend them to *block_seed_size*. *block_blocks_per_chem_group*
824 for each chem group are selected for exhaustive extension.
826 * **heuristic**: Uses *naive* strategy if number of possible mappings
827 is within *heuristic_n_max_naive*. The default of 40320 corresponds
828 to an octamer (8!=40320). A structure with stoichiometry A6B2 would be
829 6!*2!=1440 etc. If the number of possible mappings is larger,
830 *greedy_full* is used.
832 Sets :attr:`MappingResult.opt_score` in case of no trivial one-to-one
835 :param model: Model to map
836 :type model: :class:`ost.mol.EntityView`/:class:`ost.mol.EntityHandle`
837 :param inclusion_radius: Inclusion radius for lDDT
838 :type inclusion_radius: :class:`float`
839 :param thresholds: Thresholds for lDDT
840 :type thresholds: :class:`list` of :class:`float`
841 :param strategy: Strategy to find mapping. Must be in ["naive",
842 "greedy_fast", "greedy_full", "greedy_block"]
843 :type strategy: :class:`str`
844 :param steep_opt_rate: Only relevant for greedy strategies.
845 If set, every *steep_opt_rate* mappings, a simple
846 optimization is executed with the goal of
847 avoiding local minima. The optimization
848 iteratively checks all possible swaps of mappings
849 within their respective chem groups and accepts
850 swaps that improve lDDT score. Iteration stops as
851 soon as no improvement can be achieved anymore.
852 :type steep_opt_rate: :class:`int`
853 :param block_seed_size: Param for *greedy_block* strategy - Initial seeds
854 are extended by that number of chains.
855 :type block_seed_size: :class:`int`
856 :param block_blocks_per_chem_group: Param for *greedy_block* strategy -
857 Number of blocks per chem group that
858 are extended in an initial search
859 for high scoring local solutions.
860 :type block_blocks_per_chem_group: :class:`int`
861 :param chem_mapping_result: Pro param. The result of
862 :func:`~GetChemMapping` where you provided
863 *model*. If set, *model* parameter is not
865 :type chem_mapping_result: :class:`tuple`
866 :returns: A :class:`MappingResult`
869 strategies = [
"naive",
"greedy_fast",
"greedy_full",
"greedy_block",
871 if strategy
not in strategies:
872 raise RuntimeError(f
"Strategy must be in {strategies}")
874 if chem_mapping_result
is None:
875 chem_mapping, chem_group_alns, mdl = self.
GetChemMappingGetChemMapping(model)
877 chem_mapping, chem_group_alns, mdl = chem_mapping_result
879 ref_mdl_alns = _GetRefMdlAlns(self.
chem_groupschem_groups,
885 one_to_one = _CheckOneToOneMapping(self.
chem_groupschem_groups, chem_mapping)
886 if one_to_one
is not None:
888 for ref_group, mdl_group
in zip(self.
chem_groupschem_groups, one_to_one):
889 for ref_ch, mdl_ch
in zip(ref_group, mdl_group):
890 if ref_ch
is not None and mdl_ch
is not None:
891 aln = ref_mdl_alns[(ref_ch, mdl_ch)]
892 aln.AttachView(0, _CSel(self.
targettarget, [ref_ch]))
893 aln.AttachView(1, _CSel(mdl, [mdl_ch]))
894 alns[(ref_ch, mdl_ch)] = aln
898 if strategy ==
"heuristic":
899 if _NMappingsWithin(self.
chem_groupschem_groups, chem_mapping,
900 heuristic_n_max_naive):
903 strategy =
"greedy_full"
908 if strategy ==
"naive":
909 mapping, opt_lddt = _lDDTNaive(self.
targettarget, mdl, inclusion_radius,
911 chem_mapping, ref_mdl_alns,
916 chem_mapping, ref_mdl_alns,
917 inclusion_radius=inclusion_radius,
918 thresholds=thresholds,
919 steep_opt_rate=steep_opt_rate)
920 if strategy ==
"greedy_fast":
921 mapping = _lDDTGreedyFast(the_greed)
922 elif strategy ==
"greedy_full":
923 mapping = _lDDTGreedyFull(the_greed)
924 elif strategy ==
"greedy_block":
925 mapping = _lDDTGreedyBlock(the_greed, block_seed_size,
926 block_blocks_per_chem_group)
928 opt_lddt = the_greed.lDDT(self.
chem_groupschem_groups, mapping)
931 for ref_group, mdl_group
in zip(self.
chem_groupschem_groups, mapping):
932 for ref_ch, mdl_ch
in zip(ref_group, mdl_group):
933 if ref_ch
is not None and mdl_ch
is not None:
934 aln = ref_mdl_alns[(ref_ch, mdl_ch)]
935 aln.AttachView(0, _CSel(self.
targettarget, [ref_ch]))
936 aln.AttachView(1, _CSel(mdl, [mdl_ch]))
937 alns[(ref_ch, mdl_ch)] = aln
940 mapping, alns, opt_score = opt_lddt)
944 block_seed_size = 5, block_blocks_per_chem_group = 5,
945 heuristic_n_max_naive = 40320, steep_opt_rate = None,
946 chem_mapping_result = None,
947 greedy_prune_contact_map = True):
948 """ Identify chain mapping based on QSScore
950 Scoring is based on CA/C3' positions which are present in all chains of
951 a :attr:`chem_groups` as well as the *model* chains which are mapped to
952 that respective chem group.
954 The following strategies are available:
956 * **naive**: Naively iterate all possible mappings and return best based
959 * **greedy_fast**: perform all vs. all single chain lDDTs within the
960 respective ref/mdl chem groups. The mapping with highest number of
961 conserved contacts is selected as seed for greedy extension.
962 Extension is based on QS-score.
964 * **greedy_full**: try multiple seeds for greedy extension, i.e. try
965 all ref/mdl chain combinations within the respective chem groups and
966 retain the mapping leading to the best QS-score.
968 * **greedy_block**: try multiple seeds for greedy extension, i.e. try
969 all ref/mdl chain combinations within the respective chem groups and
970 extend them to *block_seed_size*. *block_blocks_per_chem_group*
971 for each chem group are selected for exhaustive extension.
973 * **heuristic**: Uses *naive* strategy if number of possible mappings
974 is within *heuristic_n_max_naive*. The default of 40320 corresponds
975 to an octamer (8!=40320). A structure with stoichiometry A6B2 would be
976 6!*2!=1440 etc. If the number of possible mappings is larger,
977 *greedy_full* is used.
979 Sets :attr:`MappingResult.opt_score` in case of no trivial one-to-one
982 :param model: Model to map
983 :type model: :class:`ost.mol.EntityView`/:class:`ost.mol.EntityHandle`
984 :param contact_d: Max distance between two residues to be considered as
985 contact in qs scoring
986 :type contact_d: :class:`float`
987 :param strategy: Strategy for sampling, must be in ["naive",
988 "greedy_fast", "greedy_full", "greedy_block"]
989 :type strategy: :class:`str`
990 :param chem_mapping_result: Pro param. The result of
991 :func:`~GetChemMapping` where you provided
992 *model*. If set, *model* parameter is not
994 :type chem_mapping_result: :class:`tuple`
995 :param greedy_prune_contact_map: Relevant for all strategies that use
996 greedy extensions. If True, only chains
997 with at least 3 contacts (8A CB
998 distance) towards already mapped chains
999 in trg/mdl are considered for
1000 extension. All chains that give a
1001 potential non-zero QS-score increase
1002 are used otherwise (at least one
1003 contact within 12A). The consequence
1004 is reduced runtime and usually no
1005 real reduction in accuracy.
1006 :returns: A :class:`MappingResult`
1009 strategies = [
"naive",
"greedy_fast",
"greedy_full",
"greedy_block",
"heuristic"]
1010 if strategy
not in strategies:
1011 raise RuntimeError(f
"strategy must be {strategies}")
1013 if chem_mapping_result
is None:
1014 chem_mapping, chem_group_alns, mdl = self.
GetChemMappingGetChemMapping(model)
1016 chem_mapping, chem_group_alns, mdl = chem_mapping_result
1017 ref_mdl_alns = _GetRefMdlAlns(self.
chem_groupschem_groups,
1022 one_to_one = _CheckOneToOneMapping(self.
chem_groupschem_groups, chem_mapping)
1023 if one_to_one
is not None:
1025 for ref_group, mdl_group
in zip(self.
chem_groupschem_groups, one_to_one):
1026 for ref_ch, mdl_ch
in zip(ref_group, mdl_group):
1027 if ref_ch
is not None and mdl_ch
is not None:
1028 aln = ref_mdl_alns[(ref_ch, mdl_ch)]
1029 aln.AttachView(0, _CSel(self.
targettarget, [ref_ch]))
1030 aln.AttachView(1, _CSel(mdl, [mdl_ch]))
1031 alns[(ref_ch, mdl_ch)] = aln
1035 if strategy ==
"heuristic":
1036 if _NMappingsWithin(self.
chem_groupschem_groups, chem_mapping,
1037 heuristic_n_max_naive):
1040 strategy =
"greedy_full"
1045 if strategy ==
"naive":
1046 mapping, opt_qsscore = _QSScoreNaive(self.
targettarget, mdl,
1048 chem_mapping, ref_mdl_alns,
1054 chem_mapping, ref_mdl_alns,
1055 contact_d = contact_d,
1056 steep_opt_rate=steep_opt_rate,
1057 greedy_prune_contact_map = greedy_prune_contact_map)
1058 if strategy ==
"greedy_fast":
1059 mapping = _QSScoreGreedyFast(the_greed)
1060 elif strategy ==
"greedy_full":
1061 mapping = _QSScoreGreedyFull(the_greed)
1062 elif strategy ==
"greedy_block":
1063 mapping = _QSScoreGreedyBlock(the_greed, block_seed_size,
1064 block_blocks_per_chem_group)
1066 opt_qsscore = the_greed.Score(mapping, check=
False)
1069 for ref_group, mdl_group
in zip(self.
chem_groupschem_groups, mapping):
1070 for ref_ch, mdl_ch
in zip(ref_group, mdl_group):
1071 if ref_ch
is not None and mdl_ch
is not None:
1072 aln = ref_mdl_alns[(ref_ch, mdl_ch)]
1073 aln.AttachView(0, _CSel(self.
targettarget, [ref_ch]))
1074 aln.AttachView(1, _CSel(mdl, [mdl_ch]))
1075 alns[(ref_ch, mdl_ch)] = aln
1078 mapping, alns, opt_score = opt_qsscore)
1081 chem_mapping_result = None, heuristic_n_max_naive = 120):
1082 """Identify chain mapping based on minimal RMSD superposition
1084 Superposition and scoring is based on CA/C3' positions which are present
1085 in all chains of a :attr:`chem_groups` as well as the *model*
1086 chains which are mapped to that respective chem group.
1088 The following strategies are available:
1090 * **naive**: Naively iterate all possible mappings and return the one
1093 * **greedy_single**: perform all vs. all single chain superpositions
1094 within the respective ref/mdl chem groups to use as starting points.
1095 For each starting point, iteratively add the model/target chain pair
1096 with lowest RMSD until a full mapping is achieved. The mapping with
1097 lowest RMSD is returned.
1099 * **greedy_iterative**: Same as greedy_single_rmsd exept that the
1100 transformation gets updated with each added chain pair.
1102 * **heuristic**: Uses *naive* strategy if number of possible mappings
1103 is within *heuristic_n_max_naive*. The default of 120 corresponds
1104 to a homo-pentamer (5!=120). If the number of possible mappings is
1105 larger, *greedy_iterative* is used.
1107 :param model: Model to map
1108 :type model: :class:`ost.mol.EntityView`/:class:`ost.mol.EntityHandle`
1109 :param strategy: Strategy for sampling. Must be in ["naive",
1110 "greedy_single", "greedy_iterative"]
1111 :type strategy: :class:`str`
1112 :param subsampling: If given, only an equally distributed subset
1113 of CA/C3' positions in each chain are used for
1114 superposition/scoring.
1115 :type subsampling: :class:`int`
1116 :param chem_mapping_result: Pro param. The result of
1117 :func:`~GetChemMapping` where you provided
1118 *model*. If set, *model* parameter is not
1120 :type chem_mapping_result: :class:`tuple`
1121 :returns: A :class:`MappingResult`
1124 strategies = [
"naive",
"greedy_single",
"greedy_iterative",
"heuristic"]
1126 if strategy
not in strategies:
1127 raise RuntimeError(f
"strategy must be {strategies}")
1129 if chem_mapping_result
is None:
1130 chem_mapping, chem_group_alns, mdl = self.
GetChemMappingGetChemMapping(model)
1132 chem_mapping, chem_group_alns, mdl = chem_mapping_result
1133 ref_mdl_alns = _GetRefMdlAlns(self.
chem_groupschem_groups,
1139 one_to_one = _CheckOneToOneMapping(self.
chem_groupschem_groups, chem_mapping)
1140 if one_to_one
is not None:
1142 for ref_group, mdl_group
in zip(self.
chem_groupschem_groups, one_to_one):
1143 for ref_ch, mdl_ch
in zip(ref_group, mdl_group):
1144 if ref_ch
is not None and mdl_ch
is not None:
1145 aln = ref_mdl_alns[(ref_ch, mdl_ch)]
1146 aln.AttachView(0, _CSel(self.
targettarget, [ref_ch]))
1147 aln.AttachView(1, _CSel(mdl, [mdl_ch]))
1148 alns[(ref_ch, mdl_ch)] = aln
1152 trg_group_pos, mdl_group_pos = _GetRefPos(self.
targettarget, mdl,
1155 max_pos = subsampling)
1157 if strategy ==
"heuristic":
1158 if _NMappingsWithin(self.
chem_groupschem_groups, chem_mapping,
1159 heuristic_n_max_naive):
1162 strategy =
"greedy_iterative"
1166 if strategy.startswith(
"greedy"):
1169 initial_transforms = list()
1170 initial_mappings = list()
1171 for trg_pos, trg_chains, mdl_pos, mdl_chains
in zip(trg_group_pos,
1175 for t_pos, t
in zip(trg_pos, trg_chains):
1176 for m_pos, m
in zip(mdl_pos, mdl_chains):
1177 if len(t_pos) >= 3
and len(m_pos) >= 3:
1178 transform = _GetTransform(m_pos, t_pos,
False)
1179 initial_transforms.append(transform)
1180 initial_mappings.append((t,m))
1182 if strategy ==
"greedy_single":
1183 mapping = _SingleRigidRMSD(initial_transforms,
1191 elif strategy ==
"greedy_iterative":
1192 mapping = _IterativeRigidRMSD(initial_transforms,
1195 trg_group_pos, mdl_group_pos)
1196 elif strategy ==
"naive":
1197 mapping = _NaiveRMSD(self.
chem_groupschem_groups, chem_mapping,
1198 trg_group_pos, mdl_group_pos,
1202 final_mapping = list()
1204 mapped_mdl_chains = list()
1205 for ref_ch
in ref_chains:
1206 if ref_ch
in mapping:
1207 mapped_mdl_chains.append(mapping[ref_ch])
1209 mapped_mdl_chains.append(
None)
1210 final_mapping.append(mapped_mdl_chains)
1213 for ref_group, mdl_group
in zip(self.
chem_groupschem_groups, final_mapping):
1214 for ref_ch, mdl_ch
in zip(ref_group, mdl_group):
1215 if ref_ch
is not None and mdl_ch
is not None:
1216 aln = ref_mdl_alns[(ref_ch, mdl_ch)]
1217 aln.AttachView(0, _CSel(self.
targettarget, [ref_ch]))
1218 aln.AttachView(1, _CSel(mdl, [mdl_ch]))
1219 alns[(ref_ch, mdl_ch)] = aln
1222 final_mapping, alns)
1225 """ Convenience function to get mapping with currently preferred method
1227 If number of possible chain mappings is <= *n_max_naive*, a naive
1228 QS-score mapping is performed and optimal QS-score is guaranteed.
1229 For anything else, a QS-score mapping with the greedy_full strategy is
1230 performed (greedy_prune_contact_map = True). The default for
1231 *n_max_naive* of 40320 corresponds to an octamer (8!=40320). A
1232 structure with stoichiometry A6B2 would be 6!*2!=1440 etc.
1235 greedy_prune_contact_map=
True,
1236 heuristic_n_max_naive = n_max_naive)
1238 def GetRepr(self, substructure, model, topn=1, inclusion_radius=15.0,
1239 thresholds=[0.5, 1.0, 2.0, 4.0], bb_only=False,
1240 only_interchain=False, chem_mapping_result = None,
1241 global_mapping = None):
1242 """ Identify *topn* representations of *substructure* in *model*
1244 *substructure* defines a subset of :attr:`~target` for which one
1245 wants the *topn* representations in *model*. Representations are scored
1248 :param substructure: A :class:`ost.mol.EntityView` which is a subset of
1249 :attr:`~target`. Should be selected with the
1250 OpenStructure query language. Example: if you're
1251 interested in residues with number 42,43 and 85 in
1253 ``substructure=mapper.target.Select("cname=A and rnum=42,43,85")``
1254 A :class:`RuntimeError` is raised if *substructure*
1255 does not refer to the same underlying
1256 :class:`ost.mol.EntityHandle` as :attr:`~target`.
1257 :type substructure: :class:`ost.mol.EntityView`
1258 :param model: Structure in which one wants to find representations for
1260 :type model: :class:`ost.mol.EntityView`/:class:`ost.mol.EntityHandle`
1261 :param topn: Max number of representations that are returned
1262 :type topn: :class:`int`
1263 :param inclusion_radius: Inclusion radius for lDDT
1264 :type inclusion_radius: :class:`float`
1265 :param thresholds: Thresholds for lDDT
1266 :type thresholds: :class:`list` of :class:`float`
1267 :param bb_only: Only consider backbone atoms in lDDT computation
1268 :type bb_only: :class:`bool`
1269 :param only_interchain: Only score interchain contacts in lDDT. Useful
1270 if you want to identify interface patches.
1271 :type only_interchain: :class:`bool`
1272 :param chem_mapping_result: Pro param. The result of
1273 :func:`~GetChemMapping` where you provided
1274 *model*. If set, *model* parameter is not
1276 :type chem_mapping_result: :class:`tuple`
1277 :param global_mapping: Pro param. Specify a global mapping result. This
1278 fully defines the desired representation in the
1279 model but extracts it and enriches it with all
1280 the nice attributes of :class:`ReprResult`.
1281 The target attribute in *global_mapping* must be
1282 of the same entity as self.target and the model
1283 attribute of *global_mapping* must be of the same
1285 :type global_mapping: :class:`MappingResult`
1286 :returns: :class:`list` of :class:`ReprResult`
1290 raise RuntimeError(
"topn must be >= 1")
1292 if global_mapping
is not None:
1294 if global_mapping.target.handle.GetHashCode() != \
1295 self.
targettarget.handle.GetHashCode():
1296 raise RuntimeError(
"global_mapping.target must be the same "
1297 "entity as self.target")
1298 if global_mapping.model.handle.GetHashCode() != \
1299 model.handle.GetHashCode():
1300 raise RuntimeError(
"global_mapping.model must be the same "
1301 "entity as model param")
1304 for r
in substructure.residues:
1305 ch_name = r.GetChain().GetName()
1306 rnum = r.GetNumber()
1307 target_r = self.
targettarget.FindResidue(ch_name, rnum)
1308 if not target_r.IsValid():
1309 raise RuntimeError(f
"substructure has residue "
1310 f
"{r.GetQualifiedName()} which is not in "
1312 if target_r.handle.GetHashCode() != r.handle.GetHashCode():
1313 raise RuntimeError(f
"substructure has residue "
1314 f
"{r.GetQualifiedName()} which has an "
1315 f
"equivalent in self.target but it does "
1316 f
"not refer to the same underlying "
1319 target_a = target_r.FindAtom(a.GetName())
1320 if not target_a.IsValid():
1321 raise RuntimeError(f
"substructure has atom "
1322 f
"{a.GetQualifiedName()} which is not "
1324 if a.handle.GetHashCode() != target_a.handle.GetHashCode():
1325 raise RuntimeError(f
"substructure has atom "
1326 f
"{a.GetQualifiedName()} which has an "
1327 f
"equivalent in self.target but it does "
1328 f
"not refer to the same underlying "
1332 ca = r.FindAtom(
"CA")
1333 c3 = r.FindAtom(
"C3'")
1335 if not ca.IsValid()
and not c3.IsValid():
1336 raise RuntimeError(
"All residues in substructure must contain "
1337 "a backbone atom named CA or C3\'")
1340 if chem_mapping_result
is None:
1341 chem_mapping, chem_group_alns, mdl = self.
GetChemMappingGetChemMapping(model)
1343 chem_mapping, chem_group_alns, mdl = chem_mapping_result
1344 ref_mdl_alns = _GetRefMdlAlns(self.
chem_groupschem_groups,
1350 substructure_res_indices = dict()
1351 for ch
in substructure.chains:
1352 full_ch = self.
targettarget.FindChain(ch.GetName())
1353 idx = [full_ch.GetResidueIndex(r.GetNumber())
for r
in ch.residues]
1354 substructure_res_indices[ch.GetName()] = idx
1358 substructure_chem_groups = list()
1359 substructure_chem_mapping = list()
1361 chnames = set([ch.GetName()
for ch
in substructure.chains])
1362 for chem_group, mapping
in zip(self.
chem_groupschem_groups, chem_mapping):
1363 substructure_chem_group = [ch
for ch
in chem_group
if ch
in chnames]
1364 if len(substructure_chem_group) > 0:
1365 substructure_chem_groups.append(substructure_chem_group)
1366 substructure_chem_mapping.append(mapping)
1369 n_mapped_mdl_chains = sum([len(m)
for m
in substructure_chem_mapping])
1370 if n_mapped_mdl_chains == 0:
1375 substructure_ref_mdl_alns = dict()
1377 for ch
in mdl.chains:
1378 mdl_views[ch.GetName()] = _CSel(mdl, [ch.GetName()])
1379 for chem_group, mapping
in zip(substructure_chem_groups,
1380 substructure_chem_mapping):
1381 for ref_ch
in chem_group:
1382 for mdl_ch
in mapping:
1383 full_aln = ref_mdl_alns[(ref_ch, mdl_ch)]
1384 ref_seq = full_aln.GetSequence(0)
1388 tmp = [
'-'] * len(full_aln)
1389 for idx
in substructure_res_indices[ref_ch]:
1390 idx_in_seq = ref_seq.GetPos(idx)
1391 tmp[idx_in_seq] = ref_seq[idx_in_seq]
1392 ref_seq = seq.CreateSequence(ref_ch,
''.join(tmp))
1393 ref_seq.AttachView(_CSel(substructure, [ref_ch]))
1394 mdl_seq = full_aln.GetSequence(1)
1395 mdl_seq = seq.CreateSequence(mdl_seq.GetName(),
1396 mdl_seq.GetString())
1397 mdl_seq.AttachView(mdl_views[mdl_ch])
1398 aln = seq.CreateAlignment()
1399 aln.AddSequence(ref_seq)
1400 aln.AddSequence(mdl_seq)
1401 substructure_ref_mdl_alns[(ref_ch, mdl_ch)] = aln
1404 inclusion_radius = inclusion_radius,
1406 scored_mappings = list()
1410 flat_mapping = global_mapping.GetFlatMapping()
1412 for chem_group, chem_mapping
in zip(substructure_chem_groups,
1413 substructure_chem_mapping):
1414 chem_group_mapping = list()
1415 for ch
in chem_group:
1416 if ch
in flat_mapping:
1417 mdl_ch = flat_mapping[ch]
1418 if mdl_ch
in chem_mapping:
1419 chem_group_mapping.append(mdl_ch)
1421 chem_group_mapping.append(
None)
1423 chem_group_mapping.append(
None)
1424 mapping.append(chem_group_mapping)
1425 mappings = [mapping]
1427 mappings = list(_ChainMappings(substructure_chem_groups,
1428 substructure_chem_mapping,
1431 for mapping
in mappings:
1433 lddt_chain_mapping = dict()
1436 for ref_chem_group, mdl_chem_group
in zip(substructure_chem_groups,
1438 for ref_ch, mdl_ch
in zip(ref_chem_group, mdl_chem_group):
1440 if mdl_ch
is not None:
1441 lddt_chain_mapping[mdl_ch] = ref_ch
1442 aln = substructure_ref_mdl_alns[(ref_ch, mdl_ch)]
1443 lddt_alns[mdl_ch] = aln
1444 tmp = [int(c[0] !=
'-' and c[1] !=
'-')
for c
in aln]
1445 n_res_aln += sum(tmp)
1450 lDDT, _ = lddt_scorer.lDDT(mdl, thresholds=thresholds,
1451 chain_mapping=lddt_chain_mapping,
1452 residue_mapping = lddt_alns,
1453 check_resnames =
False,
1454 no_intrachain = only_interchain)
1457 ost.LogVerbose(
"No valid contacts in the reference")
1462 if len(scored_mappings) == 0:
1463 scored_mappings.append((lDDT, mapping))
1464 elif len(scored_mappings) < topn:
1465 scored_mappings.append((lDDT, mapping))
1466 scored_mappings.sort(reverse=
True, key=
lambda x: x[0])
1467 elif lDDT > scored_mappings[-1][0]:
1468 scored_mappings.append((lDDT, mapping))
1469 scored_mappings.sort(reverse=
True, key=
lambda x: x[0])
1470 scored_mappings = scored_mappings[:topn]
1474 for scored_mapping
in scored_mappings:
1475 ref_view = substructure.handle.CreateEmptyView()
1476 mdl_view = mdl.handle.CreateEmptyView()
1477 for ref_ch_group, mdl_ch_group
in zip(substructure_chem_groups,
1479 for ref_ch, mdl_ch
in zip(ref_ch_group, mdl_ch_group):
1480 if ref_ch
is not None and mdl_ch
is not None:
1481 aln = substructure_ref_mdl_alns[(ref_ch, mdl_ch)]
1483 if col[0] !=
'-' and col[1] !=
'-':
1484 ref_view.AddResidue(col.GetResidue(0),
1485 mol.ViewAddFlag.INCLUDE_ALL)
1486 mdl_view.AddResidue(col.GetResidue(1),
1487 mol.ViewAddFlag.INCLUDE_ALL)
1488 results.append(
ReprResult(scored_mapping[0], substructure,
1489 ref_view, mdl_view))
1493 """ Returns number of possible mappings
1495 :param model: Model with chains that are mapped onto
1497 :type model: :class:`ost.mol.EntityView`/:class:`ost.mol.EntityHandle`
1499 chem_mapping, chem_group_alns, mdl = self.
GetChemMappingGetChemMapping(model)
1500 return _NMappings(self.
chem_groupschem_groups, chem_mapping)
1503 """ Entity processing for chain mapping
1505 * Selects view containing peptide and nucleotide residues which have
1506 required backbone atoms present - for peptide residues thats
1507 N, CA, C and CB (no CB for GLY), for nucleotide residues thats
1508 O5', C5', C4', C3' and O3'.
1509 * filters view by chain lengths, see *min_pep_length* and
1510 *min_nuc_length* in constructor
1511 * Extracts atom sequences for each chain in that view
1512 * Attaches corresponding :class:`ost.mol.EntityView` to each sequence
1513 * If residue number alignments are used, strictly increasing residue
1514 numbers without insertion codes are ensured in each chain
1516 :param ent: Entity to process
1517 :type ent: :class:`ost.mol.EntityView`/:class:`ost.mol.EntityHandle`
1518 :returns: Tuple with 3 elements: 1) :class:`ost.mol.EntityView`
1519 containing peptide and nucleotide residues 2)
1520 :class:`ost.seq.SequenceList` containing ATOMSEQ sequences
1521 for each polypeptide chain in returned view, sequences have
1522 :class:`ost.mol.EntityView` of according chains attached
1523 3) same for polynucleotide chains
1525 view = ent.CreateEmptyView()
1526 exp_pep_atoms = [
"N",
"CA",
"C",
"CB"]
1527 exp_nuc_atoms = [
"\"O5'\"",
"\"C5'\"",
"\"C4'\"",
"\"C3'\"",
"\"O3'\""]
1528 pep_query =
"peptide=true and aname=" +
','.join(exp_pep_atoms)
1529 nuc_query =
"nucleotide=true and aname=" +
','.join(exp_nuc_atoms)
1531 pep_sel = ent.Select(pep_query)
1532 for r
in pep_sel.residues:
1533 if len(r.atoms) == 4:
1534 view.AddResidue(r.handle, mol.INCLUDE_ALL)
1535 elif r.name ==
"GLY" and len(r.atoms) == 3:
1536 atom_names = [a.GetName()
for a
in r.atoms]
1537 if sorted(atom_names) == [
"C",
"CA",
"N"]:
1538 view.AddResidue(r.handle, mol.INCLUDE_ALL)
1540 nuc_sel = ent.Select(nuc_query)
1541 for r
in nuc_sel.residues:
1542 if len(r.atoms) == 5:
1543 view.AddResidue(r.handle, mol.INCLUDE_ALL)
1545 polypep_seqs = seq.CreateSequenceList()
1546 polynuc_seqs = seq.CreateSequenceList()
1548 if len(view.residues) == 0:
1550 return (view, polypep_seqs, polynuc_seqs)
1552 for ch
in view.chains:
1553 n_res = len(ch.residues)
1554 n_pep = sum([r.IsPeptideLinking()
for r
in ch.residues])
1555 n_nuc = sum([r.IsNucleotideLinking()
for r
in ch.residues])
1558 if n_pep > 0
and n_nuc > 0:
1559 raise RuntimeError(f
"Must not mix peptide and nucleotide linking "
1560 f
"residues in same chain ({ch.GetName()})")
1562 if (n_pep + n_nuc) != n_res:
1563 raise RuntimeError(
"All residues must either be peptide_linking "
1564 "or nucleotide_linking")
1578 ins_codes = [r.GetNumber().GetInsCode()
for r
in ch.residues]
1579 if len(set(ins_codes)) != 1
or ins_codes[0] !=
'\0':
1580 raise RuntimeError(
"Residue numbers in input structures must not "
1581 "contain insertion codes")
1584 nums = [r.GetNumber().GetNum()
for r
in ch.residues]
1585 if not all(i < j
for i, j
in zip(nums, nums[1:])):
1586 raise RuntimeError(
"Residue numbers in input structures must be "
1587 "strictly increasing for each chain")
1589 s =
''.join([r.one_letter_code
for r
in ch.residues])
1590 s = seq.CreateSequence(ch.GetName(), s)
1591 s.AttachView(_CSel(view, [ch.GetName()]))
1593 polypep_seqs.AddSequence(s)
1594 elif n_nuc == n_res:
1595 polynuc_seqs.AddSequence(s)
1597 raise RuntimeError(
"This shouldnt happen")
1599 if len(polypep_seqs) == 0
and len(polynuc_seqs) == 0:
1600 raise RuntimeError(f
"No chain fulfilled minimum length requirement "
1601 f
"to be considered in chain mapping "
1602 f
"({self.min_pep_length} for peptide chains, "
1603 f
"{self.min_nuc_length} for nucleotide chains) "
1604 f
"- mapping failed")
1607 chain_names = [s.GetAttachedView().chains[0].name
for s
in polypep_seqs]
1608 chain_names += [s.GetAttachedView().chains[0].name
for s
in polynuc_seqs]
1609 view = _CSel(view, chain_names)
1611 return (view, polypep_seqs, polynuc_seqs)
1614 """ Access to internal sequence alignment functionality
1616 Alignment parameterization is setup at ChainMapper construction
1618 :param s1: First sequence to align - must have view attached in case
1619 of resnum_alignments
1620 :type s1: :class:`ost.seq.SequenceHandle`
1621 :param s2: Second sequence to align - must have view attached in case
1622 of resnum_alignments
1623 :type s2: :class:`ost.seq.SequenceHandle`
1624 :param stype: Type of sequences to align, must be in
1625 [:class:`ost.mol.ChemType.AMINOACIDS`,
1626 :class:`ost.mol.ChemType.NUCLEOTIDES`]
1627 :returns: Pairwise alignment of s1 and s2
1629 if stype
not in [mol.ChemType.AMINOACIDS, mol.ChemType.NUCLEOTIDES]:
1630 raise RuntimeError(
"stype must be ost.mol.ChemType.AMINOACIDS or "
1631 "ost.mol.ChemType.NUCLEOTIDES")
1632 return self.
aligneraligner.
Align(s1, s2, chem_type = stype)
1638 def __init__(self, pep_subst_mat = seq.alg.BLOSUM62, pep_gap_open = -5,
1639 pep_gap_ext = -2, nuc_subst_mat = seq.alg.NUC44,
1640 nuc_gap_open = -4, nuc_gap_ext = -4, resnum_aln = False):
1641 """ Helper class to compute alignments
1643 Sets default values for substitution matrix, gap open and gap extension
1644 penalties. They are only used in default mode (Needleman-Wunsch aln).
1645 If *resnum_aln* is True, only residue numbers of views that are attached
1646 to input sequences are considered.
1656 def Align(self, s1, s2, chem_type=None):
1660 if chem_type
is None:
1661 raise RuntimeError(
"Must specify chem_type for NW alignment")
1662 return self.
NWAlignNWAlign(s1, s2, chem_type)
1665 """ Returns pairwise alignment using Needleman-Wunsch algorithm
1667 :param s1: First sequence to align
1668 :type s1: :class:`ost.seq.SequenceHandle`
1669 :param s2: Second sequence to align
1670 :type s2: :class:`ost.seq.SequenceHandle`
1671 :param chem_type: Must be in [:class:`ost.mol.ChemType.AMINOACIDS`,
1672 :class:`ost.mol.ChemType.NUCLEOTIDES`], determines
1673 substitution matrix and gap open/extension penalties
1674 :type chem_type: :class:`ost.mol.ChemType`
1675 :returns: Alignment with s1 as first and s2 as second sequence
1677 if chem_type == mol.ChemType.AMINOACIDS:
1678 return seq.alg.SemiGlobalAlign(s1, s2, self.
pep_subst_matpep_subst_mat,
1681 elif chem_type == mol.ChemType.NUCLEOTIDES:
1682 return seq.alg.SemiGlobalAlign(s1, s2, self.
nuc_subst_matnuc_subst_mat,
1686 raise RuntimeError(
"Invalid ChemType")
1690 """ Returns pairwise alignment using residue numbers of attached views
1692 Assumes that there are no insertion codes (alignment only on numerical
1693 component) and that resnums are strictly increasing (fast min/max
1694 identification). These requirements are assured if a structure has been
1695 processed by :class:`ChainMapper`.
1697 :param s1: First sequence to align, must have :class:`ost.mol.EntityView`
1699 :type s1: :class:`ost.seq.SequenceHandle`
1700 :param s2: Second sequence to align, must have :class:`ost.mol.EntityView`
1702 :type s2: :class:`ost.seq.SequenceHandle`
1704 assert(s1.HasAttachedView())
1705 assert(s2.HasAttachedView())
1706 v1 = s1.GetAttachedView()
1707 rnums1 = [r.GetNumber().GetNum()
for r
in v1.residues]
1708 v2 = s2.GetAttachedView()
1709 rnums2 = [r.GetNumber().GetNum()
for r
in v2.residues]
1711 min_num = min(rnums1[0], rnums2[0])
1712 max_num = max(rnums1[-1], rnums2[-1])
1713 aln_length = max_num - min_num + 1
1715 aln_s1 = [
'-'] * aln_length
1716 for r, rnum
in zip(v1.residues, rnums1):
1717 aln_s1[rnum-min_num] = r.one_letter_code
1719 aln_s2 = [
'-'] * aln_length
1720 for r, rnum
in zip(v2.residues, rnums2):
1721 aln_s2[rnum-min_num] = r.one_letter_code
1723 aln = seq.CreateAlignment()
1724 aln.AddSequence(seq.CreateSequence(s1.GetName(),
''.join(aln_s1)))
1725 aln.AddSequence(seq.CreateSequence(s2.GetName(),
''.join(aln_s2)))
1728 def _GetAlnPropsTwo(aln):
1729 """Returns basic properties of *aln* version two...
1731 :param aln: Alignment to compute properties
1732 :type aln: :class:`seq.AlignmentHandle`
1733 :returns: Tuple with 2 elements. 1) sequence identify in range [0, 100]
1734 considering aligned columns 2) Fraction of non-gap characters
1735 in first sequence that are covered by non-gap characters in
1738 assert(aln.GetCount() == 2)
1739 n_tot = sum([1
for col
in aln
if col[0] !=
'-'])
1740 n_aligned = sum([1
for col
in aln
if (col[0] !=
'-' and col[1] !=
'-')])
1741 return (seq.alg.SequenceIdentity(aln), float(n_aligned)/n_tot)
1743 def _GetAlnPropsOne(aln):
1745 """Returns basic properties of *aln* version one...
1747 :param aln: Alignment to compute properties
1748 :type aln: :class:`seq.AlignmentHandle`
1749 :returns: Tuple with 3 elements. 1) sequence identify in range [0, 100]
1750 considering aligned columns 2) Fraction of gaps between
1751 first and last aligned column in s1 3) same for s2.
1753 assert(aln.GetCount() == 2)
1754 n_gaps_1 = str(aln.GetSequence(0)).strip(
'-').count(
'-')
1755 n_gaps_2 = str(aln.GetSequence(1)).strip(
'-').count(
'-')
1756 gap_frac_1 = float(n_gaps_1)/len(aln.GetSequence(0).GetGaplessString())
1757 gap_frac_2 = float(n_gaps_2)/len(aln.GetSequence(1).GetGaplessString())
1758 return (seq.alg.SequenceIdentity(aln), gap_frac_1, gap_frac_2)
1760 def _GetChemGroupAlignments(pep_seqs, nuc_seqs, aligner, pep_seqid_thr=95.,
1761 pep_gap_thr=0.1, nuc_seqid_thr=95.,
1763 """Returns alignments with groups of chemically equivalent chains
1765 :param pep_seqs: List of polypeptide sequences
1766 :type pep_seqs: :class:`seq.SequenceList`
1767 :param nuc_seqs: List of polynucleotide sequences
1768 :type nuc_seqs: :class:`seq.SequenceList`
1769 :param aligner: Helper class to generate pairwise alignments
1770 :type aligner: :class:`_Aligner`
1771 :param pep_seqid_thr: Threshold used to decide when two peptide chains are
1772 identical. 95 percent tolerates the few mutations
1773 crystallographers like to do.
1774 :type pep_seqid_thr: :class:`float`
1775 :param pep_gap_thr: Additional threshold to avoid gappy alignments with high
1776 seqid. The reason for not just normalizing seqid by the
1777 longer sequence is that one sequence might be a perfect
1778 subsequence of the other but only cover half of it. This
1779 threshold checks for a maximum allowed fraction of gaps
1780 in any of the two sequences after stripping terminal gaps.
1781 :type pep_gap_thr: :class:`float`
1782 :param nuc_seqid_thr: Nucleotide equivalent of *pep_seqid_thr*
1783 :type nuc_seqid_thr: :class:`float`
1784 :param nuc_gap_thr: Nucleotide equivalent of *nuc_gap_thr*
1785 :type nuc_gap_thr: :class:`float`
1786 :returns: Tuple with first element being an AlignmentList. Each alignment
1787 represents a group of chemically equivalent chains and the first
1788 sequence is the longest. Second element is a list of equivalent
1789 length specifying the types of the groups. List elements are in
1790 [:class:`ost.ChemType.AMINOACIDS`,
1791 :class:`ost.ChemType.NUCLEOTIDES`]
1793 pep_groups = _GroupSequences(pep_seqs, pep_seqid_thr, pep_gap_thr, aligner,
1794 mol.ChemType.AMINOACIDS)
1795 nuc_groups = _GroupSequences(nuc_seqs, nuc_seqid_thr, nuc_gap_thr, aligner,
1796 mol.ChemType.NUCLEOTIDES)
1797 group_types = [mol.ChemType.AMINOACIDS] * len(pep_groups)
1798 group_types += [mol.ChemType.NUCLEOTIDES] * len(nuc_groups)
1800 groups.extend(nuc_groups)
1801 return (groups, group_types)
1803 def _GroupSequences(seqs, seqid_thr, gap_thr, aligner, chem_type):
1804 """Get list of alignments representing groups of equivalent sequences
1806 :param seqid_thr: Threshold used to decide when two chains are identical.
1807 :type seqid_thr: :class:`float`
1808 :param gap_thr: Additional threshold to avoid gappy alignments with high
1809 seqid. The reason for not just normalizing seqid by the
1810 longer sequence is that one sequence might be a perfect
1811 subsequence of the other but only cover half of it. This
1812 threshold checks for a maximum allowed fraction of gaps
1813 in any of the two sequences after stripping terminal gaps.
1814 :type gap_thr: :class:`float`
1815 :param aligner: Helper class to generate pairwise alignments
1816 :type aligner: :class:`_Aligner`
1817 :param chem_type: ChemType of seqs which is passed to *aligner*, must be in
1818 [:class:`ost.mol.ChemType.AMINOACIDS`,
1819 :class:`ost.mol.ChemType.NUCLEOTIDES`]
1820 :type chem_type: :class:`ost.mol.ChemType`
1821 :returns: A list of alignments, one alignment for each group
1822 with longest sequence (reference) as first sequence.
1823 :rtype: :class:`ost.seq.AlignmentList`
1826 for s_idx
in range(len(seqs)):
1827 matching_group =
None
1828 for g_idx
in range(len(groups)):
1829 for g_s_idx
in range(len(groups[g_idx])):
1830 aln = aligner.Align(seqs[s_idx], seqs[groups[g_idx][g_s_idx]],
1832 sid, frac_i, frac_j = _GetAlnPropsOne(aln)
1833 if sid >= seqid_thr
and frac_i < gap_thr
and frac_j < gap_thr:
1834 matching_group = g_idx
1836 if matching_group
is not None:
1839 if matching_group
is None:
1840 groups.append([s_idx])
1842 groups[matching_group].append(s_idx)
1845 sorted_groups = list()
1848 tmp = sorted([[len(seqs[i]), i]
for i
in g], reverse=
True)
1849 sorted_groups.append([x[1]
for x
in tmp])
1851 sorted_groups.append(g)
1855 aln_list = seq.AlignmentList()
1856 for g
in sorted_groups:
1859 aln_list.append(seq.CreateAlignment(seqs[g[0]]))
1862 alns = seq.AlignmentList()
1865 alns.append(aligner.Align(seqs[i], seqs[j], chem_type))
1867 aln_list.append(seq.alg.MergePairwiseAlignments(alns, seqs[i]))
1870 seq_dict = {s.GetName(): s
for s
in seqs}
1871 for aln_idx
in range(len(aln_list)):
1872 for aln_s_idx
in range(aln_list[aln_idx].GetCount()):
1873 s_name = aln_list[aln_idx].GetSequence(aln_s_idx).GetName()
1874 s = seq_dict[s_name]
1875 aln_list[aln_idx].AttachView(aln_s_idx, s.GetAttachedView())
1879 def _MapSequence(ref_seqs, ref_types, s, s_type, aligner):
1880 """Tries top map *s* onto any of the sequences in *ref_seqs*
1882 Computes alignments of *s* to each of the reference sequences of equal type
1883 and sorts them by seqid*fraction_covered (seqid: sequence identity of
1884 aligned columns in alignment, fraction_covered: Fraction of non-gap
1885 characters in reference sequence that are covered by non-gap characters in
1886 *s*). Best scoring mapping is returned.
1888 :param ref_seqs: Reference sequences
1889 :type ref_seqs: :class:`ost.seq.SequenceList`
1890 :param ref_types: Types of reference sequences, e.g.
1891 ost.mol.ChemType.AminoAcids
1892 :type ref_types: :class:`list` of :class:`ost.mol.ChemType`
1893 :param s: Sequence to map
1894 :type s: :class:`ost.seq.SequenceHandle`
1895 :param s_type: Type of *s*, only try mapping to sequences in *ref_seqs*
1896 with equal type as defined in *ref_types*
1897 :param aligner: Helper class to generate pairwise alignments
1898 :type aligner: :class:`_Aligner`
1899 :returns: Tuple with two elements. 1) index of sequence in *ref_seqs* to
1900 which *s* can be mapped 2) Pairwise sequence alignment with
1901 sequence from *ref_seqs* as first sequence. Both elements are
1902 None if no mapping can be found.
1903 :raises: :class:`RuntimeError` if mapping is ambiguous, i.e. *s*
1904 successfully maps to more than one sequence in *ref_seqs*
1906 scored_alns = list()
1907 for ref_idx, ref_seq
in enumerate(ref_seqs):
1908 if ref_types[ref_idx] == s_type:
1909 aln = aligner.Align(ref_seq, s, s_type)
1910 seqid, fraction_covered = _GetAlnPropsTwo(aln)
1911 score = seqid * fraction_covered
1912 scored_alns.append((score, ref_idx, aln))
1914 if len(scored_alns) == 0:
1917 scored_alns = sorted(scored_alns, key=
lambda x: x[0], reverse=
True)
1918 return (scored_alns[0][1], scored_alns[0][2])
1920 def _GetRefMdlAlns(ref_chem_groups, ref_chem_group_msas, mdl_chem_groups,
1921 mdl_chem_group_alns, pairs=None):
1922 """ Get all possible ref/mdl chain alignments given chem group mapping
1924 :param ref_chem_groups: :attr:`ChainMapper.chem_groups`
1925 :type ref_chem_groups: :class:`list` of :class:`list` of :class:`str`
1926 :param ref_chem_group_msas: :attr:`ChainMapper.chem_group_alignments`
1927 :type ref_chem_group_msas: :class:`ost.seq.AlignmentList`
1928 :param mdl_chem_groups: Groups of model chains that are mapped to
1929 *ref_chem_groups*. Return value of
1930 :func:`ChainMapper.GetChemMapping`.
1931 :type mdl_chem_groups: :class:`list` of :class:`list` of :class:`str`
1932 :param mdl_chem_group_alns: A pairwise sequence alignment for every chain
1933 in *mdl_chem_groups* that aligns these sequences
1934 to the respective reference sequence.
1936 :func:`ChainMapper.GetChemMapping`.
1937 :type mdl_chem_group_alns: :class:`list` of :class:`ost.seq.AlignmentList`
1938 :param pairs: Pro param - restrict return dict to specified pairs. A set of
1939 tuples in form (<trg_ch>, <mdl_ch>)
1940 :type pairs: :class:`set`
1941 :returns: A dictionary holding all possible ref/mdl chain alignments. Keys
1942 in that dictionary are tuples of the form (ref_ch, mdl_ch) and
1943 values are the respective pairwise alignments with first sequence
1944 being from ref, the second from mdl.
1948 for alns
in mdl_chem_group_alns:
1950 mdl_chain_name = aln.GetSequence(1).GetName()
1951 mdl_alns[mdl_chain_name] = aln
1955 ref_mdl_alns = dict()
1956 for ref_chains, mdl_chains, ref_aln
in zip(ref_chem_groups, mdl_chem_groups,
1957 ref_chem_group_msas):
1958 for ref_ch
in ref_chains:
1959 for mdl_ch
in mdl_chains:
1960 if pairs
is not None and (ref_ch, mdl_ch)
not in pairs:
1964 aln_list = seq.AlignmentList()
1966 s1 = ref_aln.GetSequence(0)
1967 s2 = ref_aln.GetSequence(ref_chains.index(ref_ch))
1968 aln_list.append(seq.CreateAlignment(s1, s2))
1970 aln_list.append(mdl_alns[mdl_ch])
1972 ref_seq = seq.CreateSequence(s1.GetName(),
1973 s1.GetGaplessString())
1974 merged_aln = seq.alg.MergePairwiseAlignments(aln_list,
1981 s2 = merged_aln.GetSequence(1)
1982 s3 = merged_aln.GetSequence(2)
1985 for idx
in range(len(s2)):
1986 if s2[idx] !=
'-' or s3[idx] !=
'-':
1990 for idx
in reversed(range(len(s2))):
1991 if s2[idx] !=
'-' or s3[idx] !=
'-':
1994 s2 = seq.CreateSequence(s2.GetName(), s2[a: len(s2)-b])
1995 s3 = seq.CreateSequence(s3.GetName(), s3[a: len(s3)-b])
1996 ref_mdl_alns[(ref_ch, mdl_ch)] = seq.CreateAlignment(s2, s3)
2000 def _CheckOneToOneMapping(ref_chains, mdl_chains):
2001 """ Checks whether we already have a perfect one to one mapping
2003 That means each list in *ref_chains* has exactly one element and each
2004 list in *mdl_chains* has either one element (it's mapped) or is empty
2005 (ref chain has no mapped mdl chain). Returns None if no such mapping
2008 :param ref_chains: corresponds to :attr:`ChainMapper.chem_groups`
2009 :type ref_chains: :class:`list` of :class:`list` of :class:`str`
2010 :param mdl_chains: mdl chains mapped to chem groups in *ref_chains*, i.e.
2011 the return value of :func:`ChainMapper.GetChemMapping`
2012 :type mdl_chains: class:`list` of :class:`list` of :class:`str`
2013 :returns: A :class:`list` of :class:`list` if a one to one mapping is found,
2016 only_one_to_one =
True
2018 for ref, mdl
in zip(ref_chains, mdl_chains):
2019 if len(ref) == 1
and len(mdl) == 1:
2020 one_to_one.append(mdl)
2021 elif len(ref) == 1
and len(mdl) == 0:
2022 one_to_one.append([
None])
2024 only_one_to_one =
False
2033 def __init__(self, ref, mdl, ref_mdl_alns, inclusion_radius = 15.0,
2034 thresholds = [0.5, 1.0, 2.0, 4.0]):
2035 """ Compute backbone only lDDT scores for ref/mdl
2037 Uses the pairwise decomposable property of backbone only lDDT and
2038 implements a caching mechanism to efficiently enumerate different
2073 def _SetupScorer(self):
2074 for ch
in self.
refref.chains:
2076 query = f
"{self.inclusion_radius} <> "
2077 query += f
"[cname={mol.QueryQuoteName(ch.GetName())}] "
2078 query += f
"and cname!={mol.QueryQuoteName(ch.GetName())}"
2079 for close_ch
in self.
refref.Select(query).chains:
2080 k1 = (ch.GetName(), close_ch.GetName())
2081 k2 = (close_ch.GetName(), ch.GetName())
2084 dimer_ref = _CSel(self.
refref, [k1[0], k1[1]])
2088 self.
nn += sum([len(x)
for x
in self.
interface_scorerinterface_scorer[k1].ref_indices_ic])
2094 self.
nn += s.GetNChainContacts(ch.GetName(),
2096 self.
ref_chainsref_chains.append(ch.GetName())
2099 self.
nn += s.GetNChainContacts(close_ch.GetName(),
2101 self.
ref_chainsref_chains.append(close_ch.GetName())
2104 for ch
in self.
refref.chains:
2106 single_chain_ref = _CSel(self.
refref, [ch.GetName()])
2110 self.
ref_chainsref_chains.append(ch.GetName())
2112 def lDDT(self, ref_chain_groups, mdl_chain_groups):
2115 for ref_chains, mdl_chains
in zip(ref_chain_groups, mdl_chain_groups):
2116 for ref_ch, mdl_ch
in zip(ref_chains, mdl_chains):
2117 flat_map[ref_ch] = mdl_ch
2127 if ref_ch
in flat_map
and flat_map[ref_ch]
is not None:
2128 conserved += self.
SCCountsSCCounts(ref_ch, flat_map[ref_ch])
2132 if ref_ch1
in flat_map
and ref_ch2
in flat_map:
2133 mdl_ch1 = flat_map[ref_ch1]
2134 mdl_ch2 = flat_map[ref_ch2]
2135 if mdl_ch1
is not None and mdl_ch2
is not None:
2136 conserved += self.
IntCountsIntCounts(ref_ch1, ref_ch2, mdl_ch1,
2139 return conserved / (len(self.
thresholdsthresholds) * self.
nn)
2144 alns[mdl_ch] = self.
ref_mdl_alnsref_mdl_alns[(ref_ch, mdl_ch)]
2145 mdl_sel = _CSel(self.
mdlmdl, [mdl_ch])
2147 _,_,_,conserved,_,_,_ = s.lDDT(mdl_sel,
2148 residue_mapping=alns,
2149 return_dist_test=
True,
2151 chain_mapping={mdl_ch: ref_ch},
2152 check_resnames=
False)
2157 k1 = ((ref_ch1, ref_ch2),(mdl_ch1, mdl_ch2))
2158 k2 = ((ref_ch2, ref_ch1),(mdl_ch2, mdl_ch1))
2161 alns[mdl_ch1] = self.
ref_mdl_alnsref_mdl_alns[(ref_ch1, mdl_ch1)]
2162 alns[mdl_ch2] = self.
ref_mdl_alnsref_mdl_alns[(ref_ch2, mdl_ch2)]
2163 mdl_sel = _CSel(self.
mdlmdl, [mdl_ch1, mdl_ch2])
2165 _,_,_,conserved,_,_,_ = s.lDDT(mdl_sel,
2166 residue_mapping=alns,
2167 return_dist_test=
True,
2169 chain_mapping={mdl_ch1: ref_ch1,
2171 check_resnames=
False)
2177 def __init__(self, ref, mdl, ref_chem_groups, mdl_chem_groups,
2178 ref_mdl_alns, inclusion_radius = 15.0,
2179 thresholds = [0.5, 1.0, 2.0, 4.0],
2180 steep_opt_rate = None):
2181 """ Greedy extension of already existing but incomplete chain mappings
2183 super().
__init__(ref, mdl, ref_mdl_alns,
2184 inclusion_radius = inclusion_radius,
2185 thresholds = thresholds)
2192 assert(len(ref_chem_groups) == len(mdl_chem_groups))
2197 for g_idx, (ref_g, mdl_g)
in enumerate(zip(ref_chem_groups,
2208 for ch
in self.
mdlmdl.chains:
2209 ch_name = ch.GetName()
2211 query = f
"{d} <> [cname={mol.QueryQuoteName(ch_name)}]"
2212 query += f
" and cname !={mol.QueryQuoteName(ch_name)}"
2213 for close_ch
in self.
mdlmdl.Select(query).chains:
2214 self.
mdl_neighborsmdl_neighbors[ch_name].add(close_ch.GetName())
2219 if len(mapping) == 0:
2220 raise RuntimError(
"Mapping must contain a starting point")
2227 for ref_ch
in mapping.keys():
2228 map_targets.update(self.
neighborsneighbors[ref_ch])
2231 for ref_ch
in mapping.keys():
2232 map_targets.discard(ref_ch)
2234 if len(map_targets) == 0:
2238 free_mdl_chains = list()
2240 tmp = [x
for x
in chem_group
if x
not in mapping.values()]
2241 free_mdl_chains.append(set(tmp))
2244 newly_mapped_ref_chains = list()
2246 something_happened =
True
2247 while something_happened:
2248 something_happened=
False
2251 n_chains = len(newly_mapped_ref_chains)
2252 if n_chains > 0
and n_chains % self.
steep_opt_ratesteep_opt_rate == 0:
2253 mapping = self.
_SteepOpt_SteepOpt(mapping, newly_mapped_ref_chains)
2255 if max_ext
is not None and len(newly_mapped_ref_chains) >= max_ext:
2260 for ref_ch
in map_targets:
2262 for mdl_ch
in free_mdl_chains[chem_group_idx]:
2264 n_single = self.
SCCountsSCCounts(ref_ch, mdl_ch)
2267 for neighbor
in self.
neighborsneighbors[ref_ch]:
2268 if neighbor
in mapping
and mapping[neighbor]
in \
2270 n_inter += self.
IntCountsIntCounts(ref_ch, neighbor, mdl_ch,
2272 n = n_single + n_inter
2274 if n_inter > 0
and n > max_n:
2279 max_mapping = (ref_ch, mdl_ch)
2282 something_happened =
True
2284 mapping[max_mapping[0]] = max_mapping[1]
2288 for neighbor
in self.
neighborsneighbors[max_mapping[0]]:
2289 if neighbor
not in mapping:
2290 map_targets.add(neighbor)
2293 map_targets.remove(max_mapping[0])
2297 free_mdl_chains[chem_group_idx].remove(max_mapping[1])
2300 newly_mapped_ref_chains.append(max_mapping[0])
2304 def _SteepOpt(self, mapping, chains_to_optimize=None):
2307 if chains_to_optimize
is None:
2308 chains_to_optimize = mapping.keys()
2311 ref_chains = [x
for x
in chains_to_optimize
if mapping[x]
is not None]
2315 for ch
in ref_chains:
2317 if chem_group_idx
in tmp:
2318 tmp[chem_group_idx].append(ch)
2320 tmp[chem_group_idx] = [ch]
2321 chem_groups = list(tmp.values())
2326 something_happened =
True
2327 while something_happened:
2328 something_happened =
False
2329 for chem_group
in chem_groups:
2330 if something_happened:
2332 for ch1, ch2
in itertools.combinations(chem_group, 2):
2333 swapped_mapping = dict(mapping)
2334 swapped_mapping[ch1] = mapping[ch2]
2335 swapped_mapping[ch2] = mapping[ch1]
2337 if score > current_lddt:
2338 something_happened =
True
2339 mapping = swapped_mapping
2340 current_lddt = score
2346 def _lDDTNaive(trg, mdl, inclusion_radius, thresholds, chem_groups,
2347 chem_mapping, ref_mdl_alns, n_max_naive):
2348 """ Naively iterates all possible chain mappings and returns the best
2356 if _NMappingsWithin(chem_groups, chem_mapping, 24):
2359 for mapping
in _ChainMappings(chem_groups, chem_mapping, n_max_naive):
2361 lddt_chain_mapping = dict()
2363 for ref_chem_group, mdl_chem_group
in zip(chem_groups, mapping):
2364 for ref_ch, mdl_ch
in zip(ref_chem_group, mdl_chem_group):
2366 if mdl_ch
is not None:
2367 lddt_chain_mapping[mdl_ch] = ref_ch
2368 lddt_alns[mdl_ch] = ref_mdl_alns[(ref_ch, mdl_ch)]
2369 lDDT, _ = lddt_scorer.lDDT(mdl, thresholds=thresholds,
2370 chain_mapping=lddt_chain_mapping,
2371 residue_mapping = lddt_alns,
2372 check_resnames =
False)
2373 if lDDT > best_lddt:
2374 best_mapping = mapping
2380 inclusion_radius=inclusion_radius,
2381 thresholds = thresholds)
2382 for mapping
in _ChainMappings(chem_groups, chem_mapping, n_max_naive):
2383 lDDT = lddt_scorer.lDDT(chem_groups, mapping)
2384 if lDDT > best_lddt:
2385 best_mapping = mapping
2388 return (best_mapping, best_lddt)
2391 def _GetSeeds(ref_chem_groups, mdl_chem_groups, mapped_ref_chains = set(),
2392 mapped_mdl_chains = set()):
2394 for ref_chains, mdl_chains
in zip(ref_chem_groups,
2396 for ref_ch
in ref_chains:
2397 if ref_ch
not in mapped_ref_chains:
2398 for mdl_ch
in mdl_chains:
2399 if mdl_ch
not in mapped_mdl_chains:
2400 seeds.append((ref_ch, mdl_ch))
2404 def _lDDTGreedyFast(the_greed):
2406 something_happened =
True
2409 while something_happened:
2410 something_happened =
False
2411 seeds = _GetSeeds(the_greed.ref_chem_groups,
2412 the_greed.mdl_chem_groups,
2413 mapped_ref_chains = set(mapping.keys()),
2414 mapped_mdl_chains = set(mapping.values()))
2419 n = the_greed.SCCounts(seed[0], seed[1])
2426 mapping[best_seed[0]] = best_seed[1]
2427 mapping = the_greed.ExtendMapping(mapping)
2428 something_happened =
True
2431 final_mapping = list()
2432 for ref_chains
in the_greed.ref_chem_groups:
2433 mapped_mdl_chains = list()
2434 for ref_ch
in ref_chains:
2435 if ref_ch
in mapping:
2436 mapped_mdl_chains.append(mapping[ref_ch])
2438 mapped_mdl_chains.append(
None)
2439 final_mapping.append(mapped_mdl_chains)
2441 return final_mapping
2444 def _lDDTGreedyFull(the_greed):
2445 """ Uses each reference chain as starting point for expansion
2448 seeds = _GetSeeds(the_greed.ref_chem_groups, the_greed.mdl_chem_groups)
2449 best_overall_score = -1.0
2450 best_overall_mapping = dict()
2455 mapping = the_greed.ExtendMapping({seed[0]: seed[1]})
2458 something_happened =
True
2459 while something_happened:
2460 something_happened =
False
2461 remnant_seeds = _GetSeeds(the_greed.ref_chem_groups,
2462 the_greed.mdl_chem_groups,
2463 mapped_ref_chains = set(mapping.keys()),
2464 mapped_mdl_chains = set(mapping.values()))
2465 if len(remnant_seeds) > 0:
2469 for remnant_seed
in remnant_seeds:
2470 tmp_mapping = dict(mapping)
2471 tmp_mapping[remnant_seed[0]] = remnant_seed[1]
2472 tmp_mapping = the_greed.ExtendMapping(tmp_mapping)
2473 score = the_greed.lDDTFromFlatMap(tmp_mapping)
2474 if score > best_score:
2476 best_mapping = tmp_mapping
2477 if best_mapping
is not None:
2478 something_happened =
True
2479 mapping = best_mapping
2481 score = the_greed.lDDTFromFlatMap(mapping)
2482 if score > best_overall_score:
2483 best_overall_score = score
2484 best_overall_mapping = mapping
2486 mapping = best_overall_mapping
2489 final_mapping = list()
2490 for ref_chains
in the_greed.ref_chem_groups:
2491 mapped_mdl_chains = list()
2492 for ref_ch
in ref_chains:
2493 if ref_ch
in mapping:
2494 mapped_mdl_chains.append(mapping[ref_ch])
2496 mapped_mdl_chains.append(
None)
2497 final_mapping.append(mapped_mdl_chains)
2499 return final_mapping
2502 def _lDDTGreedyBlock(the_greed, seed_size, blocks_per_chem_group):
2503 """ try multiple seeds, i.e. try all ref/mdl chain combinations within the
2504 respective chem groups and compute single chain lDDTs. The
2505 *blocks_per_chem_group* best scoring ones are extend by *seed_size* chains
2506 and the best scoring one is exhaustively extended.
2509 if seed_size
is None or seed_size < 1:
2510 raise RuntimeError(f
"seed_size must be an int >= 1 (got {seed_size})")
2512 if blocks_per_chem_group
is None or blocks_per_chem_group < 1:
2513 raise RuntimeError(f
"blocks_per_chem_group must be an int >= 1 "
2514 f
"(got {blocks_per_chem_group})")
2516 max_ext = seed_size - 1
2518 ref_chem_groups = copy.deepcopy(the_greed.ref_chem_groups)
2519 mdl_chem_groups = copy.deepcopy(the_greed.mdl_chem_groups)
2522 something_happened =
True
2523 while something_happened:
2524 something_happened =
False
2525 starting_blocks = list()
2526 for ref_chains, mdl_chains
in zip(ref_chem_groups, mdl_chem_groups):
2527 if len(mdl_chains) == 0:
2529 ref_chains_copy = list(ref_chains)
2530 for i
in range(blocks_per_chem_group):
2531 if len(ref_chains_copy) == 0:
2534 for ref_ch
in ref_chains_copy:
2535 seeds += [(ref_ch, mdl_ch)
for mdl_ch
in mdl_chains]
2542 seed = dict(mapping)
2543 seed.update({s[0]: s[1]})
2544 seed = the_greed.ExtendMapping(seed, max_ext = max_ext)
2545 seed_lddt = the_greed.lDDTFromFlatMap(seed)
2546 if seed_lddt > best_score:
2547 best_score = seed_lddt
2550 if best_mapping !=
None:
2551 starting_blocks.append(best_mapping)
2552 if best_seed[0]
in ref_chains_copy:
2554 ref_chains_copy.remove(best_seed[0])
2559 for seed
in starting_blocks:
2560 seed = the_greed.ExtendMapping(seed)
2561 seed_lddt = the_greed.lDDTFromFlatMap(seed)
2562 if seed_lddt > best_lddt:
2563 best_lddt = seed_lddt
2566 if best_lddt == 0.0:
2569 something_happened =
True
2570 mapping.update(best_mapping)
2571 for ref_ch, mdl_ch
in best_mapping.items():
2572 for group_idx
in range(len(ref_chem_groups)):
2573 if ref_ch
in ref_chem_groups[group_idx]:
2574 ref_chem_groups[group_idx].remove(ref_ch)
2575 if mdl_ch
in mdl_chem_groups[group_idx]:
2576 mdl_chem_groups[group_idx].remove(mdl_ch)
2579 final_mapping = list()
2580 for ref_chains
in the_greed.ref_chem_groups:
2581 mapped_mdl_chains = list()
2582 for ref_ch
in ref_chains:
2583 if ref_ch
in mapping:
2584 mapped_mdl_chains.append(mapping[ref_ch])
2586 mapped_mdl_chains.append(
None)
2587 final_mapping.append(mapped_mdl_chains)
2589 return final_mapping
2593 def __init__(self, ref, mdl, ref_chem_groups, mdl_chem_groups,
2594 ref_mdl_alns, contact_d = 12.0,
2595 steep_opt_rate = None, greedy_prune_contact_map=False):
2596 """ Greedy extension of already existing but incomplete chain mappings
2598 super().
__init__(ref, ref_chem_groups, mdl, ref_mdl_alns,
2599 contact_d = contact_d)
2605 if greedy_prune_contact_map:
2607 for p
in self.
qsent1qsent1.interacting_chains:
2608 if np.count_nonzero(self.
qsent1qsent1.PairDist(p[0], p[1])<=8) >= 3:
2613 for p
in self.
qsent2qsent2.interacting_chains:
2614 if np.count_nonzero(self.
qsent2qsent2.PairDist(p[0], p[1])<=8) >= 3:
2620 self.
neighborsneighbors = {k: set()
for k
in self.
qsent1qsent1.chain_names}
2621 for p
in self.
qsent1qsent1.interacting_chains:
2626 for p
in self.
qsent2qsent2.interacting_chains:
2630 assert(len(ref_chem_groups) == len(mdl_chem_groups))
2635 for g_idx, (ref_g, mdl_g)
in enumerate(zip(ref_chem_groups,
2647 for ch
in self.
refref.chains:
2648 single_chain_ref = _CSel(self.
refref, [ch.GetName()])
2655 alns[mdl_ch] = self.
ref_mdl_alnsref_mdl_alns[(ref_ch, mdl_ch)]
2656 mdl_sel = _CSel(self.
mdlmdl, [mdl_ch])
2658 _,_,_,conserved,_,_,_ = s.lDDT(mdl_sel,
2659 residue_mapping=alns,
2660 return_dist_test=
True,
2662 chain_mapping={mdl_ch: ref_ch},
2663 check_resnames=
False)
2669 if len(mapping) == 0:
2670 raise RuntimError(
"Mapping must contain a starting point")
2677 for ref_ch
in mapping.keys():
2678 map_targets.update(self.
neighborsneighbors[ref_ch])
2681 for ref_ch
in mapping.keys():
2682 map_targets.discard(ref_ch)
2684 if len(map_targets) == 0:
2688 free_mdl_chains = list()
2690 tmp = [x
for x
in chem_group
if x
not in mapping.values()]
2691 free_mdl_chains.append(set(tmp))
2694 newly_mapped_ref_chains = list()
2696 something_happened =
True
2697 while something_happened:
2698 something_happened=
False
2701 n_chains = len(newly_mapped_ref_chains)
2702 if n_chains > 0
and n_chains % self.
steep_opt_ratesteep_opt_rate == 0:
2703 mapping = self.
_SteepOpt_SteepOpt(mapping, newly_mapped_ref_chains)
2705 if max_ext
is not None and len(newly_mapped_ref_chains) >= max_ext:
2709 old_score = score_result.QS_global
2710 nominator = score_result.weighted_scores
2711 denominator = score_result.weight_sum + score_result.weight_extra_all
2715 for ref_ch
in map_targets:
2717 for mdl_ch
in free_mdl_chains[chem_group_idx]:
2720 nominator_diff = 0.0
2721 denominator_diff = 0.0
2722 for neighbor
in self.
neighborsneighbors[ref_ch]:
2723 if neighbor
in mapping
and mapping[neighbor]
in \
2727 int1 = (ref_ch, neighbor)
2728 int2 = (mdl_ch, mapping[neighbor])
2731 denominator_diff += b
2732 denominator_diff += d
2738 if nominator_diff > 0:
2741 new_nominator = nominator + nominator_diff
2742 new_denominator = denominator + denominator_diff
2744 if new_denominator != 0.0:
2745 new_score = new_nominator/new_denominator
2746 diff = new_score - old_score
2749 max_mapping = (ref_ch, mdl_ch)
2751 if max_mapping
is not None:
2752 something_happened =
True
2754 mapping[max_mapping[0]] = max_mapping[1]
2758 for neighbor
in self.
neighborsneighbors[max_mapping[0]]:
2759 if neighbor
not in mapping:
2760 map_targets.add(neighbor)
2763 map_targets.remove(max_mapping[0])
2767 free_mdl_chains[chem_group_idx].remove(max_mapping[1])
2770 newly_mapped_ref_chains.append(max_mapping[0])
2774 def _SteepOpt(self, mapping, chains_to_optimize=None):
2777 if chains_to_optimize
is None:
2778 chains_to_optimize = mapping.keys()
2781 ref_chains = [x
for x
in chains_to_optimize
if mapping[x]
is not None]
2785 for ch
in ref_chains:
2787 if chem_group_idx
in tmp:
2788 tmp[chem_group_idx].append(ch)
2790 tmp[chem_group_idx] = [ch]
2791 chem_groups = list(tmp.values())
2796 current_score = score_result.QS_global
2797 something_happened =
True
2798 while something_happened:
2799 something_happened =
False
2800 for chem_group
in chem_groups:
2801 if something_happened:
2803 for ch1, ch2
in itertools.combinations(chem_group, 2):
2804 swapped_mapping = dict(mapping)
2805 swapped_mapping[ch1] = mapping[ch2]
2806 swapped_mapping[ch2] = mapping[ch1]
2808 if score_result.QS_global > current_score:
2809 something_happened =
True
2810 mapping = swapped_mapping
2811 current_score = score_result.QS_global
2816 def _QSScoreNaive(trg, mdl, chem_groups, chem_mapping, ref_mdl_alns, contact_d,
2823 for mapping
in _ChainMappings(chem_groups, chem_mapping, n_max_naive):
2824 score_result = qs_scorer.Score(mapping, check=
False)
2825 if score_result.QS_global > best_score:
2826 best_mapping = mapping
2827 best_score = score_result.QS_global
2828 return (best_mapping, best_score)
2831 def _QSScoreGreedyFast(the_greed):
2833 something_happened =
True
2835 while something_happened:
2836 something_happened =
False
2840 seeds = _GetSeeds(the_greed.ref_chem_groups,
2841 the_greed.mdl_chem_groups,
2842 mapped_ref_chains = set(mapping.keys()),
2843 mapped_mdl_chains = set(mapping.values()))
2845 n = the_greed.SCCounts(seed[0], seed[1])
2852 mapping[best_seed[0]] = best_seed[1]
2853 mapping = the_greed.ExtendMapping(mapping)
2854 something_happened =
True
2857 final_mapping = list()
2858 for ref_chains
in the_greed.ref_chem_groups:
2859 mapped_mdl_chains = list()
2860 for ref_ch
in ref_chains:
2861 if ref_ch
in mapping:
2862 mapped_mdl_chains.append(mapping[ref_ch])
2864 mapped_mdl_chains.append(
None)
2865 final_mapping.append(mapped_mdl_chains)
2867 return final_mapping
2870 def _QSScoreGreedyFull(the_greed):
2871 """ Uses each reference chain as starting point for expansion
2874 seeds = _GetSeeds(the_greed.ref_chem_groups, the_greed.mdl_chem_groups)
2875 best_overall_score = -1.0
2876 best_overall_mapping = dict()
2881 mapping = the_greed.ExtendMapping({seed[0]: seed[1]})
2884 something_happened =
True
2885 while something_happened:
2886 something_happened =
False
2887 remnant_seeds = _GetSeeds(the_greed.ref_chem_groups,
2888 the_greed.mdl_chem_groups,
2889 mapped_ref_chains = set(mapping.keys()),
2890 mapped_mdl_chains = set(mapping.values()))
2891 if len(remnant_seeds) > 0:
2895 for remnant_seed
in remnant_seeds:
2896 tmp_mapping = dict(mapping)
2897 tmp_mapping[remnant_seed[0]] = remnant_seed[1]
2898 tmp_mapping = the_greed.ExtendMapping(tmp_mapping)
2899 score_result = the_greed.FromFlatMapping(tmp_mapping)
2900 if score_result.QS_global > best_score:
2901 best_score = score_result.QS_global
2902 best_mapping = tmp_mapping
2903 if best_mapping
is not None:
2904 something_happened =
True
2905 mapping = best_mapping
2907 score_result = the_greed.FromFlatMapping(mapping)
2908 if score_result.QS_global > best_overall_score:
2909 best_overall_score = score_result.QS_global
2910 best_overall_mapping = mapping
2912 mapping = best_overall_mapping
2915 final_mapping = list()
2916 for ref_chains
in the_greed.ref_chem_groups:
2917 mapped_mdl_chains = list()
2918 for ref_ch
in ref_chains:
2919 if ref_ch
in mapping:
2920 mapped_mdl_chains.append(mapping[ref_ch])
2922 mapped_mdl_chains.append(
None)
2923 final_mapping.append(mapped_mdl_chains)
2925 return final_mapping
2928 def _QSScoreGreedyBlock(the_greed, seed_size, blocks_per_chem_group):
2929 """ try multiple seeds, i.e. try all ref/mdl chain combinations within the
2930 respective chem groups and compute single chain lDDTs. The
2931 *blocks_per_chem_group* best scoring ones are extend by *seed_size* chains
2932 and the best scoring one with respect to QS score is exhaustively extended.
2935 if seed_size
is None or seed_size < 1:
2936 raise RuntimeError(f
"seed_size must be an int >= 1 (got {seed_size})")
2938 if blocks_per_chem_group
is None or blocks_per_chem_group < 1:
2939 raise RuntimeError(f
"blocks_per_chem_group must be an int >= 1 "
2940 f
"(got {blocks_per_chem_group})")
2942 max_ext = seed_size - 1
2944 ref_chem_groups = copy.deepcopy(the_greed.ref_chem_groups)
2945 mdl_chem_groups = copy.deepcopy(the_greed.mdl_chem_groups)
2949 something_happened =
True
2950 while something_happened:
2951 something_happened =
False
2952 starting_blocks = list()
2953 for ref_chains, mdl_chains
in zip(ref_chem_groups, mdl_chem_groups):
2954 if len(mdl_chains) == 0:
2956 ref_chains_copy = list(ref_chains)
2957 for i
in range(blocks_per_chem_group):
2958 if len(ref_chains_copy) == 0:
2961 for ref_ch
in ref_chains_copy:
2962 seeds += [(ref_ch, mdl_ch)
for mdl_ch
in mdl_chains]
2969 seed = dict(mapping)
2970 seed.update({s[0]: s[1]})
2971 seed = the_greed.ExtendMapping(seed, max_ext = max_ext)
2972 score_result = the_greed.FromFlatMapping(seed)
2973 if score_result.QS_global > best_score:
2974 best_score = score_result.QS_global
2977 if best_mapping !=
None:
2978 starting_blocks.append(best_mapping)
2979 if best_seed[0]
in ref_chains_copy:
2981 ref_chains_copy.remove(best_seed[0])
2986 for seed
in starting_blocks:
2987 seed = the_greed.ExtendMapping(seed)
2988 score_result = the_greed.FromFlatMapping(seed)
2989 if score_result.QS_global > best_score:
2990 best_score = score_result.QS_global
2993 if best_mapping
is not None and len(best_mapping) > len(mapping):
2996 something_happened =
True
2997 mapping.update(best_mapping)
2998 for ref_ch, mdl_ch
in best_mapping.items():
2999 for group_idx
in range(len(ref_chem_groups)):
3000 if ref_ch
in ref_chem_groups[group_idx]:
3001 ref_chem_groups[group_idx].remove(ref_ch)
3002 if mdl_ch
in mdl_chem_groups[group_idx]:
3003 mdl_chem_groups[group_idx].remove(mdl_ch)
3006 final_mapping = list()
3007 for ref_chains
in the_greed.ref_chem_groups:
3008 mapped_mdl_chains = list()
3009 for ref_ch
in ref_chains:
3010 if ref_ch
in mapping:
3011 mapped_mdl_chains.append(mapping[ref_ch])
3013 mapped_mdl_chains.append(
None)
3014 final_mapping.append(mapped_mdl_chains)
3016 return final_mapping
3018 def _SingleRigidRMSD(initial_transforms, initial_mappings, chem_groups,
3019 chem_mapping, trg_group_pos, mdl_group_pos):
3021 Takes initial transforms and sequentially adds chain pairs with lowest RMSD.
3022 The mapping from the transform that leads to lowest overall RMSD is
3025 best_mapping = dict()
3026 best_ssd = float(
"inf")
3030 for transform
in initial_transforms:
3032 mapped_mdl_chains = set()
3034 for trg_chains, mdl_chains, trg_pos, mdl_pos,
in zip(chem_groups,
3038 if len(trg_pos) == 0
or len(mdl_pos) == 0:
3042 for m_pos
in mdl_pos:
3044 t_m_pos.ApplyTransform(transform)
3045 t_mdl_pos.append(t_m_pos)
3046 for t_pos, t
in zip(trg_pos, trg_chains):
3047 for t_m_pos, m
in zip(t_mdl_pos, mdl_chains):
3048 ssd = t_pos.GetSummedSquaredDistances(t_m_pos)
3049 ssds.append((ssd, (t,m)))
3053 if p[0]
not in mapping
and p[1]
not in mapped_mdl_chains:
3054 mapping[p[0]] = p[1]
3055 mapped_mdl_chains.add(p[1])
3060 best_mapping = mapping
3064 def _IterativeRigidRMSD(initial_transforms, initial_mappings, chem_groups,
3065 chem_mapping, trg_group_pos, mdl_group_pos):
3066 """ Takes initial transforms and sequentially adds chain pairs with
3067 lowest RMSD. With each added chain pair, the transform gets updated.
3068 Thus the naming iterative. The mapping from the initial transform that
3069 leads to best overall RMSD score is returned.
3073 trg_pos_dict = dict()
3074 for trg_pos, trg_chains
in zip(trg_group_pos, chem_groups):
3075 for t_pos, t
in zip(trg_pos, trg_chains):
3076 trg_pos_dict[t] = t_pos
3077 mdl_pos_dict = dict()
3078 for mdl_pos, mdl_chains
in zip(mdl_group_pos, chem_mapping):
3079 for m_pos, m
in zip(mdl_pos, mdl_chains):
3080 mdl_pos_dict[m] = m_pos
3082 best_mapping = dict()
3083 best_rmsd = float(
"inf")
3084 for initial_transform, initial_mapping
in zip(initial_transforms,
3086 mapping = {initial_mapping[0]: initial_mapping[1]}
3087 transform =
geom.Mat4(initial_transform)
3088 mapped_trg_pos =
geom.Vec3List(trg_pos_dict[initial_mapping[0]])
3089 mapped_mdl_pos =
geom.Vec3List(mdl_pos_dict[initial_mapping[1]])
3093 trg_chain_groups = [set(group)
for group
in chem_groups]
3094 mdl_chain_groups = [set(group)
for group
in chem_mapping]
3097 for group
in trg_chain_groups:
3098 if initial_mapping[0]
in group:
3099 group.remove(initial_mapping[0])
3101 for group
in mdl_chain_groups:
3102 if initial_mapping[1]
in group:
3103 group.remove(initial_mapping[1])
3106 something_happened =
True
3107 while something_happened:
3109 something_happened=
False
3110 best_sc_mapping =
None
3111 best_sc_group_idx =
None
3112 best_sc_rmsd = float(
"inf")
3114 for trg_chains, mdl_chains
in zip(trg_chain_groups, mdl_chain_groups):
3115 for t
in trg_chains:
3116 t_pos = trg_pos_dict[t]
3117 for m
in mdl_chains:
3118 m_pos = mdl_pos_dict[m]
3120 t_m_pos.ApplyTransform(transform)
3121 rmsd = t_pos.GetRMSD(t_m_pos)
3122 if rmsd < best_sc_rmsd:
3124 best_sc_mapping = (t,m)
3125 best_sc_group_idx = group_idx
3128 if best_sc_mapping
is not None:
3129 something_happened =
True
3130 mapping[best_sc_mapping[0]] = best_sc_mapping[1]
3131 mapped_trg_pos.extend(trg_pos_dict[best_sc_mapping[0]])
3132 mapped_mdl_pos.extend(mdl_pos_dict[best_sc_mapping[1]])
3133 trg_chain_groups[best_sc_group_idx].remove(best_sc_mapping[0])
3134 mdl_chain_groups[best_sc_group_idx].remove(best_sc_mapping[1])
3136 transform = _GetTransform(mapped_mdl_pos, mapped_trg_pos,
3140 mapped_mdl_pos.ApplyTransform(transform)
3141 rmsd = mapped_trg_pos.GetRMSD(mapped_mdl_pos)
3143 if rmsd < best_rmsd:
3145 best_mapping = mapping
3149 def _NaiveRMSD(chem_groups, chem_mapping, trg_group_pos, mdl_group_pos,
3153 trg_pos_dict = dict()
3154 for trg_pos, trg_chains
in zip(trg_group_pos, chem_groups):
3155 for t_pos, t
in zip(trg_pos, trg_chains):
3156 trg_pos_dict[t] = t_pos
3157 mdl_pos_dict = dict()
3158 for mdl_pos, mdl_chains
in zip(mdl_group_pos, chem_mapping):
3159 for m_pos, m
in zip(mdl_pos, mdl_chains):
3160 mdl_pos_dict[m] = m_pos
3162 best_mapping = dict()
3163 best_rmsd = float(
"inf")
3165 for mapping
in _ChainMappings(chem_groups, chem_mapping, n_max_naive):
3168 for trg_group, mdl_group
in zip(chem_groups, mapping):
3169 for trg_ch, mdl_ch
in zip(trg_group, mdl_group):
3170 if trg_ch
is not None and mdl_ch
is not None:
3171 trg_pos.extend(trg_pos_dict[trg_ch])
3172 mdl_pos.extend(mdl_pos_dict[mdl_ch])
3173 superpose_res = mol.alg.SuperposeSVD(mdl_pos, trg_pos)
3174 if superpose_res.rmsd < best_rmsd:
3175 best_rmsd = superpose_res.rmsd
3176 best_mapping = mapping
3180 for chem_group, mapping
in zip(chem_groups, best_mapping):
3181 for trg_ch, mdl_ch
in zip(chem_group, mapping):
3182 tmp[trg_ch] = mdl_ch
3187 def _GetRefPos(trg, mdl, trg_msas, mdl_alns, max_pos = None):
3188 """ Extracts reference positions which are present in trg and mdl
3193 bb_trg = trg.Select(
"aname=\"CA\",\"C3'\"")
3194 bb_mdl = mdl.Select(
"aname=\"CA\",\"C3'\"")
3198 for aln_list
in mdl_alns:
3199 if len(aln_list) > 0:
3200 tmp = aln_list[0].GetSequence(0)
3201 ref_seq = seq.CreateSequence(tmp.GetName(), tmp.GetGaplessString())
3202 mdl_msas.append(seq.alg.MergePairwiseAlignments(aln_list, ref_seq))
3204 mdl_msas.append(seq.CreateAlignment())
3209 for trg_msa, mdl_msa
in zip(trg_msas, mdl_msas):
3211 if mdl_msa.GetCount() > 0:
3213 assert(trg_msa.GetSequence(0).GetGaplessString() == \
3214 mdl_msa.GetSequence(0).GetGaplessString())
3223 trg_indices = _GetFullyCoveredIndices(trg_msa)
3224 mdl_indices = _GetFullyCoveredIndices(mdl_msa)
3227 indices = sorted(list(trg_indices.intersection(mdl_indices)))
3230 if max_pos
is not None and len(indices) > max_pos:
3231 step = int(len(indices)/max_pos)
3232 indices = [indices[i]
for i
in range(0, len(indices), step)]
3235 trg_indices = _RefIndicesToColumnIndices(trg_msa, indices)
3236 mdl_indices = _RefIndicesToColumnIndices(mdl_msa, indices)
3239 trg_pos.append(list())
3240 mdl_pos.append(list())
3241 for s_idx
in range(trg_msa.GetCount()):
3242 trg_pos[-1].append(_ExtractMSAPos(trg_msa, s_idx, trg_indices,
3245 for s_idx
in range(1, mdl_msa.GetCount()):
3246 mdl_pos[-1].append(_ExtractMSAPos(mdl_msa, s_idx, mdl_indices,
3249 return (trg_pos, mdl_pos)
3251 def _GetFullyCoveredIndices(msa):
3252 """ Helper for _GetRefPos
3254 Returns a set containing the indices relative to first sequence in msa which
3255 are fully covered in all other sequences
3266 if sum([1
for olc
in col
if olc !=
'-']) == col.GetRowCount():
3267 indices.add(ref_idx)
3272 def _RefIndicesToColumnIndices(msa, indices):
3273 """ Helper for _GetRefPos
3275 Returns a list of mapped indices. indices refer to non-gap one letter
3276 codes in the first msa sequence. The returnes mapped indices are translated
3277 to the according msa column indices
3281 for col_idx, col
in enumerate(msa):
3283 mapping[ref_idx] = col_idx
3285 return [mapping[i]
for i
in indices]
3287 def _ExtractMSAPos(msa, s_idx, indices, view):
3288 """ Helper for _GetRefPos
3290 Returns a geom.Vec3List containing positions refering to given msa sequence.
3291 => Chain with corresponding name is mapped onto sequence and the position of
3292 the first atom of each residue specified in indices is extracted.
3293 Indices refers to column indices in msa!
3295 s = msa.GetSequence(s_idx)
3296 s_v = _CSel(view, [s.GetName()])
3299 assert(len(s.GetGaplessString()) == len(s_v.residues))
3301 residue_idx = [s.GetResidueIndex(i)
for i
in indices]
3302 return geom.Vec3List([s_v.residues[i].atoms[0].pos
for i
in residue_idx])
3304 def _NChemGroupMappings(ref_chains, mdl_chains):
3305 """ Number of mappings within one chem group
3307 :param ref_chains: Reference chains
3308 :type ref_chains: :class:`list` of :class:`str`
3309 :param mdl_chains: Model chains that are mapped onto *ref_chains*
3310 :type mdl_chains: :class:`list` of :class:`str`
3311 :returns: Number of possible mappings of *mdl_chains* onto *ref_chains*
3313 n_ref = len(ref_chains)
3314 n_mdl = len(mdl_chains)
3316 return factorial(n_ref)
3318 n_choose_k = binom(n_ref, n_mdl)
3319 return n_choose_k * factorial(n_mdl)
3321 n_choose_k = binom(n_mdl, n_ref)
3322 return n_choose_k * factorial(n_ref)
3324 def _NMappings(ref_chains, mdl_chains):
3325 """ Number of mappings for a full chem mapping
3327 :param ref_chains: Chem groups of reference
3328 :type ref_chains: :class:`list` of :class:`list` of :class:`str`
3329 :param mdl_chains: Model chains that map onto those chem groups
3330 :type mdl_chains: :class:`list` of :class:`list` of :class:`str`
3331 :returns: Number of possible mappings of *mdl_chains* onto *ref_chains*
3333 assert(len(ref_chains) == len(mdl_chains))
3335 for a,b
in zip(ref_chains, mdl_chains):
3336 n *= _NChemGroupMappings(a,b)
3339 def _NMappingsWithin(ref_chains, mdl_chains, max_mappings):
3340 """ Check whether total number of mappings is smaller than given maximum
3342 In principle the same as :func:`_NMappings` but it stops as soon as the
3345 :param ref_chains: Chem groups of reference
3346 :type ref_chains: :class:`list` of :class:`list` of :class:`str`
3347 :param mdl_chains: Model chains that map onto those chem groups
3348 :type mdl_chains: :class:`list` of :class:`list` of :class:`str`
3349 :param max_mappings: Number of max allowed mappings
3350 :returns: Whether number of possible mappings of *mdl_chains* onto
3351 *ref_chains* is below or equal *max_mappings*.
3353 assert(len(ref_chains) == len(mdl_chains))
3355 for a,b
in zip(ref_chains, mdl_chains):
3356 n *= _NChemGroupMappings(a,b)
3357 if n > max_mappings:
3361 def _RefSmallerGenerator(ref_chains, mdl_chains):
3362 """ Returns all possible ways to map mdl_chains onto ref_chains
3364 Specific for the case where len(ref_chains) < len(mdl_chains)
3366 for c
in itertools.combinations(mdl_chains, len(ref_chains)):
3367 for p
in itertools.permutations(c):
3370 def _RefLargerGenerator(ref_chains, mdl_chains):
3371 """ Returns all possible ways to map mdl_chains onto ref_chains
3373 Specific for the case where len(ref_chains) > len(mdl_chains)
3374 Ref chains without mapped mdl chain are assigned None
3376 n_ref = len(ref_chains)
3377 n_mdl = len(mdl_chains)
3378 for c
in itertools.combinations(range(n_ref), n_mdl):
3379 for p
in itertools.permutations(mdl_chains):
3380 ret_list = [
None] * n_ref
3381 for idx, ch
in zip(c, p):
3385 def _RefEqualGenerator(ref_chains, mdl_chains):
3386 """ Returns all possible ways to map mdl_chains onto ref_chains
3388 Specific for the case where len(ref_chains) == len(mdl_chains)
3390 for p
in itertools.permutations(mdl_chains):
3393 def _ConcatIterators(iterators):
3394 for item
in itertools.product(*iterators):
3397 def _ChainMappings(ref_chains, mdl_chains, n_max=None):
3398 """Returns all possible ways to map *mdl_chains* onto fixed *ref_chains*
3400 :param ref_chains: List of list of chemically equivalent chains in reference
3401 :type ref_chains: :class:`list` of :class:`list`
3402 :param mdl_chains: Equally long list of list of chemically equivalent chains
3403 in model that map on those ref chains.
3404 :type mdl_chains: :class:`list` of :class:`list`
3405 :param n_max: Aborts and raises :class:`RuntimeError` if max number of
3406 mappings is above this threshold.
3407 :type n_max: :class:`int`
3408 :returns: Iterator over all possible mappings of *mdl_chains* onto fixed
3409 *ref_chains*. Potentially contains None as padding when number of
3410 model chains for a certain mapping is smaller than the according
3412 Example: _ChainMappings([['A', 'B', 'C'], ['D', 'E']],
3413 [['x', 'y'], ['i', 'j']])
3414 gives an iterator over: [[['x', 'y', None], ['i', 'j']],
3415 [['x', 'y', None], ['j', 'i']],
3416 [['y', 'x', None], ['i', 'j']],
3417 [['y', 'x', None], ['j', 'i']],
3418 [['x', None, 'y'], ['i', 'j']],
3419 [['x', None, 'y'], ['j', 'i']],
3420 [['y', None, 'x'], ['i', 'j']],
3421 [['y', None, 'x'], ['j', 'i']],
3422 [[None, 'x', 'y'], ['i', 'j']],
3423 [[None, 'x', 'y'], ['j', 'i']],
3424 [[None, 'y', 'x'], ['i', 'j']],
3425 [[None, 'y', 'x'], ['j', 'i']]]
3427 assert(len(ref_chains) == len(mdl_chains))
3429 if n_max
is not None:
3430 if not _NMappingsWithin(ref_chains, mdl_chains, n_max):
3431 raise RuntimeError(f
"Too many mappings. Max allowed: {n_max}")
3436 for ref, mdl
in zip(ref_chains, mdl_chains):
3438 raise RuntimeError(
"Expext at least one chain in ref chem group")
3439 if len(ref) == len(mdl):
3440 iterators.append(_RefEqualGenerator(ref, mdl))
3441 elif len(ref) < len(mdl):
3442 iterators.append(_RefSmallerGenerator(ref, mdl))
3444 iterators.append(_RefLargerGenerator(ref, mdl))
3446 return _ConcatIterators(iterators)
3449 def _GetTransform(pos_one, pos_two, iterative):
3450 """ Computes minimal RMSD superposition for pos_one onto pos_two
3452 :param pos_one: Positions that should be superposed onto *pos_two*
3453 :type pos_one: :class:`geom.Vec3List`
3454 :param pos_two: Reference positions
3455 :type pos_two: :class:`geom.Vec3List`
3456 :iterative: Whether iterative superposition should be used. Iterative
3457 potentially raises, uses standard superposition as fallback.
3458 :type iterative: :class:`bool`
3459 :returns: Transformation matrix to superpose *pos_one* onto *pos_two*
3460 :rtype: :class:`geom.Mat4`
3465 res = mol.alg.IterativeSuperposeSVD(pos_one, pos_two)
3469 res = mol.alg.SuperposeSVD(pos_one, pos_two)
3470 return res.transformation
3473 __all__ = (
'ChainMapper',
'ReprResult',
'MappingResult')
def __init__(self, pep_subst_mat=seq.alg.BLOSUM62, pep_gap_open=-5, pep_gap_ext=-2, nuc_subst_mat=seq.alg.NUC44, nuc_gap_open=-4, nuc_gap_ext=-4, resnum_aln=False)
def Align(self, s1, s2, chem_type=None)
def NWAlign(self, s1, s2, chem_type)
def ResNumAlign(self, s1, s2)
def __init__(self, ref, mdl, ref_chem_groups, mdl_chem_groups, ref_mdl_alns, contact_d=12.0, steep_opt_rate=None, greedy_prune_contact_map=False)
def SCCounts(self, ref_ch, mdl_ch)
def ExtendMapping(self, mapping, max_ext=None)
def _SteepOpt(self, mapping, chains_to_optimize=None)
def __init__(self, ref, mdl, ref_mdl_alns, inclusion_radius=15.0, thresholds=[0.5, 1.0, 2.0, 4.0])
def lDDT(self, ref_chain_groups, mdl_chain_groups)
def lDDTFromFlatMap(self, flat_map)
def SCCounts(self, ref_ch, mdl_ch)
def IntCounts(self, ref_ch1, ref_ch2, mdl_ch1, mdl_ch2)
def __init__(self, ref, mdl, ref_chem_groups, mdl_chem_groups, ref_mdl_alns, inclusion_radius=15.0, thresholds=[0.5, 1.0, 2.0, 4.0], steep_opt_rate=None)
def ExtendMapping(self, mapping, max_ext=None)
def _SteepOpt(self, mapping, chains_to_optimize=None)
def chem_group_ref_seqs(self)
def chem_group_alignments(self)
def GetlDDTMapping(self, model, inclusion_radius=15.0, thresholds=[0.5, 1.0, 2.0, 4.0], strategy="heuristic", steep_opt_rate=None, block_seed_size=5, block_blocks_per_chem_group=5, chem_mapping_result=None, heuristic_n_max_naive=40320)
def GetRepr(self, substructure, model, topn=1, inclusion_radius=15.0, thresholds=[0.5, 1.0, 2.0, 4.0], bb_only=False, only_interchain=False, chem_mapping_result=None, global_mapping=None)
def GetNMappings(self, model)
def GetChemMapping(self, model)
def GetMapping(self, model, n_max_naive=40320)
def __init__(self, target, resnum_alignments=False, pep_seqid_thr=95., pep_gap_thr=1.0, nuc_seqid_thr=95., nuc_gap_thr=1.0, pep_subst_mat=seq.alg.BLOSUM62, pep_gap_open=-11, pep_gap_ext=-1, nuc_subst_mat=seq.alg.NUC44, nuc_gap_open=-4, nuc_gap_ext=-4, min_pep_length=6, min_nuc_length=4, n_max_naive=1e8)
def ProcessStructure(self, ent)
def Align(self, s1, s2, stype)
def GetQSScoreMapping(self, model, contact_d=12.0, strategy="heuristic", block_seed_size=5, block_blocks_per_chem_group=5, heuristic_n_max_naive=40320, steep_opt_rate=None, chem_mapping_result=None, greedy_prune_contact_map=True)
def chem_group_types(self)
def GetRMSDMapping(self, model, strategy="heuristic", subsampling=50, chem_mapping_result=None, heuristic_n_max_naive=120)
def GetFlatMapping(self, mdl_as_key=False)
def __init__(self, target, model, chem_groups, chem_mapping, mapping, alns, opt_score=None)
def mdl_full_bb_pos(self)
def _GetFullBBPos(self, residues)
def superposed_mdl_bb_pos(self)
def _GetBBPos(self, residues)
def inconsistent_residues(self)
def GetFlatChainMapping(self, mdl_as_key=False)
def ref_full_bb_pos(self)
def __init__(self, lDDT, substructure, ref_view, mdl_view)
def _GetInconsistentResidues(self, ref_residues, mdl_residues)
def _InterfacePenalty1(self, interface)
def _InterfacePenalty2(self, interface)
def _MappedInterfaceScores(self, int1, int2)
def FromFlatMapping(self, flat_mapping)