OpenStructure
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Groups Pages
kclust.py
Go to the documentation of this file.
1 import subprocess, os, tempfile
2 from ost import io, seq, settings
3 
4 """
5 Wrapper for kClust a protein sequence clustering algorithm
6 
7 unpublished, but mentioned in:
8 
9 Michael Remmert, Andreas Biegert, Andreas Hauser & Johannes Soeding
10 HHblits: lighting-fast iterative protein sequence searching by
11 HMM-HMM alignment
12 Nature Methods 9, 173-175 (2012)
13 
14 source code can be downloaded from:
15 
16 ftp://toolkit.genzentrum.lmu.de/pub/
17 """
18 
19 class cluster:
20 
21  """
22  Holds the information of one cluster
23 
24  .. attribute:: sequences
25 
26  SequenceList containing all sequences of the cluster
27 
28  .. attribute:: representative_id
29 
30  a string, that contains the name of the representative sequence of the cluster
31  as estimated by kClust.
32 
33  .. attribute:: alignment
34 
35  alignment handle containing a multiple sequence alignment of all sequences of
36  the cluster. Gets only calculated when binding is called with appropriate flag.
37  """
38 
39  def __init__(self, sequences, representative_id):
40 
41  self.sequences=sequences
42  self.representative_id=representative_id
43  self.alignment=None
44 
45 def _SetupFiles(sequences):
46 
47  # create temporary directory
48  tmp_dir_name=tempfile.mkdtemp()
49  io.SaveSequenceList(sequences, os.path.join(tmp_dir_name,'fastadb.fasta'))
50  return tmp_dir_name
51 
52 def _CleanupFiles(tmp_dir_name):
53 
54  import shutil
55  shutil.rmtree(tmp_dir_name)
56 
57 def _ParseOutput(tmp_dir_name):
58 
59  header_data=open(os.path.join(tmp_dir_name,'headers.dmp'),'r').readlines()
60  cluster_data=open(os.path.join(tmp_dir_name,'clusters.dmp'),'r').readlines()
61  sequences=io.LoadSequenceList(os.path.join(tmp_dir_name,'fastadb.fasta'))
62 
63  clusters=dict()
64  header_mapper=dict()
65  for line in header_data:
66  header_mapper[int(line.split()[0])]=line.split()[1].strip().strip('>')
67 
68  #find numeric ids of the representatives of the clusters
69  unique_representatives=list()
70  for line in cluster_data[1:]:
71  actual_cluster=int(line.split()[1])
72  try:
73  unique_representatives.index(actual_cluster)
74  except:
75  unique_representatives.append(actual_cluster)
76 
77  #assign every header to its corresponding cluster, where the
78  #cluster id is given by the id of the representative of the cluster
79  for idx in unique_representatives:
80  clusters[idx]=seq.CreateSequenceList()
81  for line in cluster_data[1:]:
82  clusters[int(line.split()[1])].AddSequence(sequences.FindSequence(header_mapper[int(line.split()[0])]))
83 
84  #translate into final output
85 
86  res=list()
87  for k, v in clusters.iteritems():
88  res.append(cluster(v, header_mapper[k]))
89 
90  return res
91 
92 def _RunkClust(tmp_dir_name, clustering_thresh, create_alignments):
93 
94  bitscore=clustering_thresh*0.060269-0.68498
95 
96  executable=settings.Locate('kClust')
97 
98  cmd=[]
99  cmd.append(executable)
100  cmd.append('-i')
101  cmd.append(os.path.join(tmp_dir_name,'fastadb.fasta'))
102  cmd.append('-d')
103  cmd.append(tmp_dir_name)
104  cmd.append('-s')
105  cmd.append(str(bitscore))
106 
107  cmd=' '.join(cmd)
108  ps=subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
109  stdout, stderr = ps.communicate()
110 
111  result=_ParseOutput(tmp_dir_name)
112 
113  if(create_alignments):
114  from ost.bindings import clustalw
115  for c in result:
116  if len(c.sequences)>1:
117  c.alignment=clustalw.ClustalW(c.sequences)
118  else:
119  aln=seq.CreateAlignment()
120  aln.AddSequence(c.sequences[0])
121  c.alignment=aln
122 
123  return result
124 
125 def kClust(sequences, clustering_thresh=30, create_alignments=False):
126 
127  """
128  Uses kClust to generate clusters of amino acid sequences.
129 
130  :param sequences: All sequences you want to cluster.
131  :type sequences: :class:`ost.seq.SequenceList`
132 
133  :param clustering_thres: Sequence identity threshold to build clusters. Note,
134  that clustering_thresh is more a rule of thumb, since
135  compositional bias in the sequence can also play a role.
136  The value gets transformed in a bitscore, that is used
137  as an input parameter of kClust
138 
139  :param create_alignments: Flag, wether the alignments of the clusters get calculated.
140  Requires clustalw in the path.
141 
142  :returns: A list of cluster instances
143 
144  :raises: :class:`~ost.settings.FileNotFound` if kClust could not be located.
145  """
146 
147  tmp_dir_name=_SetupFiles(sequences)
148  result=_RunkClust(tmp_dir_name, clustering_thresh, create_alignments)
149  _CleanupFiles(tmp_dir_name)
150  return result