OpenStructure
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Groups Pages
table.py
Go to the documentation of this file.
1 import csv
2 import re
3 import math
4 from ost import stutil
5 import itertools
6 import operator
7 import cPickle
8 from ost import LogError, LogWarning, LogInfo, LogVerbose
9 
10 def MakeTitle(col_name):
11  return col_name.replace('_', ' ')
12 
13 def IsStringLike(value):
14  if isinstance(value, TableCol) or isinstance(value, BinaryColExpr):
15  return False
16  try:
17  value+''
18  return True
19  except:
20  return False
21 
22 def IsNullString(value):
23  value=value.strip().upper()
24  return value in ('', 'NULL', 'NONE', 'NA')
25 
26 def IsScalar(value):
27  if IsStringLike(value):
28  return True
29  try:
30  if isinstance(value, TableCol) or isinstance(value, BinaryColExpr):
31  return False
32  iter(value)
33  return False
34  except:
35  return True
36 
37 def GuessColumnType(iterator):
38  empty=True
39  possibilities=set(['bool', 'int', 'float'])
40  for ele in iterator:
41  str_ele=str(ele).upper()
42  if IsNullString(str_ele):
43  continue
44  empty=False
45  if 'int' in possibilities:
46  try:
47  int(str_ele)
48  except ValueError:
49  possibilities.remove('int')
50 
51  if 'float' in possibilities:
52  try:
53  float(str_ele)
54  except ValueError:
55  possibilities.remove('float')
56  if 'bool' in possibilities:
57  if str_ele not in set(['YES', 'NO', 'TRUE', 'FALSE']):
58  possibilities.remove('bool')
59 
60  if len(possibilities)==0:
61  return 'string'
62  if len(possibilities)==2:
63  return 'int'
64  if empty:
65  return 'string'
66  # return the last element available
67  return possibilities.pop()
68 
70  def __init__(self, op, lhs, rhs):
71  self.op=op
72  self.lhs=lhs
73  self.rhs=rhs
74  if IsScalar(lhs):
75  self.lhs=itertools.cyle([self.lhs])
76  if IsScalar(rhs):
77  self.rhs=itertools.cycle([self.rhs])
78  def __iter__(self):
79  for l, r in zip(self.lhs, self.rhs):
80  if l!=None and r!=None:
81  yield self.op(l, r)
82  else:
83  yield None
84  def __add__(self, rhs):
85  return BinaryColExpr(operator.add, self, rhs)
86 
87  def __sub__(self, rhs):
88  return BinaryColExpr(operator.sub, self, rhs)
89 
90  def __mul__(self, rhs):
91  return BinaryColExpr(operator.mul, self, rhs)
92 
93 class TableCol:
94  def __init__(self, table, col):
95  self._table=table
96  if type(col)==str:
97  self.col_index=self._table.GetColIndex(col)
98  else:
99  self.col_index=col
100 
101  def __iter__(self):
102  for row in self._table.rows:
103  yield row[self.col_index]
104 
105  def __len__(self):
106  return len(self._table.rows)
107 
108  def __getitem__(self, index):
109  return self._table.rows[index][self.col_index]
110 
111  def __setitem__(self, index, value):
112  self._table.rows[index][self.col_index]=value
113 
114  def __add__(self, rhs):
115  return BinaryColExpr(operator.add, self, rhs)
116 
117  def __sub__(self, rhs):
118  return BinaryColExpr(operator.sub, self, rhs)
119 
120  def __mul__(self, rhs):
121  return BinaryColExpr(operator.mul, self, rhs)
122  def __div__(self, rhs):
123  return BinaryColExpr(operator.div, self, rhs)
124 
125 
126 class Table(object):
127  """
128 
129  The table class provides convenient access to data in tabular form. An empty
130  table can be easily constructed as follows
131 
132  .. code-block:: python
133 
134  tab=Table()
135 
136  If you want to add columns directly when creating the table, column names
137  and *column types* can be specified as follows
138 
139  .. code-block:: python
140 
141  tab=Table(['nameX','nameY','nameZ'], 'sfb')
142 
143  this will create three columns called nameX, nameY and nameZ of type string,
144  float and bool, respectively. There will be no data in the table and thus,
145  the table will not contain any rows.
146 
147  The following *column types* are supported:
148 
149  ======= ========
150  name abbrev
151  ======= ========
152  string s
153  float f
154  int i
155  bool b
156  ======= ========
157 
158  If you want to add data to the table in addition, use the following:
159 
160  .. code-block:: python
161 
162  tab=Table(['nameX','nameY','nameZ'],
163  'sfb',
164  nameX=['a','b','c'],
165  nameY=[0.1, 1.2, 3.414],
166  nameZ=[True, False, False])
167 
168  if values for one column is left out, they will be filled with NA, but if
169  values are specified, all values must be specified (i.e. same number of
170  values per column)
171 
172  """
173 
174  SUPPORTED_TYPES=('int', 'float', 'bool', 'string',)
175 
176 
177  def __init__(self, col_names=None, col_types=None, **kwargs):
178  self.col_names=col_names
179  self.comment=''
180  self.name=''
181 
182  self.col_types = self._ParseColTypes(col_types)
183  self.rows=[]
184  if len(kwargs)>=0:
185  if not col_names:
186  self.col_names=[v for v in kwargs.keys()]
187  if not self.col_types:
188  self.col_types=['string' for u in range(len(self.col_names))]
189  if len(kwargs)>0:
190  self._AddRowsFromDict(kwargs)
191 
192  @staticmethod
193  def _ParseColTypes(types, exp_num=None):
194  if types==None:
195  return None
196 
197  short2long = {'s' : 'string', 'i': 'int', 'b' : 'bool', 'f' : 'float'}
198  allowed_short = short2long.keys()
199  allowed_long = short2long.values()
200 
201  type_list = []
202 
203  # string type
204  if IsScalar(types):
205  if type(types)==str:
206  types = types.lower()
207 
208  # single value
209  if types in allowed_long:
210  type_list.append(types)
211  elif types in allowed_short:
212  type_list.append(short2long[types])
213 
214  # comma separated list of long or short types
215  elif types.find(',')!=-1:
216  for t in types.split(','):
217  if t in allowed_long:
218  type_list.append(t)
219  elif t in allowed_short:
220  type_list.append(short2long[t])
221  else:
222  raise ValueError('Unknown type %s in types %s'%(t,types))
223 
224  # string of short types
225  else:
226  for t in types:
227  if t in allowed_short:
228  type_list.append(short2long[t])
229  else:
230  raise ValueError('Unknown type %s in types %s'%(t,types))
231 
232  # non-string type
233  else:
234  raise ValueError('Col type %s must be string or list'%types)
235 
236  # list type
237  else:
238  for t in types:
239  # must be string type
240  if type(t)==str:
241  t = t.lower()
242  if t in allowed_long:
243  type_list.append(t)
244  elif t in allowed_short:
245  type_list.append(short2long[t])
246  else:
247  raise ValueError('Unknown type %s in types %s'%(t,types))
248 
249  # non-string type
250  else:
251  raise ValueError('Col type %s must be string or list'%types)
252 
253  if exp_num:
254  if len(type_list)!=exp_num:
255  raise ValueError('Parsed number of col types (%i) differs from ' + \
256  'expected (%i) in types %s'%(len(type_list),exp_num,types))
257 
258  return type_list
259 
260  def SetName(self, name):
261  '''
262  Set name of the table
263  :param name: name
264  :type name: :class:`str`
265  '''
266  self.name = name
267 
268  def GetName(self):
269  '''
270  Get name of table
271  '''
272  return self.name
273 
274  def _Coerce(self, value, ty):
275  '''
276  Try to convert values (e.g. from :class:`str` type) to the specified type
277 
278  :param value: the value
279  :type value: any type
280 
281  :param ty: name of type to convert it to (i.e. *int*, *float*, *string*,
282  *bool*)
283  :type ty: :class:`str`
284  '''
285  if value=='NA' or value==None:
286  return None
287  if ty=='int':
288  return int(value)
289  if ty=='float':
290  return float(value)
291  if ty=='string':
292  return str(value)
293  if ty=='bool':
294  if isinstance(value, str) or isinstance(value, unicode):
295  if value.upper() in ('FALSE', 'NO',):
296  return False
297  return True
298  return bool(value)
299  raise ValueError('Unknown type %s' % ty)
300 
301  def GetColIndex(self, col):
302  '''
303  Returns the column index for the column with the given name.
304 
305  :raises: ValueError if no column with the name is found
306  '''
307  if col not in self.col_names:
308  raise ValueError('Table has no column named "%s"' % col)
309  return self.col_names.index(col)
310 
311  def GetColNames(self):
312  '''
313  Returns a list containing all column names.
314  '''
315  return self.col_names
316 
317  def HasCol(self, col):
318  '''
319  Checks if the column with a given name is present in the table.
320  '''
321  return col in self.col_names
322 
323  def __getitem__(self, k):
324  if type(k)==int:
325  return TableCol(self, self.col_names[k])
326  else:
327  return TableCol(self, k)
328 
329  def __setitem__(self, k, value):
330  col_index=k
331  if type(k)!=int:
332  col_index=self.GetColIndex(k)
333  if IsScalar(value):
334  value=itertools.cycle([value])
335  for r, v in zip(self.rows, value):
336  r[col_index]=v
337 
338  def ToString(self, float_format='%.3f', int_format='%d', rows=None):
339  '''
340  Convert the table into a string representation.
341 
342  The output format can be modified for int and float type columns by
343  specifying a formatting string for the parameters 'float_format' and
344  'int_format'.
345 
346  The option 'rows' specify the range of rows to be printed. The parameter
347  must be a type that supports indexing (e.g. a :class:`list`) containing the
348  start and end row *index*, e.g. [start_row_idx, end_row_idx].
349 
350  :param float_format: formatting string for float columns
351  :type float_format: :class:`str`
352 
353  :param int_format: formatting string for int columns
354  :type int_format: :class:`str`
355 
356  :param rows: iterable containing start and end row *index*
357  :type rows: iterable containing :class:`ints <int>`
358  '''
359  widths=[len(cn) for cn in self.col_names]
360  sel_rows=self.rows
361  if rows:
362  sel_rows=self.rows[rows[0]:rows[1]]
363  for row in sel_rows:
364  for i, (ty, col) in enumerate(zip(self.col_types, row)):
365  if col==None:
366  widths[i]=max(widths[i], len('NA'))
367  elif ty=='float':
368  widths[i]=max(widths[i], len(float_format % col))
369  elif ty=='int':
370  widths[i]=max(widths[i], len(int_format % col))
371  else:
372  widths[i]=max(widths[i], len(str(col)))
373  s=''
374  if self.comment:
375  s+=''.join(['# %s\n' % l for l in self.comment.split('\n')])
376  total_width=sum(widths)+2*len(widths)
377  for width, col_name in zip(widths, self.col_names):
378  s+=col_name.center(width+2)
379  s+='\n%s\n' % ('-'*total_width)
380  for row in sel_rows:
381  for width, ty, col in zip(widths, self.col_types, row):
382  cs=''
383  if col==None:
384  cs='NA'.center(width+2)
385  elif ty=='float':
386  cs=(float_format % col).rjust(width+2)
387  elif ty=='int':
388  cs=(int_format % col).rjust(width+2)
389  else:
390  cs=' '+str(col).ljust(width+1)
391  s+=cs
392  s+='\n'
393  return s
394 
395  def __str__(self):
396  return self.ToString()
397 
398  def _AddRowsFromDict(self, d, overwrite=None):
399  '''
400  Add one or more rows from a :class:`dictionary <dict>`.
401 
402  If *overwrite* is not None and set to an existing column name, the specified
403  column in the table is searched for the first occurrence of a value matching
404  the value of the column with the same name in the dictionary. If a matching
405  value is found, the row is overwritten with the dictionary. If no matching
406  row is found, a new row is appended to the table.
407 
408  :param d: dictionary containing the data
409  :type d: :class:`dict`
410 
411  :param overwrite: column name to overwrite existing row if value in
412  column *overwrite* matches
413  :type overwrite: :class:`str`
414 
415  :raises: :class:`ValueError` if multiple rows are added but the number of
416  data items is different for different columns.
417  '''
418  # get column indices
419  idxs = [self.GetColIndex(k) for k in d.keys()]
420 
421  # convert scalar values to list
422  old_len = None
423  for k,v in d.iteritems():
424  if IsScalar(v):
425  v = [v]
426  d[k] = v
427  if not old_len:
428  old_len = len(v)
429  elif old_len!=len(v):
430  raise ValueError("Cannot add rows: length of data must be equal " + \
431  "for all columns in %s"%str(d))
432 
433  # convert column based dict to row based dict and create row and add data
434  for i,data in enumerate(zip(*d.values())):
435  new_row = [None for a in range(len(self.col_names))]
436  for idx,v in zip(idxs,data):
437  new_row[idx] = self._Coerce(v, self.col_types[idx])
438 
439  # partially overwrite existing row with new data
440  if overwrite:
441  overwrite_idx = self.GetColIndex(overwrite)
442  added = False
443  for i,r in enumerate(self.rows):
444  if r[overwrite_idx]==new_row[overwrite_idx]:
445  for j,e in enumerate(self.rows[i]):
446  if new_row[j]==None:
447  new_row[j] = e
448  self.rows[i] = new_row
449  added = True
450  break
451 
452  # if not overwrite or overwrite did not find appropriate row
453  if not overwrite or not added:
454  self.rows.append(new_row)
455 
456 
457  def AddRow(self, data, overwrite=None):
458  """
459  Add a row to the table.
460 
461  *data* may either be a dictionary or a list-like object:
462 
463  - If *data* is a dictionary the keys in the dictionary must match the
464  column names. Columns not found in the dict will be initialized to None.
465  If the dict contains list-like objects, multiple rows will be added, if
466  the number of items in all list-like objects is the same, otherwise a
467  :class:`ValueError` is raised.
468 
469  - If *data* is a list-like object, the row is initialized from the values
470  in *data*. The number of items in *data* must match the number of
471  columns in the table. A :class:`ValuerError` is raised otherwise. The
472  values are added in the order specified in the list, thus, the order of
473  the data must match the columns.
474 
475  If *overwrite* is not None and set to an existing column name, the specified
476  column in the table is searched for the first occurrence of a value matching
477  the value of the column with the same name in the dictionary. If a matching
478  value is found, the row is overwritten with the dictionary. If no matching
479  row is found, a new row is appended to the table.
480 
481  :param data: data to add
482  :type data: :class:`dict` or *list-like* object
483 
484  :param overwrite: column name to overwrite existing row if value in
485  column *overwrite* matches
486  :type overwrite: :class:`str`
487 
488  :raises: :class:`ValueError` if *list-like* object is used and number of
489  items does *not* match number of columns in table.
490 
491  :raises: :class:`ValueError` if *dict* is used and multiple rows are added
492  but the number of data items is different for different columns.
493 
494  **Example:** add multiple data rows to a subset of columns using a dictionary
495 
496  .. code-block:: python
497 
498  # create table with three float columns
499  tab = Table(['x','y','z'], 'fff')
500 
501  # add rows from dict
502  data = {'x': [1.2, 1.6], 'z': [1.6, 5.3]}
503  tab.AddRow(data)
504  print tab
505 
506  '''
507  will produce the table
508 
509  ==== ==== ====
510  x y z
511  ==== ==== ====
512  1.20 NA 1.60
513  1.60 NA 5.30
514  ==== ==== ====
515  '''
516 
517  # overwrite the row with x=1.2 and add row with x=1.9
518  data = {'x': [1.2, 1.9], 'z': [7.9, 3.5]}
519  tab.AddRow(data, overwrite='x')
520  print tab
521 
522  '''
523  will produce the table
524 
525  ==== ==== ====
526  x y z
527  ==== ==== ====
528  1.20 NA 7.90
529  1.60 NA 5.30
530  1.90 NA 3.50
531  ==== ==== ====
532  '''
533  """
534  if type(data)==dict:
535  self._AddRowsFromDict(data, overwrite)
536  else:
537  if len(data)!=len(self.col_names):
538  print data, self.col_names
539  msg='data array must have %d elements, not %d'
540  raise ValueError(msg % (len(self.col_names), len(data)))
541  new_row = [self._Coerce(v, t) for v, t in zip(data, self.col_types)]
542 
543  # fully overwrite existing row with new data
544  if overwrite:
545  overwrite_idx = self.GetColIndex(overwrite)
546  added = False
547  for i,r in enumerate(self.rows):
548  if r[overwrite_idx]==new_row[overwrite_idx]:
549  self.rows[i] = new_row
550  added = True
551  break
552 
553  # if not overwrite or overwrite did not find appropriate row
554  if not overwrite or not added:
555  self.rows.append(new_row)
556 
557  def RemoveCol(self, col):
558  """
559  Remove column with the given name from the table
560 
561  :param col: name of column to remove
562  :type col: :class:`str`
563  """
564  idx = self.GetColIndex(col)
565  del self.col_names[idx]
566  del self.col_types[idx]
567  for row in self.rows:
568  del row[idx]
569 
570  def AddCol(self, col_name, col_type, data=None):
571  """
572  Add a column to the right of the table.
573 
574  :param col_name: name of new column
575  :type col_name: :class:`str`
576 
577  :param col_type: type of new column (long versions: *int*, *float*, *bool*,
578  *string* or short versions: *i*, *f*, *b*, *s*)
579  :type col_type: :class:`str`
580 
581  :param data: data to add to new column.
582  :type data: scalar or iterable
583 
584  **Example:**
585 
586  .. code-block:: python
587 
588  tab=Table(['x'], 'f', x=range(5))
589  tab.AddCol('even', 'bool', itertools.cycle([True, False]))
590  print tab
591 
592  '''
593  will produce the table
594 
595  ==== ====
596  x even
597  ==== ====
598  0 True
599  1 False
600  2 True
601  3 False
602  4 True
603  ==== ====
604  '''
605 
606  If data is a constant instead of an iterable object, it's value
607  will be written into each row:
608 
609  .. code-block:: python
610 
611  tab=Table(['x'], 'f', x=range(5))
612  tab.AddCol('num', 'i', 1)
613  print tab
614 
615  '''
616  will produce the table
617 
618  ==== ====
619  x num
620  ==== ====
621  0 1
622  1 1
623  2 1
624  3 1
625  4 1
626  ==== ====
627  '''
628 
629  .. warning::
630 
631  :meth:`AddCol` only adds data to existing rows and does *not*
632  add new rows. Use :meth:`AddRow` to do this. Therefore, the following code
633  snippet does not add any data items:
634 
635  .. code-block:: python
636 
637  tab=Table()
638  tab.AddCol('even', 'int', [1,2,3,4,5])
639  print tab
640 
641  '''
642  will produce the empty table
643 
644  ====
645  even
646  ====
647  '''
648 
649  """
650  col_type = self._ParseColTypes(col_type, exp_num=1)[0]
651  self.col_names.append(col_name)
652  self.col_types.append(col_type)
653  if IsScalar(data):
654  for row in self.rows:
655  row.append(data)
656  else:
657  for row, d in zip(self.rows, data):
658  row.append(d)
659 
660  def Filter(self, *args, **kwargs):
661  """
662  Returns a filtered table only containing rows matching all the predicates
663  in kwargs and args For example,
664 
665  .. code-block:: python
666 
667  tab.Filter(town='Basel')
668 
669  will return all the rows where the value of the column "town" is equal to
670  "Basel". Several predicates may be combined, i.e.
671 
672  .. code-block:: python
673 
674  tab.Filter(town='Basel', male=True)
675 
676  will return the rows with "town" equal to "Basel" and "male" equal to true.
677  args are unary callables returning true if the row should be included in the
678  result and false if not.
679  """
680  filt_tab=Table(self.col_names, self.col_types)
681  for row in self.rows:
682  matches=True
683  for func in args:
684  if not func(row):
685  matches=False
686  break
687  for key, val in kwargs.iteritems():
688  if row[self.GetColIndex(key)]!=val:
689  matches=False
690  break
691  if matches:
692  filt_tab.AddRow(row)
693  return filt_tab
694 
695  @staticmethod
696  def _LoadOST(stream_or_filename):
697  fieldname_pattern=re.compile(r'(?P<name>[^[]+)(\[(?P<type>\w+)\])?')
698  values_pattern=re.compile("([^\" ]+|\"[^\"]*\")+")
699  if not hasattr(stream_or_filename, 'read'):
700  stream=open(stream_or_filename, 'r')
701  else:
702  stream=stream_or_filename
703  header=False
704  num_lines=0
705  for line in stream:
706  line=line.strip()
707  if line.startswith('#'):
708  continue
709  if len(line)==0:
710  continue
711  num_lines+=1
712  if not header:
713  fieldnames=[]
714  fieldtypes=[]
715  for col in line.split():
716  match=fieldname_pattern.match(col)
717  if match:
718  if match.group('type'):
719  fieldtypes.append(match.group('type'))
720  else:
721  fieldtypes.append('string')
722  fieldnames.append(match.group('name'))
723  tab=Table(fieldnames, fieldtypes)
724  header=True
725  continue
726  tab.AddRow([x.strip('"') for x in values_pattern.findall(line)])
727  if num_lines==0:
728  raise IOError("Cannot read table from empty stream")
729  return tab
730 
731  def _GuessColumnTypes(self):
732  for col_idx in range(len(self.col_names)):
733  self.col_types[col_idx]=GuessColumnType(self[self.col_names[col_idx]])
734  for row in self.rows:
735  for idx in range(len(row)):
736  row[idx]=self._Coerce(row[idx], self.col_types[idx])
737 
738  @staticmethod
739  def _LoadCSV(stream_or_filename, sep):
740  if not hasattr(stream_or_filename, 'read'):
741  stream=open(stream_or_filename, 'r')
742  else:
743  stream=stream_or_filename
744  reader=csv.reader(stream, delimiter=sep)
745  first=True
746  for row in reader:
747  if first:
748  header=row
749  types='s'*len(row)
750  tab=Table(header, types)
751  first=False
752  else:
753  tab.AddRow(row)
754  if first:
755  raise IOError('trying to load table from empty CSV stream/file')
756 
757  tab._GuessColumnTypes()
758  return tab
759 
760  @staticmethod
761  def _LoadPickle(stream_or_filename):
762  if not hasattr(stream_or_filename, 'read'):
763  stream=open(stream_or_filename, 'rb')
764  else:
765  stream=stream_or_filename
766  return cPickle.load(stream)
767 
768  @staticmethod
769  def _GuessFormat(filename):
770  try:
771  filename = filename.name
772  except AttributeError, e:
773  pass
774  if filename.endswith('.csv'):
775  return 'csv'
776  elif filename.endswith('.pickle'):
777  return 'pickle'
778  else:
779  return 'ost'
780 
781 
782  @staticmethod
783  def Load(stream_or_filename, format='auto', sep=','):
784  """
785  Load table from stream or file with given name.
786 
787  By default, the file format is set to *auto*, which tries to guess the file
788  format from the file extension. The following file extensions are
789  recognized:
790 
791  ============ ======================
792  extension recognized format
793  ============ ======================
794  .csv comma separated values
795  .pickle pickled byte stream
796  <all others> ost-specific format
797  ============ ======================
798 
799  Thus, *format* must be specified for reading file with different filename
800  extensions.
801 
802  The following file formats are understood:
803 
804  - ost
805 
806  This is an ost-specific, but still human readable file format. The file
807  (stream) must start with header line of the form
808 
809  col_name1[type1] <col_name2[type2]>...
810 
811  The types given in brackets must be one of the data types the
812  :class:`Table` class understands. Each following line in the file then must
813  contains exactly the same number of data items as listed in the header. The
814  data items are automatically converted to the column format. Lines starting
815  with a '#' and empty lines are ignored.
816 
817  - pickle
818 
819  Deserializes the table from a pickled byte stream
820 
821  - csv
822 
823  Reads the table from comma separated values stream. Since there is no
824  explicit type information in the csv file, the column types are guessed,
825  using the following simple rules:
826 
827  * if all values are either NA/NULL/NONE the type is set to string
828  * if all non-null values are convertible to float/int the type is set to
829  float/int
830  * if all non-null values are true/false/yes/no, the value is set to bool
831  * for all other cases, the column type is set to string
832 
833  :returns: A new :class:`Table` instance
834  """
835  format=format.lower()
836  if format=='auto':
837  format = Table._GuessFormat(stream_or_filename)
838 
839  if format=='ost':
840  return Table._LoadOST(stream_or_filename)
841  if format=='csv':
842  return Table._LoadCSV(stream_or_filename, sep=sep)
843  if format=='pickle':
844  return Table._LoadPickle(stream_or_filename)
845  raise ValueError('unknown format ""' % format)
846 
847  def Sort(self, by, order='+'):
848  """
849  Performs an in-place sort of the table, based on column *by*.
850 
851  :param by: column name by which to sort
852  :type by: :class:`str`
853 
854  :param order: ascending (``-``) or descending (``+``) order
855  :type order: :class:`str` (i.e. *+*, *-*)
856  """
857  sign=-1
858  if order=='-':
859  sign=1
860  key_index=self.GetColIndex(by)
861  def _key_cmp(lhs, rhs):
862  return sign*cmp(lhs[key_index], rhs[key_index])
863  self.rows=sorted(self.rows, _key_cmp)
864 
865  def GetUnique(self, col, ignore_nan=True):
866  """
867  Extract a list of all unique values from one column
868 
869  :param col: column name
870  :type col: :class:`str`
871 
872  :param ignore_nan: ignore all *None* values
873  :type ignore_nan: :class:`bool`
874  """
875  idx = self.GetColIndex(col)
876  seen = {}
877  result = []
878  for row in self.rows:
879  item = row[idx]
880  if item!=None or ignore_nan==False:
881  if item in seen: continue
882  seen[item] = 1
883  result.append(item)
884  return result
885 
886  def Zip(self, *args):
887  """
888  Allows to conveniently iterate over a selection of columns, e.g.
889 
890  .. code-block:: python
891 
892  tab=Table.Load('...')
893  for col1, col2 in tab.Zip('col1', 'col2'):
894  print col1, col2
895 
896  is a shortcut for
897 
898  .. code-block:: python
899 
900  tab=Table.Load('...')
901  for col1, col2 in zip(tab['col1'], tab['col2']):
902  print col1, col2
903  """
904  return zip(*[self[arg] for arg in args])
905 
906  def Plot(self, x, y=None, z=None, style='.', x_title=None, y_title=None,
907  z_title=None, x_range=None, y_range=None, z_range=None,
908  color=None, plot_if=None, legend=None,
909  num_z_levels=10, diag_line=False, labels=None, max_num_labels=None,
910  title=None, clear=True, save=False, **kwargs):
911  """
912  Function to plot values from your table in 1, 2 or 3 dimensions using
913  `Matplotlib <http://matplotlib.sourceforge.net>`__
914 
915  :param x: column name for first dimension
916  :type x: :class:`str`
917 
918  :param y: column name for second dimension
919  :type y: :class:`str`
920 
921  :param z: column name for third dimension
922  :type z: :class:`str`
923 
924  :param style: symbol style (e.g. *.*, *-*, *x*, *o*, *+*, *\**). For a
925  complete list check (`matplotlib docu <http://matplotlib.sourceforge.net/api/pyplot_api.html#matplotlib.pyplot.plot>`__).
926  :type style: :class:`str`
927 
928  :param x_title: title for first dimension, if not specified it is
929  automatically derived from column name
930  :type x_title: :class:`str`
931 
932  :param y_title: title for second dimension, if not specified it is
933  automatically derived from column name
934  :type y_title: :class:`str`
935 
936  :param z_title: title for third dimension, if not specified it is
937  automatically derived from column name
938  :type z_title: :class:`str`
939 
940  :param x_range: start and end value for first dimension (e.g. [start_x, end_x])
941  :type x_range: :class:`list` of length two
942 
943  :param y_range: start and end value for second dimension (e.g. [start_y, end_y])
944  :type y_range: :class:`list` of length two
945 
946  :param z_range: start and end value for third dimension (e.g. [start_z, end_z])
947  :type z_range: :class:`list` of length two
948 
949  :param color: color for data (e.g. *b*, *g*, *r*). For a complete list check
950  (`matplotlib docu <http://matplotlib.sourceforge.net/api/pyplot_api.html#matplotlib.pyplot.plot>`__).
951  :type color: :class:`str`
952 
953  :param plot_if: callable which returnes *True* if row should be plotted. Is
954  invoked like ``plot_if(self, row)``
955  :type plot_if: callable
956 
957  :param legend: legend label for data series
958  :type legend: :class:`str`
959 
960  :param num_z_levels: number of levels for third dimension
961  :type num_z_levels: :class:`int`
962 
963  :param diag_line: draw diagonal line
964  :type diag_line: :class:`bool`
965 
966  :param labels: column name containing labels to put on x-axis for one
967  dimensional plot
968  :type labels: :class:`str`
969 
970  :param max_num_labels: limit maximum number of labels
971  :type max_num_labels: :class:`int`
972 
973  :param title: plot title, if not specified it is automatically derived from
974  plotted column names
975  :type title: :class:`str`
976 
977  :param clear: clear old data from plot
978  :type clear: :class:`bool`
979 
980  :param save: filename for saving plot
981  :type save: :class:`str`
982 
983  :param \*\*kwargs: additional arguments passed to matplotlib
984 
985  :returns: the ``matplotlib.pyplot`` module
986 
987  **Examples:** simple plotting functions
988 
989  .. code-block:: python
990 
991  tab=Table(['a','b','c','d'],'iffi', a=range(5,0,-1),
992  b=[x/2.0 for x in range(1,6)],
993  c=[math.cos(x) for x in range(0,5)],
994  d=range(3,8))
995 
996  # one dimensional plot of column 'd' vs. index
997  plt=tab.Plot('d')
998  plt.show()
999 
1000  # two dimensional plot of 'a' vs. 'c'
1001  plt=tab.Plot('a', y='c', style='o-')
1002  plt.show()
1003 
1004  # three dimensional plot of 'a' vs. 'c' with values 'b'
1005  plt=tab.Plot('a', y='c', z='b')
1006  # manually save plot to file
1007  plt.savefig("plot.png")
1008  """
1009  try:
1010  import matplotlib.pyplot as plt
1011  import matplotlib.mlab as mlab
1012  import numpy as np
1013  idx1 = self.GetColIndex(x)
1014  xs = []
1015  ys = []
1016  zs = []
1017 
1018  if clear:
1019  plt.figure(figsize=[8, 6])
1020 
1021  if x_title:
1022  nice_x=x_title
1023  else:
1024  nice_x=MakeTitle(x)
1025 
1026  if y_title:
1027  nice_y=y_title
1028  else:
1029  if y:
1030  nice_y=MakeTitle(y)
1031  else:
1032  nice_y=None
1033 
1034  if z_title:
1035  nice_z = z_title
1036  else:
1037  if z:
1038  nice_z = MakeTitle(z)
1039  else:
1040  nice_z = None
1041 
1042  if x_range and (IsScalar(x_range) or len(x_range)!=2):
1043  raise ValueError('parameter x_range must contain exactly two elements')
1044  if y_range and (IsScalar(y_range) or len(y_range)!=2):
1045  raise ValueError('parameter y_range must contain exactly two elements')
1046  if z_range and (IsScalar(z_range) or len(z_range)!=2):
1047  raise ValueError('parameter z_range must contain exactly two elements')
1048 
1049  if color:
1050  kwargs['color']=color
1051  if legend:
1052  kwargs['label']=legend
1053  if y and z:
1054  idx3 = self.GetColIndex(z)
1055  idx2 = self.GetColIndex(y)
1056  for row in self.rows:
1057  if row[idx1]!=None and row[idx2]!=None and row[idx3]!=None:
1058  if plot_if and not plot_if(self, row):
1059  continue
1060  xs.append(row[idx1])
1061  ys.append(row[idx2])
1062  zs.append(row[idx3])
1063  levels = []
1064  if z_range:
1065  z_spacing = (z_range[1] - z_range[0]) / num_z_levels
1066  l = z_range[0]
1067  else:
1068  l = self.Min(z)
1069  z_spacing = (self.Max(z) - l) / num_z_levels
1070 
1071  for i in range(0,num_z_levels+1):
1072  levels.append(l)
1073  l += z_spacing
1074 
1075  xi = np.linspace(min(xs)-0.1,max(xs)+0.1,len(xs)*10)
1076  yi = np.linspace(min(ys)-0.1,max(ys)+0.1,len(ys)*10)
1077  zi = mlab.griddata(xs, ys, zs, xi, yi)
1078 
1079  plt.contour(xi,yi,zi,levels,linewidths=0.5,colors='k')
1080  plt.contourf(xi,yi,zi,levels,cmap=plt.cm.jet)
1081  plt.colorbar(ticks=levels)
1082 
1083  elif y:
1084  idx2=self.GetColIndex(y)
1085  for row in self.rows:
1086  if row[idx1]!=None and row[idx2]!=None:
1087  if plot_if and not plot_if(self, row):
1088  continue
1089  xs.append(row[idx1])
1090  ys.append(row[idx2])
1091  plt.plot(xs, ys, style, **kwargs)
1092 
1093  else:
1094  label_vals=[]
1095 
1096  if labels:
1097  label_idx=self.GetColIndex(labels)
1098  for row in self.rows:
1099  if row[idx1]!=None:
1100  if plot_if and not plot_if(self, row):
1101  continue
1102  xs.append(row[idx1])
1103  if labels:
1104  label_vals.append(row[label_idx])
1105  plt.plot(xs, style, **kwargs)
1106  if labels:
1107  interval = 1
1108  if max_num_labels:
1109  if len(label_vals)>max_num_labels:
1110  interval = int(math.ceil(float(len(label_vals))/max_num_labels))
1111  label_vals = label_vals[::interval]
1112  plt.xticks(np.arange(0, len(xs), interval), label_vals, rotation=45,
1113  size='x-small')
1114 
1115  if title==None:
1116  if nice_z:
1117  title = '%s of %s vs. %s' % (nice_z, nice_x, nice_y)
1118  elif nice_y:
1119  title = '%s vs. %s' % (nice_x, nice_y)
1120  else:
1121  title = nice_x
1122 
1123  plt.title(title, size='x-large', fontweight='bold',
1124  verticalalignment='bottom')
1125 
1126  if legend:
1127  plt.legend(loc=0)
1128 
1129  if x and y:
1130  plt.xlabel(nice_x, size='x-large')
1131  if x_range:
1132  plt.xlim(x_range[0], x_range[1])
1133  if y_range:
1134  plt.ylim(y_range[0], y_range[1])
1135  if diag_line:
1136  plt.plot(x_range, y_range, '-')
1137 
1138  plt.ylabel(nice_y, size='x-large')
1139  else:
1140  if y_range:
1141  plt.ylim(y_range[0], y_range[1])
1142  if x_title:
1143  plt.xlabel(x_title, size='x-large')
1144  plt.ylabel(nice_y, size='x-large')
1145  if save:
1146  plt.savefig(save)
1147  return plt
1148  except ImportError:
1149  LogError("Function needs numpy and matplotlib, but I could not import it.")
1150  raise
1151 
1152  def PlotHistogram(self, col, x_range=None, num_bins=10, normed=False,
1153  histtype='stepfilled', align='mid', x_title=None,
1154  y_title=None, title=None, clear=True, save=False):
1155  """
1156  Create a histogram of the data in col for the range *x_range*, split into
1157  *num_bins* bins and plot it using Matplotlib.
1158 
1159  :param col: column name with data
1160  :type col: :class:`str`
1161 
1162  :param x_range: start and end value for first dimension (e.g. [start_x, end_x])
1163  :type x_range: :class:`list` of length two
1164 
1165  :param num_bins: number of bins in range
1166  :type num_bins: :class:`int`
1167 
1168  :param normed: normalize histogram
1169  :type normed: :class:`bool`
1170 
1171  :param histtype: type of histogram (i.e. *bar*, *barstacked*, *step*,
1172  *stepfilled*). See (`matplotlib docu <http://matplotlib.sourceforge.net/api/pyplot_api.html#matplotlib.pyplot.hist>`__).
1173  :type histtype: :class:`str`
1174 
1175  :param align: style of histogram (*left*, *mid*, *right*). See
1176  (`matplotlib docu <http://matplotlib.sourceforge.net/api/pyplot_api.html#matplotlib.pyplot.hist>`__).
1177  :type align: :class:`str`
1178 
1179  :param x_title: title for first dimension, if not specified it is
1180  automatically derived from column name
1181  :type x_title: :class:`str`
1182 
1183  :param y_title: title for second dimension, if not specified it is
1184  automatically derived from column name
1185  :type y_title: :class:`str`
1186 
1187  :param title: plot title, if not specified it is automatically derived from
1188  plotted column names
1189  :type title: :class:`str`
1190 
1191  :param clear: clear old data from plot
1192  :type clear: :class:`bool`
1193 
1194  :param save: filename for saving plot
1195  :type save: :class:`str`
1196 
1197  **Examples:** simple plotting functions
1198 
1199  .. code-block:: python
1200 
1201  tab=Table(['a'],'f', a=[math.cos(x*0.01) for x in range(100)])
1202 
1203  # one dimensional plot of column 'd' vs. index
1204  plt=tab.PlotHistogram('a')
1205  plt.show()
1206 
1207  """
1208  try:
1209  import matplotlib.pyplot as plt
1210  import numpy as np
1211 
1212  if len(self.rows)==0:
1213  return None
1214 
1215  idx = self.GetColIndex(col)
1216  data = []
1217  for r in self.rows:
1218  if r[idx]!=None:
1219  data.append(r[idx])
1220 
1221  if clear:
1222  plt.clf()
1223 
1224  n, bins, patches = plt.hist(data, bins=num_bins, range=x_range,
1225  normed=normed, histtype=histtype, align=align)
1226 
1227  if x_title:
1228  nice_x=x_title
1229  else:
1230  nice_x=MakeTitle(col)
1231  plt.xlabel(nice_x, size='x-large')
1232 
1233  if y_title:
1234  nice_y=y_title
1235  else:
1236  nice_y="bin count"
1237  plt.ylabel(nice_y, size='x-large')
1238 
1239  if title:
1240  nice_title=title
1241  else:
1242  nice_title="Histogram of %s"%nice_x
1243  plt.title(nice_title, size='x-large', fontweight='bold')
1244 
1245  if save:
1246  plt.savefig(save)
1247  return plt
1248  except ImportError:
1249  LogError("Function needs numpy and matplotlib, but I could not import it.")
1250  raise
1251 
1252  def _Max(self, col):
1253  if len(self.rows)==0:
1254  return None, None
1255  idx = self.GetColIndex(col)
1256  col_type = self.col_types[idx]
1257  if col_type=='int' or col_type=='float':
1258  max_val = -float('inf')
1259  elif col_type=='bool':
1260  max_val = False
1261  elif col_type=='string':
1262  max_val = chr(0)
1263  max_idx = None
1264  for i in range(0, len(self.rows)):
1265  if self.rows[i][idx]>max_val:
1266  max_val = self.rows[i][idx]
1267  max_idx = i
1268  return max_val, max_idx
1269 
1270  def MaxRow(self, col):
1271  """
1272  Returns the row containing the cell with the maximal value in col. If
1273  several rows have the highest value, only the first one is returned.
1274  None values are ignored.
1275 
1276  :param col: column name
1277  :type col: :class:`str`
1278  """
1279  val, idx = self._Max(col)
1280  return self.rows[idx]
1281 
1282  def Max(self, col):
1283  """
1284  Returns the maximum value in col. If several rows have the highest value,
1285  only the first one is returned. None values are ignored.
1286 
1287  :param col: column name
1288  :type col: :class:`str`
1289  """
1290  val, idx = self._Max(col)
1291  return val
1292 
1293  def MaxIdx(self, col):
1294  """
1295  Returns the row index of the cell with the maximal value in col. If
1296  several rows have the highest value, only the first one is returned.
1297  None values are ignored.
1298 
1299  :param col: column name
1300  :type col: :class:`str`
1301  """
1302  val, idx = self._Max(col)
1303  return idx
1304 
1305  def _Min(self, col):
1306  if len(self.rows)==0:
1307  return None, None
1308  idx=self.GetColIndex(col)
1309  col_type = self.col_types[idx]
1310  if col_type=='int' or col_type=='float':
1311  min_val=float('inf')
1312  elif col_type=='bool':
1313  min_val=True
1314  elif col_type=='string':
1315  min_val=chr(255)
1316  min_idx=None
1317  for i,row in enumerate(self.rows):
1318  if row[idx]!=None and row[idx]<min_val:
1319  min_val=row[idx]
1320  min_idx=i
1321  return min_val, min_idx
1322 
1323  def Min(self, col):
1324  """
1325  Returns the minimal value in col. If several rows have the lowest value,
1326  only the first one is returned. None values are ignored.
1327 
1328  :param col: column name
1329  :type col: :class:`str`
1330  """
1331  val, idx = self._Min(col)
1332  return val
1333 
1334  def MinRow(self, col):
1335  """
1336  Returns the row containing the cell with the minimal value in col. If
1337  several rows have the lowest value, only the first one is returned.
1338  None values are ignored.
1339 
1340  :param col: column name
1341  :type col: :class:`str`
1342  """
1343  val, idx = self._Min(col)
1344  return self.rows[idx]
1345 
1346  def MinIdx(self, col):
1347  """
1348  Returns the row index of the cell with the minimal value in col. If
1349  several rows have the lowest value, only the first one is returned.
1350  None values are ignored.
1351 
1352  :param col: column name
1353  :type col: :class:`str`
1354  """
1355  val, idx = self._Min(col)
1356  return idx
1357 
1358  def Sum(self, col):
1359  """
1360  Returns the sum of the given column. Cells with None are ignored. Returns
1361  0.0, if the column doesn't contain any elements. Col must be of numeric
1362  column type ('float', 'int') or boolean column type.
1363 
1364  :param col: column name
1365  :type col: :class:`str`
1366 
1367  :raises: :class:`TypeError` if column type is ``string``
1368  """
1369  idx = self.GetColIndex(col)
1370  col_type = self.col_types[idx]
1371  if col_type!='int' and col_type!='float' and col_type!='bool':
1372  raise TypeError("Sum can only be used on numeric column types")
1373  s = 0.0
1374  for r in self.rows:
1375  if r[idx]!=None:
1376  s += r[idx]
1377  return s
1378 
1379  def Mean(self, col):
1380  """
1381  Returns the mean of the given column. Cells with None are ignored. Returns
1382  None, if the column doesn't contain any elements. Col must be of numeric
1383  ('float', 'int') or boolean column type.
1384 
1385  If column type is *bool*, the function returns the ratio of
1386  number of 'Trues' by total number of elements.
1387 
1388  :param col: column name
1389  :type col: :class:`str`
1390 
1391  :raises: :class:`TypeError` if column type is ``string``
1392  """
1393  idx = self.GetColIndex(col)
1394  col_type = self.col_types[idx]
1395  if col_type!='int' and col_type!='float' and col_type!='bool':
1396  raise TypeError("Mean can only be used on numeric or bool column types")
1397 
1398  vals=[]
1399  for v in self[col]:
1400  if v!=None:
1401  vals.append(v)
1402  try:
1403  return stutil.Mean(vals)
1404  except:
1405  return None
1406 
1407  def RowMean(self, mean_col_name, cols):
1408  """
1409  Adds a new column of type 'float' with a specified name (*mean_col_name*),
1410  containing the mean of all specified columns for each row.
1411 
1412  Cols are specified by their names and must be of numeric column
1413  type ('float', 'int') or boolean column type. Cells with None are ignored.
1414  Adds None if the row doesn't contain any values.
1415 
1416  :param mean_col_name: name of new column containing mean values
1417  :type mean_col_name: :class:`str`
1418 
1419  :param cols: name or list of names of columns to include in computation of
1420  mean
1421  :type cols: :class:`str` or :class:`list` of strings
1422 
1423  :raises: :class:`TypeError` if column type of columns in *col* is ``string``
1424 
1425  == Example ==
1426 
1427  Staring with the following table:
1428 
1429  ==== ==== ====
1430  x y u
1431  ==== ==== ====
1432  1 10 100
1433  2 15 None
1434  3 20 400
1435  ==== ==== ====
1436 
1437  the code here adds a column with the name 'mean' to yield the table below:
1438 
1439  .. code-block::python
1440 
1441  tab.RowMean('mean', ['x', 'u'])
1442 
1443 
1444  ==== ==== ==== =====
1445  x y u mean
1446  ==== ==== ==== =====
1447  1 10 100 50.5
1448  2 15 None 2
1449  3 20 400 201.5
1450  ==== ==== ==== =====
1451 
1452  """
1453 
1454  if IsScalar(cols):
1455  cols = [cols]
1456 
1457  cols_idxs = []
1458  for col in cols:
1459  idx = self.GetColIndex(col)
1460  col_type = self.col_types[idx]
1461  if col_type!='int' and col_type!='float' and col_type!='bool':
1462  raise TypeError("RowMean can only be used on numeric column types")
1463  cols_idxs.append(idx)
1464 
1465  mean_rows = []
1466  for row in self.rows:
1467  vals = []
1468  for idx in cols_idxs:
1469  v = row[idx]
1470  if v!=None:
1471  vals.append(v)
1472  try:
1473  mean = stutil.Mean(vals)
1474  mean_rows.append(mean)
1475  except:
1476  mean_rows.append(None)
1477 
1478  self.AddCol(mean_col_name, 'f', mean_rows)
1479 
1480  def Median(self, col):
1481  """
1482  Returns the median of the given column. Cells with None are ignored. Returns
1483  None, if the column doesn't contain any elements. Col must be of numeric
1484  column type ('float', 'int') or boolean column type.
1485 
1486  :param col: column name
1487  :type col: :class:`str`
1488 
1489  :raises: :class:`TypeError` if column type is ``string``
1490  """
1491  idx = self.GetColIndex(col)
1492  col_type = self.col_types[idx]
1493  if col_type!='int' and col_type!='float' and col_type!='bool':
1494  raise TypeError("Median can only be used on numeric column types")
1495 
1496  vals=[]
1497  for v in self[col]:
1498  if v!=None:
1499  vals.append(v)
1500  stutil.Median(vals)
1501  try:
1502  return stutil.Median(vals)
1503  except:
1504  return None
1505 
1506  def StdDev(self, col):
1507  """
1508  Returns the standard deviation of the given column. Cells with None are
1509  ignored. Returns None, if the column doesn't contain any elements. Col must
1510  be of numeric column type ('float', 'int') or boolean column type.
1511 
1512  :param col: column name
1513  :type col: :class:`str`
1514 
1515  :raises: :class:`TypeError` if column type is ``string``
1516  """
1517  idx = self.GetColIndex(col)
1518  col_type = self.col_types[idx]
1519  if col_type!='int' and col_type!='float' and col_type!='bool':
1520  raise TypeError("StdDev can only be used on numeric column types")
1521 
1522  vals=[]
1523  for v in self[col]:
1524  if v!=None:
1525  vals.append(v)
1526  try:
1527  return stutil.StdDev(vals)
1528  except:
1529  return None
1530 
1531  def Count(self, col, ignore_nan=True):
1532  """
1533  Count the number of cells in column that are not equal to None.
1534 
1535  :param col: column name
1536  :type col: :class:`str`
1537 
1538  :param ignore_nan: ignore all *None* values
1539  :type ignore_nan: :class:`bool`
1540  """
1541  count=0
1542  idx=self.GetColIndex(col)
1543  for r in self.rows:
1544  if ignore_nan:
1545  if r[idx]!=None:
1546  count+=1
1547  else:
1548  count+=1
1549  return count
1550 
1551  def Correl(self, col1, col2):
1552  """
1553  Calculate the Pearson correlation coefficient between *col1* and *col2*, only
1554  taking rows into account where both of the values are not equal to *None*.
1555  If there are not enough data points to calculate a correlation coefficient,
1556  *None* is returned.
1557 
1558  :param col1: column name for first column
1559  :type col1: :class:`str`
1560 
1561  :param col2: column name for second column
1562  :type col2: :class:`str`
1563  """
1564  if IsStringLike(col1) and IsStringLike(col2):
1565  col1 = self.GetColIndex(col1)
1566  col2 = self.GetColIndex(col2)
1567  vals1, vals2=([],[])
1568  for v1, v2 in zip(self[col1], self[col2]):
1569  if v1!=None and v2!=None:
1570  vals1.append(v1)
1571  vals2.append(v2)
1572  try:
1573  return stutil.Correl(vals1, vals2)
1574  except:
1575  return None
1576 
1577  def SpearmanCorrel(self, col1, col2):
1578  """
1579  Calculate the Spearman correlation coefficient between col1 and col2, only
1580  taking rows into account where both of the values are not equal to None. If
1581  there are not enough data points to calculate a correlation coefficient,
1582  None is returned.
1583 
1584  :warning: The function depends on the following module: *scipy.stats.mstats*
1585 
1586  :param col1: column name for first column
1587  :type col1: :class:`str`
1588 
1589  :param col2: column name for second column
1590  :type col2: :class:`str`
1591  """
1592  try:
1593  import scipy.stats.mstats
1594 
1595  if IsStringLike(col1) and IsStringLike(col2):
1596  col1 = self.GetColIndex(col1)
1597  col2 = self.GetColIndex(col2)
1598  vals1, vals2=([],[])
1599  for v1, v2 in zip(self[col1], self[col2]):
1600  if v1!=None and v2!=None:
1601  vals1.append(v1)
1602  vals2.append(v2)
1603  try:
1604  correl = scipy.stats.mstats.spearmanr(vals1, vals2)[0]
1605  if scipy.isnan(correl):
1606  return None
1607  return correl
1608  except:
1609  return None
1610 
1611  except ImportError:
1612  LogError("Function needs scipy.stats.mstats, but I could not import it.")
1613  raise
1614 
1615 
1616  def Save(self, stream_or_filename, format='ost', sep=','):
1617  """
1618  Save the table to stream or filename. The following three file formats
1619  are supported (for more information on file formats, see :meth:`Load`):
1620 
1621  ============= =======================================
1622  ost ost-specific format (human readable)
1623  csv comma separated values (human readable)
1624  pickle pickled byte stream (binary)
1625  ============= =======================================
1626 
1627  :param stream_or_filename: filename or stream for writing output
1628  :type stream_or_filename: :class:`str` or :class:`file`
1629 
1630  :param format: output format (i.e. *ost*, *csv*, *pickle*)
1631  :type format: :class:`str`
1632 
1633  :raises: :class:`ValueError` if format is unknown
1634  """
1635  format=format.lower()
1636  if format=='ost':
1637  return self._SaveOST(stream_or_filename)
1638  if format=='csv':
1639  return self._SaveCSV(stream_or_filename, sep=sep)
1640  if format=='pickle':
1641  return self._SavePickle(stream_or_filename)
1642  raise ValueError('unknown format "%s"' % format)
1643 
1644  def _SavePickle(self, stream):
1645  if not hasattr(stream, 'write'):
1646  stream=open(stream, 'wb')
1647  cPickle.dump(self, stream, cPickle.HIGHEST_PROTOCOL)
1648 
1649  def _SaveCSV(self, stream, sep):
1650  if not hasattr(stream, 'write'):
1651  stream=open(stream, 'wb')
1652 
1653  writer=csv.writer(stream, delimiter=sep)
1654  writer.writerow(['%s' % n for n in self.col_names])
1655  for row in self.rows:
1656  row=list(row)
1657  for i, c in enumerate(row):
1658  if c==None:
1659  row[i]='NA'
1660  writer.writerow(row)
1661 
1662  def _SaveOST(self, stream):
1663  if hasattr(stream, 'write'):
1664  writer=csv.writer(stream, delimiter=' ')
1665  else:
1666  stream=open(stream, 'w')
1667  writer=csv.writer(stream, delimiter=' ')
1668  if self.comment:
1669  stream.write(''.join(['# %s\n' % l for l in self.comment.split('\n')]))
1670  writer.writerow(['%s[%s]' % t for t in zip(self.col_names, self.col_types)])
1671  for row in self.rows:
1672  row=list(row)
1673  for i, c in enumerate(row):
1674  if c==None:
1675  row[i]='NA'
1676  writer.writerow(row)
1677 
1678 
1679  def GetNumpyMatrix(self, *args):
1680  '''
1681  Returns a numpy matrix containing the selected columns from the table as
1682  columns in the matrix.
1683  Only columns of type *int* or *float* are supported. *NA* values in the
1684  table will be converted to *None* values.
1685 
1686  :param \*args: column names to include in numpy matrix
1687 
1688  :warning: The function depends on *numpy*
1689  '''
1690  try:
1691  import numpy as np
1692 
1693  if len(args)==0:
1694  raise RuntimeError("At least one column must be specified.")
1695 
1696  idxs = []
1697  for arg in args:
1698  idx = self.GetColIndex(arg)
1699  col_type = self.col_types[idx]
1700  if col_type!='int' and col_type!='float':
1701  raise TypeError("Numpy matrix can only be generated from numeric column types")
1702  idxs.append(idx)
1703  m = np.matrix([list(self[i]) for i in idxs])
1704  return m.T
1705 
1706  except ImportError:
1707  LogError("Function needs numpy, but I could not import it.")
1708  raise
1709 
1710  def GetOptimalPrefactors(self, ref_col, *args, **kwargs):
1711  '''
1712  This returns the optimal prefactor values (i.e. a, b, c, ...) for the
1713  following equation
1714 
1715  .. math::
1716  :label: op1
1717 
1718  a*u + b*v + c*w + ... = z
1719 
1720  where u, v, w and z are vectors. In matrix notation
1721 
1722  .. math::
1723  :label: op2
1724 
1725  A*p = z
1726 
1727  where A contains the data from the table (u,v,w,...), p are the prefactors
1728  to optimize (a,b,c,...) and z is the vector containing the result of
1729  equation :eq:`op1`.
1730 
1731  The parameter ref_col equals to z in both equations, and \*args are columns
1732  u, v and w (or A in :eq:`op2`). All columns must be specified by their names.
1733 
1734  **Example:**
1735 
1736  .. code-block:: python
1737 
1738  tab.GetOptimalPrefactors('colC', 'colA', 'colB')
1739 
1740  The function returns a list of containing the prefactors a, b, c, ... in
1741  the correct order (i.e. same as columns were specified in \*args).
1742 
1743  Weighting:
1744  If the kwarg weights="columX" is specified, the equations are weighted by
1745  the values in that column. Each row is multiplied by the weight in that row,
1746  which leads to :eq:`op3`:
1747 
1748  .. math::
1749  :label: op3
1750 
1751  weight*a*u + weight*b*v + weight*c*w + ... = weight*z
1752 
1753  Weights must be float or int and can have any value. A value of 0 ignores
1754  this equation, a value of 1 means the same as no weight. If all weights are
1755  the same for each row, the same result will be obtained as with no weights.
1756 
1757  **Example:**
1758 
1759  .. code-block:: python
1760 
1761  tab.GetOptimalPrefactors('colC', 'colA', 'colB', weights='colD')
1762 
1763  '''
1764  try:
1765  import numpy as np
1766 
1767  if len(args)==0:
1768  raise RuntimeError("At least one column must be specified.")
1769 
1770  b = self.GetNumpyMatrix(ref_col)
1771  a = self.GetNumpyMatrix(*args)
1772 
1773  if len(kwargs)!=0:
1774  if kwargs.has_key('weights'):
1775  w = self.GetNumpyMatrix(kwargs['weights'])
1776  b = np.multiply(b,w)
1777  a = np.multiply(a,w)
1778 
1779  else:
1780  raise RuntimeError("specified unrecognized kwargs, use weights as key")
1781 
1782  k = (a.T*a).I*a.T*b
1783  return list(np.array(k.T).reshape(-1))
1784 
1785  except ImportError:
1786  LogError("Function needs numpy, but I could not import it.")
1787  raise
1788 
1789  def PlotEnrichment(self, score_col, class_col, score_dir='-',
1790  class_dir='-', class_cutoff=2.0,
1791  style='-', title=None, x_title=None, y_title=None,
1792  clear=True, save=None):
1793  '''
1794  Plot an enrichment curve using matplotlib of column *score_col* classified
1795  according to *class_col*.
1796 
1797  For more information about parameters of the enrichment, see
1798  :meth:`ComputeEnrichment`, and for plotting see :meth:`Plot`.
1799 
1800  :warning: The function depends on *matplotlib*
1801  '''
1802  try:
1803  import matplotlib.pyplot as plt
1804 
1805  enrx, enry = self.ComputeEnrichment(score_col, class_col, score_dir,
1806  class_dir, class_cutoff)
1807 
1808  if not title:
1809  title = 'Enrichment of %s'%score_col
1810 
1811  if not x_title:
1812  x_title = '% database'
1813 
1814  if not y_title:
1815  y_title = '% positives'
1816 
1817  if clear:
1818  plt.clf()
1819 
1820  plt.plot(enrx, enry, style)
1821 
1822  plt.title(title, size='x-large', fontweight='bold')
1823  plt.ylabel(y_title, size='x-large')
1824  plt.xlabel(x_title, size='x-large')
1825 
1826  if save:
1827  plt.savefig(save)
1828 
1829  return plt
1830  except ImportError:
1831  LogError("Function needs matplotlib, but I could not import it.")
1832  raise
1833 
1834  def ComputeEnrichment(self, score_col, class_col, score_dir='-',
1835  class_dir='-', class_cutoff=2.0):
1836  '''
1837  Computes the enrichment of column *score_col* classified according to
1838  *class_col*.
1839 
1840  For this it is necessary, that the datapoints are classified into positive
1841  and negative points. This can be done in two ways:
1842 
1843  - by using one 'bool' type column (*class_col*) which contains *True* for
1844  positives and *False* for negatives
1845 
1846  - by specifying a classification column (*class_col*), a cutoff value
1847  (*class_cutoff*) and the classification columns direction (*class_dir*).
1848  This will generate the classification on the fly
1849 
1850  * if ``class_dir=='-'``: values in the classification column that are less than or equal to class_cutoff will be counted as positives
1851  * if ``class_dir=='+'``: values in the classification column that are larger than or equal to class_cutoff will be counted as positives
1852 
1853  During the calculation, the table will be sorted according to *score_dir*,
1854  where a '-' values means smallest values first and therefore, the smaller
1855  the value, the better.
1856 
1857  '''
1858 
1859  ALLOWED_DIR = ['+','-']
1860 
1861  score_idx = self.GetColIndex(score_col)
1862  score_type = self.col_types[score_idx]
1863  if score_type!='int' and score_type!='float':
1864  raise TypeError("Score column must be numeric type")
1865 
1866  class_idx = self.GetColIndex(class_col)
1867  class_type = self.col_types[class_idx]
1868  if class_type!='int' and class_type!='float' and class_type!='bool':
1869  raise TypeError("Classifier column must be numeric or bool type")
1870 
1871  if (score_dir not in ALLOWED_DIR) or (class_dir not in ALLOWED_DIR):
1872  raise ValueError("Direction must be one of %s"%str(ALLOWED_DIR))
1873 
1874  self.Sort(score_col, score_dir)
1875 
1876  x = [0]
1877  y = [0]
1878  enr = 0
1879  for i,row in enumerate(self.rows):
1880  class_val = row[class_idx]
1881  if class_val!=None:
1882  if class_type=='bool':
1883  if class_val==True:
1884  enr += 1
1885  else:
1886  if (class_dir=='-' and class_val<=class_cutoff) or (class_dir=='+' and class_val>=class_cutoff):
1887  enr += 1
1888  x.append(i+1)
1889  y.append(enr)
1890  x = [float(v)/x[-1] for v in x]
1891  y = [float(v)/y[-1] for v in y]
1892  return x,y
1893 
1894  def ComputeEnrichmentAUC(self, score_col, class_col, score_dir='-',
1895  class_dir='-', class_cutoff=2.0):
1896  '''
1897  Computes the area under the curve of the enrichment using the trapezoidal
1898  rule.
1899 
1900  For more information about parameters of the enrichment, see
1901  :meth:`ComputeEnrichment`.
1902 
1903  :warning: The function depends on *numpy*
1904  '''
1905  try:
1906  import numpy as np
1907 
1908  enrx, enry = self.ComputeEnrichment(score_col, class_col, score_dir,
1909  class_dir, class_cutoff)
1910 
1911  return np.trapz(enry, enrx)
1912  except ImportError:
1913  LogError("Function needs numpy, but I could not import it.")
1914  raise
1915 
1916  def ComputeROC(self, score_col, class_col, score_dir='-',
1917  class_dir='-', class_cutoff=2.0):
1918  '''
1919  Computes the receiver operating characteristics (ROC) of column *score_col*
1920  classified according to *class_col*.
1921 
1922  For this it is necessary, that the datapoints are classified into positive
1923  and negative points. This can be done in two ways:
1924 
1925  - by using one 'bool' column (*class_col*) which contains True for positives
1926  and False for negatives
1927  - by using a non-bool column (*class_col*), a cutoff value (*class_cutoff*)
1928  and the classification columns direction (*class_dir*). This will generate
1929  the classification on the fly
1930 
1931  - if ``class_dir=='-'``: values in the classification column that are less than or equal to *class_cutoff* will be counted as positives
1932  - if ``class_dir=='+'``: values in the classification column that are larger than or equal to *class_cutoff* will be counted as positives
1933 
1934  During the calculation, the table will be sorted according to *score_dir*,
1935  where a '-' values means smallest values first and therefore, the smaller
1936  the value, the better.
1937 
1938  If *class_col* does not contain any positives (i.e. value is True (if column
1939  is of type bool) or evaluated to True (if column is of type int or float
1940  (depending on *class_dir* and *class_cutoff*))) the ROC is not defined and
1941  the function will return *None*.
1942  '''
1943 
1944  ALLOWED_DIR = ['+','-']
1945 
1946  score_idx = self.GetColIndex(score_col)
1947  score_type = self.col_types[score_idx]
1948  if score_type!='int' and score_type!='float':
1949  raise TypeError("Score column must be numeric type")
1950 
1951  class_idx = self.GetColIndex(class_col)
1952  class_type = self.col_types[class_idx]
1953  if class_type!='int' and class_type!='float' and class_type!='bool':
1954  raise TypeError("Classifier column must be numeric or bool type")
1955 
1956  if (score_dir not in ALLOWED_DIR) or (class_dir not in ALLOWED_DIR):
1957  raise ValueError("Direction must be one of %s"%str(ALLOWED_DIR))
1958 
1959  self.Sort(score_col, score_dir)
1960 
1961  x = [0]
1962  y = [0]
1963  tp = 0
1964  fp = 0
1965  old_score_val = None
1966 
1967  for i,row in enumerate(self.rows):
1968  class_val = row[class_idx]
1969  score_val = row[score_idx]
1970  if class_val!=None:
1971  if old_score_val==None:
1972  old_score_val = score_val
1973  if score_val!=old_score_val:
1974  x.append(fp)
1975  y.append(tp)
1976  old_score_val = score_val
1977  if class_type=='bool':
1978  if class_val==True:
1979  tp += 1
1980  else:
1981  fp += 1
1982  else:
1983  if (class_dir=='-' and class_val<=class_cutoff) or (class_dir=='+' and class_val>=class_cutoff):
1984  tp += 1
1985  else:
1986  fp += 1
1987  x.append(fp)
1988  y.append(tp)
1989 
1990  # if no false positives or false negatives values are found return None
1991  if x[-1]==0 or y[-1]==0:
1992  return None
1993 
1994  x = [float(v)/x[-1] for v in x]
1995  y = [float(v)/y[-1] for v in y]
1996  return x,y
1997 
1998  def ComputeROCAUC(self, score_col, class_col, score_dir='-',
1999  class_dir='-', class_cutoff=2.0):
2000  '''
2001  Computes the area under the curve of the receiver operating characteristics
2002  using the trapezoidal rule.
2003 
2004  For more information about parameters of the ROC, see
2005  :meth:`ComputeROC`.
2006 
2007  :warning: The function depends on *numpy*
2008  '''
2009  try:
2010  import numpy as np
2011 
2012  roc = self.ComputeROC(score_col, class_col, score_dir,
2013  class_dir, class_cutoff)
2014 
2015  if not roc:
2016  return None
2017  return np.trapz(roc[1], roc[0])
2018  except ImportError:
2019  LogError("Function needs numpy, but I could not import it.")
2020  raise
2021 
2022  def PlotROC(self, score_col, class_col, score_dir='-',
2023  class_dir='-', class_cutoff=2.0,
2024  style='-', title=None, x_title=None, y_title=None,
2025  clear=True, save=None):
2026  '''
2027  Plot an ROC curve using matplotlib.
2028 
2029  For more information about parameters of the ROC, see
2030  :meth:`ComputeROC`, and for plotting see :meth:`Plot`.
2031 
2032  :warning: The function depends on *matplotlib*
2033  '''
2034 
2035  try:
2036  import matplotlib.pyplot as plt
2037 
2038  roc = self.ComputeROC(score_col, class_col, score_dir,
2039  class_dir, class_cutoff)
2040 
2041  if not roc:
2042  return None
2043 
2044  enrx, enry = roc
2045 
2046  if not title:
2047  title = 'ROC of %s'%score_col
2048 
2049  if not x_title:
2050  x_title = 'false positive rate'
2051 
2052  if not y_title:
2053  y_title = 'true positive rate'
2054 
2055  if clear:
2056  plt.clf()
2057 
2058  plt.plot(enrx, enry, style)
2059 
2060  plt.title(title, size='x-large', fontweight='bold')
2061  plt.ylabel(y_title, size='x-large')
2062  plt.xlabel(x_title, size='x-large')
2063 
2064  if save:
2065  plt.savefig(save)
2066 
2067  return plt
2068  except ImportError:
2069  LogError("Function needs matplotlib, but I could not import it.")
2070  raise
2071 
2072  def ComputeMCC(self, score_col, class_col, score_dir='-',
2073  class_dir='-', score_cutoff=2.0, class_cutoff=2.0):
2074  '''
2075  Compute Matthews correlation coefficient (MCC) for one column (*score_col*)
2076  with the points classified into true positives, false positives, true
2077  negatives and false negatives according to a specified classification
2078  column (*class_col*).
2079 
2080  The datapoints in *score_col* and *class_col* are classified into
2081  positive and negative points. This can be done in two ways:
2082 
2083  - by using 'bool' columns which contains True for positives and False
2084  for negatives
2085 
2086  - by using 'float' or 'int' columns and specifying a cutoff value and the
2087  columns direction. This will generate the classification on the fly
2088 
2089  * if ``class_dir``/``score_dir=='-'``: values in the classification column that are less than or equal to *class_cutoff*/*score_cutoff* will be counted as positives
2090  * if ``class_dir``/``score_dir=='+'``: values in the classification column that are larger than or equal to *class_cutoff*/*score_cutoff* will be counted as positives
2091 
2092  The two possibilities can be used together, i.e. 'bool' type for one column
2093  and 'float'/'int' type and cutoff/direction for the other column.
2094  '''
2095  ALLOWED_DIR = ['+','-']
2096 
2097  score_idx = self.GetColIndex(score_col)
2098  score_type = self.col_types[score_idx]
2099  if score_type!='int' and score_type!='float' and score_type!='bool':
2100  raise TypeError("Score column must be numeric or bool type")
2101 
2102  class_idx = self.GetColIndex(class_col)
2103  class_type = self.col_types[class_idx]
2104  if class_type!='int' and class_type!='float' and class_type!='bool':
2105  raise TypeError("Classifier column must be numeric or bool type")
2106 
2107  if (score_dir not in ALLOWED_DIR) or (class_dir not in ALLOWED_DIR):
2108  raise ValueError("Direction must be one of %s"%str(ALLOWED_DIR))
2109 
2110  tp = 0
2111  fp = 0
2112  fn = 0
2113  tn = 0
2114 
2115  for i,row in enumerate(self.rows):
2116  class_val = row[class_idx]
2117  score_val = row[score_idx]
2118  if class_val!=None:
2119  if (class_type=='bool' and class_val==True) or (class_type!='bool' and ((class_dir=='-' and class_val<=class_cutoff) or (class_dir=='+' and class_val>=class_cutoff))):
2120  if (score_type=='bool' and score_val==True) or (score_type!='bool' and ((score_dir=='-' and score_val<=score_cutoff) or (score_dir=='+' and score_val>=score_cutoff))):
2121  tp += 1
2122  else:
2123  fn += 1
2124  else:
2125  if (score_type=='bool' and score_val==False) or (score_type!='bool' and ((score_dir=='-' and score_val>score_cutoff) or (score_dir=='+' and score_val<score_cutoff))):
2126  tn += 1
2127  else:
2128  fp += 1
2129 
2130  mcc = None
2131  msg = None
2132  if (tp+fn)==0:
2133  msg = 'factor (tp + fn) is zero'
2134  elif (tp+fp)==0:
2135  msg = 'factor (tp + fp) is zero'
2136  elif (tn+fn)==0:
2137  msg = 'factor (tn + fn) is zero'
2138  elif (tn+fp)==0:
2139  msg = 'factor (tn + fp) is zero'
2140 
2141  if msg:
2142  LogWarning("Could not compute MCC: MCC is not defined since %s"%msg)
2143  else:
2144  mcc = ((tp*tn)-(fp*fn)) / math.sqrt((tp+fn)*(tp+fp)*(tn+fn)*(tn+fp))
2145  return mcc
2146 
2147 
2148  def IsEmpty(self, col_name=None, ignore_nan=True):
2149  '''
2150  Checks if a table is empty.
2151 
2152  If no column name is specified, the whole table is checked for being empty,
2153  whereas if a column name is specified, only this column is checked.
2154 
2155  By default, all NAN (or None) values are ignored, and thus, a table
2156  containing only NAN values is considered as empty. By specifying the
2157  option ignore_nan=False, NAN values are counted as 'normal' values.
2158  '''
2159 
2160  # table with no columns and no rows
2161  if len(self.col_names)==0:
2162  if col_name:
2163  raise ValueError('Table has no column named "%s"' % col_name)
2164  return True
2165 
2166  # column name specified
2167  if col_name:
2168  if self.Count(col_name, ignore_nan=ignore_nan)==0:
2169  return True
2170  else:
2171  return False
2172 
2173  # no column name specified -> test whole table
2174  else:
2175  for row in self.rows:
2176  for cell in row:
2177  if ignore_nan:
2178  if cell!=None:
2179  return False
2180  else:
2181  return False
2182  return True
2183 
2184  def Extend(self, tab, overwrite=None):
2185  """
2186  Append each row of *tab* to the current table. The data is appended based
2187  on the column names, thus the order of the table columns is *not* relevant,
2188  only the header names.
2189 
2190  If there is a column in *tab* that is not present in the current table,
2191  it is added to the current table and filled with *None* for all the rows
2192  present in the current table.
2193 
2194  If the type of any column in *tab* is not the same as in the current table
2195  a *TypeError* is raised.
2196 
2197  If *overwrite* is not None and set to an existing column name, the specified
2198  column in the table is searched for the first occurrence of a value matching
2199  the value of the column with the same name in the dictionary. If a matching
2200  value is found, the row is overwritten with the dictionary. If no matching
2201  row is found, a new row is appended to the table.
2202  """
2203  # add column to current table if it doesn't exist
2204  for name,typ in zip(tab.col_names, tab.col_types):
2205  if not name in self.col_names:
2206  self.AddCol(name, typ)
2207 
2208  # check that column types are the same in current and new table
2209  for name in self.col_names:
2210  if name in tab.col_names:
2211  curr_type = self.col_types[self.GetColIndex(name)]
2212  new_type = tab.col_types[tab.GetColIndex(name)]
2213  if curr_type!=new_type:
2214  raise TypeError('cannot extend table, column %s in new '%name +\
2215  'table different type (%s) than in '%new_type +\
2216  'current table (%s)'%curr_type)
2217 
2218  num_rows = len(tab.rows)
2219  for i in range(0,num_rows):
2220  row = tab.rows[i]
2221  data = dict(zip(tab.col_names,row))
2222  self.AddRow(data, overwrite)
2223 
2224 
2225 def Merge(table1, table2, by, only_matching=False):
2226  """
2227  Returns a new table containing the data from both tables. The rows are
2228  combined based on the common values in the column(s) by. The option 'by' can
2229  be a list of column names. When this is the case, merging is based on
2230  multiple columns.
2231  For example, the two tables below
2232 
2233  ==== ====
2234  x y
2235  ==== ====
2236  1 10
2237  2 15
2238  3 20
2239  ==== ====
2240 
2241  ==== ====
2242  x u
2243  ==== ====
2244  1 100
2245  3 200
2246  4 400
2247  ==== ====
2248 
2249  ===== ===== =====
2250  x y u
2251  ===== ===== =====
2252  1 10 100
2253  2 15 None
2254  3 20 200
2255  4 None 400
2256  ===== ===== =====
2257 
2258  when merged by column x, produce the following output:
2259  """
2260  def _key(row, indices):
2261  return tuple([row[i] for i in indices])
2262  def _keep(indices, cn, ct, ni):
2263  ncn, nct, nni=([],[],[])
2264  for i in range(len(cn)):
2265  if i not in indices:
2266  ncn.append(cn[i])
2267  nct.append(ct[i])
2268  nni.append(ni[i])
2269  return ncn, nct, nni
2270  col_names=list(table2.col_names)
2271  col_types=list(table2.col_types)
2272  new_index=[i for i in range(len(col_names))]
2273  if isinstance(by, str):
2274  common2_indices=[col_names.index(by)]
2275  else:
2276  common2_indices=[col_names.index(b) for b in by]
2277  col_names, col_types, new_index=_keep(common2_indices, col_names,
2278  col_types, new_index)
2279 
2280  for i, name in enumerate(col_names):
2281  try_name=name
2282  counter=1
2283  while try_name in table1.col_names:
2284  counter+=1
2285  try_name='%s_%d' % (name, counter)
2286  col_names[i]=try_name
2287  common1={}
2288  if isinstance(by, str):
2289  common1_indices=[table1.col_names.index(by)]
2290  else:
2291  common1_indices=[table1.col_names.index(b) for b in by]
2292  for row in table1.rows:
2293  key=_key(row, common1_indices)
2294  if key in common1:
2295  raise ValueError('duplicate key "%s in first table"' % (str(key)))
2296  common1[key]=row
2297  common2={}
2298  for row in table2.rows:
2299  key=_key(row, common2_indices)
2300  if key in common2:
2301  raise ValueError('duplicate key "%s" in second table' % (str(key)))
2302  common2[key]=row
2303  new_tab=Table(table1.col_names+col_names, table1.col_types+col_types)
2304  for k, v in common1.iteritems():
2305  row=v+[None for i in range(len(table2.col_names)-len(common2_indices))]
2306  matched=False
2307  if k in common2:
2308  matched=True
2309  row2=common2[k]
2310  for i, index in enumerate(new_index):
2311  row[len(table1.col_names)+i]=row2[index]
2312  if only_matching and not matched:
2313  continue
2314  new_tab.AddRow(row)
2315  if only_matching:
2316  return new_tab
2317  for k, v in common2.iteritems():
2318  if not k in common1:
2319  v2=[v[i] for i in new_index]
2320  row=[None for i in range(len(table1.col_names))]+v2
2321  for common1_index, common2_index in zip(common1_indices, common2_indices):
2322  row[common1_index]=v[common2_index]
2323  new_tab.AddRow(row)
2324  return new_tab
2325 
2326