OpenStructure
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Groups Pages
table.py
Go to the documentation of this file.
1 import csv
2 import re
3 import math
4 from ost import stutil
5 import itertools
6 import operator
7 import cPickle
8 import weakref
9 from ost import LogError, LogWarning, LogInfo, LogVerbose
10 
11 def MakeTitle(col_name):
12  return col_name.replace('_', ' ')
13 
14 def IsStringLike(value):
15  if isinstance(value, TableCol) or isinstance(value, BinaryColExpr):
16  return False
17  try:
18  value+''
19  return True
20  except:
21  return False
22 
23 def IsNullString(value):
24  value=value.strip().upper()
25  return value in ('', 'NULL', 'NONE', 'NA')
26 
27 def IsScalar(value):
28  if IsStringLike(value):
29  return True
30  try:
31  if isinstance(value, TableCol) or isinstance(value, BinaryColExpr):
32  return False
33  iter(value)
34  return False
35  except:
36  return True
37 
38 def GuessColumnType(iterator):
39  empty=True
40  possibilities=set(['bool', 'int', 'float'])
41  for ele in iterator:
42  str_ele=str(ele).upper()
43  if IsNullString(str_ele):
44  continue
45  empty=False
46  if 'int' in possibilities:
47  try:
48  int(str_ele)
49  except ValueError:
50  possibilities.remove('int')
51 
52  if 'float' in possibilities:
53  try:
54  float(str_ele)
55  except ValueError:
56  possibilities.remove('float')
57  if 'bool' in possibilities:
58  if str_ele not in set(['YES', 'NO', 'TRUE', 'FALSE']):
59  possibilities.remove('bool')
60 
61  if len(possibilities)==0:
62  return 'string'
63  if len(possibilities)==2:
64  return 'int'
65  if empty:
66  return 'string'
67  # return the last element available
68  return possibilities.pop()
69 
71  def __init__(self, op, lhs, rhs):
72  self.op=op
73  self.lhs=lhs
74  self.rhs=rhs
75  if IsScalar(lhs):
76  self.lhs=itertools.cyle([self.lhs])
77  if IsScalar(rhs):
78  self.rhs=itertools.cycle([self.rhs])
79  def __iter__(self):
80  for l, r in zip(self.lhs, self.rhs):
81  if l!=None and r!=None:
82  yield self.op(l, r)
83  else:
84  yield None
85  def __add__(self, rhs):
86  return BinaryColExpr(operator.add, self, rhs)
87 
88  def __sub__(self, rhs):
89  return BinaryColExpr(operator.sub, self, rhs)
90 
91  def __mul__(self, rhs):
92  return BinaryColExpr(operator.mul, self, rhs)
93 
94  def __div__(self, rhs):
95  return BinaryColExpr(operator.div, self, rhs)
96 
97 class TableCol:
98  def __init__(self, table, col):
99  self._table=table
100  if type(col)==str:
101  self.col_index=self._table.GetColIndex(col)
102  else:
103  self.col_index=col
104 
105  def __iter__(self):
106  for row in self._table.rows:
107  yield row[self.col_index]
108 
109  def __len__(self):
110  return len(self._table.rows)
111 
112  def __getitem__(self, index):
113  return self._table.rows[index][self.col_index]
114 
115  def __setitem__(self, index, value):
116  self._table.rows[index][self.col_index]=value
117 
118  def __add__(self, rhs):
119  return BinaryColExpr(operator.add, self, rhs)
120 
121  def __sub__(self, rhs):
122  return BinaryColExpr(operator.sub, self, rhs)
123 
124  def __mul__(self, rhs):
125  return BinaryColExpr(operator.mul, self, rhs)
126 
127  def __div__(self, rhs):
128  return BinaryColExpr(operator.div, self, rhs)
129 
130 class TableRow:
131  """
132  Essentially a named tuple, but allows column names that are not valid
133  python variable names.
134  """
135  def __init__(self, row_data, tab):
136  self.__dict__['tab'] = weakref.proxy(tab)
137  self.__dict__['row_data'] = row_data
138 
139  def __getitem__(self, col_name):
140  if type(col_name)==int:
141  return self.row_data[col_name]
142  return self.row_data[self.tab.GetColIndex(col_name)]
143 
144  def __str__(self):
145  s = []
146  for k, v in zip(self.__dict__['tab'].col_names, self.__dict__['row_data']):
147  s.append('%s=%s' % (k, str(v)))
148  return ', '.join(s)
149 
150 
151  def __len__(self):
152  return len(self.row_data)
153 
154  def __setitem__(self, col_name, val):
155  if type(col_name)==int:
156  self.row_data[col_name] = val
157  else:
158  self.row_data[self.tab.GetColIndex(col_name)] = val
159 
160  def __getattr__(self, col_name):
161  if 'col_names' not in self.tab.__dict__ or col_name not in self.tab.col_names:
162  raise AttributeError(col_name)
163  return self.row_data[self.tab.GetColIndex(col_name)]
164 
165  def __setattr__(self, col_name, val):
166  if 'col_names' not in self.tab.__dict__ or col_name not in self.tab.col_names:
167  raise AttributeError(col_name)
168  self.row_data[self.tab.GetColIndex(col_name)] = val
169 
170 class Table(object):
171  """
172 
173  The table class provides convenient access to data in tabular form. An empty
174  table can be easily constructed as follows
175 
176  .. code-block:: python
177 
178  tab = Table()
179 
180  If you want to add columns directly when creating the table, column names
181  and *column types* can be specified as follows
182 
183  .. code-block:: python
184 
185  tab = Table(['nameX','nameY','nameZ'], 'sfb')
186 
187  this will create three columns called nameX, nameY and nameZ of type string,
188  float and bool, respectively. There will be no data in the table and thus,
189  the table will not contain any rows.
190 
191  The following *column types* are supported:
192 
193  ======= ========
194  name abbrev
195  ======= ========
196  string s
197  float f
198  int i
199  bool b
200  ======= ========
201 
202  If you want to add data to the table in addition, use the following:
203 
204  .. code-block:: python
205 
206  tab=Table(['nameX','nameY','nameZ'],
207  'sfb',
208  nameX = ['a','b','c'],
209  nameY = [0.1, 1.2, 3.414],
210  nameZ = [True, False, False])
211 
212  if values for one column is left out, they will be filled with NA, but if
213  values are specified, all values must be specified (i.e. same number of
214  values per column)
215 
216  """
217 
218  SUPPORTED_TYPES=('int', 'float', 'bool', 'string',)
219 
220 
221  def __init__(self, col_names=[], col_types=None, **kwargs):
222 
223  self.col_names=list(col_names)
224  self.comment=''
225  self.name=''
226 
227  self.col_types = self._ParseColTypes(col_types)
228  self.rows=[]
229  if len(kwargs)>=0:
230  if not col_names:
231  self.col_names=[v for v in kwargs.keys()]
232  if not self.col_types:
233  self.col_types=['string' for u in range(len(self.col_names))]
234  if len(kwargs)>0:
235  self._AddRowsFromDict(kwargs)
236 
237  def __getattr__(self, col_name):
238  # pickling doesn't call the standard __init__ defined above and thus
239  # col_names might not be defined. This leads to infinite recursions.
240  # Protect against it by checking that col_names is contained in
241  # __dict__
242  if 'col_names' not in self.__dict__ or col_name not in self.col_names:
243  raise AttributeError(col_name)
244  return TableCol(self, col_name)
245 
246  @staticmethod
247  def _ParseColTypes(types, exp_num=None):
248  if types==None:
249  return None
250 
251  short2long = {'s' : 'string', 'i': 'int', 'b' : 'bool', 'f' : 'float'}
252  allowed_short = short2long.keys()
253  allowed_long = short2long.values()
254 
255  type_list = []
256 
257  # string type
258  if IsScalar(types):
259  if type(types)==str:
260  types = types.lower()
261 
262  # single value
263  if types in allowed_long:
264  type_list.append(types)
265  elif types in allowed_short:
266  type_list.append(short2long[types])
267 
268  # comma separated list of long or short types
269  elif types.find(',')!=-1:
270  for t in types.split(','):
271  if t in allowed_long:
272  type_list.append(t)
273  elif t in allowed_short:
274  type_list.append(short2long[t])
275  else:
276  raise ValueError('Unknown type %s in types %s'%(t,types))
277 
278  # string of short types
279  else:
280  for t in types:
281  if t in allowed_short:
282  type_list.append(short2long[t])
283  else:
284  raise ValueError('Unknown type %s in types %s'%(t,types))
285 
286  # non-string type
287  else:
288  raise ValueError('Col type %s must be string or list'%types)
289 
290  # list type
291  else:
292  for t in types:
293  # must be string type
294  if type(t)==str:
295  t = t.lower()
296  if t in allowed_long:
297  type_list.append(t)
298  elif t in allowed_short:
299  type_list.append(short2long[t])
300  else:
301  raise ValueError('Unknown type %s in types %s'%(t,types))
302 
303  # non-string type
304  else:
305  raise ValueError('Col type %s must be string or list'%types)
306 
307  if exp_num:
308  if len(type_list)!=exp_num:
309  raise ValueError('Parsed number of col types (%i) differs from ' + \
310  'expected (%i) in types %s'%(len(type_list),exp_num,types))
311 
312  return type_list
313 
314  def SetName(self, name):
315  '''
316  Set name of the table
317 
318  :param name: name
319  :type name: :class:`str`
320  '''
321  self.name = name
322 
323  def GetName(self):
324  '''
325  Get name of table
326  '''
327  return self.name
328 
329  def RenameCol(self, old_name, new_name):
330  """
331  Rename column *old_name* to *new_name*.
332 
333  :param old_name: Name of the old column
334  :param new_name: Name of the new column
335  :raises: :exc:`ValueError` when *old_name* is not a valid column
336  """
337  if old_name==new_name:
338  return
339  self.AddCol(new_name, self.col_types[self.GetColIndex(old_name)],
340  self[old_name])
341  self.RemoveCol(old_name)
342  def _Coerce(self, value, ty):
343  '''
344  Try to convert values (e.g. from :class:`str` type) to the specified type
345 
346  :param value: the value
347  :type value: any type
348 
349  :param ty: name of type to convert it to (i.e. *int*, *float*, *string*,
350  *bool*)
351  :type ty: :class:`str`
352  '''
353  if value=='NA' or value==None:
354  return None
355  if ty=='int':
356  return int(value)
357  if ty=='float':
358  return float(value)
359  if ty=='string':
360  return str(value)
361  if ty=='bool':
362  if isinstance(value, str) or isinstance(value, unicode):
363  if value.upper() in ('FALSE', 'NO',):
364  return False
365  return True
366  return bool(value)
367  raise ValueError('Unknown type %s' % ty)
368 
369  def GetColIndex(self, col):
370  '''
371  Returns the column index for the column with the given name.
372 
373  :raises: ValueError if no column with the name is found.
374  '''
375  if col not in self.col_names:
376  raise ValueError('Table has no column named "%s"' % col)
377  return self.col_names.index(col)
378 
379  def GetColNames(self):
380  '''
381  Returns a list containing all column names.
382  '''
383  return self.col_names
384 
385  def SearchColNames(self, regex):
386  '''
387  Returns a list of column names matching the regex.
388 
389  :param regex: regex pattern
390  :type regex: :class:`str`
391 
392  :returns: :class:`list` of column names (:class:`str`)
393  '''
394  matching_names = []
395  for name in self.col_names:
396  matches = re.search(regex, name)
397  if matches:
398  matching_names.append(name)
399  return matching_names
400 
401  def HasCol(self, col):
402  '''
403  Checks if the column with a given name is present in the table.
404  '''
405  return col in self.col_names
406 
407  def __getitem__(self, k):
408  if type(k)==int:
409  return TableCol(self, self.col_names[k])
410  else:
411  return TableCol(self, k)
412 
413  def __setitem__(self, k, value):
414  col_index=k
415  if type(k)!=int:
416  col_index=self.GetColIndex(k)
417  if IsScalar(value):
418  value=itertools.cycle([value])
419  for r, v in zip(self.rows, value):
420  r[col_index]=v
421 
422  def ToString(self, float_format='%.3f', int_format='%d', rows=None):
423  '''
424  Convert the table into a string representation.
425 
426  The output format can be modified for int and float type columns by
427  specifying a formatting string for the parameters *float_format* and
428  *int_format*.
429 
430  The option *rows* specify the range of rows to be printed. The parameter
431  must be a type that supports indexing (e.g. a :class:`list`) containing the
432  start and end row *index*, e.g. [start_row_idx, end_row_idx].
433 
434  :param float_format: formatting string for float columns
435  :type float_format: :class:`str`
436 
437  :param int_format: formatting string for int columns
438  :type int_format: :class:`str`
439 
440  :param rows: iterable containing start and end row *index*
441  :type rows: iterable containing :class:`ints <int>`
442  '''
443  widths=[len(cn) for cn in self.col_names]
444  sel_rows=self.rows
445  if rows:
446  sel_rows=self.rows[rows[0]:rows[1]]
447  for row in sel_rows:
448  for i, (ty, col) in enumerate(zip(self.col_types, row)):
449  if col==None:
450  widths[i]=max(widths[i], len('NA'))
451  elif ty=='float':
452  widths[i]=max(widths[i], len(float_format % col))
453  elif ty=='int':
454  widths[i]=max(widths[i], len(int_format % col))
455  else:
456  widths[i]=max(widths[i], len(str(col)))
457  s=''
458  if self.comment:
459  s+=''.join(['# %s\n' % l for l in self.comment.split('\n')])
460  total_width=sum(widths)+2*len(widths)
461  for width, col_name in zip(widths, self.col_names):
462  s+=col_name.center(width+2)
463  s+='\n%s\n' % ('-'*total_width)
464  for row in sel_rows:
465  for width, ty, col in zip(widths, self.col_types, row):
466  cs=''
467  if col==None:
468  cs='NA'.center(width+2)
469  elif ty=='float':
470  cs=(float_format % col).rjust(width+2)
471  elif ty=='int':
472  cs=(int_format % col).rjust(width+2)
473  else:
474  cs=' '+str(col).ljust(width+1)
475  s+=cs
476  s+='\n'
477  return s
478 
479  def __str__(self):
480  return self.ToString()
481 
482  def Stats(self, col):
483  idx = self.GetColIndex(col)
484  text ='''
485 Statistics for column %(col)s
486 
487  Number of Rows : %(num)d
488  Number of Rows Not None: %(num_non_null)d
489  Mean : %(mean)f
490  Median : %(median)f
491  Standard Deviation : %(stddev)f
492  Min : %(min)f
493  Max : %(max)f
494 '''
495  data = {
496  'col' : col,
497  'num' : len(self.rows),
498  'num_non_null' : self.Count(col),
499  'median' : self.Median(col),
500  'mean' : self.Mean(col),
501  'stddev' : self.StdDev(col),
502  'min' : self.Min(col),
503  'max' : self.Max(col),
504  }
505  return text % data
506 
507  def _AddRowsFromDict(self, d, overwrite=None):
508  '''
509  Add one or more rows from a :class:`dictionary <dict>`.
510 
511  If *overwrite* is not None and set to an existing column name, the specified
512  column in the table is searched for the first occurrence of a value matching
513  the value of the column with the same name in the dictionary. If a matching
514  value is found, the row is overwritten with the dictionary. If no matching
515  row is found, a new row is appended to the table.
516 
517  :param d: dictionary containing the data
518  :type d: :class:`dict`
519 
520  :param overwrite: column name to overwrite existing row if value in
521  column *overwrite* matches
522  :type overwrite: :class:`str`
523 
524  :raises: :class:`ValueError` if multiple rows are added but the number of
525  data items is different for different columns.
526  '''
527  # get column indices
528  idxs = [self.GetColIndex(k) for k in d.keys()]
529 
530  # convert scalar values to list
531  old_len = None
532  for k,v in d.iteritems():
533  if IsScalar(v):
534  v = [v]
535  d[k] = v
536  if not old_len:
537  old_len = len(v)
538  elif old_len!=len(v):
539  raise ValueError("Cannot add rows: length of data must be equal " + \
540  "for all columns in %s"%str(d))
541 
542  # convert column based dict to row based dict and create row and add data
543  for i,data in enumerate(zip(*d.values())):
544  new_row = [None for a in range(len(self.col_names))]
545  for idx,v in zip(idxs,data):
546  new_row[idx] = self._Coerce(v, self.col_types[idx])
547 
548  # partially overwrite existing row with new data
549  if overwrite:
550  overwrite_idx = self.GetColIndex(overwrite)
551  added = False
552  for i,r in enumerate(self.rows):
553  if r[overwrite_idx]==new_row[overwrite_idx]:
554  for j,e in enumerate(self.rows[i]):
555  if new_row[j]==None:
556  new_row[j] = e
557  self.rows[i] = new_row
558  added = True
559  break
560 
561  # if not overwrite or overwrite did not find appropriate row
562  if not overwrite or not added:
563  self.rows.append(new_row)
564 
565  def PairedTTest(self, col_a, col_b):
566  """
567  Two-sided test for the null-hypothesis that two related samples
568  have the same average (expected values).
569 
570  :param col_a: First column
571  :param col_b: Second column
572 
573  :returns: P-value between 0 and 1 that the two columns have the
574  same average. The smaller the value, the less related the two
575  columns are.
576  """
577  from scipy.stats import ttest_rel
578  xs = []
579  ys = []
580  for x, y in self.Zip(col_a, col_b):
581  if x!=None and y!=None:
582  xs.append(x)
583  ys.append(y)
584  result = ttest_rel(xs, ys)
585  return result[1]
586 
587  def AddRow(self, data, overwrite=None):
588  """
589  Add a row to the table.
590 
591  *data* may either be a dictionary or a list-like object:
592 
593  - If *data* is a dictionary, the keys in the dictionary must match the
594  column names. Columns not found in the dict will be initialized to None.
595  If the dict contains list-like objects, multiple rows will be added, if
596  the number of items in all list-like objects is the same, otherwise a
597  :class:`ValueError` is raised.
598 
599  - If *data* is a list-like object, the row is initialized from the values
600  in *data*. The number of items in *data* must match the number of
601  columns in the table. A :class:`ValuerError` is raised otherwise. The
602  values are added in the order specified in the list, thus, the order of
603  the data must match the columns.
604 
605  If *overwrite* is not None and set to an existing column name, the specified
606  column in the table is searched for the first occurrence of a value matching
607  the value of the column with the same name in the dictionary. If a matching
608  value is found, the row is overwritten with the dictionary. If no matching
609  row is found, a new row is appended to the table.
610 
611  :param data: data to add
612  :type data: :class:`dict` or *list-like* object
613 
614  :param overwrite: column name to overwrite existing row if value in
615  column *overwrite* matches
616  :type overwrite: :class:`str`
617 
618  :raises: :class:`ValueError` if *list-like* object is used and number of
619  items does *not* match number of columns in table.
620 
621  :raises: :class:`ValueError` if *dict* is used and multiple rows are added
622  but the number of data items is different for different columns.
623 
624  **Example:** add multiple data rows to a subset of columns using a dictionary
625 
626  .. code-block:: python
627 
628  # create table with three float columns
629  tab = Table(['x','y','z'], 'fff')
630 
631  # add rows from dict
632  data = {'x': [1.2, 1.6], 'z': [1.6, 5.3]}
633  tab.AddRow(data)
634  print tab
635 
636  '''
637  will produce the table
638 
639  ==== ==== ====
640  x y z
641  ==== ==== ====
642  1.20 NA 1.60
643  1.60 NA 5.30
644  ==== ==== ====
645  '''
646 
647  # overwrite the row with x=1.2 and add row with x=1.9
648  data = {'x': [1.2, 1.9], 'z': [7.9, 3.5]}
649  tab.AddRow(data, overwrite='x')
650  print tab
651 
652  '''
653  will produce the table
654 
655  ==== ==== ====
656  x y z
657  ==== ==== ====
658  1.20 NA 7.90
659  1.60 NA 5.30
660  1.90 NA 3.50
661  ==== ==== ====
662  '''
663  """
664  if type(data)==dict:
665  self._AddRowsFromDict(data, overwrite)
666  else:
667  if len(data)!=len(self.col_names):
668  msg='data array must have %d elements, not %d'
669  raise ValueError(msg % (len(self.col_names), len(data)))
670  new_row = [self._Coerce(v, t) for v, t in zip(data, self.col_types)]
671 
672  # fully overwrite existing row with new data
673  if overwrite:
674  overwrite_idx = self.GetColIndex(overwrite)
675  added = False
676  for i,r in enumerate(self.rows):
677  if r[overwrite_idx]==new_row[overwrite_idx]:
678  self.rows[i] = new_row
679  added = True
680  break
681 
682  # if not overwrite or overwrite did not find appropriate row
683  if not overwrite or not added:
684  self.rows.append(new_row)
685 
686  def RemoveCol(self, col):
687  """
688  Remove column with the given name from the table.
689 
690  :param col: name of column to remove
691  :type col: :class:`str`
692  """
693  idx = self.GetColIndex(col)
694  del self.col_names[idx]
695  del self.col_types[idx]
696  for row in self.rows:
697  del row[idx]
698 
699  def AddCol(self, col_name, col_type, data=None):
700  """
701  Add a column to the right of the table.
702 
703  :param col_name: name of new column
704  :type col_name: :class:`str`
705 
706  :param col_type: type of new column (long versions: *int*, *float*, *bool*,
707  *string* or short versions: *i*, *f*, *b*, *s*)
708  :type col_type: :class:`str`
709 
710  :param data: data to add to new column
711  :type data: scalar or iterable
712 
713  **Example:**
714 
715  .. code-block:: python
716 
717  tab = Table(['x'], 'f', x=range(5))
718  tab.AddCol('even', 'bool', itertools.cycle([True, False]))
719  print tab
720 
721  '''
722  will produce the table
723 
724  ==== ====
725  x even
726  ==== ====
727  0 True
728  1 False
729  2 True
730  3 False
731  4 True
732  ==== ====
733  '''
734 
735  If data is a constant instead of an iterable object, it's value
736  will be written into each row:
737 
738  .. code-block:: python
739 
740  tab = Table(['x'], 'f', x=range(5))
741  tab.AddCol('num', 'i', 1)
742  print tab
743 
744  '''
745  will produce the table
746 
747  ==== ====
748  x num
749  ==== ====
750  0 1
751  1 1
752  2 1
753  3 1
754  4 1
755  ==== ====
756  '''
757 
758  As a special case, if there are no previous rows, and data is not
759  None, rows are added for every item in data.
760  """
761 
762  if col_name in self.col_names:
763  raise ValueError('Column with name %s already exists'%col_name)
764 
765  col_type = self._ParseColTypes(col_type, exp_num=1)[0]
766  self.col_names.append(col_name)
767  self.col_types.append(col_type)
768 
769  if len(self.rows)>0:
770  if IsScalar(data):
771  for row in self.rows:
772  row.append(data)
773  else:
774  if hasattr(data, '__len__') and len(data)!=len(self.rows):
775  self.col_names.pop()
776  self.col_types.pop()
777  raise ValueError('Length of data (%i) must correspond to number of '%len(data) +\
778  'existing rows (%i)'%len(self.rows))
779  for row, d in zip(self.rows, data):
780  row.append(d)
781 
782  elif data!=None and len(self.col_names)==1:
783  if IsScalar(data):
784  self.AddRow({col_name : data})
785  else:
786  for v in data:
787  self.AddRow({col_name : v})
788 
789  def Filter(self, *args, **kwargs):
790  """
791  Returns a filtered table only containing rows matching all the predicates
792  in kwargs and args For example,
793 
794  .. code-block:: python
795 
796  tab.Filter(town='Basel')
797 
798  will return all the rows where the value of the column "town" is equal to
799  "Basel". Several predicates may be combined, i.e.
800 
801  .. code-block:: python
802 
803  tab.Filter(town='Basel', male=True)
804 
805  will return the rows with "town" equal to "Basel" and "male" equal to true.
806  args are unary callables returning true if the row should be included in the
807  result and false if not.
808  """
809  filt_tab=Table(list(self.col_names), list(self.col_types))
810  for row in self.rows:
811  matches=True
812  for func in args:
813  if not func(row):
814  matches=False
815  break
816  for key, val in kwargs.iteritems():
817  if row[self.GetColIndex(key)]!=val:
818  matches=False
819  break
820  if matches:
821  filt_tab.AddRow(row)
822  return filt_tab
823 
824 
825  def Select(self, query):
826 
827  """
828  Returns a new table object containing all rows matching a logical query expression.
829 
830  *query* is a string containing the logical expression, that will be evaluated
831  for every row.
832 
833  Operands have to be the name of a column or an expression that can be parsed to
834  float, int, bool or string.
835  Valid operators are: and, or, !=, !, <=, >=, ==, =, <, >, +, -, *, /
836 
837  .. code-block:: python
838 
839  subtab = tab.Select('col_a>0.5 and (col_b=5 or col_c=5)')
840 
841  The selection query should be self explaining. Allowed parenthesis are: (), [], {},
842  whereas parenthesis mismatches get recognized. Expressions like '3<=col_a>=col_b'
843  throw an error, due to problems in figuring out the evaluation order.
844 
845  There are two special expressions:
846 
847  .. code-block:: python
848 
849  #selects rows, where 1.0<=col_a<=1.5
850  subtab = tab.Select('col_a=1.0:1.5')
851 
852  #selects rows, where col_a=1 or col_a=2 or col_a=3
853  subtab = tab.Select('col_a=1,2,3')
854 
855  Only consistent types can be compared. If col_a is of type string and col_b is of type int,
856  following expression would throw an error: 'col_a<col_b'
857 
858  """
859 
860  try:
861  from table_selector import TableSelector
862  except:
863  raise ImportError("Tried to import from the file table_selector.py, but could not find it!")
864 
865  selector=TableSelector(self.col_types, self.col_names, query)
866 
867  selected_tab=Table(list(self.col_names), list(self.col_types))
868 
869  for row in self.rows:
870  if selector.EvaluateRow(row):
871  selected_tab.AddRow(row)
872 
873  return selected_tab
874 
875 
876  @staticmethod
877  def _LoadOST(stream_or_filename):
878  fieldname_pattern=re.compile(r'(?P<name>[^[]+)(\[(?P<type>\w+)\])?')
879  values_pattern=re.compile("([^\" ]+|\"[^\"]*\")+")
880  if not hasattr(stream_or_filename, 'read'):
881  stream=open(stream_or_filename, 'r')
882  else:
883  stream=stream_or_filename
884  header=False
885  num_lines=0
886  for line in stream:
887  line=line.strip()
888  if line.startswith('#'):
889  continue
890  if len(line)==0:
891  continue
892  num_lines+=1
893  if not header:
894  fieldnames=[]
895  fieldtypes=[]
896  for col in line.split():
897  match=fieldname_pattern.match(col)
898  if match:
899  if match.group('type'):
900  fieldtypes.append(match.group('type'))
901  else:
902  fieldtypes.append('string')
903  fieldnames.append(match.group('name'))
904  tab=Table(fieldnames, fieldtypes)
905  header=True
906  continue
907  tab.AddRow([x.strip('"') for x in values_pattern.findall(line)])
908  if num_lines==0:
909  raise IOError("Cannot read table from empty stream")
910  return tab
911 
912  def _GuessColumnTypes(self):
913  for col_idx in range(len(self.col_names)):
914  self.col_types[col_idx]=GuessColumnType(self[self.col_names[col_idx]])
915  for row in self.rows:
916  for idx in range(len(row)):
917  row[idx]=self._Coerce(row[idx], self.col_types[idx])
918 
919  @staticmethod
920  def _LoadCSV(stream_or_filename, sep):
921  if not hasattr(stream_or_filename, 'read'):
922  stream=open(stream_or_filename, 'r')
923  else:
924  stream=stream_or_filename
925  reader=csv.reader(stream, delimiter=sep)
926  first=True
927  for row in reader:
928  if first:
929  header=row
930  types='s'*len(row)
931  tab=Table(header, types)
932  first=False
933  else:
934  tab.AddRow(row)
935  if first:
936  raise IOError('trying to load table from empty CSV stream/file')
937 
938  tab._GuessColumnTypes()
939  return tab
940 
941  @staticmethod
942  def _LoadPickle(stream_or_filename):
943  if not hasattr(stream_or_filename, 'read'):
944  stream=open(stream_or_filename, 'rb')
945  else:
946  stream=stream_or_filename
947  return cPickle.load(stream)
948 
949  @staticmethod
950  def _GuessFormat(filename):
951  try:
952  filename = filename.name
953  except AttributeError, e:
954  pass
955  if filename.endswith('.csv'):
956  return 'csv'
957  elif filename.endswith('.pickle'):
958  return 'pickle'
959  else:
960  return 'ost'
961 
962 
963  @staticmethod
964  def Load(stream_or_filename, format='auto', sep=','):
965  """
966  Load table from stream or file with given name.
967 
968  By default, the file format is set to *auto*, which tries to guess the file
969  format from the file extension. The following file extensions are
970  recognized:
971 
972  ============ ======================
973  extension recognized format
974  ============ ======================
975  .csv comma separated values
976  .pickle pickled byte stream
977  <all others> ost-specific format
978  ============ ======================
979 
980  Thus, *format* must be specified for reading file with different filename
981  extensions.
982 
983  The following file formats are understood:
984 
985  - ost
986 
987  This is an ost-specific, but still human readable file format. The file
988  (stream) must start with header line of the form
989 
990  col_name1[type1] <col_name2[type2]>...
991 
992  The types given in brackets must be one of the data types the
993  :class:`Table` class understands. Each following line in the file then must
994  contains exactly the same number of data items as listed in the header. The
995  data items are automatically converted to the column format. Lines starting
996  with a '#' and empty lines are ignored.
997 
998  - pickle
999 
1000  Deserializes the table from a pickled byte stream.
1001 
1002  - csv
1003 
1004  Reads the table from comma separated values stream. Since there is no
1005  explicit type information in the csv file, the column types are guessed,
1006  using the following simple rules:
1007 
1008  * if all values are either NA/NULL/NONE the type is set to string.
1009  * if all non-null values are convertible to float/int the type is set to
1010  float/int.
1011  * if all non-null values are true/false/yes/no, the value is set to bool.
1012  * for all other cases, the column type is set to string.
1013 
1014  :returns: A new :class:`Table` instance
1015  """
1016  format=format.lower()
1017  if format=='auto':
1018  format = Table._GuessFormat(stream_or_filename)
1019 
1020  if format=='ost':
1021  return Table._LoadOST(stream_or_filename)
1022  if format=='csv':
1023  return Table._LoadCSV(stream_or_filename, sep=sep)
1024  if format=='pickle':
1025  return Table._LoadPickle(stream_or_filename)
1026  raise ValueError('unknown format ""' % format)
1027 
1028  def Sort(self, by, order='+'):
1029  """
1030  Performs an in-place sort of the table, based on column *by*.
1031 
1032  :param by: column name by which to sort
1033  :type by: :class:`str`
1034 
1035  :param order: ascending (``-``) or descending (``+``) order
1036  :type order: :class:`str` (i.e. *+*, *-*)
1037  """
1038  sign=-1
1039  if order=='-':
1040  sign=1
1041  key_index=self.GetColIndex(by)
1042  def _key_cmp(lhs, rhs):
1043  return sign*cmp(lhs[key_index], rhs[key_index])
1044  self.rows=sorted(self.rows, _key_cmp)
1045 
1046  def GetUnique(self, col, ignore_nan=True):
1047  """
1048  Extract a list of all unique values from one column.
1049 
1050  :param col: column name
1051  :type col: :class:`str`
1052 
1053  :param ignore_nan: ignore all *None* values
1054  :type ignore_nan: :class:`bool`
1055  """
1056  idx = self.GetColIndex(col)
1057  seen = {}
1058  result = []
1059  for row in self.rows:
1060  item = row[idx]
1061  if item!=None or ignore_nan==False:
1062  if item in seen: continue
1063  seen[item] = 1
1064  result.append(item)
1065  return result
1066 
1067  def Zip(self, *args):
1068  """
1069  Allows to conveniently iterate over a selection of columns, e.g.
1070 
1071  .. code-block:: python
1072 
1073  tab = Table.Load('...')
1074  for col1, col2 in tab.Zip('col1', 'col2'):
1075  print col1, col2
1076 
1077  is a shortcut for
1078 
1079  .. code-block:: python
1080 
1081  tab = Table.Load('...')
1082  for col1, col2 in zip(tab['col1'], tab['col2']):
1083  print col1, col2
1084  """
1085  return zip(*[self[arg] for arg in args])
1086 
1087  def Plot(self, x, y=None, z=None, style='.', x_title=None, y_title=None,
1088  z_title=None, x_range=None, y_range=None, z_range=None,
1089  color=None, plot_if=None, legend=None,
1090  num_z_levels=10, z_contour=True, z_interpol='nn', diag_line=False,
1091  labels=None, max_num_labels=None, title=None, clear=True, save=False,
1092  **kwargs):
1093  """
1094  Function to plot values from your table in 1, 2 or 3 dimensions using
1095  `Matplotlib <http://matplotlib.sourceforge.net>`__
1096 
1097  :param x: column name for first dimension
1098  :type x: :class:`str`
1099 
1100  :param y: column name for second dimension
1101  :type y: :class:`str`
1102 
1103  :param z: column name for third dimension
1104  :type z: :class:`str`
1105 
1106  :param style: symbol style (e.g. *.*, *-*, *x*, *o*, *+*, *\**). For a
1107  complete list check (`matplotlib docu <http://matplotlib.sourceforge.net/api/pyplot_api.html#matplotlib.pyplot.plot>`__).
1108  :type style: :class:`str`
1109 
1110  :param x_title: title for first dimension, if not specified it is
1111  automatically derived from column name
1112  :type x_title: :class:`str`
1113 
1114  :param y_title: title for second dimension, if not specified it is
1115  automatically derived from column name
1116  :type y_title: :class:`str`
1117 
1118  :param z_title: title for third dimension, if not specified it is
1119  automatically derived from column name
1120  :type z_title: :class:`str`
1121 
1122  :param x_range: start and end value for first dimension (e.g. [start_x, end_x])
1123  :type x_range: :class:`list` of length two
1124 
1125  :param y_range: start and end value for second dimension (e.g. [start_y, end_y])
1126  :type y_range: :class:`list` of length two
1127 
1128  :param z_range: start and end value for third dimension (e.g. [start_z, end_z])
1129  :type z_range: :class:`list` of length two
1130 
1131  :param color: color for data (e.g. *b*, *g*, *r*). For a complete list check
1132  (`matplotlib docu <http://matplotlib.sourceforge.net/api/pyplot_api.html#matplotlib.pyplot.plot>`__).
1133  :type color: :class:`str`
1134 
1135  :param plot_if: callable which returnes *True* if row should be plotted. Is
1136  invoked like ``plot_if(self, row)``
1137  :type plot_if: callable
1138 
1139  :param legend: legend label for data series
1140  :type legend: :class:`str`
1141 
1142  :param num_z_levels: number of levels for third dimension
1143  :type num_z_levels: :class:`int`
1144 
1145  :param diag_line: draw diagonal line
1146  :type diag_line: :class:`bool`
1147 
1148  :param labels: column name containing labels to put on x-axis for one
1149  dimensional plot
1150  :type labels: :class:`str`
1151 
1152  :param max_num_labels: limit maximum number of labels
1153  :type max_num_labels: :class:`int`
1154 
1155  :param title: plot title, if not specified it is automatically derived from
1156  plotted column names
1157  :type title: :class:`str`
1158 
1159  :param clear: clear old data from plot
1160  :type clear: :class:`bool`
1161 
1162  :param save: filename for saving plot
1163  :type save: :class:`str`
1164 
1165  :param z_contour: draw contour lines
1166  :type z_contour: :class:`bool`
1167 
1168  :param z_interpol: interpolation method for 3-dimensional plot (one of 'nn',
1169  'linear')
1170  :type z_interpol: :class:`str`
1171 
1172  :param \*\*kwargs: additional arguments passed to matplotlib
1173 
1174  :returns: the ``matplotlib.pyplot`` module
1175 
1176  **Examples:** simple plotting functions
1177 
1178  .. code-block:: python
1179 
1180  tab = Table(['a','b','c','d'],'iffi', a=range(5,0,-1),
1181  b=[x/2.0 for x in range(1,6)],
1182  c=[math.cos(x) for x in range(0,5)],
1183  d=range(3,8))
1184 
1185  # one dimensional plot of column 'd' vs. index
1186  plt = tab.Plot('d')
1187  plt.show()
1188 
1189  # two dimensional plot of 'a' vs. 'c'
1190  plt = tab.Plot('a', y='c', style='o-')
1191  plt.show()
1192 
1193  # three dimensional plot of 'a' vs. 'c' with values 'b'
1194  plt = tab.Plot('a', y='c', z='b')
1195  # manually save plot to file
1196  plt.savefig("plot.png")
1197  """
1198  try:
1199  import matplotlib.pyplot as plt
1200  import matplotlib.mlab as mlab
1201  import numpy as np
1202  idx1 = self.GetColIndex(x)
1203  xs = []
1204  ys = []
1205  zs = []
1206 
1207  if clear:
1208  plt.figure(figsize=[8, 6])
1209 
1210  if x_title!=None:
1211  nice_x=x_title
1212  else:
1213  nice_x=MakeTitle(x)
1214 
1215  if y_title!=None:
1216  nice_y=y_title
1217  else:
1218  if y:
1219  nice_y=MakeTitle(y)
1220  else:
1221  nice_y=None
1222 
1223  if z_title!=None:
1224  nice_z = z_title
1225  else:
1226  if z:
1227  nice_z = MakeTitle(z)
1228  else:
1229  nice_z = None
1230 
1231  if x_range and (IsScalar(x_range) or len(x_range)!=2):
1232  raise ValueError('parameter x_range must contain exactly two elements')
1233  if y_range and (IsScalar(y_range) or len(y_range)!=2):
1234  raise ValueError('parameter y_range must contain exactly two elements')
1235  if z_range and (IsScalar(z_range) or len(z_range)!=2):
1236  raise ValueError('parameter z_range must contain exactly two elements')
1237 
1238  if color:
1239  kwargs['color']=color
1240  if legend:
1241  kwargs['label']=legend
1242  if y and z:
1243  idx3 = self.GetColIndex(z)
1244  idx2 = self.GetColIndex(y)
1245  for row in self.rows:
1246  if row[idx1]!=None and row[idx2]!=None and row[idx3]!=None:
1247  if plot_if and not plot_if(self, row):
1248  continue
1249  xs.append(row[idx1])
1250  ys.append(row[idx2])
1251  zs.append(row[idx3])
1252  levels = []
1253  if z_range:
1254  z_spacing = (z_range[1] - z_range[0]) / num_z_levels
1255  l = z_range[0]
1256  else:
1257  l = self.Min(z)
1258  z_spacing = (self.Max(z) - l) / num_z_levels
1259 
1260  for i in range(0,num_z_levels+1):
1261  levels.append(l)
1262  l += z_spacing
1263 
1264  xi = np.linspace(min(xs),max(xs),len(xs)*10)
1265  yi = np.linspace(min(ys),max(ys),len(ys)*10)
1266  zi = mlab.griddata(xs, ys, zs, xi, yi, interp=z_interpol)
1267 
1268  if z_contour:
1269  plt.contour(xi,yi,zi,levels,linewidths=0.5,colors='k')
1270 
1271  plt.contourf(xi,yi,zi,levels,cmap=plt.cm.jet)
1272  plt.colorbar(ticks=levels)
1273 
1274  elif y:
1275  idx2=self.GetColIndex(y)
1276  for row in self.rows:
1277  if row[idx1]!=None and row[idx2]!=None:
1278  if plot_if and not plot_if(self, row):
1279  continue
1280  xs.append(row[idx1])
1281  ys.append(row[idx2])
1282  plt.plot(xs, ys, style, **kwargs)
1283 
1284  else:
1285  label_vals=[]
1286 
1287  if labels:
1288  label_idx=self.GetColIndex(labels)
1289  for row in self.rows:
1290  if row[idx1]!=None:
1291  if plot_if and not plot_if(self, row):
1292  continue
1293  xs.append(row[idx1])
1294  if labels:
1295  label_vals.append(row[label_idx])
1296  plt.plot(xs, style, **kwargs)
1297  if labels:
1298  interval = 1
1299  if max_num_labels:
1300  if len(label_vals)>max_num_labels:
1301  interval = int(math.ceil(float(len(label_vals))/max_num_labels))
1302  label_vals = label_vals[::interval]
1303  plt.xticks(np.arange(0, len(xs), interval), label_vals, rotation=45,
1304  size='x-small')
1305 
1306  if title==None:
1307  if nice_z:
1308  title = '%s of %s vs. %s' % (nice_z, nice_x, nice_y)
1309  elif nice_y:
1310  title = '%s vs. %s' % (nice_x, nice_y)
1311  else:
1312  title = nice_x
1313 
1314  plt.title(title, size='x-large', fontweight='bold',
1315  verticalalignment='bottom')
1316 
1317  if legend:
1318  plt.legend(loc=0)
1319 
1320  if x and y:
1321  plt.xlabel(nice_x, size='x-large')
1322  if x_range:
1323  plt.xlim(x_range[0], x_range[1])
1324  if y_range:
1325  plt.ylim(y_range[0], y_range[1])
1326  if diag_line:
1327  plt.plot(x_range, y_range, '-', color='black')
1328 
1329  plt.ylabel(nice_y, size='x-large')
1330  else:
1331  if y_range:
1332  plt.ylim(y_range[0], y_range[1])
1333  if x_title:
1334  plt.xlabel(x_title, size='x-large')
1335  plt.ylabel(nice_y, size='x-large')
1336  if save:
1337  plt.savefig(save)
1338  return plt
1339  except ImportError:
1340  LogError("Function needs numpy and matplotlib, but I could not import it.")
1341  raise
1342 
1343  def PlotHistogram(self, col, x_range=None, num_bins=10, normed=False,
1344  histtype='stepfilled', align='mid', x_title=None,
1345  y_title=None, title=None, clear=True, save=False,
1346  color=None, y_range=None):
1347  """
1348  Create a histogram of the data in col for the range *x_range*, split into
1349  *num_bins* bins and plot it using Matplotlib.
1350 
1351  :param col: column name with data
1352  :type col: :class:`str`
1353 
1354  :param x_range: start and end value for first dimension (e.g. [start_x, end_x])
1355  :type x_range: :class:`list` of length two
1356 
1357  :param y_range: start and end value for second dimension (e.g. [start_y, end_y])
1358  :type y_range: :class:`list` of length two
1359 
1360  :param num_bins: number of bins in range
1361  :type num_bins: :class:`int`
1362 
1363  :param color: Color to be used for the histogram. If not set, color will be
1364  determined by matplotlib
1365  :type color: :class:`str`
1366 
1367  :param normed: normalize histogram
1368  :type normed: :class:`bool`
1369 
1370  :param histtype: type of histogram (i.e. *bar*, *barstacked*, *step*,
1371  *stepfilled*). See (`matplotlib docu <http://matplotlib.sourceforge.net/api/pyplot_api.html#matplotlib.pyplot.hist>`__).
1372  :type histtype: :class:`str`
1373 
1374  :param align: style of histogram (*left*, *mid*, *right*). See
1375  (`matplotlib docu <http://matplotlib.sourceforge.net/api/pyplot_api.html#matplotlib.pyplot.hist>`__).
1376  :type align: :class:`str`
1377 
1378  :param x_title: title for first dimension, if not specified it is
1379  automatically derived from column name
1380  :type x_title: :class:`str`
1381 
1382  :param y_title: title for second dimension, if not specified it is
1383  automatically derived from column name
1384  :type y_title: :class:`str`
1385 
1386  :param title: plot title, if not specified it is automatically derived from
1387  plotted column names
1388  :type title: :class:`str`
1389 
1390  :param clear: clear old data from plot
1391  :type clear: :class:`bool`
1392 
1393  :param save: filename for saving plot
1394  :type save: :class:`str`
1395 
1396  **Examples:** simple plotting functions
1397 
1398  .. code-block:: python
1399 
1400  tab = Table(['a'],'f', a=[math.cos(x*0.01) for x in range(100)])
1401 
1402  # one dimensional plot of column 'd' vs. index
1403  plt = tab.PlotHistogram('a')
1404  plt.show()
1405 
1406  """
1407  try:
1408  import matplotlib.pyplot as plt
1409  import numpy as np
1410 
1411  if len(self.rows)==0:
1412  return None
1413  kwargs={}
1414  if color:
1415  kwargs['color']=color
1416  idx = self.GetColIndex(col)
1417  data = []
1418  for r in self.rows:
1419  if r[idx]!=None:
1420  data.append(r[idx])
1421 
1422  if clear:
1423  plt.clf()
1424 
1425  n, bins, patches = plt.hist(data, bins=num_bins, range=x_range,
1426  normed=normed, histtype=histtype, align=align,
1427  **kwargs)
1428 
1429  if x_title!=None:
1430  nice_x=x_title
1431  else:
1432  nice_x=MakeTitle(col)
1433  plt.xlabel(nice_x, size='x-large')
1434  if y_range:
1435  plt.ylim(y_range)
1436  if y_title!=None:
1437  nice_y=y_title
1438  else:
1439  nice_y="bin count"
1440  plt.ylabel(nice_y, size='x-large')
1441 
1442  if title!=None:
1443  nice_title=title
1444  else:
1445  nice_title="Histogram of %s"%nice_x
1446  plt.title(nice_title, size='x-large', fontweight='bold')
1447 
1448  if save:
1449  plt.savefig(save)
1450  return plt
1451  except ImportError:
1452  LogError("Function needs numpy and matplotlib, but I could not import it.")
1453  raise
1454 
1455  def _Max(self, col):
1456  if len(self.rows)==0:
1457  return None, None
1458  idx = self.GetColIndex(col)
1459  col_type = self.col_types[idx]
1460  if col_type=='int' or col_type=='float':
1461  max_val = -float('inf')
1462  elif col_type=='bool':
1463  max_val = False
1464  elif col_type=='string':
1465  max_val = chr(0)
1466  max_idx = None
1467  for i in range(0, len(self.rows)):
1468  if self.rows[i][idx]>max_val:
1469  max_val = self.rows[i][idx]
1470  max_idx = i
1471  return max_val, max_idx
1472 
1473  def PlotBar(self, cols=None, rows=None, xlabels=None, set_xlabels=True, xlabels_rotation='horizontal', y_title=None, title=None,
1474  colors=None, width=0.8, bottom=0, legend=False, legend_names=None, show=False, save=False):
1475 
1476  """
1477  Create a barplot of the data in cols. Every column will be represented
1478  at one position. If there are several rows, each column will be grouped
1479  together.
1480 
1481  :param cols: List of column names. Every column will be represented as a
1482  single bar. If cols is None, every column of the table gets
1483  plotted.
1484  :type cols: :class:`list`
1485 
1486  :param rows: List of row indices. Values from given rows will be plotted
1487  in parallel at one column position. If set to None, all rows
1488  of the table will be plotted. Note, that the maximum number
1489  of rows is 7.
1490  :type rows: :class:`list`
1491 
1492  :param xlabels: Label for every col on x-axis. If set to None, the column
1493  names are used. The xlabel plotting can be supressed by
1494  the parameter set_xlabel.
1495  :type xlabels: :class:`list`
1496 
1497  :param set_xlabels: Controls whether xlabels are plotted or not.
1498  :type set_xlabels: :class:`bool`
1499 
1500  :param x_labels_rotation: Can either be 'horizontal', 'vertical' or an
1501  integer, that describes the rotation in degrees.
1502 
1503  :param y_title: Y-axis description
1504  :type y_title: :class:`str`
1505 
1506  :title: Title of the plot. No title appears if set to None
1507  :type title: :class:`str`
1508 
1509  :param colors: Colors of the different bars in each group. Must be a list
1510  of valid colors in matplotlib. Length of color and rows must
1511  be consistent.
1512  :type colors: :class:`list`
1513 
1514  :param width: The available space for the groups on the x-axis is divided
1515  by the exact number of groups. The parameters width is the
1516  fraction of what is actually used. If it would be 1.0 the
1517  bars of the different groups would touch each other.
1518  Value must be between [0;1]
1519  :type width: :class:`float`
1520 
1521  :param bottom: Bottom
1522  :type bottom: :class:`float`
1523 
1524  :param legend: Legend for color explanation, the corresponding row
1525  respectively. If set to True, legend_names must be provided.
1526  :type legend: :class:`bool`
1527 
1528  :param legend_names: List of names, that describe the differently colored
1529  bars. Length must be consistent with number of rows.
1530 
1531  :param show: If set to True, the plot is directly displayed.
1532 
1533  :param save: If set, a png image with name save in the current working
1534  directory will be saved.
1535  :type save: :class:`str`
1536 
1537  """
1538  try:
1539  import numpy as np
1540  import matplotlib.pyplot as plt
1541  except:
1542  raise ImportError('PlotBar relies on numpy and matplotlib, but I could' \
1543  'not import it!')
1544 
1545  standard_colors=['b','g','y','c','m','r','k']
1546  data=[]
1547 
1548  if cols==None:
1549  cols=self.col_names
1550 
1551  if width<=0 or width>1:
1552  raise ValueError('Width must be in [0;1]')
1553 
1554  if rows==None:
1555  if len(self.rows)>7:
1556  raise ValueError('Table contains too many rows to represent them at one '\
1557  'bar position in parallel. You can Select a Subtable or '\
1558  'specify the parameter rows with a list of row indices '\
1559  '(max 7)')
1560  else:
1561  rows=range(len(self.rows))
1562  else:
1563  if not isinstance(rows,list):
1564  rows=[rows]
1565  if len(rows)>7:
1566  raise ValueError('Too many rows to represent (max 7). Please note, that '\
1567  'data from multiple rows from one column gets '\
1568  'represented at one position in parallel.')
1569 
1570  for r_idx in rows:
1571  row=self.rows[r_idx]
1572  temp=list()
1573  for c in cols:
1574  try:
1575  c_idx=self.GetColIndex(c)
1576  except:
1577  raise ValueError('Cannot find column with name '+str(c))
1578  temp.append(row[c_idx])
1579  data.append(temp)
1580 
1581  if colors==None:
1582  colors=standard_colors[:len(rows)]
1583 
1584  if len(rows)!=len(colors):
1585  raise ValueError("Number of rows and number of colors must be consistent!")
1586 
1587  ind=np.arange(len(data[0]))
1588  single_bar_width=float(width)/len(data)
1589 
1590  fig=plt.figure()
1591  ax=fig.add_subplot(111)
1592  legend_data=[]
1593 
1594  for i in range(len(data)):
1595  legend_data.append(ax.bar(ind+i*single_bar_width+(1-width)/2,data[i],single_bar_width,bottom=bottom,color=colors[i])[0])
1596 
1597  if title!=None:
1598  ax.set_title(title, size='x-large', fontweight='bold')
1599 
1600  if y_title!=None:
1601  nice_y=y_title
1602  else:
1603  nice_y="value"
1604  ax.set_ylabel(nice_y)
1605 
1606  if xlabels:
1607  if len(data[0])!=len(xlabels):
1608  raise ValueError('Number of xlabels is not consistent with number of cols!')
1609  else:
1610  xlabels=cols
1611 
1612  if set_xlabels:
1613  ax.set_xticks(ind+0.5)
1614  ax.set_xticklabels(xlabels, rotation = xlabels_rotation)
1615  else:
1616  ax.set_xticks([])
1617 
1618  if legend == True:
1619  if legend_names==None:
1620  raise ValueError('You must provide legend names! e.g. names for the rows, '\
1621  'that are printed in parallel.')
1622  if len(legend_names)!=len(data):
1623  raise ValueError('length of legend_names must be consistent with number '\
1624  'of plotted rows!')
1625  ax.legend(legend_data, legend_names)
1626 
1627  if save:
1628  plt.savefig(save)
1629 
1630  if show:
1631  plt.show()
1632 
1633  return plt
1634 
1635  def PlotHexbin(self, x, y, title=None, x_title=None, y_title=None, x_range=None, y_range=None, binning='log',
1636  colormap='jet', show_scalebar=False, scalebar_label=None, clear=True, save=False, show=False):
1637 
1638  """
1639  Create a heatplot of the data in col x vs the data in col y using matplotlib
1640 
1641  :param x: column name with x data
1642  :type x: :class:`str`
1643 
1644  :param y: column name with y data
1645  :type y: :class:`str`
1646 
1647  :param title: title of the plot, will be generated automatically if set to None
1648  :type title: :class:`str`
1649 
1650  :param x_title: label of x-axis, will be generated automatically if set to None
1651  :type title: :class:`str`
1652 
1653  :param y_title: label of y-axis, will be generated automatically if set to None
1654  :type title: :class:`str`
1655 
1656  :param x_range: start and end value for first dimension (e.g. [start_x, end_x])
1657  :type x_range: :class:`list` of length two
1658 
1659  :param y_range: start and end value for second dimension (e.g. [start_y, end_y])
1660  :type y_range: :class:`list` of length two
1661 
1662  :param binning: type of binning. If set to None, the value of a hexbin will
1663  correspond to the number of datapoints falling into it. If
1664  set to 'log', the value will be the log with base 10 of the above
1665  value (log(i+1)). If an integer is provided, the number of a
1666  hexbin is equal the number of datapoints falling into it divided
1667  by the integer. If a list of values is provided, these values
1668  will be the lower bounds of the bins.
1669 
1670  :param colormap: colormap, that will be used. Value can be every colormap defined
1671  in matplotlib or an own defined colormap. You can either pass a
1672  string with the name of the matplotlib colormap or a colormap
1673  object.
1674 
1675  :param show_scalebar: If set to True, a scalebar according to the chosen colormap is shown
1676  :type show_scalebar: :class:`bool`
1677 
1678  :param scalebar_label: Label of the scalebar
1679  :type scalebar_label: :class:`str`
1680 
1681  :param clear: clear old data from plot
1682  :type clear: :class:`bool`
1683 
1684  :param save: filename for saving plot
1685  :type save: :class:`str`
1686 
1687  :param show: directly show plot
1688  :type show: :class:`bool`
1689 
1690  """
1691 
1692  try:
1693  import matplotlib.pyplot as plt
1694  import matplotlib.cm as cm
1695  except:
1696  raise ImportError('PlotHexbin relies on matplotlib, but I could not import it')
1697 
1698  idx=self.GetColIndex(x)
1699  idy=self.GetColIndex(y)
1700  xdata=[]
1701  ydata=[]
1702 
1703  for r in self.rows:
1704  if r[idx]!=None and r[idy]!=None:
1705  xdata.append(r[idx])
1706  ydata.append(r[idy])
1707 
1708  if clear:
1709  plt.clf()
1710 
1711  if x_title!=None:
1712  nice_x=x_title
1713  else:
1714  nice_x=MakeTitle(x)
1715 
1716  if y_title!=None:
1717  nice_y=y_title
1718  else:
1719  nice_y=MakeTitle(y)
1720 
1721  if title==None:
1722  title = '%s vs. %s' % (nice_x, nice_y)
1723 
1724  if IsStringLike(colormap):
1725  colormap=getattr(cm, colormap)
1726 
1727  if x_range and (IsScalar(x_range) or len(x_range)!=2):
1728  raise ValueError('parameter x_range must contain exactly two elements')
1729  if y_range and (IsScalar(y_range) or len(y_range)!=2):
1730  raise ValueError('parameter y_range must contain exactly two elements')
1731 
1732  ext = [min(xdata),max(xdata),min(ydata),max(ydata)]
1733 
1734  if x_range:
1735  plt.xlim((x_range[0], x_range[1]))
1736  ext[0]=x_range[0]
1737  ext[1]=x_range[1]
1738  if y_range:
1739  plt.ylim(y_range[0], y_range[1])
1740  ext[2]=y_range[0]
1741  ext[3]=y_range[1]
1742 
1743 
1744  plt.hexbin(xdata, ydata, bins=binning, cmap=colormap, extent=ext)
1745 
1746  plt.title(title, size='x-large', fontweight='bold',
1747  verticalalignment='bottom')
1748 
1749  plt.xlabel(nice_x)
1750  plt.ylabel(nice_y)
1751 
1752  if show_scalebar:
1753  cb=plt.colorbar()
1754  if scalebar_label:
1755  cb.set_label(scalebar_label)
1756 
1757  if save:
1758  plt.savefig(save)
1759 
1760  if show:
1761  plt.show()
1762 
1763  return plt
1764 
1765  def MaxRow(self, col):
1766  """
1767  Returns the row containing the cell with the maximal value in col. If
1768  several rows have the highest value, only the first one is returned.
1769  ''None'' values are ignored.
1770 
1771  :param col: column name
1772  :type col: :class:`str`
1773 
1774  :returns: row with maximal col value or None if the table is empty
1775  """
1776  val, idx = self._Max(col)
1777  if idx!=None:
1778  return self.rows[idx]
1779 
1780  def Max(self, col):
1781  """
1782  Returns the maximum value in col. If several rows have the highest value,
1783  only the first one is returned. ''None'' values are ignored.
1784 
1785  :param col: column name
1786  :type col: :class:`str`
1787  """
1788  val, idx = self._Max(col)
1789  return val
1790 
1791  def MaxIdx(self, col):
1792  """
1793  Returns the row index of the cell with the maximal value in col. If
1794  several rows have the highest value, only the first one is returned.
1795  ''None'' values are ignored.
1796 
1797  :param col: column name
1798  :type col: :class:`str`
1799  """
1800  val, idx = self._Max(col)
1801  return idx
1802 
1803  def _Min(self, col):
1804  if len(self.rows)==0:
1805  return None, None
1806  idx=self.GetColIndex(col)
1807  col_type = self.col_types[idx]
1808  if col_type=='int' or col_type=='float':
1809  min_val=float('inf')
1810  elif col_type=='bool':
1811  min_val=True
1812  elif col_type=='string':
1813  min_val=chr(255)
1814  min_idx=None
1815  for i,row in enumerate(self.rows):
1816  if row[idx]!=None and row[idx]<min_val:
1817  min_val=row[idx]
1818  min_idx=i
1819  return min_val, min_idx
1820 
1821  def Min(self, col):
1822  """
1823  Returns the minimal value in col. If several rows have the lowest value,
1824  only the first one is returned. ''None'' values are ignored.
1825 
1826  :param col: column name
1827  :type col: :class:`str`
1828  """
1829  val, idx = self._Min(col)
1830  return val
1831 
1832  def MinRow(self, col):
1833  """
1834  Returns the row containing the cell with the minimal value in col. If
1835  several rows have the lowest value, only the first one is returned.
1836  ''None'' values are ignored.
1837 
1838  :param col: column name
1839  :type col: :class:`str`
1840 
1841  :returns: row with minimal col value or None if the table is empty
1842  """
1843  val, idx = self._Min(col)
1844  if idx!=None:
1845  return self.rows[idx]
1846 
1847  def MinIdx(self, col):
1848  """
1849  Returns the row index of the cell with the minimal value in col. If
1850  several rows have the lowest value, only the first one is returned.
1851  ''None'' values are ignored.
1852 
1853  :param col: column name
1854  :type col: :class:`str`
1855  """
1856  val, idx = self._Min(col)
1857  return idx
1858 
1859  def Sum(self, col):
1860  """
1861  Returns the sum of the given column. Cells with ''None'' are ignored. Returns
1862  0.0, if the column doesn't contain any elements. Col must be of numeric
1863  column type ('float', 'int') or boolean column type.
1864 
1865  :param col: column name
1866  :type col: :class:`str`
1867 
1868  :raises: :class:`TypeError` if column type is ``string``
1869  """
1870  idx = self.GetColIndex(col)
1871  col_type = self.col_types[idx]
1872  if col_type!='int' and col_type!='float' and col_type!='bool':
1873  raise TypeError("Sum can only be used on numeric column types")
1874  s = 0.0
1875  for r in self.rows:
1876  if r[idx]!=None:
1877  s += r[idx]
1878  return s
1879 
1880  def Mean(self, col):
1881  """
1882  Returns the mean of the given column. Cells with ''None'' are ignored. Returns
1883  None, if the column doesn't contain any elements. Col must be of numeric
1884  ('float', 'int') or boolean column type.
1885 
1886  If column type is *bool*, the function returns the ratio of
1887  number of 'Trues' by total number of elements.
1888 
1889  :param col: column name
1890  :type col: :class:`str`
1891 
1892  :raises: :class:`TypeError` if column type is ``string``
1893  """
1894  idx = self.GetColIndex(col)
1895  col_type = self.col_types[idx]
1896  if col_type!='int' and col_type!='float' and col_type!='bool':
1897  raise TypeError("Mean can only be used on numeric or bool column types")
1898 
1899  vals=[]
1900  for v in self[col]:
1901  if v!=None:
1902  vals.append(v)
1903  try:
1904  return stutil.Mean(vals)
1905  except:
1906  return None
1907 
1908  def RowMean(self, mean_col_name, cols):
1909  """
1910  Adds a new column of type 'float' with a specified name (*mean_col_name*),
1911  containing the mean of all specified columns for each row.
1912 
1913  Cols are specified by their names and must be of numeric column
1914  type ('float', 'int') or boolean column type. Cells with None are ignored.
1915  Adds ''None'' if the row doesn't contain any values.
1916 
1917  :param mean_col_name: name of new column containing mean values
1918  :type mean_col_name: :class:`str`
1919 
1920  :param cols: name or list of names of columns to include in computation of
1921  mean
1922  :type cols: :class:`str` or :class:`list` of strings
1923 
1924  :raises: :class:`TypeError` if column type of columns in *col* is ``string``
1925 
1926  == Example ==
1927 
1928  Staring with the following table:
1929 
1930  ==== ==== ====
1931  x y u
1932  ==== ==== ====
1933  1 10 100
1934  2 15 None
1935  3 20 400
1936  ==== ==== ====
1937 
1938  the code here adds a column with the name 'mean' to yield the table below:
1939 
1940  .. code-block::python
1941 
1942  tab.RowMean('mean', ['x', 'u'])
1943 
1944 
1945  ==== ==== ==== =====
1946  x y u mean
1947  ==== ==== ==== =====
1948  1 10 100 50.5
1949  2 15 None 2
1950  3 20 400 201.5
1951  ==== ==== ==== =====
1952 
1953  """
1954 
1955  if IsScalar(cols):
1956  cols = [cols]
1957 
1958  cols_idxs = []
1959  for col in cols:
1960  idx = self.GetColIndex(col)
1961  col_type = self.col_types[idx]
1962  if col_type!='int' and col_type!='float' and col_type!='bool':
1963  raise TypeError("RowMean can only be used on numeric column types")
1964  cols_idxs.append(idx)
1965 
1966  mean_rows = []
1967  for row in self.rows:
1968  vals = []
1969  for idx in cols_idxs:
1970  v = row[idx]
1971  if v!=None:
1972  vals.append(v)
1973  try:
1974  mean = stutil.Mean(vals)
1975  mean_rows.append(mean)
1976  except:
1977  mean_rows.append(None)
1978 
1979  self.AddCol(mean_col_name, 'f', mean_rows)
1980 
1981  def Percentiles(self, col, nths):
1982  """
1983  Returns the percentiles of column *col* given in *nths*.
1984 
1985  The percentiles are calculated as
1986 
1987  .. code-block:: python
1988 
1989  values[min(len(values), int(round(len(values)*p/100+0.5)-1))]
1990 
1991  where values are the sorted values of *col* not equal to ''None''
1992  :param: nths: list of percentiles to be calculated. Each percentile is a number
1993  between 0 and 100.
1994 
1995  :raises: :class:`TypeError` if column type is ``string``
1996  :returns: List of percentiles in the same order as given in *nths*
1997  """
1998  idx = self.GetColIndex(col)
1999  col_type = self.col_types[idx]
2000  if col_type!='int' and col_type!='float' and col_type!='bool':
2001  raise TypeError("Median can only be used on numeric column types")
2002 
2003  for nth in nths:
2004  if nth < 0 or nth > 100:
2005  raise ValueError("percentiles must be between 0 and 100")
2006  vals=[]
2007  for v in self[col]:
2008  if v!=None:
2009  vals.append(v)
2010  vals=sorted(vals)
2011  if len(vals)==0:
2012  return [None]*len(nths)
2013  percentiles=[]
2014 
2015  for nth in nths:
2016  p=vals[min(len(vals)-1, int(round(len(vals)*nth/100.0+0.5)-1))]
2017  percentiles.append(p)
2018  return percentiles
2019 
2020  def Median(self, col):
2021  """
2022  Returns the median of the given column. Cells with ''None'' are ignored. Returns
2023  ''None'', if the column doesn't contain any elements. Col must be of numeric
2024  column type ('float', 'int') or boolean column type.
2025 
2026  :param col: column name
2027  :type col: :class:`str`
2028 
2029  :raises: :class:`TypeError` if column type is ``string``
2030  """
2031  idx = self.GetColIndex(col)
2032  col_type = self.col_types[idx]
2033  if col_type!='int' and col_type!='float' and col_type!='bool':
2034  raise TypeError("Median can only be used on numeric column types")
2035 
2036  vals=[]
2037  for v in self[col]:
2038  if v!=None:
2039  vals.append(v)
2040  stutil.Median(vals)
2041  try:
2042  return stutil.Median(vals)
2043  except:
2044  return None
2045 
2046  def StdDev(self, col):
2047  """
2048  Returns the standard deviation of the given column. Cells with ''None'' are
2049  ignored. Returns ''None'', if the column doesn't contain any elements. Col must
2050  be of numeric column type ('float', 'int') or boolean column type.
2051 
2052  :param col: column name
2053  :type col: :class:`str`
2054 
2055  :raises: :class:`TypeError` if column type is ``string``
2056  """
2057  idx = self.GetColIndex(col)
2058  col_type = self.col_types[idx]
2059  if col_type!='int' and col_type!='float' and col_type!='bool':
2060  raise TypeError("StdDev can only be used on numeric column types")
2061 
2062  vals=[]
2063  for v in self[col]:
2064  if v!=None:
2065  vals.append(v)
2066  try:
2067  return stutil.StdDev(vals)
2068  except:
2069  return None
2070 
2071  def Count(self, col, ignore_nan=True):
2072  """
2073  Count the number of cells in column that are not equal to ''None''.
2074 
2075  :param col: column name
2076  :type col: :class:`str`
2077 
2078  :param ignore_nan: ignore all *None* values
2079  :type ignore_nan: :class:`bool`
2080  """
2081  count=0
2082  idx=self.GetColIndex(col)
2083  for r in self.rows:
2084  if ignore_nan:
2085  if r[idx]!=None:
2086  count+=1
2087  else:
2088  count+=1
2089  return count
2090 
2091  def Correl(self, col1, col2):
2092  """
2093  Calculate the Pearson correlation coefficient between *col1* and *col2*, only
2094  taking rows into account where both of the values are not equal to *None*.
2095  If there are not enough data points to calculate a correlation coefficient,
2096  *None* is returned.
2097 
2098  :param col1: column name for first column
2099  :type col1: :class:`str`
2100 
2101  :param col2: column name for second column
2102  :type col2: :class:`str`
2103  """
2104  if IsStringLike(col1) and IsStringLike(col2):
2105  col1 = self.GetColIndex(col1)
2106  col2 = self.GetColIndex(col2)
2107  vals1, vals2=([],[])
2108  for v1, v2 in zip(self[col1], self[col2]):
2109  if v1!=None and v2!=None:
2110  vals1.append(v1)
2111  vals2.append(v2)
2112  try:
2113  return stutil.Correl(vals1, vals2)
2114  except:
2115  return None
2116 
2117  def SpearmanCorrel(self, col1, col2):
2118  """
2119  Calculate the Spearman correlation coefficient between col1 and col2, only
2120  taking rows into account where both of the values are not equal to None. If
2121  there are not enough data points to calculate a correlation coefficient,
2122  None is returned.
2123 
2124  :warning: The function depends on the following module: *scipy.stats.mstats*
2125 
2126  :param col1: column name for first column
2127  :type col1: :class:`str`
2128 
2129  :param col2: column name for second column
2130  :type col2: :class:`str`
2131  """
2132  try:
2133  import scipy.stats.mstats
2134 
2135  if IsStringLike(col1) and IsStringLike(col2):
2136  col1 = self.GetColIndex(col1)
2137  col2 = self.GetColIndex(col2)
2138  vals1, vals2=([],[])
2139  for v1, v2 in zip(self[col1], self[col2]):
2140  if v1!=None and v2!=None:
2141  vals1.append(v1)
2142  vals2.append(v2)
2143  try:
2144  correl = scipy.stats.mstats.spearmanr(vals1, vals2)[0]
2145  if scipy.isnan(correl):
2146  return None
2147  return correl
2148  except:
2149  return None
2150 
2151  except ImportError:
2152  LogError("Function needs scipy.stats.mstats, but I could not import it.")
2153  raise
2154 
2155 
2156  def Save(self, stream_or_filename, format='ost', sep=','):
2157  """
2158  Save the table to stream or filename. The following three file formats
2159  are supported (for more information on file formats, see :meth:`Load`):
2160 
2161  ============= =======================================
2162  ost ost-specific format (human readable)
2163  csv comma separated values (human readable)
2164  pickle pickled byte stream (binary)
2165  html HTML table
2166  context ConTeXt table
2167  ============= =======================================
2168 
2169  :param stream_or_filename: filename or stream for writing output
2170  :type stream_or_filename: :class:`str` or :class:`file`
2171 
2172  :param format: output format (i.e. *ost*, *csv*, *pickle*)
2173  :type format: :class:`str`
2174 
2175  :raises: :class:`ValueError` if format is unknown
2176  """
2177  format=format.lower()
2178  if format=='ost':
2179  return self._SaveOST(stream_or_filename)
2180  if format=='csv':
2181  return self._SaveCSV(stream_or_filename, sep=sep)
2182  if format=='pickle':
2183  return self._SavePickle(stream_or_filename)
2184  if format=='html':
2185  return self._SaveHTML(stream_or_filename)
2186  if format=='context':
2187  return self._SaveContext(stream_or_filename)
2188  raise ValueError('unknown format "%s"' % format)
2189 
2190  def _SavePickle(self, stream):
2191  if not hasattr(stream, 'write'):
2192  stream=open(stream, 'wb')
2193  cPickle.dump(self, stream, cPickle.HIGHEST_PROTOCOL)
2194 
2195  def _SaveHTML(self, stream_or_filename):
2196  def _escape(s):
2197  return s.replace('&', '&amp;').replace('>', '&gt;').replace('<', '&lt;')
2198 
2199  file_opened = False
2200  if not hasattr(stream_or_filename, 'write'):
2201  stream = open(stream_or_filename, 'w')
2202  file_opened = True
2203  else:
2204  stream = stream_or_filename
2205  stream.write('<table>')
2206  stream.write('<tr>')
2207  for col_name in self.col_names:
2208  stream.write('<th>%s</th>' % _escape(col_name))
2209  stream.write('</tr>')
2210  for row in self.rows:
2211  stream.write('<tr>')
2212  for i, col in enumerate(row):
2213  val = ''
2214  if col != None:
2215  if self.col_types[i] == 'float':
2216  val = '%.3f' % col
2217  elif self.col_types[i] == 'int':
2218  val = '%d' % col
2219  elif self.col_types[i] == 'bool':
2220  val = col and 'true' or 'false'
2221  else:
2222  val = str(col)
2223  stream.write('<td>%s</td>' % _escape(val))
2224  stream.write('</tr>')
2225  stream.write('</table>')
2226  if file_opened:
2227  stream.close()
2228  def _SaveContext(self, stream_or_filename):
2229  file_opened = False
2230  if not hasattr(stream_or_filename, 'write'):
2231  stream = open(stream_or_filename, 'w')
2232  file_opened = True
2233  else:
2234  stream = stream_or_filename
2235  stream.write('\\starttable[')
2236  for col_type in self.col_types:
2237  if col_type =='string':
2238  stream.write('l|')
2239  elif col_type=='int':
2240  stream.write('r|')
2241  elif col_type =='float':
2242  stream.write('i3r|')
2243  else:
2244  stream.write('l|')
2245  stream.write(']\n\\HL\n')
2246  for col_name in self.col_names:
2247  stream.write('\\NC \\bf %s' % col_name)
2248  stream.write(' \\AR\\HL\n')
2249  for row in self.rows:
2250  for i, col in enumerate(row):
2251  val = '---'
2252  if col != None:
2253  if self.col_types[i] == 'float':
2254  val = '%.3f' % col
2255  elif self.col_types[i] == 'int':
2256  val = '%d' % col
2257  elif self.col_types[i] == 'bool':
2258  val = col and 'true' or 'false'
2259  else:
2260  val = str(col)
2261  stream.write('\\NC %s' % val)
2262  stream.write(' \\AR\n')
2263  stream.write('\\HL\n')
2264  stream.write('\\stoptable')
2265  if file_opened:
2266  stream.close()
2267 
2268  def _SaveCSV(self, stream, sep):
2269  if not hasattr(stream, 'write'):
2270  stream=open(stream, 'wb')
2271 
2272  writer=csv.writer(stream, delimiter=sep)
2273  writer.writerow(['%s' % n for n in self.col_names])
2274  for row in self.rows:
2275  row=list(row)
2276  for i, c in enumerate(row):
2277  if c==None:
2278  row[i]='NA'
2279  writer.writerow(row)
2280 
2281  def _SaveOST(self, stream):
2282  if hasattr(stream, 'write'):
2283  writer=csv.writer(stream, delimiter=' ')
2284  else:
2285  stream=open(stream, 'w')
2286  writer=csv.writer(stream, delimiter=' ')
2287  if self.comment:
2288  stream.write(''.join(['# %s\n' % l for l in self.comment.split('\n')]))
2289  writer.writerow(['%s[%s]' % t for t in zip(self.col_names, self.col_types)])
2290  for row in self.rows:
2291  row=list(row)
2292  for i, c in enumerate(row):
2293  if c==None:
2294  row[i]='NA'
2295  writer.writerow(row)
2296 
2297 
2298  def GetNumpyMatrix(self, *args):
2299  '''
2300  Returns a numpy matrix containing the selected columns from the table as
2301  columns in the matrix.
2302 
2303  Only columns of type *int* or *float* are supported. *NA* values in the
2304  table will be converted to *None* values.
2305 
2306  :param \*args: column names to include in numpy matrix
2307 
2308  :warning: The function depends on *numpy*
2309  '''
2310  try:
2311  import numpy as np
2312 
2313  if len(args)==0:
2314  raise RuntimeError("At least one column must be specified.")
2315 
2316  idxs = []
2317  for arg in args:
2318  idx = self.GetColIndex(arg)
2319  col_type = self.col_types[idx]
2320  if col_type!='int' and col_type!='float':
2321  raise TypeError("Numpy matrix can only be generated from numeric column types")
2322  idxs.append(idx)
2323  m = np.matrix([list(self[i]) for i in idxs])
2324  return m.T
2325 
2326  except ImportError:
2327  LogError("Function needs numpy, but I could not import it.")
2328  raise
2329 
2330 
2331 
2332  def GaussianSmooth(self, col, std=1.0, na_value=0.0, padding='reflect', c=0.0):
2333 
2334  '''
2335  In place Gaussian smooth of a column in the table with a given standard deviation.
2336  All nan are set to nan_value before smoothing.
2337 
2338  :param col: column name
2339  :type col: :class:`str`
2340 
2341  :param std: standard deviation for gaussian kernel
2342  :type std: `scalar`
2343 
2344  :param na_value: all na (None) values of the speciefied column are set to na_value before smoothing
2345  :type na_value: `scalar`
2346 
2347  :param padding: allows to handle padding behaviour see scipy ndimage.gaussian_filter1d documentation for more information. standard is reflect
2348  :type padding: :class:`str`
2349 
2350  :param c: constant value used for padding if padding mode is constant
2351  :type c: `scalar`
2352 
2353 
2354 
2355  :warning: The function depends on *scipy*
2356  '''
2357 
2358  try:
2359  from scipy import ndimage
2360  import numpy as np
2361  except ImportError:
2362  LogError("I need scipy.ndimage and numpy, but could not import it")
2363  raise
2364 
2365  idx = self.GetColIndex(col)
2366  col_type = self.col_types[idx]
2367  if col_type!='int' and col_type!='float':
2368  raise TypeError("GaussianSmooth can only be used on numeric column types")
2369 
2370  vals=[]
2371  for v in self[col]:
2372  if v!=None:
2373  vals.append(v)
2374  else:
2375  vals.append(na_value)
2376 
2377 
2378  smoothed_values_ndarray=ndimage.gaussian_filter1d(vals,std, mode=padding, cval=c)
2379 
2380  result=[]
2381 
2382  for v in smoothed_values_ndarray:
2383  result.append(v)
2384 
2385  self[col]=result
2386 
2387 
2388  def GetOptimalPrefactors(self, ref_col, *args, **kwargs):
2389  '''
2390  This returns the optimal prefactor values (i.e. a, b, c, ...) for the
2391  following equation
2392 
2393  .. math::
2394  :label: op1
2395 
2396  a*u + b*v + c*w + ... = z
2397 
2398  where u, v, w and z are vectors. In matrix notation
2399 
2400  .. math::
2401  :label: op2
2402 
2403  A*p = z
2404 
2405  where A contains the data from the table (u,v,w,...), p are the prefactors
2406  to optimize (a,b,c,...) and z is the vector containing the result of
2407  equation :eq:`op1`.
2408 
2409  The parameter ref_col equals to z in both equations, and \*args are columns
2410  u, v and w (or A in :eq:`op2`). All columns must be specified by their names.
2411 
2412  **Example:**
2413 
2414  .. code-block:: python
2415 
2416  tab.GetOptimalPrefactors('colC', 'colA', 'colB')
2417 
2418  The function returns a list of containing the prefactors a, b, c, ... in
2419  the correct order (i.e. same as columns were specified in \*args).
2420 
2421  Weighting:
2422  If the kwarg weights="columX" is specified, the equations are weighted by
2423  the values in that column. Each row is multiplied by the weight in that row,
2424  which leads to :eq:`op3`:
2425 
2426  .. math::
2427  :label: op3
2428 
2429  weight*a*u + weight*b*v + weight*c*w + ... = weight*z
2430 
2431  Weights must be float or int and can have any value. A value of 0 ignores
2432  this equation, a value of 1 means the same as no weight. If all weights are
2433  the same for each row, the same result will be obtained as with no weights.
2434 
2435  **Example:**
2436 
2437  .. code-block:: python
2438 
2439  tab.GetOptimalPrefactors('colC', 'colA', 'colB', weights='colD')
2440 
2441  '''
2442  try:
2443  import numpy as np
2444 
2445  if len(args)==0:
2446  raise RuntimeError("At least one column must be specified.")
2447 
2448  b = self.GetNumpyMatrix(ref_col)
2449  a = self.GetNumpyMatrix(*args)
2450 
2451  if len(kwargs)!=0:
2452  if kwargs.has_key('weights'):
2453  w = self.GetNumpyMatrix(kwargs['weights'])
2454  b = np.multiply(b,w)
2455  a = np.multiply(a,w)
2456 
2457  else:
2458  raise RuntimeError("specified unrecognized kwargs, use weights as key")
2459 
2460  k = (a.T*a).I*a.T*b
2461  return list(np.array(k.T).reshape(-1))
2462 
2463  except ImportError:
2464  LogError("Function needs numpy, but I could not import it.")
2465  raise
2466 
2467  def PlotEnrichment(self, score_col, class_col, score_dir='-',
2468  class_dir='-', class_cutoff=2.0,
2469  style='-', title=None, x_title=None, y_title=None,
2470  clear=True, save=None):
2471  '''
2472  Plot an enrichment curve using matplotlib of column *score_col* classified
2473  according to *class_col*.
2474 
2475  For more information about parameters of the enrichment, see
2476  :meth:`ComputeEnrichment`, and for plotting see :meth:`Plot`.
2477 
2478  :warning: The function depends on *matplotlib*
2479  '''
2480  try:
2481  import matplotlib.pyplot as plt
2482 
2483  enrx, enry = self.ComputeEnrichment(score_col, class_col, score_dir,
2484  class_dir, class_cutoff)
2485 
2486  if not title:
2487  title = 'Enrichment of %s'%score_col
2488 
2489  if not x_title:
2490  x_title = '% database'
2491 
2492  if not y_title:
2493  y_title = '% positives'
2494 
2495  if clear:
2496  plt.clf()
2497 
2498  plt.plot(enrx, enry, style)
2499 
2500  plt.title(title, size='x-large', fontweight='bold')
2501  plt.ylabel(y_title, size='x-large')
2502  plt.xlabel(x_title, size='x-large')
2503 
2504  if save:
2505  plt.savefig(save)
2506 
2507  return plt
2508  except ImportError:
2509  LogError("Function needs matplotlib, but I could not import it.")
2510  raise
2511 
2512  def ComputeEnrichment(self, score_col, class_col, score_dir='-',
2513  class_dir='-', class_cutoff=2.0):
2514  '''
2515  Computes the enrichment of column *score_col* classified according to
2516  *class_col*.
2517 
2518  For this it is necessary, that the datapoints are classified into positive
2519  and negative points. This can be done in two ways:
2520 
2521  - by using one 'bool' type column (*class_col*) which contains *True* for
2522  positives and *False* for negatives
2523 
2524  - by specifying a classification column (*class_col*), a cutoff value
2525  (*class_cutoff*) and the classification columns direction (*class_dir*).
2526  This will generate the classification on the fly
2527 
2528  * if ``class_dir=='-'``: values in the classification column that are less than or equal to class_cutoff will be counted as positives
2529  * if ``class_dir=='+'``: values in the classification column that are larger than or equal to class_cutoff will be counted as positives
2530 
2531  During the calculation, the table will be sorted according to *score_dir*,
2532  where a '-' values means smallest values first and therefore, the smaller
2533  the value, the better.
2534 
2535  :warning: If either the value of *class_col* or *score_col* is *None*, the
2536  data in this row is ignored.
2537  '''
2538 
2539  ALLOWED_DIR = ['+','-']
2540 
2541  score_idx = self.GetColIndex(score_col)
2542  score_type = self.col_types[score_idx]
2543  if score_type!='int' and score_type!='float':
2544  raise TypeError("Score column must be numeric type")
2545 
2546  class_idx = self.GetColIndex(class_col)
2547  class_type = self.col_types[class_idx]
2548  if class_type!='int' and class_type!='float' and class_type!='bool':
2549  raise TypeError("Classifier column must be numeric or bool type")
2550 
2551  if (score_dir not in ALLOWED_DIR) or (class_dir not in ALLOWED_DIR):
2552  raise ValueError("Direction must be one of %s"%str(ALLOWED_DIR))
2553 
2554  self.Sort(score_col, score_dir)
2555 
2556  x = [0]
2557  y = [0]
2558  enr = 0
2559  old_score_val = None
2560  i = 0
2561 
2562  for row in self.rows:
2563  class_val = row[class_idx]
2564  score_val = row[score_idx]
2565  if class_val==None or score_val==None:
2566  continue
2567  if class_val!=None:
2568  if old_score_val==None:
2569  old_score_val = score_val
2570  if score_val!=old_score_val:
2571  x.append(i)
2572  y.append(enr)
2573  old_score_val = score_val
2574  i+=1
2575  if class_type=='bool':
2576  if class_val==True:
2577  enr += 1
2578  else:
2579  if (class_dir=='-' and class_val<=class_cutoff) or (class_dir=='+' and class_val>=class_cutoff):
2580  enr += 1
2581  x.append(i)
2582  y.append(enr)
2583 
2584  # if no false positives or false negatives values are found return None
2585  if x[-1]==0 or y[-1]==0:
2586  return None
2587 
2588  x = [float(v)/x[-1] for v in x]
2589  y = [float(v)/y[-1] for v in y]
2590  return x,y
2591 
2592  def ComputeEnrichmentAUC(self, score_col, class_col, score_dir='-',
2593  class_dir='-', class_cutoff=2.0):
2594  '''
2595  Computes the area under the curve of the enrichment using the trapezoidal
2596  rule.
2597 
2598  For more information about parameters of the enrichment, see
2599  :meth:`ComputeEnrichment`.
2600 
2601  :warning: The function depends on *numpy*
2602  '''
2603  try:
2604  import numpy as np
2605 
2606  enr = self.ComputeEnrichment(score_col, class_col, score_dir,
2607  class_dir, class_cutoff)
2608 
2609  if enr==None:
2610  return None
2611  return np.trapz(enr[1], enr[0])
2612  except ImportError:
2613  LogError("Function needs numpy, but I could not import it.")
2614  raise
2615 
2616  def ComputeROC(self, score_col, class_col, score_dir='-',
2617  class_dir='-', class_cutoff=2.0):
2618  '''
2619  Computes the receiver operating characteristics (ROC) of column *score_col*
2620  classified according to *class_col*.
2621 
2622  For this it is necessary, that the datapoints are classified into positive
2623  and negative points. This can be done in two ways:
2624 
2625  - by using one 'bool' column (*class_col*) which contains True for positives
2626  and False for negatives
2627  - by using a non-bool column (*class_col*), a cutoff value (*class_cutoff*)
2628  and the classification columns direction (*class_dir*). This will generate
2629  the classification on the fly
2630 
2631  - if ``class_dir=='-'``: values in the classification column that are less than or equal to *class_cutoff* will be counted as positives
2632  - if ``class_dir=='+'``: values in the classification column that are larger than or equal to *class_cutoff* will be counted as positives
2633 
2634  During the calculation, the table will be sorted according to *score_dir*,
2635  where a '-' values means smallest values first and therefore, the smaller
2636  the value, the better.
2637 
2638  If *class_col* does not contain any positives (i.e. value is True (if column
2639  is of type bool) or evaluated to True (if column is of type int or float
2640  (depending on *class_dir* and *class_cutoff*))) the ROC is not defined and
2641  the function will return *None*.
2642 
2643  :warning: If either the value of *class_col* or *score_col* is *None*, the
2644  data in this row is ignored.
2645  '''
2646 
2647  ALLOWED_DIR = ['+','-']
2648 
2649  score_idx = self.GetColIndex(score_col)
2650  score_type = self.col_types[score_idx]
2651  if score_type!='int' and score_type!='float':
2652  raise TypeError("Score column must be numeric type")
2653 
2654  class_idx = self.GetColIndex(class_col)
2655  class_type = self.col_types[class_idx]
2656  if class_type!='int' and class_type!='float' and class_type!='bool':
2657  raise TypeError("Classifier column must be numeric or bool type")
2658 
2659  if (score_dir not in ALLOWED_DIR) or (class_dir not in ALLOWED_DIR):
2660  raise ValueError("Direction must be one of %s"%str(ALLOWED_DIR))
2661 
2662  self.Sort(score_col, score_dir)
2663 
2664  x = [0]
2665  y = [0]
2666  tp = 0
2667  fp = 0
2668  old_score_val = None
2669 
2670  for i,row in enumerate(self.rows):
2671  class_val = row[class_idx]
2672  score_val = row[score_idx]
2673  if class_val==None or score_val==None:
2674  continue
2675  if class_val!=None:
2676  if old_score_val==None:
2677  old_score_val = score_val
2678  if score_val!=old_score_val:
2679  x.append(fp)
2680  y.append(tp)
2681  old_score_val = score_val
2682  if class_type=='bool':
2683  if class_val==True:
2684  tp += 1
2685  else:
2686  fp += 1
2687  else:
2688  if (class_dir=='-' and class_val<=class_cutoff) or (class_dir=='+' and class_val>=class_cutoff):
2689  tp += 1
2690  else:
2691  fp += 1
2692  x.append(fp)
2693  y.append(tp)
2694 
2695  # if no false positives or false negatives values are found return None
2696  if x[-1]==0 or y[-1]==0:
2697  return None
2698 
2699  x = [float(v)/x[-1] for v in x]
2700  y = [float(v)/y[-1] for v in y]
2701  return x,y
2702 
2703  def ComputeROCAUC(self, score_col, class_col, score_dir='-',
2704  class_dir='-', class_cutoff=2.0):
2705  '''
2706  Computes the area under the curve of the receiver operating characteristics
2707  using the trapezoidal rule.
2708 
2709  For more information about parameters of the ROC, see
2710  :meth:`ComputeROC`.
2711 
2712  :warning: The function depends on *numpy*
2713  '''
2714  try:
2715  import numpy as np
2716 
2717  roc = self.ComputeROC(score_col, class_col, score_dir,
2718  class_dir, class_cutoff)
2719 
2720  if not roc:
2721  return None
2722  return np.trapz(roc[1], roc[0])
2723  except ImportError:
2724  LogError("Function needs numpy, but I could not import it.")
2725  raise
2726 
2727  def ComputeLogROCAUC(self, score_col, class_col, score_dir='-',
2728  class_dir='-', class_cutoff=2.0):
2729  '''
2730  Computes the area under the curve of the log receiver operating
2731  characteristics (logROC) where the x-axis is semilogarithmic
2732  using the trapezoidal rule.
2733 
2734  The logROC is computed with a lambda of 0.001 according to
2735  Rapid Context-Dependent Ligand Desolvation in Molecular Docking
2736  Mysinger M. and Shoichet B., Journal of Chemical Information and Modeling
2737  2010 50 (9), 1561-1573
2738 
2739  For more information about parameters of the ROC, see
2740  :meth:`ComputeROC`.
2741 
2742  :warning: The function depends on *numpy*
2743  '''
2744  try:
2745  import numpy as np
2746 
2747  roc = self.ComputeROC(score_col, class_col, score_dir,
2748  class_dir, class_cutoff)
2749 
2750  if not roc:
2751  return None
2752 
2753  rocxt, rocyt = roc
2754  rocx=[]
2755  rocy=[]
2756 
2757  # define lambda
2758  l=0.001
2759 
2760  # remove all duplicate x-values
2761  rocxt = [x if x>0 else l for x in rocxt]
2762  for i in range(len(rocxt)-1):
2763  if rocxt[i]==rocxt[i+1]:
2764  continue
2765  rocx.append(rocxt[i])
2766  rocy.append(rocyt[i])
2767  rocx.append(1.0)
2768  rocy.append(1.0)
2769 
2770  # compute logauc
2771  value = 0
2772  for i in range(len(rocx)-1):
2773  x = rocx[i]
2774  if rocx[i]==rocx[i+1]:
2775  continue
2776  b = rocy[i+1]-rocx[i+1]*((rocy[i+1]-rocy[i])/(rocx[i+1]-rocx[i]))
2777  value += ((rocy[i+1]-rocy[i])/math.log(10))+b*(math.log10(rocx[i+1])-math.log10(rocx[i]))
2778  return value/math.log10(1.0/l)
2779 
2780  except ImportError:
2781  LogError("Function needs numpy, but I could not import it.")
2782  raise
2783 
2784  def PlotROC(self, score_col, class_col, score_dir='-',
2785  class_dir='-', class_cutoff=2.0,
2786  style='-', title=None, x_title=None, y_title=None,
2787  clear=True, save=None):
2788  '''
2789  Plot an ROC curve using matplotlib.
2790 
2791  For more information about parameters of the ROC, see
2792  :meth:`ComputeROC`, and for plotting see :meth:`Plot`.
2793 
2794  :warning: The function depends on *matplotlib*
2795  '''
2796 
2797  try:
2798  import matplotlib.pyplot as plt
2799 
2800  roc = self.ComputeROC(score_col, class_col, score_dir,
2801  class_dir, class_cutoff)
2802 
2803  if not roc:
2804  return None
2805 
2806  enrx, enry = roc
2807 
2808  if not title:
2809  title = 'ROC of %s'%score_col
2810 
2811  if not x_title:
2812  x_title = 'false positive rate'
2813 
2814  if not y_title:
2815  y_title = 'true positive rate'
2816 
2817  if clear:
2818  plt.clf()
2819 
2820  plt.plot(enrx, enry, style)
2821 
2822  plt.title(title, size='x-large', fontweight='bold')
2823  plt.ylabel(y_title, size='x-large')
2824  plt.xlabel(x_title, size='x-large')
2825 
2826  if save:
2827  plt.savefig(save)
2828 
2829  return plt
2830  except ImportError:
2831  LogError("Function needs matplotlib, but I could not import it.")
2832  raise
2833 
2834  def PlotLogROC(self, score_col, class_col, score_dir='-',
2835  class_dir='-', class_cutoff=2.0,
2836  style='-', title=None, x_title=None, y_title=None,
2837  clear=True, save=None):
2838  '''
2839  Plot an logROC curve where the x-axis is semilogarithmic using matplotlib
2840 
2841  For more information about parameters of the ROC, see
2842  :meth:`ComputeROC`, and for plotting see :meth:`Plot`.
2843 
2844  :warning: The function depends on *matplotlib*
2845  '''
2846 
2847  try:
2848  import matplotlib.pyplot as plt
2849 
2850  roc = self.ComputeROC(score_col, class_col, score_dir,
2851  class_dir, class_cutoff)
2852 
2853  if not roc:
2854  return None
2855 
2856  rocx, rocy = roc
2857 
2858  if not title:
2859  title = 'logROC of %s'%score_col
2860 
2861  if not x_title:
2862  x_title = 'false positive rate'
2863 
2864  if not y_title:
2865  y_title = 'true positive rate'
2866 
2867  if clear:
2868  plt.clf()
2869 
2870  rocx = [x if x>0 else 0.001 for x in rocx]
2871 
2872 
2873  plt.plot(rocx, rocy, style)
2874 
2875  plt.title(title, size='x-large', fontweight='bold')
2876  plt.ylabel(y_title, size='x-large')
2877  plt.xlabel(x_title, size='x-large')
2878 
2879  plt.xscale('log', basex=10)
2880  plt.xlim(0.001, 1.0)
2881 
2882 
2883  if save:
2884  plt.savefig(save)
2885 
2886  return plt
2887  except ImportError:
2888  LogError("Function needs matplotlib, but I could not import it.")
2889  raise
2890 
2891  def ComputeMCC(self, score_col, class_col, score_dir='-',
2892  class_dir='-', score_cutoff=2.0, class_cutoff=2.0):
2893  '''
2894  Compute Matthews correlation coefficient (MCC) for one column (*score_col*)
2895  with the points classified into true positives, false positives, true
2896  negatives and false negatives according to a specified classification
2897  column (*class_col*).
2898 
2899  The datapoints in *score_col* and *class_col* are classified into
2900  positive and negative points. This can be done in two ways:
2901 
2902  - by using 'bool' columns which contains True for positives and False
2903  for negatives
2904 
2905  - by using 'float' or 'int' columns and specifying a cutoff value and the
2906  columns direction. This will generate the classification on the fly
2907 
2908  * if ``class_dir``/``score_dir=='-'``: values in the classification column that are less than or equal to *class_cutoff*/*score_cutoff* will be counted as positives
2909  * if ``class_dir``/``score_dir=='+'``: values in the classification column that are larger than or equal to *class_cutoff*/*score_cutoff* will be counted as positives
2910 
2911  The two possibilities can be used together, i.e. 'bool' type for one column
2912  and 'float'/'int' type and cutoff/direction for the other column.
2913  '''
2914  ALLOWED_DIR = ['+','-']
2915 
2916  score_idx = self.GetColIndex(score_col)
2917  score_type = self.col_types[score_idx]
2918  if score_type!='int' and score_type!='float' and score_type!='bool':
2919  raise TypeError("Score column must be numeric or bool type")
2920 
2921  class_idx = self.GetColIndex(class_col)
2922  class_type = self.col_types[class_idx]
2923  if class_type!='int' and class_type!='float' and class_type!='bool':
2924  raise TypeError("Classifier column must be numeric or bool type")
2925 
2926  if (score_dir not in ALLOWED_DIR) or (class_dir not in ALLOWED_DIR):
2927  raise ValueError("Direction must be one of %s"%str(ALLOWED_DIR))
2928 
2929  tp = 0
2930  fp = 0
2931  fn = 0
2932  tn = 0
2933 
2934  for i,row in enumerate(self.rows):
2935  class_val = row[class_idx]
2936  score_val = row[score_idx]
2937  if class_val!=None:
2938  if (class_type=='bool' and class_val==True) or (class_type!='bool' and ((class_dir=='-' and class_val<=class_cutoff) or (class_dir=='+' and class_val>=class_cutoff))):
2939  if (score_type=='bool' and score_val==True) or (score_type!='bool' and ((score_dir=='-' and score_val<=score_cutoff) or (score_dir=='+' and score_val>=score_cutoff))):
2940  tp += 1
2941  else:
2942  fn += 1
2943  else:
2944  if (score_type=='bool' and score_val==False) or (score_type!='bool' and ((score_dir=='-' and score_val>score_cutoff) or (score_dir=='+' and score_val<score_cutoff))):
2945  tn += 1
2946  else:
2947  fp += 1
2948 
2949  mcc = None
2950  msg = None
2951  if (tp+fn)==0:
2952  msg = 'factor (tp + fn) is zero'
2953  elif (tp+fp)==0:
2954  msg = 'factor (tp + fp) is zero'
2955  elif (tn+fn)==0:
2956  msg = 'factor (tn + fn) is zero'
2957  elif (tn+fp)==0:
2958  msg = 'factor (tn + fp) is zero'
2959 
2960  if msg:
2961  LogWarning("Could not compute MCC: MCC is not defined since %s"%msg)
2962  else:
2963  mcc = ((tp*tn)-(fp*fn)) / math.sqrt((tp+fn)*(tp+fp)*(tn+fn)*(tn+fp))
2964  return mcc
2965 
2966 
2967  def IsEmpty(self, col_name=None, ignore_nan=True):
2968  '''
2969  Checks if a table is empty.
2970 
2971  If no column name is specified, the whole table is checked for being empty,
2972  whereas if a column name is specified, only this column is checked.
2973 
2974  By default, all NAN (or None) values are ignored, and thus, a table
2975  containing only NAN values is considered as empty. By specifying the
2976  option ignore_nan=False, NAN values are counted as 'normal' values.
2977  '''
2978 
2979  # table with no columns and no rows
2980  if len(self.col_names)==0:
2981  if col_name:
2982  raise ValueError('Table has no column named "%s"' % col_name)
2983  return True
2984 
2985  # column name specified
2986  if col_name:
2987  if self.Count(col_name, ignore_nan=ignore_nan)==0:
2988  return True
2989  else:
2990  return False
2991 
2992  # no column name specified -> test whole table
2993  else:
2994  for row in self.rows:
2995  for cell in row:
2996  if ignore_nan:
2997  if cell!=None:
2998  return False
2999  else:
3000  return False
3001  return True
3002 
3003 
3004  def Extend(self, tab, overwrite=None):
3005  """
3006  Append each row of *tab* to the current table. The data is appended based
3007  on the column names, thus the order of the table columns is *not* relevant,
3008  only the header names.
3009 
3010  If there is a column in *tab* that is not present in the current table,
3011  it is added to the current table and filled with *None* for all the rows
3012  present in the current table.
3013 
3014  If the type of any column in *tab* is not the same as in the current table
3015  a *TypeError* is raised.
3016 
3017  If *overwrite* is not None and set to an existing column name, the specified
3018  column in the table is searched for the first occurrence of a value matching
3019  the value of the column with the same name in the dictionary. If a matching
3020  value is found, the row is overwritten with the dictionary. If no matching
3021  row is found, a new row is appended to the table.
3022  """
3023  # add column to current table if it doesn't exist
3024  for name,typ in zip(tab.col_names, tab.col_types):
3025  if not name in self.col_names:
3026  self.AddCol(name, typ)
3027 
3028  # check that column types are the same in current and new table
3029  for name in self.col_names:
3030  if name in tab.col_names:
3031  curr_type = self.col_types[self.GetColIndex(name)]
3032  new_type = tab.col_types[tab.GetColIndex(name)]
3033  if curr_type!=new_type:
3034  raise TypeError('cannot extend table, column %s in new '%name +\
3035  'table different type (%s) than in '%new_type +\
3036  'current table (%s)'%curr_type)
3037 
3038  num_rows = len(tab.rows)
3039  for i in range(0,num_rows):
3040  row = tab.rows[i]
3041  data = dict(zip(tab.col_names,row))
3042  self.AddRow(data, overwrite)
3043 
3044 
3045 def Merge(table1, table2, by, only_matching=False):
3046  """
3047  Returns a new table containing the data from both tables. The rows are
3048  combined based on the common values in the column(s) by. The option 'by' can
3049  be a list of column names. When this is the case, merging is based on
3050  multiple columns.
3051  For example, the two tables below
3052 
3053  ==== ====
3054  x y
3055  ==== ====
3056  1 10
3057  2 15
3058  3 20
3059  ==== ====
3060 
3061  ==== ====
3062  x u
3063  ==== ====
3064  1 100
3065  3 200
3066  4 400
3067  ==== ====
3068 
3069  when merged by column x, produce the following output:
3070 
3071  ===== ===== =====
3072  x y u
3073  ===== ===== =====
3074  1 10 100
3075  2 15 None
3076  3 20 200
3077  4 None 400
3078  ===== ===== =====
3079 
3080 
3081  """
3082  def _key(row, indices):
3083  return tuple([row[i] for i in indices])
3084  def _keep(indices, cn, ct, ni):
3085  ncn, nct, nni=([],[],[])
3086  for i in range(len(cn)):
3087  if i not in indices:
3088  ncn.append(cn[i])
3089  nct.append(ct[i])
3090  nni.append(ni[i])
3091  return ncn, nct, nni
3092  col_names=list(table2.col_names)
3093  col_types=list(table2.col_types)
3094  new_index=[i for i in range(len(col_names))]
3095  if isinstance(by, str):
3096  common2_indices=[col_names.index(by)]
3097  else:
3098  common2_indices=[col_names.index(b) for b in by]
3099  col_names, col_types, new_index=_keep(common2_indices, col_names,
3100  col_types, new_index)
3101 
3102  for i, name in enumerate(col_names):
3103  try_name=name
3104  counter=1
3105  while try_name in table1.col_names:
3106  counter+=1
3107  try_name='%s_%d' % (name, counter)
3108  col_names[i]=try_name
3109  common1={}
3110  if isinstance(by, str):
3111  common1_indices=[table1.col_names.index(by)]
3112  else:
3113  common1_indices=[table1.col_names.index(b) for b in by]
3114  for row in table1.rows:
3115  key=_key(row, common1_indices)
3116  if key in common1:
3117  raise ValueError('duplicate key "%s in first table"' % (str(key)))
3118  common1[key]=row
3119  common2={}
3120  for row in table2.rows:
3121  key=_key(row, common2_indices)
3122  if key in common2:
3123  raise ValueError('duplicate key "%s" in second table' % (str(key)))
3124  common2[key]=row
3125  new_tab=Table(table1.col_names+col_names, table1.col_types+col_types)
3126  for k, v in common1.iteritems():
3127  row=v+[None for i in range(len(table2.col_names)-len(common2_indices))]
3128  matched=False
3129  if k in common2:
3130  matched=True
3131  row2=common2[k]
3132  for i, index in enumerate(new_index):
3133  row[len(table1.col_names)+i]=row2[index]
3134  if only_matching and not matched:
3135  continue
3136  new_tab.AddRow(row)
3137  if only_matching:
3138  return new_tab
3139  for k, v in common2.iteritems():
3140  if not k in common1:
3141  v2=[v[i] for i in new_index]
3142  row=[None for i in range(len(table1.col_names))]+v2
3143  for common1_index, common2_index in zip(common1_indices, common2_indices):
3144  row[common1_index]=v[common2_index]
3145  new_tab.AddRow(row)
3146  return new_tab
3147