OpenStructure
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Groups Pages
table.py
Go to the documentation of this file.
1 import csv
2 import re
3 import math
4 from ost import stutil
5 import itertools
6 import operator
7 import cPickle
8 import weakref
9 from ost import LogError, LogWarning, LogInfo, LogVerbose
10 
11 def MakeTitle(col_name):
12  return col_name.replace('_', ' ')
13 
14 def IsStringLike(value):
15  if isinstance(value, TableCol) or isinstance(value, BinaryColExpr):
16  return False
17  try:
18  value+''
19  return True
20  except:
21  return False
22 
23 def IsNullString(value):
24  value=value.strip().upper()
25  return value in ('', 'NULL', 'NONE', 'NA')
26 
27 def IsScalar(value):
28  if IsStringLike(value):
29  return True
30  try:
31  if isinstance(value, TableCol) or isinstance(value, BinaryColExpr):
32  return False
33  iter(value)
34  return False
35  except:
36  return True
37 
38 def GuessColumnType(iterator):
39  empty=True
40  possibilities=set(['bool', 'int', 'float'])
41  for ele in iterator:
42  str_ele=str(ele).upper()
43  if IsNullString(str_ele):
44  continue
45  empty=False
46  if 'int' in possibilities:
47  try:
48  int(str_ele)
49  except ValueError:
50  possibilities.remove('int')
51 
52  if 'float' in possibilities:
53  try:
54  float(str_ele)
55  except ValueError:
56  possibilities.remove('float')
57  if 'bool' in possibilities:
58  if str_ele not in set(['YES', 'NO', 'TRUE', 'FALSE']):
59  possibilities.remove('bool')
60 
61  if len(possibilities)==0:
62  return 'string'
63  if len(possibilities)==2:
64  return 'int'
65  if empty:
66  return 'string'
67  # return the last element available
68  return possibilities.pop()
69 
71  def __init__(self, op, lhs, rhs):
72  self.op=op
73  self.lhs=lhs
74  self.rhs=rhs
75  if IsScalar(lhs):
76  self.lhs=itertools.cyle([self.lhs])
77  if IsScalar(rhs):
78  self.rhs=itertools.cycle([self.rhs])
79  def __iter__(self):
80  for l, r in zip(self.lhs, self.rhs):
81  if l!=None and r!=None:
82  yield self.op(l, r)
83  else:
84  yield None
85  def __add__(self, rhs):
86  return BinaryColExpr(operator.add, self, rhs)
87 
88  def __sub__(self, rhs):
89  return BinaryColExpr(operator.sub, self, rhs)
90 
91  def __mul__(self, rhs):
92  return BinaryColExpr(operator.mul, self, rhs)
93 
94  def __div__(self, rhs):
95  return BinaryColExpr(operator.div, self, rhs)
96 
97 class TableCol:
98  def __init__(self, table, col):
99  self._table=table
100  if type(col)==str:
101  self.col_index=self._table.GetColIndex(col)
102  else:
103  self.col_index=col
104 
105  def __iter__(self):
106  for row in self._table.rows:
107  yield row[self.col_index]
108 
109  def __len__(self):
110  return len(self._table.rows)
111 
112  def __getitem__(self, index):
113  return self._table.rows[index][self.col_index]
114 
115  def __setitem__(self, index, value):
116  self._table.rows[index][self.col_index]=value
117 
118  def __add__(self, rhs):
119  return BinaryColExpr(operator.add, self, rhs)
120 
121  def __sub__(self, rhs):
122  return BinaryColExpr(operator.sub, self, rhs)
123 
124  def __mul__(self, rhs):
125  return BinaryColExpr(operator.mul, self, rhs)
126 
127  def __div__(self, rhs):
128  return BinaryColExpr(operator.div, self, rhs)
129 
130 class TableRow:
131  """
132  Essentially a named tuple, but allows column names that are not valid
133  python variable names.
134  """
135  def __init__(self, row_data, tab):
136  self.__dict__['tab'] = weakref.proxy(tab)
137  self.__dict__['row_data'] = row_data
138 
139  def __getitem__(self, col_name):
140  if type(col_name)==int:
141  return self.row_data[col_name]
142  return self.row_data[self.tab.GetColIndex(col_name)]
143 
144  def __str__(self):
145  s = []
146  for k, v in zip(self.__dict__['tab'].col_names, self.__dict__['row_data']):
147  s.append('%s=%s' % (k, str(v)))
148  return ', '.join(s)
149 
150 
151  def __len__(self):
152  return len(self.row_data)
153 
154  def __setitem__(self, col_name, val):
155  if type(col_name)==int:
156  self.row_data[col_name] = val
157  else:
158  self.row_data[self.tab.GetColIndex(col_name)] = val
159 
160  def __getattr__(self, col_name):
161  if 'col_names' not in self.tab.__dict__ or col_name not in self.tab.col_names:
162  raise AttributeError(col_name)
163  return self.row_data[self.tab.GetColIndex(col_name)]
164 
165  def __setattr__(self, col_name, val):
166  if 'col_names' not in self.tab.__dict__ or col_name not in self.tab.col_names:
167  raise AttributeError(col_name)
168  self.row_data[self.tab.GetColIndex(col_name)] = val
169 
170 class Table(object):
171  """
172 
173  The table class provides convenient access to data in tabular form. An empty
174  table can be easily constructed as follows
175 
176  .. code-block:: python
177 
178  tab = Table()
179 
180  If you want to add columns directly when creating the table, column names
181  and *column types* can be specified as follows
182 
183  .. code-block:: python
184 
185  tab = Table(['nameX','nameY','nameZ'], 'sfb')
186 
187  this will create three columns called nameX, nameY and nameZ of type string,
188  float and bool, respectively. There will be no data in the table and thus,
189  the table will not contain any rows.
190 
191  The following *column types* are supported:
192 
193  ======= ========
194  name abbrev
195  ======= ========
196  string s
197  float f
198  int i
199  bool b
200  ======= ========
201 
202  If you want to add data to the table in addition, use the following:
203 
204  .. code-block:: python
205 
206  tab=Table(['nameX','nameY','nameZ'],
207  'sfb',
208  nameX = ['a','b','c'],
209  nameY = [0.1, 1.2, 3.414],
210  nameZ = [True, False, False])
211 
212  if values for one column is left out, they will be filled with NA, but if
213  values are specified, all values must be specified (i.e. same number of
214  values per column)
215 
216  """
217 
218  SUPPORTED_TYPES=('int', 'float', 'bool', 'string',)
219 
220 
221  def __init__(self, col_names=[], col_types=None, **kwargs):
222 
223  self.col_names=list(col_names)
224  self.comment=''
225  self.name=''
226 
227  self.col_types = self._ParseColTypes(col_types)
228  self.rows=[]
229  if len(kwargs)>=0:
230  if not col_names:
231  self.col_names=[v for v in kwargs.keys()]
232  if not self.col_types:
233  self.col_types=['string' for u in range(len(self.col_names))]
234  if len(kwargs)>0:
235  self._AddRowsFromDict(kwargs)
236 
237  def __getattr__(self, col_name):
238  # pickling doesn't call the standard __init__ defined above and thus
239  # col_names might not be defined. This leads to infinite recursions.
240  # Protect against it by checking that col_names is contained in
241  # __dict__
242  if 'col_names' not in self.__dict__ or col_name not in self.col_names:
243  raise AttributeError(col_name)
244  return TableCol(self, col_name)
245 
246  @staticmethod
247  def _ParseColTypes(types, exp_num=None):
248  if types==None:
249  return None
250 
251  short2long = {'s' : 'string', 'i': 'int', 'b' : 'bool', 'f' : 'float'}
252  allowed_short = short2long.keys()
253  allowed_long = short2long.values()
254 
255  type_list = []
256 
257  # string type
258  if IsScalar(types):
259  if type(types)==str:
260  types = types.lower()
261 
262  # single value
263  if types in allowed_long:
264  type_list.append(types)
265  elif types in allowed_short:
266  type_list.append(short2long[types])
267 
268  # comma separated list of long or short types
269  elif types.find(',')!=-1:
270  for t in types.split(','):
271  if t in allowed_long:
272  type_list.append(t)
273  elif t in allowed_short:
274  type_list.append(short2long[t])
275  else:
276  raise ValueError('Unknown type %s in types %s'%(t,types))
277 
278  # string of short types
279  else:
280  for t in types:
281  if t in allowed_short:
282  type_list.append(short2long[t])
283  else:
284  raise ValueError('Unknown type %s in types %s'%(t,types))
285 
286  # non-string type
287  else:
288  raise ValueError('Col type %s must be string or list'%types)
289 
290  # list type
291  else:
292  for t in types:
293  # must be string type
294  if type(t)==str:
295  t = t.lower()
296  if t in allowed_long:
297  type_list.append(t)
298  elif t in allowed_short:
299  type_list.append(short2long[t])
300  else:
301  raise ValueError('Unknown type %s in types %s'%(t,types))
302 
303  # non-string type
304  else:
305  raise ValueError('Col type %s must be string or list'%types)
306 
307  if exp_num:
308  if len(type_list)!=exp_num:
309  raise ValueError('Parsed number of col types (%i) differs from ' + \
310  'expected (%i) in types %s'%(len(type_list),exp_num,types))
311 
312  return type_list
313 
314  def SetName(self, name):
315  '''
316  Set name of the table
317 
318  :param name: name
319  :type name: :class:`str`
320  '''
321  self.name = name
322 
323  def GetName(self):
324  '''
325  Get name of table
326  '''
327  return self.name
328 
329  def RenameCol(self, old_name, new_name):
330  """
331  Rename column *old_name* to *new_name*.
332 
333  :param old_name: Name of the old column
334  :param new_name: Name of the new column
335  :raises: :exc:`ValueError` when *old_name* is not a valid column
336  """
337  if old_name==new_name:
338  return
339  self.AddCol(new_name, self.col_types[self.GetColIndex(old_name)],
340  self[old_name])
341  self.RemoveCol(old_name)
342  def _Coerce(self, value, ty):
343  '''
344  Try to convert values (e.g. from :class:`str` type) to the specified type
345 
346  :param value: the value
347  :type value: any type
348 
349  :param ty: name of type to convert it to (i.e. *int*, *float*, *string*,
350  *bool*)
351  :type ty: :class:`str`
352  '''
353  if value=='NA' or value==None:
354  return None
355  if ty=='int':
356  return int(value)
357  if ty=='float':
358  return float(value)
359  if ty=='string':
360  return str(value)
361  if ty=='bool':
362  if isinstance(value, str) or isinstance(value, unicode):
363  if value.upper() in ('FALSE', 'NO',):
364  return False
365  return True
366  return bool(value)
367  raise ValueError('Unknown type %s' % ty)
368 
369  def GetColIndex(self, col):
370  '''
371  Returns the column index for the column with the given name.
372 
373  :raises: ValueError if no column with the name is found.
374  '''
375  if col not in self.col_names:
376  raise ValueError('Table has no column named "%s"' % col)
377  return self.col_names.index(col)
378 
379  def GetColNames(self):
380  '''
381  Returns a list containing all column names.
382  '''
383  return self.col_names
384 
385  def SearchColNames(self, regex):
386  '''
387  Returns a list of column names matching the regex.
388 
389  :param regex: regex pattern
390  :type regex: :class:`str`
391 
392  :returns: :class:`list` of column names (:class:`str`)
393  '''
394  matching_names = []
395  for name in self.col_names:
396  matches = re.search(regex, name)
397  if matches:
398  matching_names.append(name)
399  return matching_names
400 
401  def HasCol(self, col):
402  '''
403  Checks if the column with a given name is present in the table.
404  '''
405  return col in self.col_names
406 
407  def __getitem__(self, k):
408  if type(k)==int:
409  return TableCol(self, self.col_names[k])
410  else:
411  return TableCol(self, k)
412 
413  def __setitem__(self, k, value):
414  col_index=k
415  if type(k)!=int:
416  col_index=self.GetColIndex(k)
417  if IsScalar(value):
418  value=itertools.cycle([value])
419  for r, v in zip(self.rows, value):
420  r[col_index]=v
421 
422  def ToString(self, float_format='%.3f', int_format='%d', rows=None):
423  '''
424  Convert the table into a string representation.
425 
426  The output format can be modified for int and float type columns by
427  specifying a formatting string for the parameters *float_format* and
428  *int_format*.
429 
430  The option *rows* specify the range of rows to be printed. The parameter
431  must be a type that supports indexing (e.g. a :class:`list`) containing the
432  start and end row *index*, e.g. [start_row_idx, end_row_idx].
433 
434  :param float_format: formatting string for float columns
435  :type float_format: :class:`str`
436 
437  :param int_format: formatting string for int columns
438  :type int_format: :class:`str`
439 
440  :param rows: iterable containing start and end row *index*
441  :type rows: iterable containing :class:`ints <int>`
442  '''
443  widths=[len(cn) for cn in self.col_names]
444  sel_rows=self.rows
445  if rows:
446  sel_rows=self.rows[rows[0]:rows[1]]
447  for row in sel_rows:
448  for i, (ty, col) in enumerate(zip(self.col_types, row)):
449  if col==None:
450  widths[i]=max(widths[i], len('NA'))
451  elif ty=='float':
452  widths[i]=max(widths[i], len(float_format % col))
453  elif ty=='int':
454  widths[i]=max(widths[i], len(int_format % col))
455  else:
456  widths[i]=max(widths[i], len(str(col)))
457  s=''
458  if self.comment:
459  s+=''.join(['# %s\n' % l for l in self.comment.split('\n')])
460  total_width=sum(widths)+2*len(widths)
461  for width, col_name in zip(widths, self.col_names):
462  s+=col_name.center(width+2)
463  s+='\n%s\n' % ('-'*total_width)
464  for row in sel_rows:
465  for width, ty, col in zip(widths, self.col_types, row):
466  cs=''
467  if col==None:
468  cs='NA'.center(width+2)
469  elif ty=='float':
470  cs=(float_format % col).rjust(width+2)
471  elif ty=='int':
472  cs=(int_format % col).rjust(width+2)
473  else:
474  cs=' '+str(col).ljust(width+1)
475  s+=cs
476  s+='\n'
477  return s
478 
479  def __str__(self):
480  return self.ToString()
481 
482  def Stats(self, col):
483  idx = self.GetColIndex(col)
484  text ='''
485 Statistics for column %(col)s
486 
487  Number of Rows : %(num)d
488  Number of Rows Not None: %(num_non_null)d
489  Mean : %(mean)f
490  Median : %(median)f
491  Standard Deviation : %(stddev)f
492  Min : %(min)f
493  Max : %(max)f
494 '''
495  data = {
496  'col' : col,
497  'num' : len(self.rows),
498  'num_non_null' : self.Count(col),
499  'median' : self.Median(col),
500  'mean' : self.Mean(col),
501  'stddev' : self.StdDev(col),
502  'min' : self.Min(col),
503  'max' : self.Max(col),
504  }
505  return text % data
506 
507  def _AddRowsFromDict(self, d, overwrite=None):
508  '''
509  Add one or more rows from a :class:`dictionary <dict>`.
510 
511  If *overwrite* is not None and set to an existing column name, the specified
512  column in the table is searched for the first occurrence of a value matching
513  the value of the column with the same name in the dictionary. If a matching
514  value is found, the row is overwritten with the dictionary. If no matching
515  row is found, a new row is appended to the table.
516 
517  :param d: dictionary containing the data
518  :type d: :class:`dict`
519 
520  :param overwrite: column name to overwrite existing row if value in
521  column *overwrite* matches
522  :type overwrite: :class:`str`
523 
524  :raises: :class:`ValueError` if multiple rows are added but the number of
525  data items is different for different columns.
526  '''
527  # get column indices
528  idxs = [self.GetColIndex(k) for k in d.keys()]
529 
530  # convert scalar values to list
531  old_len = None
532  for k,v in d.iteritems():
533  if IsScalar(v):
534  v = [v]
535  d[k] = v
536  if not old_len:
537  old_len = len(v)
538  elif old_len!=len(v):
539  raise ValueError("Cannot add rows: length of data must be equal " + \
540  "for all columns in %s"%str(d))
541 
542  # convert column based dict to row based dict and create row and add data
543  for i,data in enumerate(zip(*d.values())):
544  new_row = [None for a in range(len(self.col_names))]
545  for idx,v in zip(idxs,data):
546  new_row[idx] = self._Coerce(v, self.col_types[idx])
547 
548  # partially overwrite existing row with new data
549  if overwrite:
550  overwrite_idx = self.GetColIndex(overwrite)
551  added = False
552  for i,r in enumerate(self.rows):
553  if r[overwrite_idx]==new_row[overwrite_idx]:
554  for j,e in enumerate(self.rows[i]):
555  if new_row[j]==None:
556  new_row[j] = e
557  self.rows[i] = new_row
558  added = True
559  break
560 
561  # if not overwrite or overwrite did not find appropriate row
562  if not overwrite or not added:
563  self.rows.append(new_row)
564 
565  def PairedTTest(self, col_a, col_b):
566  """
567  Two-sided test for the null-hypothesis that two related samples
568  have the same average (expected values).
569 
570  :param col_a: First column
571  :type col_a: :class:`str`
572  :param col_b: Second column
573  :type col_b: :class:`str`
574 
575  :returns: P-value between 0 and 1 that the two columns have the
576  same average. The smaller the value, the less related the two
577  columns are.
578  """
579  from scipy.stats import ttest_rel
580  xs = []
581  ys = []
582  for x, y in self.Zip(col_a, col_b):
583  if x!=None and y!=None:
584  xs.append(x)
585  ys.append(y)
586  result = ttest_rel(xs, ys)
587  return result[1]
588 
589  def AddRow(self, data, overwrite=None):
590  """
591  Add a row to the table.
592 
593  *data* may either be a dictionary or a list-like object:
594 
595  - If *data* is a dictionary, the keys in the dictionary must match the
596  column names. Columns not found in the dict will be initialized to None.
597  If the dict contains list-like objects, multiple rows will be added, if
598  the number of items in all list-like objects is the same, otherwise a
599  :class:`ValueError` is raised.
600 
601  - If *data* is a list-like object, the row is initialized from the values
602  in *data*. The number of items in *data* must match the number of
603  columns in the table. A :class:`ValuerError` is raised otherwise. The
604  values are added in the order specified in the list, thus, the order of
605  the data must match the columns.
606 
607  If *overwrite* is not None and set to an existing column name, the specified
608  column in the table is searched for the first occurrence of a value matching
609  the value of the column with the same name in the dictionary. If a matching
610  value is found, the row is overwritten with the dictionary. If no matching
611  row is found, a new row is appended to the table.
612 
613  :param data: data to add
614  :type data: :class:`dict` or *list-like* object
615 
616  :param overwrite: column name to overwrite existing row if value in
617  column *overwrite* matches
618  :type overwrite: :class:`str`
619 
620  :raises: :class:`ValueError` if *list-like* object is used and number of
621  items does *not* match number of columns in table.
622 
623  :raises: :class:`ValueError` if *dict* is used and multiple rows are added
624  but the number of data items is different for different columns.
625 
626  **Example:** add multiple data rows to a subset of columns using a dictionary
627 
628  .. code-block:: python
629 
630  # create table with three float columns
631  tab = Table(['x','y','z'], 'fff')
632 
633  # add rows from dict
634  data = {'x': [1.2, 1.6], 'z': [1.6, 5.3]}
635  tab.AddRow(data)
636  print tab
637 
638  '''
639  will produce the table
640 
641  ==== ==== ====
642  x y z
643  ==== ==== ====
644  1.20 NA 1.60
645  1.60 NA 5.30
646  ==== ==== ====
647  '''
648 
649  # overwrite the row with x=1.2 and add row with x=1.9
650  data = {'x': [1.2, 1.9], 'z': [7.9, 3.5]}
651  tab.AddRow(data, overwrite='x')
652  print tab
653 
654  '''
655  will produce the table
656 
657  ==== ==== ====
658  x y z
659  ==== ==== ====
660  1.20 NA 7.90
661  1.60 NA 5.30
662  1.90 NA 3.50
663  ==== ==== ====
664  '''
665  """
666  if type(data)==dict:
667  self._AddRowsFromDict(data, overwrite)
668  else:
669  if len(data)!=len(self.col_names):
670  msg='data array must have %d elements, not %d'
671  raise ValueError(msg % (len(self.col_names), len(data)))
672  new_row = [self._Coerce(v, t) for v, t in zip(data, self.col_types)]
673 
674  # fully overwrite existing row with new data
675  if overwrite:
676  overwrite_idx = self.GetColIndex(overwrite)
677  added = False
678  for i,r in enumerate(self.rows):
679  if r[overwrite_idx]==new_row[overwrite_idx]:
680  self.rows[i] = new_row
681  added = True
682  break
683 
684  # if not overwrite or overwrite did not find appropriate row
685  if not overwrite or not added:
686  self.rows.append(new_row)
687 
688  def RemoveCol(self, col):
689  """
690  Remove column with the given name from the table.
691 
692  :param col: name of column to remove
693  :type col: :class:`str`
694  """
695  idx = self.GetColIndex(col)
696  del self.col_names[idx]
697  del self.col_types[idx]
698  for row in self.rows:
699  del row[idx]
700 
701  def AddCol(self, col_name, col_type, data=None):
702  """
703  Add a column to the right of the table.
704 
705  :param col_name: name of new column
706  :type col_name: :class:`str`
707 
708  :param col_type: type of new column (long versions: *int*, *float*, *bool*,
709  *string* or short versions: *i*, *f*, *b*, *s*)
710  :type col_type: :class:`str`
711 
712  :param data: data to add to new column
713  :type data: scalar or iterable
714 
715  **Example:**
716 
717  .. code-block:: python
718 
719  tab = Table(['x'], 'f', x=range(5))
720  tab.AddCol('even', 'bool', itertools.cycle([True, False]))
721  print tab
722 
723  '''
724  will produce the table
725 
726  ==== ====
727  x even
728  ==== ====
729  0 True
730  1 False
731  2 True
732  3 False
733  4 True
734  ==== ====
735  '''
736 
737  If data is a constant instead of an iterable object, it's value
738  will be written into each row:
739 
740  .. code-block:: python
741 
742  tab = Table(['x'], 'f', x=range(5))
743  tab.AddCol('num', 'i', 1)
744  print tab
745 
746  '''
747  will produce the table
748 
749  ==== ====
750  x num
751  ==== ====
752  0 1
753  1 1
754  2 1
755  3 1
756  4 1
757  ==== ====
758  '''
759 
760  As a special case, if there are no previous rows, and data is not
761  None, rows are added for every item in data.
762  """
763 
764  if col_name in self.col_names:
765  raise ValueError('Column with name %s already exists'%col_name)
766 
767  col_type = self._ParseColTypes(col_type, exp_num=1)[0]
768  self.col_names.append(col_name)
769  self.col_types.append(col_type)
770 
771  if len(self.rows)>0:
772  if IsScalar(data):
773  for row in self.rows:
774  row.append(data)
775  else:
776  if hasattr(data, '__len__') and len(data)!=len(self.rows):
777  self.col_names.pop()
778  self.col_types.pop()
779  raise ValueError('Length of data (%i) must correspond to number of '%len(data) +\
780  'existing rows (%i)'%len(self.rows))
781  for row, d in zip(self.rows, data):
782  row.append(d)
783 
784  elif data!=None and len(self.col_names)==1:
785  if IsScalar(data):
786  self.AddRow({col_name : data})
787  else:
788  for v in data:
789  self.AddRow({col_name : v})
790 
791  def Filter(self, *args, **kwargs):
792  """
793  Returns a filtered table only containing rows matching all the predicates
794  in kwargs and args For example,
795 
796  .. code-block:: python
797 
798  tab.Filter(town='Basel')
799 
800  will return all the rows where the value of the column "town" is equal to
801  "Basel". Several predicates may be combined, i.e.
802 
803  .. code-block:: python
804 
805  tab.Filter(town='Basel', male=True)
806 
807  will return the rows with "town" equal to "Basel" and "male" equal to true.
808  args are unary callables returning true if the row should be included in the
809  result and false if not.
810  """
811  filt_tab=Table(list(self.col_names), list(self.col_types))
812  for row in self.rows:
813  matches=True
814  for func in args:
815  if not func(row):
816  matches=False
817  break
818  for key, val in kwargs.iteritems():
819  if row[self.GetColIndex(key)]!=val:
820  matches=False
821  break
822  if matches:
823  filt_tab.AddRow(row)
824  return filt_tab
825 
826 
827  def Select(self, query):
828 
829  """
830  Returns a new table object containing all rows matching a logical query
831  expression.
832 
833  *query* is a string containing the logical expression, that will be
834  evaluated for every row.
835 
836  Operands have to be the name of a column or an expression that can be
837  parsed to float, int, bool or string.
838  Valid operators are: and, or, !=, !, <=, >=, ==, =, <, >, +, -, \*, /
839 
840  .. code-block:: python
841 
842  subtab = tab.Select('col_a>0.5 and (col_b=5 or col_c=5)')
843 
844  The selection query should be self explaining. Allowed parenthesis are:
845  (), [], {}, whereas parenthesis mismatches get recognized. Expressions like
846  '3<=col_a>=col_b' throw an error, due to problems in figuring out the
847  evaluation order.
848 
849  There are two special expressions:
850 
851  .. code-block:: python
852 
853  #selects rows, where 1.0<=col_a<=1.5
854  subtab = tab.Select('col_a=1.0:1.5')
855 
856  #selects rows, where col_a=1 or col_a=2 or col_a=3
857  subtab = tab.Select('col_a=1,2,3')
858 
859  Only consistent types can be compared. If col_a is of type string and col_b
860  is of type int, following expression would throw an error: 'col_a<col_b'
861  """
862 
863  try:
864  from table_selector import TableSelector
865  except:
866  raise ImportError("Tried to import from the file table_selector.py, but could not find it!")
867 
868  selector=TableSelector(self.col_types, self.col_names, query)
869 
870  selected_tab=Table(list(self.col_names), list(self.col_types))
871 
872  for row in self.rows:
873  if selector.EvaluateRow(row):
874  selected_tab.AddRow(row)
875 
876  return selected_tab
877 
878 
879  @staticmethod
880  def _LoadOST(stream_or_filename):
881  fieldname_pattern=re.compile(r'(?P<name>[^[]+)(\[(?P<type>\w+)\])?')
882  values_pattern=re.compile("([^\" ]+|\"[^\"]*\")+")
883  if not hasattr(stream_or_filename, 'read'):
884  stream=open(stream_or_filename, 'r')
885  else:
886  stream=stream_or_filename
887  header=False
888  num_lines=0
889  for line in stream:
890  line=line.strip()
891  if line.startswith('#'):
892  continue
893  if len(line)==0:
894  continue
895  num_lines+=1
896  if not header:
897  fieldnames=[]
898  fieldtypes=[]
899  for col in line.split():
900  match=fieldname_pattern.match(col)
901  if match:
902  if match.group('type'):
903  fieldtypes.append(match.group('type'))
904  else:
905  fieldtypes.append('string')
906  fieldnames.append(match.group('name'))
907  tab=Table(fieldnames, fieldtypes)
908  header=True
909  continue
910  tab.AddRow([x.strip('"') for x in values_pattern.findall(line)])
911  if num_lines==0:
912  raise IOError("Cannot read table from empty stream")
913  return tab
914 
915  def _GuessColumnTypes(self):
916  for col_idx in range(len(self.col_names)):
917  self.col_types[col_idx]=GuessColumnType(self[self.col_names[col_idx]])
918  for row in self.rows:
919  for idx in range(len(row)):
920  row[idx]=self._Coerce(row[idx], self.col_types[idx])
921 
922  @staticmethod
923  def _LoadCSV(stream_or_filename, sep):
924  if not hasattr(stream_or_filename, 'read'):
925  stream=open(stream_or_filename, 'r')
926  else:
927  stream=stream_or_filename
928  reader=csv.reader(stream, delimiter=sep)
929  first=True
930  for row in reader:
931  if first:
932  header=row
933  types='s'*len(row)
934  tab=Table(header, types)
935  first=False
936  else:
937  tab.AddRow(row)
938  if first:
939  raise IOError('trying to load table from empty CSV stream/file')
940 
941  tab._GuessColumnTypes()
942  return tab
943 
944  @staticmethod
945  def _LoadPickle(stream_or_filename):
946  if not hasattr(stream_or_filename, 'read'):
947  stream=open(stream_or_filename, 'rb')
948  else:
949  stream=stream_or_filename
950  return cPickle.load(stream)
951 
952  @staticmethod
953  def _GuessFormat(filename):
954  try:
955  filename = filename.name
956  except AttributeError, e:
957  pass
958  if filename.endswith('.csv'):
959  return 'csv'
960  elif filename.endswith('.pickle'):
961  return 'pickle'
962  else:
963  return 'ost'
964 
965 
966  @staticmethod
967  def Load(stream_or_filename, format='auto', sep=','):
968  """
969  Load table from stream or file with given name.
970 
971  By default, the file format is set to *auto*, which tries to guess the file
972  format from the file extension. The following file extensions are
973  recognized:
974 
975  ============ ======================
976  extension recognized format
977  ============ ======================
978  .csv comma separated values
979  .pickle pickled byte stream
980  <all others> ost-specific format
981  ============ ======================
982 
983  Thus, *format* must be specified for reading file with different filename
984  extensions.
985 
986  The following file formats are understood:
987 
988  - ost
989 
990  This is an ost-specific, but still human readable file format. The file
991  (stream) must start with header line of the form
992 
993  col_name1[type1] <col_name2[type2]>...
994 
995  The types given in brackets must be one of the data types the
996  :class:`Table` class understands. Each following line in the file then must
997  contains exactly the same number of data items as listed in the header. The
998  data items are automatically converted to the column format. Lines starting
999  with a '#' and empty lines are ignored.
1000 
1001  - pickle
1002 
1003  Deserializes the table from a pickled byte stream.
1004 
1005  - csv
1006 
1007  Reads the table from comma separated values stream. Since there is no
1008  explicit type information in the csv file, the column types are guessed,
1009  using the following simple rules:
1010 
1011  * if all values are either NA/NULL/NONE the type is set to string.
1012  * if all non-null values are convertible to float/int the type is set to
1013  float/int.
1014  * if all non-null values are true/false/yes/no, the value is set to bool.
1015  * for all other cases, the column type is set to string.
1016 
1017  :returns: A new :class:`Table` instance
1018  """
1019  format=format.lower()
1020  if format=='auto':
1021  format = Table._GuessFormat(stream_or_filename)
1022 
1023  if format=='ost':
1024  return Table._LoadOST(stream_or_filename)
1025  if format=='csv':
1026  return Table._LoadCSV(stream_or_filename, sep=sep)
1027  if format=='pickle':
1028  return Table._LoadPickle(stream_or_filename)
1029  raise ValueError('unknown format ""' % format)
1030 
1031  def Sort(self, by, order='+'):
1032  """
1033  Performs an in-place sort of the table, based on column *by*.
1034 
1035  :param by: column name by which to sort
1036  :type by: :class:`str`
1037 
1038  :param order: ascending (``-``) or descending (``+``) order
1039  :type order: :class:`str` (i.e. *+*, *-*)
1040  """
1041  sign=-1
1042  if order=='-':
1043  sign=1
1044  key_index=self.GetColIndex(by)
1045  def _key_cmp(lhs, rhs):
1046  return sign*cmp(lhs[key_index], rhs[key_index])
1047  self.rows=sorted(self.rows, _key_cmp)
1048 
1049  def GetUnique(self, col, ignore_nan=True):
1050  """
1051  Extract a list of all unique values from one column.
1052 
1053  :param col: column name
1054  :type col: :class:`str`
1055 
1056  :param ignore_nan: ignore all *None* values
1057  :type ignore_nan: :class:`bool`
1058  """
1059  idx = self.GetColIndex(col)
1060  seen = {}
1061  result = []
1062  for row in self.rows:
1063  item = row[idx]
1064  if item!=None or ignore_nan==False:
1065  if item in seen: continue
1066  seen[item] = 1
1067  result.append(item)
1068  return result
1069 
1070  def Zip(self, *args):
1071  """
1072  Allows to conveniently iterate over a selection of columns, e.g.
1073 
1074  .. code-block:: python
1075 
1076  tab = Table.Load('...')
1077  for col1, col2 in tab.Zip('col1', 'col2'):
1078  print col1, col2
1079 
1080  is a shortcut for
1081 
1082  .. code-block:: python
1083 
1084  tab = Table.Load('...')
1085  for col1, col2 in zip(tab['col1'], tab['col2']):
1086  print col1, col2
1087  """
1088  return zip(*[self[arg] for arg in args])
1089 
1090  def Plot(self, x, y=None, z=None, style='.', x_title=None, y_title=None,
1091  z_title=None, x_range=None, y_range=None, z_range=None,
1092  color=None, plot_if=None, legend=None,
1093  num_z_levels=10, z_contour=True, z_interpol='nn', diag_line=False,
1094  labels=None, max_num_labels=None, title=None, clear=True, save=False,
1095  **kwargs):
1096  """
1097  Function to plot values from your table in 1, 2 or 3 dimensions using
1098  `Matplotlib <http://matplotlib.sourceforge.net>`__
1099 
1100  :param x: column name for first dimension
1101  :type x: :class:`str`
1102 
1103  :param y: column name for second dimension
1104  :type y: :class:`str`
1105 
1106  :param z: column name for third dimension
1107  :type z: :class:`str`
1108 
1109  :param style: symbol style (e.g. *.*, *-*, *x*, *o*, *+*, *\**). For a
1110  complete list check (`matplotlib docu <http://matplotlib.sourceforge.net/api/pyplot_api.html#matplotlib.pyplot.plot>`__).
1111  :type style: :class:`str`
1112 
1113  :param x_title: title for first dimension, if not specified it is
1114  automatically derived from column name
1115  :type x_title: :class:`str`
1116 
1117  :param y_title: title for second dimension, if not specified it is
1118  automatically derived from column name
1119  :type y_title: :class:`str`
1120 
1121  :param z_title: title for third dimension, if not specified it is
1122  automatically derived from column name
1123  :type z_title: :class:`str`
1124 
1125  :param x_range: start and end value for first dimension (e.g. [start_x, end_x])
1126  :type x_range: :class:`list` of length two
1127 
1128  :param y_range: start and end value for second dimension (e.g. [start_y, end_y])
1129  :type y_range: :class:`list` of length two
1130 
1131  :param z_range: start and end value for third dimension (e.g. [start_z, end_z])
1132  :type z_range: :class:`list` of length two
1133 
1134  :param color: color for data (e.g. *b*, *g*, *r*). For a complete list check
1135  (`matplotlib docu <http://matplotlib.sourceforge.net/api/pyplot_api.html#matplotlib.pyplot.plot>`__).
1136  :type color: :class:`str`
1137 
1138  :param plot_if: callable which returnes *True* if row should be plotted. Is
1139  invoked like ``plot_if(self, row)``
1140  :type plot_if: callable
1141 
1142  :param legend: legend label for data series
1143  :type legend: :class:`str`
1144 
1145  :param num_z_levels: number of levels for third dimension
1146  :type num_z_levels: :class:`int`
1147 
1148  :param diag_line: draw diagonal line
1149  :type diag_line: :class:`bool`
1150 
1151  :param labels: column name containing labels to put on x-axis for one
1152  dimensional plot
1153  :type labels: :class:`str`
1154 
1155  :param max_num_labels: limit maximum number of labels
1156  :type max_num_labels: :class:`int`
1157 
1158  :param title: plot title, if not specified it is automatically derived from
1159  plotted column names
1160  :type title: :class:`str`
1161 
1162  :param clear: clear old data from plot
1163  :type clear: :class:`bool`
1164 
1165  :param save: filename for saving plot
1166  :type save: :class:`str`
1167 
1168  :param z_contour: draw contour lines
1169  :type z_contour: :class:`bool`
1170 
1171  :param z_interpol: interpolation method for 3-dimensional plot (one of 'nn',
1172  'linear')
1173  :type z_interpol: :class:`str`
1174 
1175  :param \*\*kwargs: additional arguments passed to matplotlib
1176 
1177  :returns: the ``matplotlib.pyplot`` module
1178 
1179  **Examples:** simple plotting functions
1180 
1181  .. code-block:: python
1182 
1183  tab = Table(['a','b','c','d'],'iffi', a=range(5,0,-1),
1184  b=[x/2.0 for x in range(1,6)],
1185  c=[math.cos(x) for x in range(0,5)],
1186  d=range(3,8))
1187 
1188  # one dimensional plot of column 'd' vs. index
1189  plt = tab.Plot('d')
1190  plt.show()
1191 
1192  # two dimensional plot of 'a' vs. 'c'
1193  plt = tab.Plot('a', y='c', style='o-')
1194  plt.show()
1195 
1196  # three dimensional plot of 'a' vs. 'c' with values 'b'
1197  plt = tab.Plot('a', y='c', z='b')
1198  # manually save plot to file
1199  plt.savefig("plot.png")
1200  """
1201  try:
1202  import matplotlib.pyplot as plt
1203  import matplotlib.mlab as mlab
1204  import numpy as np
1205  idx1 = self.GetColIndex(x)
1206  xs = []
1207  ys = []
1208  zs = []
1209 
1210  if clear:
1211  plt.figure(figsize=[8, 6])
1212 
1213  if x_title!=None:
1214  nice_x=x_title
1215  else:
1216  nice_x=MakeTitle(x)
1217 
1218  if y_title!=None:
1219  nice_y=y_title
1220  else:
1221  if y:
1222  nice_y=MakeTitle(y)
1223  else:
1224  nice_y=None
1225 
1226  if z_title!=None:
1227  nice_z = z_title
1228  else:
1229  if z:
1230  nice_z = MakeTitle(z)
1231  else:
1232  nice_z = None
1233 
1234  if x_range and (IsScalar(x_range) or len(x_range)!=2):
1235  raise ValueError('parameter x_range must contain exactly two elements')
1236  if y_range and (IsScalar(y_range) or len(y_range)!=2):
1237  raise ValueError('parameter y_range must contain exactly two elements')
1238  if z_range and (IsScalar(z_range) or len(z_range)!=2):
1239  raise ValueError('parameter z_range must contain exactly two elements')
1240 
1241  if color:
1242  kwargs['color']=color
1243  if legend:
1244  kwargs['label']=legend
1245  if y and z:
1246  idx3 = self.GetColIndex(z)
1247  idx2 = self.GetColIndex(y)
1248  for row in self.rows:
1249  if row[idx1]!=None and row[idx2]!=None and row[idx3]!=None:
1250  if plot_if and not plot_if(self, row):
1251  continue
1252  xs.append(row[idx1])
1253  ys.append(row[idx2])
1254  zs.append(row[idx3])
1255  levels = []
1256  if z_range:
1257  z_spacing = (z_range[1] - z_range[0]) / num_z_levels
1258  l = z_range[0]
1259  else:
1260  l = self.Min(z)
1261  z_spacing = (self.Max(z) - l) / num_z_levels
1262 
1263  for i in range(0,num_z_levels+1):
1264  levels.append(l)
1265  l += z_spacing
1266 
1267  xi = np.linspace(min(xs),max(xs),len(xs)*10)
1268  yi = np.linspace(min(ys),max(ys),len(ys)*10)
1269  zi = mlab.griddata(xs, ys, zs, xi, yi, interp=z_interpol)
1270 
1271  if z_contour:
1272  plt.contour(xi,yi,zi,levels,linewidths=0.5,colors='k')
1273 
1274  plt.contourf(xi,yi,zi,levels,cmap=plt.cm.jet)
1275  plt.colorbar(ticks=levels)
1276 
1277  elif y:
1278  idx2=self.GetColIndex(y)
1279  for row in self.rows:
1280  if row[idx1]!=None and row[idx2]!=None:
1281  if plot_if and not plot_if(self, row):
1282  continue
1283  xs.append(row[idx1])
1284  ys.append(row[idx2])
1285  plt.plot(xs, ys, style, **kwargs)
1286 
1287  else:
1288  label_vals=[]
1289 
1290  if labels:
1291  label_idx=self.GetColIndex(labels)
1292  for row in self.rows:
1293  if row[idx1]!=None:
1294  if plot_if and not plot_if(self, row):
1295  continue
1296  xs.append(row[idx1])
1297  if labels:
1298  label_vals.append(row[label_idx])
1299  plt.plot(xs, style, **kwargs)
1300  if labels:
1301  interval = 1
1302  if max_num_labels:
1303  if len(label_vals)>max_num_labels:
1304  interval = int(math.ceil(float(len(label_vals))/max_num_labels))
1305  label_vals = label_vals[::interval]
1306  plt.xticks(np.arange(0, len(xs), interval), label_vals, rotation=45,
1307  size='x-small')
1308 
1309  if title==None:
1310  if nice_z:
1311  title = '%s of %s vs. %s' % (nice_z, nice_x, nice_y)
1312  elif nice_y:
1313  title = '%s vs. %s' % (nice_x, nice_y)
1314  else:
1315  title = nice_x
1316 
1317  plt.title(title, size='x-large', fontweight='bold',
1318  verticalalignment='bottom')
1319 
1320  if legend:
1321  plt.legend(loc=0)
1322 
1323  if x and y:
1324  plt.xlabel(nice_x, size='x-large')
1325  if x_range:
1326  plt.xlim(x_range[0], x_range[1])
1327  if y_range:
1328  plt.ylim(y_range[0], y_range[1])
1329  if diag_line:
1330  plt.plot(x_range, y_range, '-', color='black')
1331 
1332  plt.ylabel(nice_y, size='x-large')
1333  else:
1334  if y_range:
1335  plt.ylim(y_range[0], y_range[1])
1336  if x_title:
1337  plt.xlabel(x_title, size='x-large')
1338  plt.ylabel(nice_y, size='x-large')
1339  if save:
1340  plt.savefig(save)
1341  return plt
1342  except ImportError:
1343  LogError("Function needs numpy and matplotlib, but I could not import it.")
1344  raise
1345 
1346  def PlotHistogram(self, col, x_range=None, num_bins=10, normed=False,
1347  histtype='stepfilled', align='mid', x_title=None,
1348  y_title=None, title=None, clear=True, save=False,
1349  color=None, y_range=None):
1350  """
1351  Create a histogram of the data in col for the range *x_range*, split into
1352  *num_bins* bins and plot it using Matplotlib.
1353 
1354  :param col: column name with data
1355  :type col: :class:`str`
1356 
1357  :param x_range: start and end value for first dimension (e.g. [start_x, end_x])
1358  :type x_range: :class:`list` of length two
1359 
1360  :param y_range: start and end value for second dimension (e.g. [start_y, end_y])
1361  :type y_range: :class:`list` of length two
1362 
1363  :param num_bins: number of bins in range
1364  :type num_bins: :class:`int`
1365 
1366  :param color: Color to be used for the histogram. If not set, color will be
1367  determined by matplotlib
1368  :type color: :class:`str`
1369 
1370  :param normed: normalize histogram
1371  :type normed: :class:`bool`
1372 
1373  :param histtype: type of histogram (i.e. *bar*, *barstacked*, *step*,
1374  *stepfilled*). See (`matplotlib docu <http://matplotlib.sourceforge.net/api/pyplot_api.html#matplotlib.pyplot.hist>`__).
1375  :type histtype: :class:`str`
1376 
1377  :param align: style of histogram (*left*, *mid*, *right*). See
1378  (`matplotlib docu <http://matplotlib.sourceforge.net/api/pyplot_api.html#matplotlib.pyplot.hist>`__).
1379  :type align: :class:`str`
1380 
1381  :param x_title: title for first dimension, if not specified it is
1382  automatically derived from column name
1383  :type x_title: :class:`str`
1384 
1385  :param y_title: title for second dimension, if not specified it is
1386  automatically derived from column name
1387  :type y_title: :class:`str`
1388 
1389  :param title: plot title, if not specified it is automatically derived from
1390  plotted column names
1391  :type title: :class:`str`
1392 
1393  :param clear: clear old data from plot
1394  :type clear: :class:`bool`
1395 
1396  :param save: filename for saving plot
1397  :type save: :class:`str`
1398 
1399  **Examples:** simple plotting functions
1400 
1401  .. code-block:: python
1402 
1403  tab = Table(['a'],'f', a=[math.cos(x*0.01) for x in range(100)])
1404 
1405  # one dimensional plot of column 'd' vs. index
1406  plt = tab.PlotHistogram('a')
1407  plt.show()
1408 
1409  """
1410  try:
1411  import matplotlib.pyplot as plt
1412  import numpy as np
1413 
1414  if len(self.rows)==0:
1415  return None
1416  kwargs={}
1417  if color:
1418  kwargs['color']=color
1419  idx = self.GetColIndex(col)
1420  data = []
1421  for r in self.rows:
1422  if r[idx]!=None:
1423  data.append(r[idx])
1424 
1425  if clear:
1426  plt.clf()
1427 
1428  n, bins, patches = plt.hist(data, bins=num_bins, range=x_range,
1429  normed=normed, histtype=histtype, align=align,
1430  **kwargs)
1431 
1432  if x_title!=None:
1433  nice_x=x_title
1434  else:
1435  nice_x=MakeTitle(col)
1436  plt.xlabel(nice_x, size='x-large')
1437  if y_range:
1438  plt.ylim(y_range)
1439  if y_title!=None:
1440  nice_y=y_title
1441  else:
1442  nice_y="bin count"
1443  plt.ylabel(nice_y, size='x-large')
1444 
1445  if title!=None:
1446  nice_title=title
1447  else:
1448  nice_title="Histogram of %s"%nice_x
1449  plt.title(nice_title, size='x-large', fontweight='bold')
1450 
1451  if save:
1452  plt.savefig(save)
1453  return plt
1454  except ImportError:
1455  LogError("Function needs numpy and matplotlib, but I could not import it.")
1456  raise
1457 
1458  def _Max(self, col):
1459  if len(self.rows)==0:
1460  return None, None
1461  idx = self.GetColIndex(col)
1462  col_type = self.col_types[idx]
1463  if col_type=='int' or col_type=='float':
1464  max_val = -float('inf')
1465  elif col_type=='bool':
1466  max_val = False
1467  elif col_type=='string':
1468  max_val = chr(0)
1469  max_idx = None
1470  for i in range(0, len(self.rows)):
1471  if self.rows[i][idx]>max_val:
1472  max_val = self.rows[i][idx]
1473  max_idx = i
1474  return max_val, max_idx
1475 
1476  def PlotBar(self, cols=None, rows=None, xlabels=None, set_xlabels=True, xlabels_rotation='horizontal', y_title=None, title=None,
1477  colors=None, width=0.8, bottom=0, legend=False, legend_names=None, show=False, save=False):
1478 
1479  """
1480  Create a barplot of the data in cols. Every column will be represented
1481  at one position. If there are several rows, each column will be grouped
1482  together.
1483 
1484  :param cols: List of column names. Every column will be represented as a
1485  single bar. If cols is None, every column of the table gets
1486  plotted.
1487  :type cols: :class:`list`
1488 
1489  :param rows: List of row indices. Values from given rows will be plotted
1490  in parallel at one column position. If set to None, all rows
1491  of the table will be plotted. Note, that the maximum number
1492  of rows is 7.
1493  :type rows: :class:`list`
1494 
1495  :param xlabels: Label for every col on x-axis. If set to None, the column
1496  names are used. The xlabel plotting can be supressed by
1497  the parameter set_xlabel.
1498  :type xlabels: :class:`list`
1499 
1500  :param set_xlabels: Controls whether xlabels are plotted or not.
1501  :type set_xlabels: :class:`bool`
1502 
1503  :param x_labels_rotation: Can either be 'horizontal', 'vertical' or an
1504  integer, that describes the rotation in degrees.
1505 
1506  :param y_title: Y-axis description
1507  :type y_title: :class:`str`
1508 
1509  :title: Title of the plot. No title appears if set to None
1510  :type title: :class:`str`
1511 
1512  :param colors: Colors of the different bars in each group. Must be a list
1513  of valid colors in matplotlib. Length of color and rows must
1514  be consistent.
1515  :type colors: :class:`list`
1516 
1517  :param width: The available space for the groups on the x-axis is divided
1518  by the exact number of groups. The parameters width is the
1519  fraction of what is actually used. If it would be 1.0 the
1520  bars of the different groups would touch each other.
1521  Value must be between [0;1]
1522  :type width: :class:`float`
1523 
1524  :param bottom: Bottom
1525  :type bottom: :class:`float`
1526 
1527  :param legend: Legend for color explanation, the corresponding row
1528  respectively. If set to True, legend_names must be provided.
1529  :type legend: :class:`bool`
1530 
1531  :param legend_names: List of names, that describe the differently colored
1532  bars. Length must be consistent with number of rows.
1533 
1534  :param show: If set to True, the plot is directly displayed.
1535 
1536  :param save: If set, a png image with name save in the current working
1537  directory will be saved.
1538  :type save: :class:`str`
1539 
1540  """
1541  try:
1542  import numpy as np
1543  import matplotlib.pyplot as plt
1544  except:
1545  raise ImportError('PlotBar relies on numpy and matplotlib, but I could' \
1546  'not import it!')
1547 
1548  standard_colors=['b','g','y','c','m','r','k']
1549  data=[]
1550 
1551  if cols==None:
1552  cols=self.col_names
1553 
1554  if width<=0 or width>1:
1555  raise ValueError('Width must be in [0;1]')
1556 
1557  if rows==None:
1558  if len(self.rows)>7:
1559  raise ValueError('Table contains too many rows to represent them at one '\
1560  'bar position in parallel. You can Select a Subtable or '\
1561  'specify the parameter rows with a list of row indices '\
1562  '(max 7)')
1563  else:
1564  rows=range(len(self.rows))
1565  else:
1566  if not isinstance(rows,list):
1567  rows=[rows]
1568  if len(rows)>7:
1569  raise ValueError('Too many rows to represent (max 7). Please note, that '\
1570  'data from multiple rows from one column gets '\
1571  'represented at one position in parallel.')
1572 
1573  for r_idx in rows:
1574  row=self.rows[r_idx]
1575  temp=list()
1576  for c in cols:
1577  try:
1578  c_idx=self.GetColIndex(c)
1579  except:
1580  raise ValueError('Cannot find column with name '+str(c))
1581  temp.append(row[c_idx])
1582  data.append(temp)
1583 
1584  if colors==None:
1585  colors=standard_colors[:len(rows)]
1586 
1587  if len(rows)!=len(colors):
1588  raise ValueError("Number of rows and number of colors must be consistent!")
1589 
1590  ind=np.arange(len(data[0]))
1591  single_bar_width=float(width)/len(data)
1592 
1593  fig=plt.figure()
1594  ax=fig.add_subplot(111)
1595  legend_data=[]
1596 
1597  for i in range(len(data)):
1598  legend_data.append(ax.bar(ind+i*single_bar_width+(1-width)/2,data[i],single_bar_width,bottom=bottom,color=colors[i])[0])
1599 
1600  if title!=None:
1601  ax.set_title(title, size='x-large', fontweight='bold')
1602 
1603  if y_title!=None:
1604  nice_y=y_title
1605  else:
1606  nice_y="value"
1607  ax.set_ylabel(nice_y)
1608 
1609  if xlabels:
1610  if len(data[0])!=len(xlabels):
1611  raise ValueError('Number of xlabels is not consistent with number of cols!')
1612  else:
1613  xlabels=cols
1614 
1615  if set_xlabels:
1616  ax.set_xticks(ind+0.5)
1617  ax.set_xticklabels(xlabels, rotation = xlabels_rotation)
1618  else:
1619  ax.set_xticks([])
1620 
1621  if legend == True:
1622  if legend_names==None:
1623  raise ValueError('You must provide legend names! e.g. names for the rows, '\
1624  'that are printed in parallel.')
1625  if len(legend_names)!=len(data):
1626  raise ValueError('length of legend_names must be consistent with number '\
1627  'of plotted rows!')
1628  ax.legend(legend_data, legend_names)
1629 
1630  if save:
1631  plt.savefig(save)
1632 
1633  if show:
1634  plt.show()
1635 
1636  return plt
1637 
1638  def PlotHexbin(self, x, y, title=None, x_title=None, y_title=None, x_range=None, y_range=None, binning='log',
1639  colormap='jet', show_scalebar=False, scalebar_label=None, clear=True, save=False, show=False):
1640 
1641  """
1642  Create a heatplot of the data in col x vs the data in col y using matplotlib
1643 
1644  :param x: column name with x data
1645  :type x: :class:`str`
1646 
1647  :param y: column name with y data
1648  :type y: :class:`str`
1649 
1650  :param title: title of the plot, will be generated automatically if set to None
1651  :type title: :class:`str`
1652 
1653  :param x_title: label of x-axis, will be generated automatically if set to None
1654  :type title: :class:`str`
1655 
1656  :param y_title: label of y-axis, will be generated automatically if set to None
1657  :type title: :class:`str`
1658 
1659  :param x_range: start and end value for first dimension (e.g. [start_x, end_x])
1660  :type x_range: :class:`list` of length two
1661 
1662  :param y_range: start and end value for second dimension (e.g. [start_y, end_y])
1663  :type y_range: :class:`list` of length two
1664 
1665  :param binning: type of binning. If set to None, the value of a hexbin will
1666  correspond to the number of datapoints falling into it. If
1667  set to 'log', the value will be the log with base 10 of the above
1668  value (log(i+1)). If an integer is provided, the number of a
1669  hexbin is equal the number of datapoints falling into it divided
1670  by the integer. If a list of values is provided, these values
1671  will be the lower bounds of the bins.
1672 
1673  :param colormap: colormap, that will be used. Value can be every colormap defined
1674  in matplotlib or an own defined colormap. You can either pass a
1675  string with the name of the matplotlib colormap or a colormap
1676  object.
1677 
1678  :param show_scalebar: If set to True, a scalebar according to the chosen colormap is shown
1679  :type show_scalebar: :class:`bool`
1680 
1681  :param scalebar_label: Label of the scalebar
1682  :type scalebar_label: :class:`str`
1683 
1684  :param clear: clear old data from plot
1685  :type clear: :class:`bool`
1686 
1687  :param save: filename for saving plot
1688  :type save: :class:`str`
1689 
1690  :param show: directly show plot
1691  :type show: :class:`bool`
1692 
1693  """
1694 
1695  try:
1696  import matplotlib.pyplot as plt
1697  import matplotlib.cm as cm
1698  except:
1699  raise ImportError('PlotHexbin relies on matplotlib, but I could not import it')
1700 
1701  idx=self.GetColIndex(x)
1702  idy=self.GetColIndex(y)
1703  xdata=[]
1704  ydata=[]
1705 
1706  for r in self.rows:
1707  if r[idx]!=None and r[idy]!=None:
1708  xdata.append(r[idx])
1709  ydata.append(r[idy])
1710 
1711  if clear:
1712  plt.clf()
1713 
1714  if x_title!=None:
1715  nice_x=x_title
1716  else:
1717  nice_x=MakeTitle(x)
1718 
1719  if y_title!=None:
1720  nice_y=y_title
1721  else:
1722  nice_y=MakeTitle(y)
1723 
1724  if title==None:
1725  title = '%s vs. %s' % (nice_x, nice_y)
1726 
1727  if IsStringLike(colormap):
1728  colormap=getattr(cm, colormap)
1729 
1730  if x_range and (IsScalar(x_range) or len(x_range)!=2):
1731  raise ValueError('parameter x_range must contain exactly two elements')
1732  if y_range and (IsScalar(y_range) or len(y_range)!=2):
1733  raise ValueError('parameter y_range must contain exactly two elements')
1734 
1735  ext = [min(xdata),max(xdata),min(ydata),max(ydata)]
1736 
1737  if x_range:
1738  plt.xlim((x_range[0], x_range[1]))
1739  ext[0]=x_range[0]
1740  ext[1]=x_range[1]
1741  if y_range:
1742  plt.ylim(y_range[0], y_range[1])
1743  ext[2]=y_range[0]
1744  ext[3]=y_range[1]
1745 
1746 
1747  plt.hexbin(xdata, ydata, bins=binning, cmap=colormap, extent=ext)
1748 
1749  plt.title(title, size='x-large', fontweight='bold',
1750  verticalalignment='bottom')
1751 
1752  plt.xlabel(nice_x)
1753  plt.ylabel(nice_y)
1754 
1755  if show_scalebar:
1756  cb=plt.colorbar()
1757  if scalebar_label:
1758  cb.set_label(scalebar_label)
1759 
1760  if save:
1761  plt.savefig(save)
1762 
1763  if show:
1764  plt.show()
1765 
1766  return plt
1767 
1768  def MaxRow(self, col):
1769  """
1770  Returns the row containing the cell with the maximal value in col. If
1771  several rows have the highest value, only the first one is returned.
1772  ''None'' values are ignored.
1773 
1774  :param col: column name
1775  :type col: :class:`str`
1776 
1777  :returns: row with maximal col value or None if the table is empty
1778  """
1779  val, idx = self._Max(col)
1780  if idx!=None:
1781  return self.rows[idx]
1782 
1783  def Max(self, col):
1784  """
1785  Returns the maximum value in col. If several rows have the highest value,
1786  only the first one is returned. ''None'' values are ignored.
1787 
1788  :param col: column name
1789  :type col: :class:`str`
1790  """
1791  val, idx = self._Max(col)
1792  return val
1793 
1794  def MaxIdx(self, col):
1795  """
1796  Returns the row index of the cell with the maximal value in col. If
1797  several rows have the highest value, only the first one is returned.
1798  ''None'' values are ignored.
1799 
1800  :param col: column name
1801  :type col: :class:`str`
1802  """
1803  val, idx = self._Max(col)
1804  return idx
1805 
1806  def _Min(self, col):
1807  if len(self.rows)==0:
1808  return None, None
1809  idx=self.GetColIndex(col)
1810  col_type = self.col_types[idx]
1811  if col_type=='int' or col_type=='float':
1812  min_val=float('inf')
1813  elif col_type=='bool':
1814  min_val=True
1815  elif col_type=='string':
1816  min_val=chr(255)
1817  min_idx=None
1818  for i,row in enumerate(self.rows):
1819  if row[idx]!=None and row[idx]<min_val:
1820  min_val=row[idx]
1821  min_idx=i
1822  return min_val, min_idx
1823 
1824  def Min(self, col):
1825  """
1826  Returns the minimal value in col. If several rows have the lowest value,
1827  only the first one is returned. ''None'' values are ignored.
1828 
1829  :param col: column name
1830  :type col: :class:`str`
1831  """
1832  val, idx = self._Min(col)
1833  return val
1834 
1835  def MinRow(self, col):
1836  """
1837  Returns the row containing the cell with the minimal value in col. If
1838  several rows have the lowest value, only the first one is returned.
1839  ''None'' values are ignored.
1840 
1841  :param col: column name
1842  :type col: :class:`str`
1843 
1844  :returns: row with minimal col value or None if the table is empty
1845  """
1846  val, idx = self._Min(col)
1847  if idx!=None:
1848  return self.rows[idx]
1849 
1850  def MinIdx(self, col):
1851  """
1852  Returns the row index of the cell with the minimal value in col. If
1853  several rows have the lowest value, only the first one is returned.
1854  ''None'' values are ignored.
1855 
1856  :param col: column name
1857  :type col: :class:`str`
1858  """
1859  val, idx = self._Min(col)
1860  return idx
1861 
1862  def Sum(self, col):
1863  """
1864  Returns the sum of the given column. Cells with ''None'' are ignored. Returns
1865  0.0, if the column doesn't contain any elements. Col must be of numeric
1866  column type ('float', 'int') or boolean column type.
1867 
1868  :param col: column name
1869  :type col: :class:`str`
1870 
1871  :raises: :class:`TypeError` if column type is ``string``
1872  """
1873  idx = self.GetColIndex(col)
1874  col_type = self.col_types[idx]
1875  if col_type!='int' and col_type!='float' and col_type!='bool':
1876  raise TypeError("Sum can only be used on numeric column types")
1877  s = 0.0
1878  for r in self.rows:
1879  if r[idx]!=None:
1880  s += r[idx]
1881  return s
1882 
1883  def Mean(self, col):
1884  """
1885  Returns the mean of the given column. Cells with ''None'' are ignored. Returns
1886  None, if the column doesn't contain any elements. Col must be of numeric
1887  ('float', 'int') or boolean column type.
1888 
1889  If column type is *bool*, the function returns the ratio of
1890  number of 'Trues' by total number of elements.
1891 
1892  :param col: column name
1893  :type col: :class:`str`
1894 
1895  :raises: :class:`TypeError` if column type is ``string``
1896  """
1897  idx = self.GetColIndex(col)
1898  col_type = self.col_types[idx]
1899  if col_type!='int' and col_type!='float' and col_type!='bool':
1900  raise TypeError("Mean can only be used on numeric or bool column types")
1901 
1902  vals=[]
1903  for v in self[col]:
1904  if v!=None:
1905  vals.append(v)
1906  try:
1907  return stutil.Mean(vals)
1908  except:
1909  return None
1910 
1911  def RowMean(self, mean_col_name, cols):
1912  """
1913  Adds a new column of type 'float' with a specified name (*mean_col_name*),
1914  containing the mean of all specified columns for each row.
1915 
1916  Cols are specified by their names and must be of numeric column
1917  type ('float', 'int') or boolean column type. Cells with None are ignored.
1918  Adds ''None'' if the row doesn't contain any values.
1919 
1920  :param mean_col_name: name of new column containing mean values
1921  :type mean_col_name: :class:`str`
1922 
1923  :param cols: name or list of names of columns to include in computation of
1924  mean
1925  :type cols: :class:`str` or :class:`list` of strings
1926 
1927  :raises: :class:`TypeError` if column type of columns in *col* is ``string``
1928 
1929  == Example ==
1930 
1931  Staring with the following table:
1932 
1933  ==== ==== ====
1934  x y u
1935  ==== ==== ====
1936  1 10 100
1937  2 15 None
1938  3 20 400
1939  ==== ==== ====
1940 
1941  the code here adds a column with the name 'mean' to yield the table below:
1942 
1943  .. code-block::python
1944 
1945  tab.RowMean('mean', ['x', 'u'])
1946 
1947 
1948  ==== ==== ==== =====
1949  x y u mean
1950  ==== ==== ==== =====
1951  1 10 100 50.5
1952  2 15 None 2
1953  3 20 400 201.5
1954  ==== ==== ==== =====
1955 
1956  """
1957 
1958  if IsScalar(cols):
1959  cols = [cols]
1960 
1961  cols_idxs = []
1962  for col in cols:
1963  idx = self.GetColIndex(col)
1964  col_type = self.col_types[idx]
1965  if col_type!='int' and col_type!='float' and col_type!='bool':
1966  raise TypeError("RowMean can only be used on numeric column types")
1967  cols_idxs.append(idx)
1968 
1969  mean_rows = []
1970  for row in self.rows:
1971  vals = []
1972  for idx in cols_idxs:
1973  v = row[idx]
1974  if v!=None:
1975  vals.append(v)
1976  try:
1977  mean = stutil.Mean(vals)
1978  mean_rows.append(mean)
1979  except:
1980  mean_rows.append(None)
1981 
1982  self.AddCol(mean_col_name, 'f', mean_rows)
1983 
1984  def Percentiles(self, col, nths):
1985  """
1986  Returns the percentiles of column *col* given in *nths*.
1987 
1988  The percentiles are calculated as
1989 
1990  .. code-block:: python
1991 
1992  values[min(len(values), int(round(len(values)*nth/100+0.5)-1))]
1993 
1994  where values are the sorted values of *col* not equal to ''None''
1995 
1996  :param col: column name
1997  :type col: :class:`str`
1998  :param nths: list of percentiles to be calculated. Each percentile is a
1999  number between 0 and 100.
2000  :type nths: :class:`list` of numbers
2001 
2002  :raises: :class:`TypeError` if column type is ``string``
2003  :returns: List of percentiles in the same order as given in *nths*
2004  """
2005  idx = self.GetColIndex(col)
2006  col_type = self.col_types[idx]
2007  if col_type!='int' and col_type!='float' and col_type!='bool':
2008  raise TypeError("Median can only be used on numeric column types")
2009 
2010  for nth in nths:
2011  if nth < 0 or nth > 100:
2012  raise ValueError("percentiles must be between 0 and 100")
2013  vals=[]
2014  for v in self[col]:
2015  if v!=None:
2016  vals.append(v)
2017  vals=sorted(vals)
2018  if len(vals)==0:
2019  return [None]*len(nths)
2020  percentiles=[]
2021 
2022  for nth in nths:
2023  p=vals[min(len(vals)-1, int(round(len(vals)*nth/100.0+0.5)-1))]
2024  percentiles.append(p)
2025  return percentiles
2026 
2027  def Median(self, col):
2028  """
2029  Returns the median of the given column. Cells with ''None'' are ignored. Returns
2030  ''None'', if the column doesn't contain any elements. Col must be of numeric
2031  column type ('float', 'int') or boolean column type.
2032 
2033  :param col: column name
2034  :type col: :class:`str`
2035 
2036  :raises: :class:`TypeError` if column type is ``string``
2037  """
2038  idx = self.GetColIndex(col)
2039  col_type = self.col_types[idx]
2040  if col_type!='int' and col_type!='float' and col_type!='bool':
2041  raise TypeError("Median can only be used on numeric column types")
2042 
2043  vals=[]
2044  for v in self[col]:
2045  if v!=None:
2046  vals.append(v)
2047  stutil.Median(vals)
2048  try:
2049  return stutil.Median(vals)
2050  except:
2051  return None
2052 
2053  def StdDev(self, col):
2054  """
2055  Returns the standard deviation of the given column. Cells with ''None'' are
2056  ignored. Returns ''None'', if the column doesn't contain any elements. Col must
2057  be of numeric column type ('float', 'int') or boolean column type.
2058 
2059  :param col: column name
2060  :type col: :class:`str`
2061 
2062  :raises: :class:`TypeError` if column type is ``string``
2063  """
2064  idx = self.GetColIndex(col)
2065  col_type = self.col_types[idx]
2066  if col_type!='int' and col_type!='float' and col_type!='bool':
2067  raise TypeError("StdDev can only be used on numeric column types")
2068 
2069  vals=[]
2070  for v in self[col]:
2071  if v!=None:
2072  vals.append(v)
2073  try:
2074  return stutil.StdDev(vals)
2075  except:
2076  return None
2077 
2078  def Count(self, col, ignore_nan=True):
2079  """
2080  Count the number of cells in column that are not equal to ''None''.
2081 
2082  :param col: column name
2083  :type col: :class:`str`
2084 
2085  :param ignore_nan: ignore all *None* values
2086  :type ignore_nan: :class:`bool`
2087  """
2088  count=0
2089  idx=self.GetColIndex(col)
2090  for r in self.rows:
2091  if ignore_nan:
2092  if r[idx]!=None:
2093  count+=1
2094  else:
2095  count+=1
2096  return count
2097 
2098  def Correl(self, col1, col2):
2099  """
2100  Calculate the Pearson correlation coefficient between *col1* and *col2*, only
2101  taking rows into account where both of the values are not equal to *None*.
2102  If there are not enough data points to calculate a correlation coefficient,
2103  *None* is returned.
2104 
2105  :param col1: column name for first column
2106  :type col1: :class:`str`
2107 
2108  :param col2: column name for second column
2109  :type col2: :class:`str`
2110  """
2111  if IsStringLike(col1) and IsStringLike(col2):
2112  col1 = self.GetColIndex(col1)
2113  col2 = self.GetColIndex(col2)
2114  vals1, vals2=([],[])
2115  for v1, v2 in zip(self[col1], self[col2]):
2116  if v1!=None and v2!=None:
2117  vals1.append(v1)
2118  vals2.append(v2)
2119  try:
2120  return stutil.Correl(vals1, vals2)
2121  except:
2122  return None
2123 
2124  def SpearmanCorrel(self, col1, col2):
2125  """
2126  Calculate the Spearman correlation coefficient between col1 and col2, only
2127  taking rows into account where both of the values are not equal to None. If
2128  there are not enough data points to calculate a correlation coefficient,
2129  None is returned.
2130 
2131  :warning: The function depends on the following module: *scipy.stats.mstats*
2132 
2133  :param col1: column name for first column
2134  :type col1: :class:`str`
2135 
2136  :param col2: column name for second column
2137  :type col2: :class:`str`
2138  """
2139  try:
2140  import scipy.stats.mstats
2141 
2142  if IsStringLike(col1) and IsStringLike(col2):
2143  col1 = self.GetColIndex(col1)
2144  col2 = self.GetColIndex(col2)
2145  vals1, vals2=([],[])
2146  for v1, v2 in zip(self[col1], self[col2]):
2147  if v1!=None and v2!=None:
2148  vals1.append(v1)
2149  vals2.append(v2)
2150  try:
2151  correl = scipy.stats.mstats.spearmanr(vals1, vals2)[0]
2152  if scipy.isnan(correl):
2153  return None
2154  return correl
2155  except:
2156  return None
2157 
2158  except ImportError:
2159  LogError("Function needs scipy.stats.mstats, but I could not import it.")
2160  raise
2161 
2162 
2163  def Save(self, stream_or_filename, format='ost', sep=','):
2164  """
2165  Save the table to stream or filename. The following three file formats
2166  are supported (for more information on file formats, see :meth:`Load`):
2167 
2168  ============= =======================================
2169  ost ost-specific format (human readable)
2170  csv comma separated values (human readable)
2171  pickle pickled byte stream (binary)
2172  html HTML table
2173  context ConTeXt table
2174  ============= =======================================
2175 
2176  :param stream_or_filename: filename or stream for writing output
2177  :type stream_or_filename: :class:`str` or :class:`file`
2178 
2179  :param format: output format (i.e. *ost*, *csv*, *pickle*)
2180  :type format: :class:`str`
2181 
2182  :raises: :class:`ValueError` if format is unknown
2183  """
2184  format=format.lower()
2185  if format=='ost':
2186  return self._SaveOST(stream_or_filename)
2187  if format=='csv':
2188  return self._SaveCSV(stream_or_filename, sep=sep)
2189  if format=='pickle':
2190  return self._SavePickle(stream_or_filename)
2191  if format=='html':
2192  return self._SaveHTML(stream_or_filename)
2193  if format=='context':
2194  return self._SaveContext(stream_or_filename)
2195  raise ValueError('unknown format "%s"' % format)
2196 
2197  def _SavePickle(self, stream):
2198  if not hasattr(stream, 'write'):
2199  stream=open(stream, 'wb')
2200  cPickle.dump(self, stream, cPickle.HIGHEST_PROTOCOL)
2201 
2202  def _SaveHTML(self, stream_or_filename):
2203  def _escape(s):
2204  return s.replace('&', '&amp;').replace('>', '&gt;').replace('<', '&lt;')
2205 
2206  file_opened = False
2207  if not hasattr(stream_or_filename, 'write'):
2208  stream = open(stream_or_filename, 'w')
2209  file_opened = True
2210  else:
2211  stream = stream_or_filename
2212  stream.write('<table>')
2213  stream.write('<tr>')
2214  for col_name in self.col_names:
2215  stream.write('<th>%s</th>' % _escape(col_name))
2216  stream.write('</tr>')
2217  for row in self.rows:
2218  stream.write('<tr>')
2219  for i, col in enumerate(row):
2220  val = ''
2221  if col != None:
2222  if self.col_types[i] == 'float':
2223  val = '%.3f' % col
2224  elif self.col_types[i] == 'int':
2225  val = '%d' % col
2226  elif self.col_types[i] == 'bool':
2227  val = col and 'true' or 'false'
2228  else:
2229  val = str(col)
2230  stream.write('<td>%s</td>' % _escape(val))
2231  stream.write('</tr>')
2232  stream.write('</table>')
2233  if file_opened:
2234  stream.close()
2235  def _SaveContext(self, stream_or_filename):
2236  file_opened = False
2237  if not hasattr(stream_or_filename, 'write'):
2238  stream = open(stream_or_filename, 'w')
2239  file_opened = True
2240  else:
2241  stream = stream_or_filename
2242  stream.write('\\starttable[')
2243  for col_type in self.col_types:
2244  if col_type =='string':
2245  stream.write('l|')
2246  elif col_type=='int':
2247  stream.write('r|')
2248  elif col_type =='float':
2249  stream.write('i3r|')
2250  else:
2251  stream.write('l|')
2252  stream.write(']\n\\HL\n')
2253  for col_name in self.col_names:
2254  stream.write('\\NC \\bf %s' % col_name)
2255  stream.write(' \\AR\\HL\n')
2256  for row in self.rows:
2257  for i, col in enumerate(row):
2258  val = '---'
2259  if col != None:
2260  if self.col_types[i] == 'float':
2261  val = '%.3f' % col
2262  elif self.col_types[i] == 'int':
2263  val = '%d' % col
2264  elif self.col_types[i] == 'bool':
2265  val = col and 'true' or 'false'
2266  else:
2267  val = str(col)
2268  stream.write('\\NC %s' % val)
2269  stream.write(' \\AR\n')
2270  stream.write('\\HL\n')
2271  stream.write('\\stoptable')
2272  if file_opened:
2273  stream.close()
2274 
2275  def _SaveCSV(self, stream, sep):
2276  if not hasattr(stream, 'write'):
2277  stream=open(stream, 'wb')
2278 
2279  writer=csv.writer(stream, delimiter=sep)
2280  writer.writerow(['%s' % n for n in self.col_names])
2281  for row in self.rows:
2282  row=list(row)
2283  for i, c in enumerate(row):
2284  if c==None:
2285  row[i]='NA'
2286  writer.writerow(row)
2287 
2288  def _SaveOST(self, stream):
2289  if hasattr(stream, 'write'):
2290  writer=csv.writer(stream, delimiter=' ')
2291  else:
2292  stream=open(stream, 'w')
2293  writer=csv.writer(stream, delimiter=' ')
2294  if self.comment:
2295  stream.write(''.join(['# %s\n' % l for l in self.comment.split('\n')]))
2296  writer.writerow(['%s[%s]' % t for t in zip(self.col_names, self.col_types)])
2297  for row in self.rows:
2298  row=list(row)
2299  for i, c in enumerate(row):
2300  if c==None:
2301  row[i]='NA'
2302  writer.writerow(row)
2303 
2304 
2305  def GetNumpyMatrix(self, *args):
2306  '''
2307  Returns a numpy matrix containing the selected columns from the table as
2308  columns in the matrix.
2309 
2310  Only columns of type *int* or *float* are supported. *NA* values in the
2311  table will be converted to *None* values.
2312 
2313  :param \*args: column names to include in numpy matrix
2314 
2315  :warning: The function depends on *numpy*
2316  '''
2317  try:
2318  import numpy as np
2319 
2320  if len(args)==0:
2321  raise RuntimeError("At least one column must be specified.")
2322 
2323  idxs = []
2324  for arg in args:
2325  idx = self.GetColIndex(arg)
2326  col_type = self.col_types[idx]
2327  if col_type!='int' and col_type!='float':
2328  raise TypeError("Numpy matrix can only be generated from numeric column types")
2329  idxs.append(idx)
2330  m = np.matrix([list(self[i]) for i in idxs])
2331  return m.T
2332 
2333  except ImportError:
2334  LogError("Function needs numpy, but I could not import it.")
2335  raise
2336 
2337 
2338 
2339  def GaussianSmooth(self, col, std=1.0, na_value=0.0, padding='reflect', c=0.0):
2340 
2341  '''
2342  In place Gaussian smooth of a column in the table with a given standard deviation.
2343  All nan are set to nan_value before smoothing.
2344 
2345  :param col: column name
2346  :type col: :class:`str`
2347 
2348  :param std: standard deviation for gaussian kernel
2349  :type std: `scalar`
2350 
2351  :param na_value: all na (None) values of the speciefied column are set to na_value before smoothing
2352  :type na_value: `scalar`
2353 
2354  :param padding: allows to handle padding behaviour see scipy ndimage.gaussian_filter1d documentation for more information. standard is reflect
2355  :type padding: :class:`str`
2356 
2357  :param c: constant value used for padding if padding mode is constant
2358  :type c: `scalar`
2359 
2360 
2361 
2362  :warning: The function depends on *scipy*
2363  '''
2364 
2365  try:
2366  from scipy import ndimage
2367  import numpy as np
2368  except ImportError:
2369  LogError("I need scipy.ndimage and numpy, but could not import it")
2370  raise
2371 
2372  idx = self.GetColIndex(col)
2373  col_type = self.col_types[idx]
2374  if col_type!='int' and col_type!='float':
2375  raise TypeError("GaussianSmooth can only be used on numeric column types")
2376 
2377  vals=[]
2378  for v in self[col]:
2379  if v!=None:
2380  vals.append(v)
2381  else:
2382  vals.append(na_value)
2383 
2384 
2385  smoothed_values_ndarray=ndimage.gaussian_filter1d(vals,std, mode=padding, cval=c)
2386 
2387  result=[]
2388 
2389  for v in smoothed_values_ndarray:
2390  result.append(v)
2391 
2392  self[col]=result
2393 
2394 
2395  def GetOptimalPrefactors(self, ref_col, *args, **kwargs):
2396  '''
2397  This returns the optimal prefactor values (i.e. a, b, c, ...) for the
2398  following equation
2399 
2400  .. math::
2401  :label: op1
2402 
2403  a*u + b*v + c*w + ... = z
2404 
2405  where u, v, w and z are vectors. In matrix notation
2406 
2407  .. math::
2408  :label: op2
2409 
2410  A*p = z
2411 
2412  where A contains the data from the table (u,v,w,...), p are the prefactors
2413  to optimize (a,b,c,...) and z is the vector containing the result of
2414  equation :eq:`op1`.
2415 
2416  The parameter ref_col equals to z in both equations, and \*args are columns
2417  u, v and w (or A in :eq:`op2`). All columns must be specified by their names.
2418 
2419  **Example:**
2420 
2421  .. code-block:: python
2422 
2423  tab.GetOptimalPrefactors('colC', 'colA', 'colB')
2424 
2425  The function returns a list of containing the prefactors a, b, c, ... in
2426  the correct order (i.e. same as columns were specified in \*args).
2427 
2428  Weighting:
2429  If the kwarg weights="columX" is specified, the equations are weighted by
2430  the values in that column. Each row is multiplied by the weight in that row,
2431  which leads to :eq:`op3`:
2432 
2433  .. math::
2434  :label: op3
2435 
2436  weight*a*u + weight*b*v + weight*c*w + ... = weight*z
2437 
2438  Weights must be float or int and can have any value. A value of 0 ignores
2439  this equation, a value of 1 means the same as no weight. If all weights are
2440  the same for each row, the same result will be obtained as with no weights.
2441 
2442  **Example:**
2443 
2444  .. code-block:: python
2445 
2446  tab.GetOptimalPrefactors('colC', 'colA', 'colB', weights='colD')
2447 
2448  '''
2449  try:
2450  import numpy as np
2451 
2452  if len(args)==0:
2453  raise RuntimeError("At least one column must be specified.")
2454 
2455  b = self.GetNumpyMatrix(ref_col)
2456  a = self.GetNumpyMatrix(*args)
2457 
2458  if len(kwargs)!=0:
2459  if kwargs.has_key('weights'):
2460  w = self.GetNumpyMatrix(kwargs['weights'])
2461  b = np.multiply(b,w)
2462  a = np.multiply(a,w)
2463 
2464  else:
2465  raise RuntimeError("specified unrecognized kwargs, use weights as key")
2466 
2467  k = (a.T*a).I*a.T*b
2468  return list(np.array(k.T).reshape(-1))
2469 
2470  except ImportError:
2471  LogError("Function needs numpy, but I could not import it.")
2472  raise
2473 
2474  def PlotEnrichment(self, score_col, class_col, score_dir='-',
2475  class_dir='-', class_cutoff=2.0,
2476  style='-', title=None, x_title=None, y_title=None,
2477  clear=True, save=None):
2478  '''
2479  Plot an enrichment curve using matplotlib of column *score_col* classified
2480  according to *class_col*.
2481 
2482  For more information about parameters of the enrichment, see
2483  :meth:`ComputeEnrichment`, and for plotting see :meth:`Plot`.
2484 
2485  :warning: The function depends on *matplotlib*
2486  '''
2487  try:
2488  import matplotlib.pyplot as plt
2489 
2490  enrx, enry = self.ComputeEnrichment(score_col, class_col, score_dir,
2491  class_dir, class_cutoff)
2492 
2493  if not title:
2494  title = 'Enrichment of %s'%score_col
2495 
2496  if not x_title:
2497  x_title = '% database'
2498 
2499  if not y_title:
2500  y_title = '% positives'
2501 
2502  if clear:
2503  plt.clf()
2504 
2505  plt.plot(enrx, enry, style)
2506 
2507  plt.title(title, size='x-large', fontweight='bold')
2508  plt.ylabel(y_title, size='x-large')
2509  plt.xlabel(x_title, size='x-large')
2510 
2511  if save:
2512  plt.savefig(save)
2513 
2514  return plt
2515  except ImportError:
2516  LogError("Function needs matplotlib, but I could not import it.")
2517  raise
2518 
2519  def ComputeEnrichment(self, score_col, class_col, score_dir='-',
2520  class_dir='-', class_cutoff=2.0):
2521  '''
2522  Computes the enrichment of column *score_col* classified according to
2523  *class_col*.
2524 
2525  For this it is necessary, that the datapoints are classified into positive
2526  and negative points. This can be done in two ways:
2527 
2528  - by using one 'bool' type column (*class_col*) which contains *True* for
2529  positives and *False* for negatives
2530 
2531  - by specifying a classification column (*class_col*), a cutoff value
2532  (*class_cutoff*) and the classification columns direction (*class_dir*).
2533  This will generate the classification on the fly
2534 
2535  * if ``class_dir=='-'``: values in the classification column that are less than or equal to class_cutoff will be counted as positives
2536  * if ``class_dir=='+'``: values in the classification column that are larger than or equal to class_cutoff will be counted as positives
2537 
2538  During the calculation, the table will be sorted according to *score_dir*,
2539  where a '-' values means smallest values first and therefore, the smaller
2540  the value, the better.
2541 
2542  :warning: If either the value of *class_col* or *score_col* is *None*, the
2543  data in this row is ignored.
2544  '''
2545 
2546  ALLOWED_DIR = ['+','-']
2547 
2548  score_idx = self.GetColIndex(score_col)
2549  score_type = self.col_types[score_idx]
2550  if score_type!='int' and score_type!='float':
2551  raise TypeError("Score column must be numeric type")
2552 
2553  class_idx = self.GetColIndex(class_col)
2554  class_type = self.col_types[class_idx]
2555  if class_type!='int' and class_type!='float' and class_type!='bool':
2556  raise TypeError("Classifier column must be numeric or bool type")
2557 
2558  if (score_dir not in ALLOWED_DIR) or (class_dir not in ALLOWED_DIR):
2559  raise ValueError("Direction must be one of %s"%str(ALLOWED_DIR))
2560 
2561  self.Sort(score_col, score_dir)
2562 
2563  x = [0]
2564  y = [0]
2565  enr = 0
2566  old_score_val = None
2567  i = 0
2568 
2569  for row in self.rows:
2570  class_val = row[class_idx]
2571  score_val = row[score_idx]
2572  if class_val==None or score_val==None:
2573  continue
2574  if class_val!=None:
2575  if old_score_val==None:
2576  old_score_val = score_val
2577  if score_val!=old_score_val:
2578  x.append(i)
2579  y.append(enr)
2580  old_score_val = score_val
2581  i+=1
2582  if class_type=='bool':
2583  if class_val==True:
2584  enr += 1
2585  else:
2586  if (class_dir=='-' and class_val<=class_cutoff) or (class_dir=='+' and class_val>=class_cutoff):
2587  enr += 1
2588  x.append(i)
2589  y.append(enr)
2590 
2591  # if no false positives or false negatives values are found return None
2592  if x[-1]==0 or y[-1]==0:
2593  return None
2594 
2595  x = [float(v)/x[-1] for v in x]
2596  y = [float(v)/y[-1] for v in y]
2597  return x,y
2598 
2599  def ComputeEnrichmentAUC(self, score_col, class_col, score_dir='-',
2600  class_dir='-', class_cutoff=2.0):
2601  '''
2602  Computes the area under the curve of the enrichment using the trapezoidal
2603  rule.
2604 
2605  For more information about parameters of the enrichment, see
2606  :meth:`ComputeEnrichment`.
2607 
2608  :warning: The function depends on *numpy*
2609  '''
2610  try:
2611  import numpy as np
2612 
2613  enr = self.ComputeEnrichment(score_col, class_col, score_dir,
2614  class_dir, class_cutoff)
2615 
2616  if enr==None:
2617  return None
2618  return np.trapz(enr[1], enr[0])
2619  except ImportError:
2620  LogError("Function needs numpy, but I could not import it.")
2621  raise
2622 
2623  def ComputeROC(self, score_col, class_col, score_dir='-',
2624  class_dir='-', class_cutoff=2.0):
2625  '''
2626  Computes the receiver operating characteristics (ROC) of column *score_col*
2627  classified according to *class_col*.
2628 
2629  For this it is necessary, that the datapoints are classified into positive
2630  and negative points. This can be done in two ways:
2631 
2632  - by using one 'bool' column (*class_col*) which contains True for positives
2633  and False for negatives
2634  - by using a non-bool column (*class_col*), a cutoff value (*class_cutoff*)
2635  and the classification columns direction (*class_dir*). This will generate
2636  the classification on the fly
2637 
2638  - if ``class_dir=='-'``: values in the classification column that are less than or equal to *class_cutoff* will be counted as positives
2639  - if ``class_dir=='+'``: values in the classification column that are larger than or equal to *class_cutoff* will be counted as positives
2640 
2641  During the calculation, the table will be sorted according to *score_dir*,
2642  where a '-' values means smallest values first and therefore, the smaller
2643  the value, the better.
2644 
2645  If *class_col* does not contain any positives (i.e. value is True (if column
2646  is of type bool) or evaluated to True (if column is of type int or float
2647  (depending on *class_dir* and *class_cutoff*))) the ROC is not defined and
2648  the function will return *None*.
2649 
2650  :warning: If either the value of *class_col* or *score_col* is *None*, the
2651  data in this row is ignored.
2652  '''
2653 
2654  ALLOWED_DIR = ['+','-']
2655 
2656  score_idx = self.GetColIndex(score_col)
2657  score_type = self.col_types[score_idx]
2658  if score_type!='int' and score_type!='float':
2659  raise TypeError("Score column must be numeric type")
2660 
2661  class_idx = self.GetColIndex(class_col)
2662  class_type = self.col_types[class_idx]
2663  if class_type!='int' and class_type!='float' and class_type!='bool':
2664  raise TypeError("Classifier column must be numeric or bool type")
2665 
2666  if (score_dir not in ALLOWED_DIR) or (class_dir not in ALLOWED_DIR):
2667  raise ValueError("Direction must be one of %s"%str(ALLOWED_DIR))
2668 
2669  self.Sort(score_col, score_dir)
2670 
2671  x = [0]
2672  y = [0]
2673  tp = 0
2674  fp = 0
2675  old_score_val = None
2676 
2677  for i,row in enumerate(self.rows):
2678  class_val = row[class_idx]
2679  score_val = row[score_idx]
2680  if class_val==None or score_val==None:
2681  continue
2682  if class_val!=None:
2683  if old_score_val==None:
2684  old_score_val = score_val
2685  if score_val!=old_score_val:
2686  x.append(fp)
2687  y.append(tp)
2688  old_score_val = score_val
2689  if class_type=='bool':
2690  if class_val==True:
2691  tp += 1
2692  else:
2693  fp += 1
2694  else:
2695  if (class_dir=='-' and class_val<=class_cutoff) or (class_dir=='+' and class_val>=class_cutoff):
2696  tp += 1
2697  else:
2698  fp += 1
2699  x.append(fp)
2700  y.append(tp)
2701 
2702  # if no false positives or false negatives values are found return None
2703  if x[-1]==0 or y[-1]==0:
2704  return None
2705 
2706  x = [float(v)/x[-1] for v in x]
2707  y = [float(v)/y[-1] for v in y]
2708  return x,y
2709 
2710  def ComputeROCAUC(self, score_col, class_col, score_dir='-',
2711  class_dir='-', class_cutoff=2.0):
2712  '''
2713  Computes the area under the curve of the receiver operating characteristics
2714  using the trapezoidal rule.
2715 
2716  For more information about parameters of the ROC, see
2717  :meth:`ComputeROC`.
2718 
2719  :warning: The function depends on *numpy*
2720  '''
2721  try:
2722  import numpy as np
2723 
2724  roc = self.ComputeROC(score_col, class_col, score_dir,
2725  class_dir, class_cutoff)
2726 
2727  if not roc:
2728  return None
2729  return np.trapz(roc[1], roc[0])
2730  except ImportError:
2731  LogError("Function needs numpy, but I could not import it.")
2732  raise
2733 
2734  def ComputeLogROCAUC(self, score_col, class_col, score_dir='-',
2735  class_dir='-', class_cutoff=2.0):
2736  '''
2737  Computes the area under the curve of the log receiver operating
2738  characteristics (logROC) where the x-axis is semilogarithmic
2739  using the trapezoidal rule.
2740 
2741  The logROC is computed with a lambda of 0.001 according to
2742  Rapid Context-Dependent Ligand Desolvation in Molecular Docking
2743  Mysinger M. and Shoichet B., Journal of Chemical Information and Modeling
2744  2010 50 (9), 1561-1573
2745 
2746  For more information about parameters of the ROC, see
2747  :meth:`ComputeROC`.
2748 
2749  :warning: The function depends on *numpy*
2750  '''
2751  try:
2752  import numpy as np
2753 
2754  roc = self.ComputeROC(score_col, class_col, score_dir,
2755  class_dir, class_cutoff)
2756 
2757  if not roc:
2758  return None
2759 
2760  rocxt, rocyt = roc
2761  rocx=[]
2762  rocy=[]
2763 
2764  # define lambda
2765  l=0.001
2766 
2767  # remove all duplicate x-values
2768  rocxt = [x if x>0 else l for x in rocxt]
2769  for i in range(len(rocxt)-1):
2770  if rocxt[i]==rocxt[i+1]:
2771  continue
2772  rocx.append(rocxt[i])
2773  rocy.append(rocyt[i])
2774  rocx.append(1.0)
2775  rocy.append(1.0)
2776 
2777  # compute logauc
2778  value = 0
2779  for i in range(len(rocx)-1):
2780  x = rocx[i]
2781  if rocx[i]==rocx[i+1]:
2782  continue
2783  b = rocy[i+1]-rocx[i+1]*((rocy[i+1]-rocy[i])/(rocx[i+1]-rocx[i]))
2784  value += ((rocy[i+1]-rocy[i])/math.log(10))+b*(math.log10(rocx[i+1])-math.log10(rocx[i]))
2785  return value/math.log10(1.0/l)
2786 
2787  except ImportError:
2788  LogError("Function needs numpy, but I could not import it.")
2789  raise
2790 
2791  def PlotROC(self, score_col, class_col, score_dir='-',
2792  class_dir='-', class_cutoff=2.0,
2793  style='-', title=None, x_title=None, y_title=None,
2794  clear=True, save=None):
2795  '''
2796  Plot an ROC curve using matplotlib.
2797 
2798  For more information about parameters of the ROC, see
2799  :meth:`ComputeROC`, and for plotting see :meth:`Plot`.
2800 
2801  :warning: The function depends on *matplotlib*
2802  '''
2803 
2804  try:
2805  import matplotlib.pyplot as plt
2806 
2807  roc = self.ComputeROC(score_col, class_col, score_dir,
2808  class_dir, class_cutoff)
2809 
2810  if not roc:
2811  return None
2812 
2813  enrx, enry = roc
2814 
2815  if not title:
2816  title = 'ROC of %s'%score_col
2817 
2818  if not x_title:
2819  x_title = 'false positive rate'
2820 
2821  if not y_title:
2822  y_title = 'true positive rate'
2823 
2824  if clear:
2825  plt.clf()
2826 
2827  plt.plot(enrx, enry, style)
2828 
2829  plt.title(title, size='x-large', fontweight='bold')
2830  plt.ylabel(y_title, size='x-large')
2831  plt.xlabel(x_title, size='x-large')
2832 
2833  if save:
2834  plt.savefig(save)
2835 
2836  return plt
2837  except ImportError:
2838  LogError("Function needs matplotlib, but I could not import it.")
2839  raise
2840 
2841  def PlotLogROC(self, score_col, class_col, score_dir='-',
2842  class_dir='-', class_cutoff=2.0,
2843  style='-', title=None, x_title=None, y_title=None,
2844  clear=True, save=None):
2845  '''
2846  Plot an logROC curve where the x-axis is semilogarithmic using matplotlib
2847 
2848  For more information about parameters of the ROC, see
2849  :meth:`ComputeROC`, and for plotting see :meth:`Plot`.
2850 
2851  :warning: The function depends on *matplotlib*
2852  '''
2853 
2854  try:
2855  import matplotlib.pyplot as plt
2856 
2857  roc = self.ComputeROC(score_col, class_col, score_dir,
2858  class_dir, class_cutoff)
2859 
2860  if not roc:
2861  return None
2862 
2863  rocx, rocy = roc
2864 
2865  if not title:
2866  title = 'logROC of %s'%score_col
2867 
2868  if not x_title:
2869  x_title = 'false positive rate'
2870 
2871  if not y_title:
2872  y_title = 'true positive rate'
2873 
2874  if clear:
2875  plt.clf()
2876 
2877  rocx = [x if x>0 else 0.001 for x in rocx]
2878 
2879 
2880  plt.plot(rocx, rocy, style)
2881 
2882  plt.title(title, size='x-large', fontweight='bold')
2883  plt.ylabel(y_title, size='x-large')
2884  plt.xlabel(x_title, size='x-large')
2885 
2886  plt.xscale('log', basex=10)
2887  plt.xlim(0.001, 1.0)
2888 
2889 
2890  if save:
2891  plt.savefig(save)
2892 
2893  return plt
2894  except ImportError:
2895  LogError("Function needs matplotlib, but I could not import it.")
2896  raise
2897 
2898  def ComputeMCC(self, score_col, class_col, score_dir='-',
2899  class_dir='-', score_cutoff=2.0, class_cutoff=2.0):
2900  '''
2901  Compute Matthews correlation coefficient (MCC) for one column (*score_col*)
2902  with the points classified into true positives, false positives, true
2903  negatives and false negatives according to a specified classification
2904  column (*class_col*).
2905 
2906  The datapoints in *score_col* and *class_col* are classified into
2907  positive and negative points. This can be done in two ways:
2908 
2909  - by using 'bool' columns which contains True for positives and False
2910  for negatives
2911 
2912  - by using 'float' or 'int' columns and specifying a cutoff value and the
2913  columns direction. This will generate the classification on the fly
2914 
2915  * if ``class_dir``/``score_dir=='-'``: values in the classification column that are less than or equal to *class_cutoff*/*score_cutoff* will be counted as positives
2916  * if ``class_dir``/``score_dir=='+'``: values in the classification column that are larger than or equal to *class_cutoff*/*score_cutoff* will be counted as positives
2917 
2918  The two possibilities can be used together, i.e. 'bool' type for one column
2919  and 'float'/'int' type and cutoff/direction for the other column.
2920  '''
2921  ALLOWED_DIR = ['+','-']
2922 
2923  score_idx = self.GetColIndex(score_col)
2924  score_type = self.col_types[score_idx]
2925  if score_type!='int' and score_type!='float' and score_type!='bool':
2926  raise TypeError("Score column must be numeric or bool type")
2927 
2928  class_idx = self.GetColIndex(class_col)
2929  class_type = self.col_types[class_idx]
2930  if class_type!='int' and class_type!='float' and class_type!='bool':
2931  raise TypeError("Classifier column must be numeric or bool type")
2932 
2933  if (score_dir not in ALLOWED_DIR) or (class_dir not in ALLOWED_DIR):
2934  raise ValueError("Direction must be one of %s"%str(ALLOWED_DIR))
2935 
2936  tp = 0
2937  fp = 0
2938  fn = 0
2939  tn = 0
2940 
2941  for i,row in enumerate(self.rows):
2942  class_val = row[class_idx]
2943  score_val = row[score_idx]
2944  if class_val!=None:
2945  if (class_type=='bool' and class_val==True) or (class_type!='bool' and ((class_dir=='-' and class_val<=class_cutoff) or (class_dir=='+' and class_val>=class_cutoff))):
2946  if (score_type=='bool' and score_val==True) or (score_type!='bool' and ((score_dir=='-' and score_val<=score_cutoff) or (score_dir=='+' and score_val>=score_cutoff))):
2947  tp += 1
2948  else:
2949  fn += 1
2950  else:
2951  if (score_type=='bool' and score_val==False) or (score_type!='bool' and ((score_dir=='-' and score_val>score_cutoff) or (score_dir=='+' and score_val<score_cutoff))):
2952  tn += 1
2953  else:
2954  fp += 1
2955 
2956  mcc = None
2957  msg = None
2958  if (tp+fn)==0:
2959  msg = 'factor (tp + fn) is zero'
2960  elif (tp+fp)==0:
2961  msg = 'factor (tp + fp) is zero'
2962  elif (tn+fn)==0:
2963  msg = 'factor (tn + fn) is zero'
2964  elif (tn+fp)==0:
2965  msg = 'factor (tn + fp) is zero'
2966 
2967  if msg:
2968  LogWarning("Could not compute MCC: MCC is not defined since %s"%msg)
2969  else:
2970  mcc = ((tp*tn)-(fp*fn)) / math.sqrt((tp+fn)*(tp+fp)*(tn+fn)*(tn+fp))
2971  return mcc
2972 
2973 
2974  def IsEmpty(self, col_name=None, ignore_nan=True):
2975  '''
2976  Checks if a table is empty.
2977 
2978  If no column name is specified, the whole table is checked for being empty,
2979  whereas if a column name is specified, only this column is checked.
2980 
2981  By default, all NAN (or None) values are ignored, and thus, a table
2982  containing only NAN values is considered as empty. By specifying the
2983  option ignore_nan=False, NAN values are counted as 'normal' values.
2984  '''
2985 
2986  # table with no columns and no rows
2987  if len(self.col_names)==0:
2988  if col_name:
2989  raise ValueError('Table has no column named "%s"' % col_name)
2990  return True
2991 
2992  # column name specified
2993  if col_name:
2994  if self.Count(col_name, ignore_nan=ignore_nan)==0:
2995  return True
2996  else:
2997  return False
2998 
2999  # no column name specified -> test whole table
3000  else:
3001  for row in self.rows:
3002  for cell in row:
3003  if ignore_nan:
3004  if cell!=None:
3005  return False
3006  else:
3007  return False
3008  return True
3009 
3010 
3011  def Extend(self, tab, overwrite=None):
3012  """
3013  Append each row of *tab* to the current table. The data is appended based
3014  on the column names, thus the order of the table columns is *not* relevant,
3015  only the header names.
3016 
3017  If there is a column in *tab* that is not present in the current table,
3018  it is added to the current table and filled with *None* for all the rows
3019  present in the current table.
3020 
3021  If the type of any column in *tab* is not the same as in the current table
3022  a *TypeError* is raised.
3023 
3024  If *overwrite* is not None and set to an existing column name, the specified
3025  column in the table is searched for the first occurrence of a value matching
3026  the value of the column with the same name in the dictionary. If a matching
3027  value is found, the row is overwritten with the dictionary. If no matching
3028  row is found, a new row is appended to the table.
3029  """
3030  # add column to current table if it doesn't exist
3031  for name,typ in zip(tab.col_names, tab.col_types):
3032  if not name in self.col_names:
3033  self.AddCol(name, typ)
3034 
3035  # check that column types are the same in current and new table
3036  for name in self.col_names:
3037  if name in tab.col_names:
3038  curr_type = self.col_types[self.GetColIndex(name)]
3039  new_type = tab.col_types[tab.GetColIndex(name)]
3040  if curr_type!=new_type:
3041  raise TypeError('cannot extend table, column %s in new '%name +\
3042  'table different type (%s) than in '%new_type +\
3043  'current table (%s)'%curr_type)
3044 
3045  num_rows = len(tab.rows)
3046  for i in range(0,num_rows):
3047  row = tab.rows[i]
3048  data = dict(zip(tab.col_names,row))
3049  self.AddRow(data, overwrite)
3050 
3051 
3052 def Merge(table1, table2, by, only_matching=False):
3053  """
3054  Returns a new table containing the data from both tables. The rows are
3055  combined based on the common values in the column(s) by. The option 'by' can
3056  be a list of column names. When this is the case, merging is based on
3057  multiple columns.
3058  For example, the two tables below
3059 
3060  ==== ====
3061  x y
3062  ==== ====
3063  1 10
3064  2 15
3065  3 20
3066  ==== ====
3067 
3068  ==== ====
3069  x u
3070  ==== ====
3071  1 100
3072  3 200
3073  4 400
3074  ==== ====
3075 
3076  when merged by column x, produce the following output:
3077 
3078  ===== ===== =====
3079  x y u
3080  ===== ===== =====
3081  1 10 100
3082  2 15 None
3083  3 20 200
3084  4 None 400
3085  ===== ===== =====
3086 
3087 
3088  """
3089  def _key(row, indices):
3090  return tuple([row[i] for i in indices])
3091  def _keep(indices, cn, ct, ni):
3092  ncn, nct, nni=([],[],[])
3093  for i in range(len(cn)):
3094  if i not in indices:
3095  ncn.append(cn[i])
3096  nct.append(ct[i])
3097  nni.append(ni[i])
3098  return ncn, nct, nni
3099  col_names=list(table2.col_names)
3100  col_types=list(table2.col_types)
3101  new_index=[i for i in range(len(col_names))]
3102  if isinstance(by, str):
3103  common2_indices=[col_names.index(by)]
3104  else:
3105  common2_indices=[col_names.index(b) for b in by]
3106  col_names, col_types, new_index=_keep(common2_indices, col_names,
3107  col_types, new_index)
3108 
3109  for i, name in enumerate(col_names):
3110  try_name=name
3111  counter=1
3112  while try_name in table1.col_names:
3113  counter+=1
3114  try_name='%s_%d' % (name, counter)
3115  col_names[i]=try_name
3116  common1={}
3117  if isinstance(by, str):
3118  common1_indices=[table1.col_names.index(by)]
3119  else:
3120  common1_indices=[table1.col_names.index(b) for b in by]
3121  for row in table1.rows:
3122  key=_key(row, common1_indices)
3123  if key in common1:
3124  raise ValueError('duplicate key "%s in first table"' % (str(key)))
3125  common1[key]=row
3126  common2={}
3127  for row in table2.rows:
3128  key=_key(row, common2_indices)
3129  if key in common2:
3130  raise ValueError('duplicate key "%s" in second table' % (str(key)))
3131  common2[key]=row
3132  new_tab=Table(table1.col_names+col_names, table1.col_types+col_types)
3133  for k, v in common1.iteritems():
3134  row=v+[None for i in range(len(table2.col_names)-len(common2_indices))]
3135  matched=False
3136  if k in common2:
3137  matched=True
3138  row2=common2[k]
3139  for i, index in enumerate(new_index):
3140  row[len(table1.col_names)+i]=row2[index]
3141  if only_matching and not matched:
3142  continue
3143  new_tab.AddRow(row)
3144  if only_matching:
3145  return new_tab
3146  for k, v in common2.iteritems():
3147  if not k in common1:
3148  v2=[v[i] for i in new_index]
3149  row=[None for i in range(len(table1.col_names))]+v2
3150  for common1_index, common2_index in zip(common1_indices, common2_indices):
3151  row[common1_index]=v[common2_index]
3152  new_tab.AddRow(row)
3153  return new_tab
3154 
def RenameCol
Definition: table.py:329
def PlotHexbin
Definition: table.py:1639
def RemoveCol
Definition: table.py:688
def ToString
Definition: table.py:422
def IsStringLike
Definition: table.py:14
def SearchColNames
Definition: table.py:385
def PlotHistogram
Definition: table.py:1349
def _AddRowsFromDict
Definition: table.py:507
def PairedTTest
Definition: table.py:565
def IsNullString
Definition: table.py:23
def _ParseColTypes
Definition: table.py:247
def GuessColumnType
Definition: table.py:38
def GetColIndex
Definition: table.py:369
def IsScalar
Definition: table.py:27
def __init__
Definition: table.py:221
def MakeTitle
Definition: table.py:11
def __getattr__
Definition: table.py:237
def __getitem__
Definition: table.py:407
def __setitem__
Definition: table.py:413
def GetColNames
Definition: table.py:379