OpenStructure
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Groups Pages
table.py
Go to the documentation of this file.
1 import csv
2 import re
3 import math
4 from ost import stutil
5 import itertools
6 import operator
7 import cPickle
8 from ost import LogError, LogWarning, LogInfo, LogVerbose
9 
10 def MakeTitle(col_name):
11  return col_name.replace('_', ' ')
12 
13 def IsStringLike(value):
14  if isinstance(value, TableCol) or isinstance(value, BinaryColExpr):
15  return False
16  try:
17  value+''
18  return True
19  except:
20  return False
21 
22 def IsNullString(value):
23  value=value.strip().upper()
24  return value in ('', 'NULL', 'NONE', 'NA')
25 
26 def IsScalar(value):
27  if IsStringLike(value):
28  return True
29  try:
30  if isinstance(value, TableCol) or isinstance(value, BinaryColExpr):
31  return False
32  iter(value)
33  return False
34  except:
35  return True
36 
37 def GuessColumnType(iterator):
38  empty=True
39  possibilities=set(['bool', 'int', 'float'])
40  for ele in iterator:
41  str_ele=str(ele).upper()
42  if IsNullString(str_ele):
43  continue
44  empty=False
45  if 'int' in possibilities:
46  try:
47  int(str_ele)
48  except ValueError:
49  possibilities.remove('int')
50 
51  if 'float' in possibilities:
52  try:
53  float(str_ele)
54  except ValueError:
55  possibilities.remove('float')
56  if 'bool' in possibilities:
57  if str_ele not in set(['YES', 'NO', 'TRUE', 'FALSE']):
58  possibilities.remove('bool')
59 
60  if len(possibilities)==0:
61  return 'string'
62  if len(possibilities)==2:
63  return 'int'
64  if empty:
65  return 'string'
66  # return the last element available
67  return possibilities.pop()
68 
70  def __init__(self, op, lhs, rhs):
71  self.op=op
72  self.lhs=lhs
73  self.rhs=rhs
74  if IsScalar(lhs):
75  self.lhs=itertools.cyle([self.lhs])
76  if IsScalar(rhs):
77  self.rhs=itertools.cycle([self.rhs])
78  def __iter__(self):
79  for l, r in zip(self.lhs, self.rhs):
80  if l!=None and r!=None:
81  yield self.op(l, r)
82  else:
83  yield None
84  def __add__(self, rhs):
85  return BinaryColExpr(operator.add, self, rhs)
86 
87  def __sub__(self, rhs):
88  return BinaryColExpr(operator.sub, self, rhs)
89 
90  def __mul__(self, rhs):
91  return BinaryColExpr(operator.mul, self, rhs)
92 
93  def __div__(self, rhs):
94  return BinaryColExpr(operator.div, self, rhs)
95 
96 class TableCol:
97  def __init__(self, table, col):
98  self._table=table
99  if type(col)==str:
100  self.col_index=self._table.GetColIndex(col)
101  else:
102  self.col_index=col
103 
104  def __iter__(self):
105  for row in self._table.rows:
106  yield row[self.col_index]
107 
108  def __len__(self):
109  return len(self._table.rows)
110 
111  def __getitem__(self, index):
112  return self._table.rows[index][self.col_index]
113 
114  def __setitem__(self, index, value):
115  self._table.rows[index][self.col_index]=value
116 
117  def __add__(self, rhs):
118  return BinaryColExpr(operator.add, self, rhs)
119 
120  def __sub__(self, rhs):
121  return BinaryColExpr(operator.sub, self, rhs)
122 
123  def __mul__(self, rhs):
124  return BinaryColExpr(operator.mul, self, rhs)
125 
126  def __div__(self, rhs):
127  return BinaryColExpr(operator.div, self, rhs)
128 
129 
130 class Table(object):
131  """
132 
133  The table class provides convenient access to data in tabular form. An empty
134  table can be easily constructed as follows
135 
136  .. code-block:: python
137 
138  tab=Table()
139 
140  If you want to add columns directly when creating the table, column names
141  and *column types* can be specified as follows
142 
143  .. code-block:: python
144 
145  tab=Table(['nameX','nameY','nameZ'], 'sfb')
146 
147  this will create three columns called nameX, nameY and nameZ of type string,
148  float and bool, respectively. There will be no data in the table and thus,
149  the table will not contain any rows.
150 
151  The following *column types* are supported:
152 
153  ======= ========
154  name abbrev
155  ======= ========
156  string s
157  float f
158  int i
159  bool b
160  ======= ========
161 
162  If you want to add data to the table in addition, use the following:
163 
164  .. code-block:: python
165 
166  tab=Table(['nameX','nameY','nameZ'],
167  'sfb',
168  nameX=['a','b','c'],
169  nameY=[0.1, 1.2, 3.414],
170  nameZ=[True, False, False])
171 
172  if values for one column is left out, they will be filled with NA, but if
173  values are specified, all values must be specified (i.e. same number of
174  values per column)
175 
176  """
177 
178  SUPPORTED_TYPES=('int', 'float', 'bool', 'string',)
179 
180 
181  def __init__(self, col_names=None, col_types=None, **kwargs):
182  self.col_names=col_names
183  self.comment=''
184  self.name=''
185 
186  self.col_types = self._ParseColTypes(col_types)
187  self.rows=[]
188  if len(kwargs)>=0:
189  if not col_names:
190  self.col_names=[v for v in kwargs.keys()]
191  if not self.col_types:
192  self.col_types=['string' for u in range(len(self.col_names))]
193  if len(kwargs)>0:
194  self._AddRowsFromDict(kwargs)
195 
196  def __getattr__(self, col_name):
197  # pickling doesn't call the standard __init__ defined above and thus
198  # col_names might not be defined. This leads to infinite recursions.
199  # Protect against it by checking that col_names is contained in
200  # __dict__
201  if 'col_names' not in self.__dict__ or col_name not in self.col_names:
202  raise AttributeError(col_name)
203  return TableCol(self, col_name)
204 
205  @staticmethod
206  def _ParseColTypes(types, exp_num=None):
207  if types==None:
208  return None
209 
210  short2long = {'s' : 'string', 'i': 'int', 'b' : 'bool', 'f' : 'float'}
211  allowed_short = short2long.keys()
212  allowed_long = short2long.values()
213 
214  type_list = []
215 
216  # string type
217  if IsScalar(types):
218  if type(types)==str:
219  types = types.lower()
220 
221  # single value
222  if types in allowed_long:
223  type_list.append(types)
224  elif types in allowed_short:
225  type_list.append(short2long[types])
226 
227  # comma separated list of long or short types
228  elif types.find(',')!=-1:
229  for t in types.split(','):
230  if t in allowed_long:
231  type_list.append(t)
232  elif t in allowed_short:
233  type_list.append(short2long[t])
234  else:
235  raise ValueError('Unknown type %s in types %s'%(t,types))
236 
237  # string of short types
238  else:
239  for t in types:
240  if t in allowed_short:
241  type_list.append(short2long[t])
242  else:
243  raise ValueError('Unknown type %s in types %s'%(t,types))
244 
245  # non-string type
246  else:
247  raise ValueError('Col type %s must be string or list'%types)
248 
249  # list type
250  else:
251  for t in types:
252  # must be string type
253  if type(t)==str:
254  t = t.lower()
255  if t in allowed_long:
256  type_list.append(t)
257  elif t in allowed_short:
258  type_list.append(short2long[t])
259  else:
260  raise ValueError('Unknown type %s in types %s'%(t,types))
261 
262  # non-string type
263  else:
264  raise ValueError('Col type %s must be string or list'%types)
265 
266  if exp_num:
267  if len(type_list)!=exp_num:
268  raise ValueError('Parsed number of col types (%i) differs from ' + \
269  'expected (%i) in types %s'%(len(type_list),exp_num,types))
270 
271  return type_list
272 
273  def SetName(self, name):
274  '''
275  Set name of the table
276  :param name: name
277  :type name: :class:`str`
278  '''
279  self.name = name
280 
281  def GetName(self):
282  '''
283  Get name of table
284  '''
285  return self.name
286 
287  def RenameCol(self, old_name, new_name):
288  """
289  Rename column *old_name* to *new_name*.
290 
291  :param old_name: Name of the old column
292  :param new_name: Name of the new column
293  :raises: :exc:`ValueError` when *old_name* is not a valid column
294  """
295  if old_name==new_name:
296  return
297  self.AddCol(new_name, self.col_types[self.GetColIndex(old_name)],
298  self[old_name])
299  self.RemoveCol(old_name)
300  def _Coerce(self, value, ty):
301  '''
302  Try to convert values (e.g. from :class:`str` type) to the specified type
303 
304  :param value: the value
305  :type value: any type
306 
307  :param ty: name of type to convert it to (i.e. *int*, *float*, *string*,
308  *bool*)
309  :type ty: :class:`str`
310  '''
311  if value=='NA' or value==None:
312  return None
313  if ty=='int':
314  return int(value)
315  if ty=='float':
316  return float(value)
317  if ty=='string':
318  return str(value)
319  if ty=='bool':
320  if isinstance(value, str) or isinstance(value, unicode):
321  if value.upper() in ('FALSE', 'NO',):
322  return False
323  return True
324  return bool(value)
325  raise ValueError('Unknown type %s' % ty)
326 
327  def GetColIndex(self, col):
328  '''
329  Returns the column index for the column with the given name.
330 
331  :raises: ValueError if no column with the name is found
332  '''
333  if col not in self.col_names:
334  raise ValueError('Table has no column named "%s"' % col)
335  return self.col_names.index(col)
336 
337  def GetColNames(self):
338  '''
339  Returns a list containing all column names.
340  '''
341  return self.col_names
342 
343  def SearchColNames(self, regex):
344  '''
345  Returns a list of column names matching the regex
346 
347  :param regex: regex pattern
348  :type regex: :class:`str`
349 
350  :returns: :class:`list` of column names (:class:`str`)
351  '''
352  matching_names = []
353  for name in self.col_names:
354  matches = re.search(regex, name)
355  if matches:
356  matching_names.append(name)
357  return matching_names
358 
359  def HasCol(self, col):
360  '''
361  Checks if the column with a given name is present in the table.
362  '''
363  return col in self.col_names
364 
365  def __getitem__(self, k):
366  if type(k)==int:
367  return TableCol(self, self.col_names[k])
368  else:
369  return TableCol(self, k)
370 
371  def __setitem__(self, k, value):
372  col_index=k
373  if type(k)!=int:
374  col_index=self.GetColIndex(k)
375  if IsScalar(value):
376  value=itertools.cycle([value])
377  for r, v in zip(self.rows, value):
378  r[col_index]=v
379 
380  def ToString(self, float_format='%.3f', int_format='%d', rows=None):
381  '''
382  Convert the table into a string representation.
383 
384  The output format can be modified for int and float type columns by
385  specifying a formatting string for the parameters 'float_format' and
386  'int_format'.
387 
388  The option 'rows' specify the range of rows to be printed. The parameter
389  must be a type that supports indexing (e.g. a :class:`list`) containing the
390  start and end row *index*, e.g. [start_row_idx, end_row_idx].
391 
392  :param float_format: formatting string for float columns
393  :type float_format: :class:`str`
394 
395  :param int_format: formatting string for int columns
396  :type int_format: :class:`str`
397 
398  :param rows: iterable containing start and end row *index*
399  :type rows: iterable containing :class:`ints <int>`
400  '''
401  widths=[len(cn) for cn in self.col_names]
402  sel_rows=self.rows
403  if rows:
404  sel_rows=self.rows[rows[0]:rows[1]]
405  for row in sel_rows:
406  for i, (ty, col) in enumerate(zip(self.col_types, row)):
407  if col==None:
408  widths[i]=max(widths[i], len('NA'))
409  elif ty=='float':
410  widths[i]=max(widths[i], len(float_format % col))
411  elif ty=='int':
412  widths[i]=max(widths[i], len(int_format % col))
413  else:
414  widths[i]=max(widths[i], len(str(col)))
415  s=''
416  if self.comment:
417  s+=''.join(['# %s\n' % l for l in self.comment.split('\n')])
418  total_width=sum(widths)+2*len(widths)
419  for width, col_name in zip(widths, self.col_names):
420  s+=col_name.center(width+2)
421  s+='\n%s\n' % ('-'*total_width)
422  for row in sel_rows:
423  for width, ty, col in zip(widths, self.col_types, row):
424  cs=''
425  if col==None:
426  cs='NA'.center(width+2)
427  elif ty=='float':
428  cs=(float_format % col).rjust(width+2)
429  elif ty=='int':
430  cs=(int_format % col).rjust(width+2)
431  else:
432  cs=' '+str(col).ljust(width+1)
433  s+=cs
434  s+='\n'
435  return s
436 
437  def __str__(self):
438  return self.ToString()
439 
440  def Stats(self, col):
441  idx = self.GetColIndex(col)
442  text ='''
443 Statistics for column %(col)s
444 
445  Number of Rows : %(num)d
446  Number of Rows Not None: %(num_non_null)d
447  Mean : %(mean)f
448  Median : %(median)f
449  Standard Deviation : %(stddev)f
450  Min : %(min)f
451  Max : %(max)f
452 '''
453  data = {
454  'col' : col,
455  'num' : len(self.rows),
456  'num_non_null' : self.Count(col),
457  'median' : self.Median(col),
458  'mean' : self.Mean(col),
459  'stddev' : self.StdDev(col),
460  'min' : self.Min(col),
461  'max' : self.Max(col),
462  }
463  return text % data
464 
465  def _AddRowsFromDict(self, d, overwrite=None):
466  '''
467  Add one or more rows from a :class:`dictionary <dict>`.
468 
469  If *overwrite* is not None and set to an existing column name, the specified
470  column in the table is searched for the first occurrence of a value matching
471  the value of the column with the same name in the dictionary. If a matching
472  value is found, the row is overwritten with the dictionary. If no matching
473  row is found, a new row is appended to the table.
474 
475  :param d: dictionary containing the data
476  :type d: :class:`dict`
477 
478  :param overwrite: column name to overwrite existing row if value in
479  column *overwrite* matches
480  :type overwrite: :class:`str`
481 
482  :raises: :class:`ValueError` if multiple rows are added but the number of
483  data items is different for different columns.
484  '''
485  # get column indices
486  idxs = [self.GetColIndex(k) for k in d.keys()]
487 
488  # convert scalar values to list
489  old_len = None
490  for k,v in d.iteritems():
491  if IsScalar(v):
492  v = [v]
493  d[k] = v
494  if not old_len:
495  old_len = len(v)
496  elif old_len!=len(v):
497  raise ValueError("Cannot add rows: length of data must be equal " + \
498  "for all columns in %s"%str(d))
499 
500  # convert column based dict to row based dict and create row and add data
501  for i,data in enumerate(zip(*d.values())):
502  new_row = [None for a in range(len(self.col_names))]
503  for idx,v in zip(idxs,data):
504  new_row[idx] = self._Coerce(v, self.col_types[idx])
505 
506  # partially overwrite existing row with new data
507  if overwrite:
508  overwrite_idx = self.GetColIndex(overwrite)
509  added = False
510  for i,r in enumerate(self.rows):
511  if r[overwrite_idx]==new_row[overwrite_idx]:
512  for j,e in enumerate(self.rows[i]):
513  if new_row[j]==None:
514  new_row[j] = e
515  self.rows[i] = new_row
516  added = True
517  break
518 
519  # if not overwrite or overwrite did not find appropriate row
520  if not overwrite or not added:
521  self.rows.append(new_row)
522 
523  def PairedTTest(self, col_a, col_b):
524  """
525  Two-sided test for the null-hypothesis that two related samples
526  have the same average (expected values)
527 
528  :param col_a: First column
529  :param col_b: Second column
530 
531  :returns: P-value between 0 and 1 that the two columns have the
532  same average. The smaller the value, the less related the two
533  columns are.
534  """
535  from scipy.stats import ttest_rel
536  xs = []
537  ys = []
538  for x, y in self.Zip(col_a, col_b):
539  if x!=None and y!=None:
540  xs.append(x)
541  ys.append(y)
542  result = ttest_rel(xs, ys)
543  return result[1]
544 
545  def AddRow(self, data, overwrite=None):
546  """
547  Add a row to the table.
548 
549  *data* may either be a dictionary or a list-like object:
550 
551  - If *data* is a dictionary the keys in the dictionary must match the
552  column names. Columns not found in the dict will be initialized to None.
553  If the dict contains list-like objects, multiple rows will be added, if
554  the number of items in all list-like objects is the same, otherwise a
555  :class:`ValueError` is raised.
556 
557  - If *data* is a list-like object, the row is initialized from the values
558  in *data*. The number of items in *data* must match the number of
559  columns in the table. A :class:`ValuerError` is raised otherwise. The
560  values are added in the order specified in the list, thus, the order of
561  the data must match the columns.
562 
563  If *overwrite* is not None and set to an existing column name, the specified
564  column in the table is searched for the first occurrence of a value matching
565  the value of the column with the same name in the dictionary. If a matching
566  value is found, the row is overwritten with the dictionary. If no matching
567  row is found, a new row is appended to the table.
568 
569  :param data: data to add
570  :type data: :class:`dict` or *list-like* object
571 
572  :param overwrite: column name to overwrite existing row if value in
573  column *overwrite* matches
574  :type overwrite: :class:`str`
575 
576  :raises: :class:`ValueError` if *list-like* object is used and number of
577  items does *not* match number of columns in table.
578 
579  :raises: :class:`ValueError` if *dict* is used and multiple rows are added
580  but the number of data items is different for different columns.
581 
582  **Example:** add multiple data rows to a subset of columns using a dictionary
583 
584  .. code-block:: python
585 
586  # create table with three float columns
587  tab = Table(['x','y','z'], 'fff')
588 
589  # add rows from dict
590  data = {'x': [1.2, 1.6], 'z': [1.6, 5.3]}
591  tab.AddRow(data)
592  print tab
593 
594  '''
595  will produce the table
596 
597  ==== ==== ====
598  x y z
599  ==== ==== ====
600  1.20 NA 1.60
601  1.60 NA 5.30
602  ==== ==== ====
603  '''
604 
605  # overwrite the row with x=1.2 and add row with x=1.9
606  data = {'x': [1.2, 1.9], 'z': [7.9, 3.5]}
607  tab.AddRow(data, overwrite='x')
608  print tab
609 
610  '''
611  will produce the table
612 
613  ==== ==== ====
614  x y z
615  ==== ==== ====
616  1.20 NA 7.90
617  1.60 NA 5.30
618  1.90 NA 3.50
619  ==== ==== ====
620  '''
621  """
622  if type(data)==dict:
623  self._AddRowsFromDict(data, overwrite)
624  else:
625  if len(data)!=len(self.col_names):
626  msg='data array must have %d elements, not %d'
627  raise ValueError(msg % (len(self.col_names), len(data)))
628  new_row = [self._Coerce(v, t) for v, t in zip(data, self.col_types)]
629 
630  # fully overwrite existing row with new data
631  if overwrite:
632  overwrite_idx = self.GetColIndex(overwrite)
633  added = False
634  for i,r in enumerate(self.rows):
635  if r[overwrite_idx]==new_row[overwrite_idx]:
636  self.rows[i] = new_row
637  added = True
638  break
639 
640  # if not overwrite or overwrite did not find appropriate row
641  if not overwrite or not added:
642  self.rows.append(new_row)
643 
644  def RemoveCol(self, col):
645  """
646  Remove column with the given name from the table
647 
648  :param col: name of column to remove
649  :type col: :class:`str`
650  """
651  idx = self.GetColIndex(col)
652  del self.col_names[idx]
653  del self.col_types[idx]
654  for row in self.rows:
655  del row[idx]
656 
657  def AddCol(self, col_name, col_type, data=None):
658  """
659  Add a column to the right of the table.
660 
661  :param col_name: name of new column
662  :type col_name: :class:`str`
663 
664  :param col_type: type of new column (long versions: *int*, *float*, *bool*,
665  *string* or short versions: *i*, *f*, *b*, *s*)
666  :type col_type: :class:`str`
667 
668  :param data: data to add to new column.
669  :type data: scalar or iterable
670 
671  **Example:**
672 
673  .. code-block:: python
674 
675  tab=Table(['x'], 'f', x=range(5))
676  tab.AddCol('even', 'bool', itertools.cycle([True, False]))
677  print tab
678 
679  '''
680  will produce the table
681 
682  ==== ====
683  x even
684  ==== ====
685  0 True
686  1 False
687  2 True
688  3 False
689  4 True
690  ==== ====
691  '''
692 
693  If data is a constant instead of an iterable object, it's value
694  will be written into each row:
695 
696  .. code-block:: python
697 
698  tab=Table(['x'], 'f', x=range(5))
699  tab.AddCol('num', 'i', 1)
700  print tab
701 
702  '''
703  will produce the table
704 
705  ==== ====
706  x num
707  ==== ====
708  0 1
709  1 1
710  2 1
711  3 1
712  4 1
713  ==== ====
714  '''
715 
716  As a special case, if there are no previous rows, and data is not
717  None, rows are added for every item in data.
718  """
719 
720  if col_name in self.col_names:
721  raise ValueError('Column with name %s already exists'%col_name)
722 
723  col_type = self._ParseColTypes(col_type, exp_num=1)[0]
724  self.col_names.append(col_name)
725  self.col_types.append(col_type)
726 
727  if len(self.rows)>0:
728  if IsScalar(data):
729  for row in self.rows:
730  row.append(data)
731  else:
732  if hasattr(data, '__len__') and len(data)!=len(self.rows):
733  self.col_names.pop()
734  self.col_types.pop()
735  raise ValueError('Length of data (%i) must correspond to number of '%len(data) +\
736  'existing rows (%i)'%len(self.rows))
737  for row, d in zip(self.rows, data):
738  row.append(d)
739 
740  elif data!=None and len(self.col_names)==1:
741  if IsScalar(data):
742  self.AddRow({col_name : data})
743  else:
744  for v in data:
745  self.AddRow({col_name : v})
746 
747  def Filter(self, *args, **kwargs):
748  """
749  Returns a filtered table only containing rows matching all the predicates
750  in kwargs and args For example,
751 
752  .. code-block:: python
753 
754  tab.Filter(town='Basel')
755 
756  will return all the rows where the value of the column "town" is equal to
757  "Basel". Several predicates may be combined, i.e.
758 
759  .. code-block:: python
760 
761  tab.Filter(town='Basel', male=True)
762 
763  will return the rows with "town" equal to "Basel" and "male" equal to true.
764  args are unary callables returning true if the row should be included in the
765  result and false if not.
766  """
767  filt_tab=Table(list(self.col_names), list(self.col_types))
768  for row in self.rows:
769  matches=True
770  for func in args:
771  if not func(row):
772  matches=False
773  break
774  for key, val in kwargs.iteritems():
775  if row[self.GetColIndex(key)]!=val:
776  matches=False
777  break
778  if matches:
779  filt_tab.AddRow(row)
780  return filt_tab
781 
782  @staticmethod
783  def _LoadOST(stream_or_filename):
784  fieldname_pattern=re.compile(r'(?P<name>[^[]+)(\[(?P<type>\w+)\])?')
785  values_pattern=re.compile("([^\" ]+|\"[^\"]*\")+")
786  if not hasattr(stream_or_filename, 'read'):
787  stream=open(stream_or_filename, 'r')
788  else:
789  stream=stream_or_filename
790  header=False
791  num_lines=0
792  for line in stream:
793  line=line.strip()
794  if line.startswith('#'):
795  continue
796  if len(line)==0:
797  continue
798  num_lines+=1
799  if not header:
800  fieldnames=[]
801  fieldtypes=[]
802  for col in line.split():
803  match=fieldname_pattern.match(col)
804  if match:
805  if match.group('type'):
806  fieldtypes.append(match.group('type'))
807  else:
808  fieldtypes.append('string')
809  fieldnames.append(match.group('name'))
810  tab=Table(fieldnames, fieldtypes)
811  header=True
812  continue
813  tab.AddRow([x.strip('"') for x in values_pattern.findall(line)])
814  if num_lines==0:
815  raise IOError("Cannot read table from empty stream")
816  return tab
817 
818  def _GuessColumnTypes(self):
819  for col_idx in range(len(self.col_names)):
820  self.col_types[col_idx]=GuessColumnType(self[self.col_names[col_idx]])
821  for row in self.rows:
822  for idx in range(len(row)):
823  row[idx]=self._Coerce(row[idx], self.col_types[idx])
824 
825  @staticmethod
826  def _LoadCSV(stream_or_filename, sep):
827  if not hasattr(stream_or_filename, 'read'):
828  stream=open(stream_or_filename, 'r')
829  else:
830  stream=stream_or_filename
831  reader=csv.reader(stream, delimiter=sep)
832  first=True
833  for row in reader:
834  if first:
835  header=row
836  types='s'*len(row)
837  tab=Table(header, types)
838  first=False
839  else:
840  tab.AddRow(row)
841  if first:
842  raise IOError('trying to load table from empty CSV stream/file')
843 
844  tab._GuessColumnTypes()
845  return tab
846 
847  @staticmethod
848  def _LoadPickle(stream_or_filename):
849  if not hasattr(stream_or_filename, 'read'):
850  stream=open(stream_or_filename, 'rb')
851  else:
852  stream=stream_or_filename
853  return cPickle.load(stream)
854 
855  @staticmethod
856  def _GuessFormat(filename):
857  try:
858  filename = filename.name
859  except AttributeError, e:
860  pass
861  if filename.endswith('.csv'):
862  return 'csv'
863  elif filename.endswith('.pickle'):
864  return 'pickle'
865  else:
866  return 'ost'
867 
868 
869  @staticmethod
870  def Load(stream_or_filename, format='auto', sep=','):
871  """
872  Load table from stream or file with given name.
873 
874  By default, the file format is set to *auto*, which tries to guess the file
875  format from the file extension. The following file extensions are
876  recognized:
877 
878  ============ ======================
879  extension recognized format
880  ============ ======================
881  .csv comma separated values
882  .pickle pickled byte stream
883  <all others> ost-specific format
884  ============ ======================
885 
886  Thus, *format* must be specified for reading file with different filename
887  extensions.
888 
889  The following file formats are understood:
890 
891  - ost
892 
893  This is an ost-specific, but still human readable file format. The file
894  (stream) must start with header line of the form
895 
896  col_name1[type1] <col_name2[type2]>...
897 
898  The types given in brackets must be one of the data types the
899  :class:`Table` class understands. Each following line in the file then must
900  contains exactly the same number of data items as listed in the header. The
901  data items are automatically converted to the column format. Lines starting
902  with a '#' and empty lines are ignored.
903 
904  - pickle
905 
906  Deserializes the table from a pickled byte stream
907 
908  - csv
909 
910  Reads the table from comma separated values stream. Since there is no
911  explicit type information in the csv file, the column types are guessed,
912  using the following simple rules:
913 
914  * if all values are either NA/NULL/NONE the type is set to string
915  * if all non-null values are convertible to float/int the type is set to
916  float/int
917  * if all non-null values are true/false/yes/no, the value is set to bool
918  * for all other cases, the column type is set to string
919 
920  :returns: A new :class:`Table` instance
921  """
922  format=format.lower()
923  if format=='auto':
924  format = Table._GuessFormat(stream_or_filename)
925 
926  if format=='ost':
927  return Table._LoadOST(stream_or_filename)
928  if format=='csv':
929  return Table._LoadCSV(stream_or_filename, sep=sep)
930  if format=='pickle':
931  return Table._LoadPickle(stream_or_filename)
932  raise ValueError('unknown format ""' % format)
933 
934  def Sort(self, by, order='+'):
935  """
936  Performs an in-place sort of the table, based on column *by*.
937 
938  :param by: column name by which to sort
939  :type by: :class:`str`
940 
941  :param order: ascending (``-``) or descending (``+``) order
942  :type order: :class:`str` (i.e. *+*, *-*)
943  """
944  sign=-1
945  if order=='-':
946  sign=1
947  key_index=self.GetColIndex(by)
948  def _key_cmp(lhs, rhs):
949  return sign*cmp(lhs[key_index], rhs[key_index])
950  self.rows=sorted(self.rows, _key_cmp)
951 
952  def GetUnique(self, col, ignore_nan=True):
953  """
954  Extract a list of all unique values from one column
955 
956  :param col: column name
957  :type col: :class:`str`
958 
959  :param ignore_nan: ignore all *None* values
960  :type ignore_nan: :class:`bool`
961  """
962  idx = self.GetColIndex(col)
963  seen = {}
964  result = []
965  for row in self.rows:
966  item = row[idx]
967  if item!=None or ignore_nan==False:
968  if item in seen: continue
969  seen[item] = 1
970  result.append(item)
971  return result
972 
973  def Zip(self, *args):
974  """
975  Allows to conveniently iterate over a selection of columns, e.g.
976 
977  .. code-block:: python
978 
979  tab=Table.Load('...')
980  for col1, col2 in tab.Zip('col1', 'col2'):
981  print col1, col2
982 
983  is a shortcut for
984 
985  .. code-block:: python
986 
987  tab=Table.Load('...')
988  for col1, col2 in zip(tab['col1'], tab['col2']):
989  print col1, col2
990  """
991  return zip(*[self[arg] for arg in args])
992 
993  def Plot(self, x, y=None, z=None, style='.', x_title=None, y_title=None,
994  z_title=None, x_range=None, y_range=None, z_range=None,
995  color=None, plot_if=None, legend=None,
996  num_z_levels=10, z_contour=True, z_interpol='nn', diag_line=False,
997  labels=None, max_num_labels=None, title=None, clear=True, save=False,
998  **kwargs):
999  """
1000  Function to plot values from your table in 1, 2 or 3 dimensions using
1001  `Matplotlib <http://matplotlib.sourceforge.net>`__
1002 
1003  :param x: column name for first dimension
1004  :type x: :class:`str`
1005 
1006  :param y: column name for second dimension
1007  :type y: :class:`str`
1008 
1009  :param z: column name for third dimension
1010  :type z: :class:`str`
1011 
1012  :param style: symbol style (e.g. *.*, *-*, *x*, *o*, *+*, *\**). For a
1013  complete list check (`matplotlib docu <http://matplotlib.sourceforge.net/api/pyplot_api.html#matplotlib.pyplot.plot>`__).
1014  :type style: :class:`str`
1015 
1016  :param x_title: title for first dimension, if not specified it is
1017  automatically derived from column name
1018  :type x_title: :class:`str`
1019 
1020  :param y_title: title for second dimension, if not specified it is
1021  automatically derived from column name
1022  :type y_title: :class:`str`
1023 
1024  :param z_title: title for third dimension, if not specified it is
1025  automatically derived from column name
1026  :type z_title: :class:`str`
1027 
1028  :param x_range: start and end value for first dimension (e.g. [start_x, end_x])
1029  :type x_range: :class:`list` of length two
1030 
1031  :param y_range: start and end value for second dimension (e.g. [start_y, end_y])
1032  :type y_range: :class:`list` of length two
1033 
1034  :param z_range: start and end value for third dimension (e.g. [start_z, end_z])
1035  :type z_range: :class:`list` of length two
1036 
1037  :param color: color for data (e.g. *b*, *g*, *r*). For a complete list check
1038  (`matplotlib docu <http://matplotlib.sourceforge.net/api/pyplot_api.html#matplotlib.pyplot.plot>`__).
1039  :type color: :class:`str`
1040 
1041  :param plot_if: callable which returnes *True* if row should be plotted. Is
1042  invoked like ``plot_if(self, row)``
1043  :type plot_if: callable
1044 
1045  :param legend: legend label for data series
1046  :type legend: :class:`str`
1047 
1048  :param num_z_levels: number of levels for third dimension
1049  :type num_z_levels: :class:`int`
1050 
1051  :param diag_line: draw diagonal line
1052  :type diag_line: :class:`bool`
1053 
1054  :param labels: column name containing labels to put on x-axis for one
1055  dimensional plot
1056  :type labels: :class:`str`
1057 
1058  :param max_num_labels: limit maximum number of labels
1059  :type max_num_labels: :class:`int`
1060 
1061  :param title: plot title, if not specified it is automatically derived from
1062  plotted column names
1063  :type title: :class:`str`
1064 
1065  :param clear: clear old data from plot
1066  :type clear: :class:`bool`
1067 
1068  :param save: filename for saving plot
1069  :type save: :class:`str`
1070 
1071  :param z_contour: draw contour lines
1072  :type z_contour: :class:`bool`
1073 
1074  :param z_interpol: interpolation method for 3-dimensional plot (one of 'nn',
1075  'linear')
1076  :type z_interpol: :class:`str`
1077 
1078  :param \*\*kwargs: additional arguments passed to matplotlib
1079 
1080  :returns: the ``matplotlib.pyplot`` module
1081 
1082  **Examples:** simple plotting functions
1083 
1084  .. code-block:: python
1085 
1086  tab=Table(['a','b','c','d'],'iffi', a=range(5,0,-1),
1087  b=[x/2.0 for x in range(1,6)],
1088  c=[math.cos(x) for x in range(0,5)],
1089  d=range(3,8))
1090 
1091  # one dimensional plot of column 'd' vs. index
1092  plt=tab.Plot('d')
1093  plt.show()
1094 
1095  # two dimensional plot of 'a' vs. 'c'
1096  plt=tab.Plot('a', y='c', style='o-')
1097  plt.show()
1098 
1099  # three dimensional plot of 'a' vs. 'c' with values 'b'
1100  plt=tab.Plot('a', y='c', z='b')
1101  # manually save plot to file
1102  plt.savefig("plot.png")
1103  """
1104  try:
1105  import matplotlib.pyplot as plt
1106  import matplotlib.mlab as mlab
1107  import numpy as np
1108  idx1 = self.GetColIndex(x)
1109  xs = []
1110  ys = []
1111  zs = []
1112 
1113  if clear:
1114  plt.figure(figsize=[8, 6])
1115 
1116  if x_title!=None:
1117  nice_x=x_title
1118  else:
1119  nice_x=MakeTitle(x)
1120 
1121  if y_title!=None:
1122  nice_y=y_title
1123  else:
1124  if y:
1125  nice_y=MakeTitle(y)
1126  else:
1127  nice_y=None
1128 
1129  if z_title!=None:
1130  nice_z = z_title
1131  else:
1132  if z:
1133  nice_z = MakeTitle(z)
1134  else:
1135  nice_z = None
1136 
1137  if x_range and (IsScalar(x_range) or len(x_range)!=2):
1138  raise ValueError('parameter x_range must contain exactly two elements')
1139  if y_range and (IsScalar(y_range) or len(y_range)!=2):
1140  raise ValueError('parameter y_range must contain exactly two elements')
1141  if z_range and (IsScalar(z_range) or len(z_range)!=2):
1142  raise ValueError('parameter z_range must contain exactly two elements')
1143 
1144  if color:
1145  kwargs['color']=color
1146  if legend:
1147  kwargs['label']=legend
1148  if y and z:
1149  idx3 = self.GetColIndex(z)
1150  idx2 = self.GetColIndex(y)
1151  for row in self.rows:
1152  if row[idx1]!=None and row[idx2]!=None and row[idx3]!=None:
1153  if plot_if and not plot_if(self, row):
1154  continue
1155  xs.append(row[idx1])
1156  ys.append(row[idx2])
1157  zs.append(row[idx3])
1158  levels = []
1159  if z_range:
1160  z_spacing = (z_range[1] - z_range[0]) / num_z_levels
1161  l = z_range[0]
1162  else:
1163  l = self.Min(z)
1164  z_spacing = (self.Max(z) - l) / num_z_levels
1165 
1166  for i in range(0,num_z_levels+1):
1167  levels.append(l)
1168  l += z_spacing
1169 
1170  xi = np.linspace(min(xs),max(xs),len(xs)*10)
1171  yi = np.linspace(min(ys),max(ys),len(ys)*10)
1172  zi = mlab.griddata(xs, ys, zs, xi, yi, interp=z_interpol)
1173 
1174  if z_contour:
1175  plt.contour(xi,yi,zi,levels,linewidths=0.5,colors='k')
1176 
1177  plt.contourf(xi,yi,zi,levels,cmap=plt.cm.jet)
1178  plt.colorbar(ticks=levels)
1179 
1180  elif y:
1181  idx2=self.GetColIndex(y)
1182  for row in self.rows:
1183  if row[idx1]!=None and row[idx2]!=None:
1184  if plot_if and not plot_if(self, row):
1185  continue
1186  xs.append(row[idx1])
1187  ys.append(row[idx2])
1188  plt.plot(xs, ys, style, **kwargs)
1189 
1190  else:
1191  label_vals=[]
1192 
1193  if labels:
1194  label_idx=self.GetColIndex(labels)
1195  for row in self.rows:
1196  if row[idx1]!=None:
1197  if plot_if and not plot_if(self, row):
1198  continue
1199  xs.append(row[idx1])
1200  if labels:
1201  label_vals.append(row[label_idx])
1202  plt.plot(xs, style, **kwargs)
1203  if labels:
1204  interval = 1
1205  if max_num_labels:
1206  if len(label_vals)>max_num_labels:
1207  interval = int(math.ceil(float(len(label_vals))/max_num_labels))
1208  label_vals = label_vals[::interval]
1209  plt.xticks(np.arange(0, len(xs), interval), label_vals, rotation=45,
1210  size='x-small')
1211 
1212  if title==None:
1213  if nice_z:
1214  title = '%s of %s vs. %s' % (nice_z, nice_x, nice_y)
1215  elif nice_y:
1216  title = '%s vs. %s' % (nice_x, nice_y)
1217  else:
1218  title = nice_x
1219 
1220  plt.title(title, size='x-large', fontweight='bold',
1221  verticalalignment='bottom')
1222 
1223  if legend:
1224  plt.legend(loc=0)
1225 
1226  if x and y:
1227  plt.xlabel(nice_x, size='x-large')
1228  if x_range:
1229  plt.xlim(x_range[0], x_range[1])
1230  if y_range:
1231  plt.ylim(y_range[0], y_range[1])
1232  if diag_line:
1233  plt.plot(x_range, y_range, '-')
1234 
1235  plt.ylabel(nice_y, size='x-large')
1236  else:
1237  if y_range:
1238  plt.ylim(y_range[0], y_range[1])
1239  if x_title:
1240  plt.xlabel(x_title, size='x-large')
1241  plt.ylabel(nice_y, size='x-large')
1242  if save:
1243  plt.savefig(save)
1244  return plt
1245  except ImportError:
1246  LogError("Function needs numpy and matplotlib, but I could not import it.")
1247  raise
1248 
1249  def PlotHistogram(self, col, x_range=None, num_bins=10, normed=False,
1250  histtype='stepfilled', align='mid', x_title=None,
1251  y_title=None, title=None, clear=True, save=False,
1252  color=None, y_range=None):
1253  """
1254  Create a histogram of the data in col for the range *x_range*, split into
1255  *num_bins* bins and plot it using Matplotlib.
1256 
1257  :param col: column name with data
1258  :type col: :class:`str`
1259 
1260  :param x_range: start and end value for first dimension (e.g. [start_x, end_x])
1261  :type x_range: :class:`list` of length two
1262 
1263  :param y_range: start and end value for second dimension (e.g. [start_y, end_y])
1264  :type y_range: :class:`list` of length two
1265 
1266  :param num_bins: number of bins in range
1267  :type num_bins: :class:`int`
1268 
1269  :param color: Color to be used for the histogram. If not set, color will be
1270  determined by matplotlib
1271  :type color: :class:`str`
1272 
1273  :param normed: normalize histogram
1274  :type normed: :class:`bool`
1275 
1276  :param histtype: type of histogram (i.e. *bar*, *barstacked*, *step*,
1277  *stepfilled*). See (`matplotlib docu <http://matplotlib.sourceforge.net/api/pyplot_api.html#matplotlib.pyplot.hist>`__).
1278  :type histtype: :class:`str`
1279 
1280  :param align: style of histogram (*left*, *mid*, *right*). See
1281  (`matplotlib docu <http://matplotlib.sourceforge.net/api/pyplot_api.html#matplotlib.pyplot.hist>`__).
1282  :type align: :class:`str`
1283 
1284  :param x_title: title for first dimension, if not specified it is
1285  automatically derived from column name
1286  :type x_title: :class:`str`
1287 
1288  :param y_title: title for second dimension, if not specified it is
1289  automatically derived from column name
1290  :type y_title: :class:`str`
1291 
1292  :param title: plot title, if not specified it is automatically derived from
1293  plotted column names
1294  :type title: :class:`str`
1295 
1296  :param clear: clear old data from plot
1297  :type clear: :class:`bool`
1298 
1299  :param save: filename for saving plot
1300  :type save: :class:`str`
1301 
1302  **Examples:** simple plotting functions
1303 
1304  .. code-block:: python
1305 
1306  tab=Table(['a'],'f', a=[math.cos(x*0.01) for x in range(100)])
1307 
1308  # one dimensional plot of column 'd' vs. index
1309  plt=tab.PlotHistogram('a')
1310  plt.show()
1311 
1312  """
1313  try:
1314  import matplotlib.pyplot as plt
1315  import numpy as np
1316 
1317  if len(self.rows)==0:
1318  return None
1319  kwargs={}
1320  if color:
1321  kwargs['color']=color
1322  idx = self.GetColIndex(col)
1323  data = []
1324  for r in self.rows:
1325  if r[idx]!=None:
1326  data.append(r[idx])
1327 
1328  if clear:
1329  plt.clf()
1330 
1331  n, bins, patches = plt.hist(data, bins=num_bins, range=x_range,
1332  normed=normed, histtype=histtype, align=align,
1333  **kwargs)
1334 
1335  if x_title!=None:
1336  nice_x=x_title
1337  else:
1338  nice_x=MakeTitle(col)
1339  plt.xlabel(nice_x, size='x-large')
1340  if y_range:
1341  plt.ylim(y_range)
1342  if y_title!=None:
1343  nice_y=y_title
1344  else:
1345  nice_y="bin count"
1346  plt.ylabel(nice_y, size='x-large')
1347 
1348  if title!=None:
1349  nice_title=title
1350  else:
1351  nice_title="Histogram of %s"%nice_x
1352  plt.title(nice_title, size='x-large', fontweight='bold')
1353 
1354  if save:
1355  plt.savefig(save)
1356  return plt
1357  except ImportError:
1358  LogError("Function needs numpy and matplotlib, but I could not import it.")
1359  raise
1360 
1361  def _Max(self, col):
1362  if len(self.rows)==0:
1363  return None, None
1364  idx = self.GetColIndex(col)
1365  col_type = self.col_types[idx]
1366  if col_type=='int' or col_type=='float':
1367  max_val = -float('inf')
1368  elif col_type=='bool':
1369  max_val = False
1370  elif col_type=='string':
1371  max_val = chr(0)
1372  max_idx = None
1373  for i in range(0, len(self.rows)):
1374  if self.rows[i][idx]>max_val:
1375  max_val = self.rows[i][idx]
1376  max_idx = i
1377  return max_val, max_idx
1378 
1379  def PlotBar(self, cols, x_labels=None, x_labels_rotation='horizontal', y_title=None, title=None,
1380  colors=None, yerr_cols=None, width=0.8, bottom=0,
1381  legend=True, save=False):
1382 
1383  """
1384  Create a barplot of the data in cols. Every element of a column will be represented
1385  as a single bar. If there are several columns, each row will be grouped together.
1386 
1387  :param cols: Column names with data. If cols is a string, every element of that column
1388  will be represented as a single bar. If cols is a list, every row resulting
1389  of these columns will be grouped together. Every value of the table still
1390  is represented by a single bar.
1391 
1392  :param x_labels: Label for every row on x-axis.
1393  :type x_labels: :class:`list`
1394 
1395  :param x_labels_rotation: Can either be 'horizontal', 'vertical' or a number that
1396  describes the rotation in degrees.
1397 
1398  :param y_title: Y-axis description
1399  :type y_title: :class:`str`
1400 
1401  :title: Title
1402  :type title: :class:`str`
1403 
1404  :param colors: Colors of the different bars in each group. Must be a list of valid
1405  colornames in matplotlib. Length of color and cols must be consistent.
1406  :type colors: :class:`list`
1407 
1408  :param yerr_cols: Columns containing the y-error information. Can either be a string
1409  if only one column is plotted or a list otherwise. Length of
1410  yerr_cols and cols must be consistent.
1411 
1412  :param width: The available space for the groups on the x-axis is divided by the exact
1413  number of groups. The parameters width is the fraction of what is actually
1414  used. If it would be 1.0 the bars of the different groups would touch each other.
1415  :type width: :class:`float`
1416 
1417  :param bottom: Bottom
1418  :type bottom: :class:`float`
1419 
1420  :param legend: Legend for color explanation, the corresponding column respectively.
1421  :type legend: :class:`bool`
1422 
1423  :param save: If set, a png image with name $save in the current working directory will be saved.
1424  :type save: :class:`str`
1425 
1426  """
1427  try:
1428  import numpy as np
1429  import matplotlib.pyplot as plt
1430  except:
1431  raise ImportError('PlotBar relies on numpy and matplotlib, but I could not import it!')
1432 
1433  if len(cols)>7:
1434  raise ValueError('More than seven bars at one position looks rather meaningless...')
1435 
1436  standard_colors=['b','g','y','c','m','r','k']
1437  data=[]
1438  yerr_data=[]
1439 
1440  if not isinstance(cols, list):
1441  cols=[cols]
1442 
1443  if yerr_cols:
1444  if not isinstance(yerr_cols, list):
1445  yerr_cols=[yerr_cols]
1446  if len(yerr_cols)!=len(cols):
1447  raise RuntimeError ('Number of cols and number of error columns must be consistent!')
1448 
1449  for c in cols:
1450  cid=self.GetColIndex(c)
1451  temp=list()
1452  for r in self.rows:
1453  temp.append(r[cid])
1454  data.append(temp)
1455 
1456  if yerr_cols:
1457  for c in yerr_cols:
1458  cid=self.GetColIndex(c)
1459  temp=list()
1460  for r in self.rows:
1461  temp.append(r[cid])
1462  yerr_data.append(temp)
1463  else:
1464  for i in range(len(cols)):
1465  yerr_data.append(None)
1466 
1467  if not colors:
1468  colors=standard_colors[:len(cols)]
1469 
1470  if len(cols)!=len(colors):
1471  raise RuntimeError("Number of columns and number of colors must be consistent!")
1472 
1473  ind=np.arange(len(data[0]))
1474  single_bar_width=float(width)/len(data)
1475 
1476  fig=plt.figure()
1477  ax=fig.add_subplot(111)
1478  legend_data=[]
1479  for i in range(len(data)):
1480  legend_data.append(ax.bar(ind+i*single_bar_width,data[i],single_bar_width,bottom=bottom,color=colors[i],yerr=yerr_data[i], ecolor='black')[0])
1481 
1482  if title!=None:
1483  nice_title=title
1484  else:
1485  nice_title="coolest barplot on earth"
1486  ax.set_title(nice_title, size='x-large', fontweight='bold')
1487 
1488  if y_title!=None:
1489  nice_y=y_title
1490  else:
1491  nice_y="score"
1492  ax.set_ylabel(nice_y)
1493 
1494  if x_labels:
1495  if len(data[0])!=len(x_labels):
1496  raise ValueError('Number of xlabels is not consistent with number of rows!')
1497  else:
1498  x_labels=list()
1499  for i in range(1,len(data[0])+1):
1500  x_labels.append('Row '+str(i))
1501 
1502  ax.set_xticks(ind+width*0.5)
1503  ax.set_xticklabels(x_labels, rotation = x_labels_rotation)
1504 
1505  if legend:
1506  ax.legend(legend_data, cols)
1507 
1508  if save:
1509  plt.savefig(save)
1510 
1511  return plt
1512 
1513  def PlotHexbin(self, x, y, title=None, x_title=None, y_title=None, x_range=None, y_range=None, binning='log',
1514  colormap='jet', show_scalebar=False, scalebar_label=None, clear=True, save=False, show=False):
1515 
1516  """
1517  Create a heatplot of the data in col x vs the data in col y using matplotlib
1518 
1519  :param x: column name with x data
1520  :type x: :class:`str`
1521 
1522  :param y: column name with y data
1523  :type y: :class:`str`
1524 
1525  :param title: title of the plot, will be generated automatically if set to None
1526  :type title: :class:`str`
1527 
1528  :param x_title: label of x-axis, will be generated automatically if set to None
1529  :type title: :class:`str`
1530 
1531  :param y_title: label of y-axis, will be generated automatically if set to None
1532  :type title: :class:`str`
1533 
1534  :param x_range: start and end value for first dimension (e.g. [start_x, end_x])
1535  :type x_range: :class:`list` of length two
1536 
1537  :param y_range: start and end value for second dimension (e.g. [start_y, end_y])
1538  :type y_range: :class:`list` of length two
1539 
1540  :param binning: type of binning. If set to None, the value of a hexbin will
1541  correspond to the number of datapoints falling into it. If
1542  set to 'log', the value will be the log with base 10 of the above
1543  value (log(i+1)). If an integer is provided, the number of a
1544  hexbin is equal the number of datapoints falling into it divided
1545  by the integer. If a list of values is provided, these values
1546  will be the lower bounds of the bins.
1547 
1548  :param colormap: colormap, that will be used. Value can be every colormap defined
1549  in matplotlib or an own defined colormap. You can either pass a
1550  string with the name of the matplotlib colormap or a colormap
1551  object.
1552 
1553  :param show_scalebar: If set to True, a scalebar according to the chosen colormap is shown
1554  :type show_scalebar: :class:`bool`
1555 
1556  :param scalebar_label: Label of the scalebar
1557  :type scalebar_label: :class:`str`
1558 
1559  :param clear: clear old data from plot
1560  :type clear: :class:`bool`
1561 
1562  :param save: filename for saving plot
1563  :type save: :class:`str`
1564 
1565  :param show: directly show plot
1566  :type show: :class:`bool`
1567 
1568  """
1569 
1570  try:
1571  import matplotlib.pyplot as plt
1572  import matplotlib.cm as cm
1573  except:
1574  raise ImportError('PlotHexbin relies on matplotlib, but I could not import it')
1575 
1576  idx=self.GetColIndex(x)
1577  idy=self.GetColIndex(y)
1578  xdata=[]
1579  ydata=[]
1580 
1581  for r in self.rows:
1582  if r[idx]!=None and r[idy]!=None:
1583  xdata.append(r[idx])
1584  ydata.append(r[idy])
1585 
1586  if clear:
1587  plt.clf()
1588 
1589  if x_title!=None:
1590  nice_x=x_title
1591  else:
1592  nice_x=MakeTitle(x)
1593 
1594  if y_title!=None:
1595  nice_y=y_title
1596  else:
1597  nice_y=MakeTitle(y)
1598 
1599  if title==None:
1600  title = '%s vs. %s' % (nice_x, nice_y)
1601 
1602  if IsStringLike(colormap):
1603  colormap=getattr(cm, colormap)
1604 
1605  if x_range and (IsScalar(x_range) or len(x_range)!=2):
1606  raise ValueError('parameter x_range must contain exactly two elements')
1607  if y_range and (IsScalar(y_range) or len(y_range)!=2):
1608  raise ValueError('parameter y_range must contain exactly two elements')
1609 
1610  ext = [min(xdata),max(xdata),min(ydata),max(ydata)]
1611 
1612  if x_range:
1613  plt.xlim((x_range[0], x_range[1]))
1614  ext[0]=x_range[0]
1615  ext[1]=x_range[1]
1616  if y_range:
1617  plt.ylim(y_range[0], y_range[1])
1618  ext[2]=y_range[0]
1619  ext[3]=y_range[1]
1620 
1621 
1622  plt.hexbin(xdata, ydata, bins=binning, cmap=colormap, extent=ext)
1623 
1624  plt.title(title, size='x-large', fontweight='bold',
1625  verticalalignment='bottom')
1626 
1627  plt.xlabel(nice_x)
1628  plt.ylabel(nice_y)
1629 
1630  if show_scalebar:
1631  cb=plt.colorbar()
1632  if scalebar_label:
1633  cb.set_label(scalebar_label)
1634 
1635  if save:
1636  plt.savefig(save)
1637 
1638  if show:
1639  plt.show()
1640 
1641  return plt
1642 
1643  def MaxRow(self, col):
1644  """
1645  Returns the row containing the cell with the maximal value in col. If
1646  several rows have the highest value, only the first one is returned.
1647  None values are ignored.
1648 
1649  :param col: column name
1650  :type col: :class:`str`
1651 
1652  :returns: row with maximal col value or None if the table is empty
1653  """
1654  val, idx = self._Max(col)
1655  if idx!=None:
1656  return self.rows[idx]
1657 
1658  def Max(self, col):
1659  """
1660  Returns the maximum value in col. If several rows have the highest value,
1661  only the first one is returned. None values are ignored.
1662 
1663  :param col: column name
1664  :type col: :class:`str`
1665  """
1666  val, idx = self._Max(col)
1667  return val
1668 
1669  def MaxIdx(self, col):
1670  """
1671  Returns the row index of the cell with the maximal value in col. If
1672  several rows have the highest value, only the first one is returned.
1673  None values are ignored.
1674 
1675  :param col: column name
1676  :type col: :class:`str`
1677  """
1678  val, idx = self._Max(col)
1679  return idx
1680 
1681  def _Min(self, col):
1682  if len(self.rows)==0:
1683  return None, None
1684  idx=self.GetColIndex(col)
1685  col_type = self.col_types[idx]
1686  if col_type=='int' or col_type=='float':
1687  min_val=float('inf')
1688  elif col_type=='bool':
1689  min_val=True
1690  elif col_type=='string':
1691  min_val=chr(255)
1692  min_idx=None
1693  for i,row in enumerate(self.rows):
1694  if row[idx]!=None and row[idx]<min_val:
1695  min_val=row[idx]
1696  min_idx=i
1697  return min_val, min_idx
1698 
1699  def Min(self, col):
1700  """
1701  Returns the minimal value in col. If several rows have the lowest value,
1702  only the first one is returned. None values are ignored.
1703 
1704  :param col: column name
1705  :type col: :class:`str`
1706  """
1707  val, idx = self._Min(col)
1708  return val
1709 
1710  def MinRow(self, col):
1711  """
1712  Returns the row containing the cell with the minimal value in col. If
1713  several rows have the lowest value, only the first one is returned.
1714  None values are ignored.
1715 
1716  :param col: column name
1717  :type col: :class:`str`
1718 
1719  :returns: row with minimal col value or None if the table is empty
1720  """
1721  val, idx = self._Min(col)
1722  if idx!=None:
1723  return self.rows[idx]
1724 
1725  def MinIdx(self, col):
1726  """
1727  Returns the row index of the cell with the minimal value in col. If
1728  several rows have the lowest value, only the first one is returned.
1729  None values are ignored.
1730 
1731  :param col: column name
1732  :type col: :class:`str`
1733  """
1734  val, idx = self._Min(col)
1735  return idx
1736 
1737  def Sum(self, col):
1738  """
1739  Returns the sum of the given column. Cells with None are ignored. Returns
1740  0.0, if the column doesn't contain any elements. Col must be of numeric
1741  column type ('float', 'int') or boolean column type.
1742 
1743  :param col: column name
1744  :type col: :class:`str`
1745 
1746  :raises: :class:`TypeError` if column type is ``string``
1747  """
1748  idx = self.GetColIndex(col)
1749  col_type = self.col_types[idx]
1750  if col_type!='int' and col_type!='float' and col_type!='bool':
1751  raise TypeError("Sum can only be used on numeric column types")
1752  s = 0.0
1753  for r in self.rows:
1754  if r[idx]!=None:
1755  s += r[idx]
1756  return s
1757 
1758  def Mean(self, col):
1759  """
1760  Returns the mean of the given column. Cells with None are ignored. Returns
1761  None, if the column doesn't contain any elements. Col must be of numeric
1762  ('float', 'int') or boolean column type.
1763 
1764  If column type is *bool*, the function returns the ratio of
1765  number of 'Trues' by total number of elements.
1766 
1767  :param col: column name
1768  :type col: :class:`str`
1769 
1770  :raises: :class:`TypeError` if column type is ``string``
1771  """
1772  idx = self.GetColIndex(col)
1773  col_type = self.col_types[idx]
1774  if col_type!='int' and col_type!='float' and col_type!='bool':
1775  raise TypeError("Mean can only be used on numeric or bool column types")
1776 
1777  vals=[]
1778  for v in self[col]:
1779  if v!=None:
1780  vals.append(v)
1781  try:
1782  return stutil.Mean(vals)
1783  except:
1784  return None
1785 
1786  def RowMean(self, mean_col_name, cols):
1787  """
1788  Adds a new column of type 'float' with a specified name (*mean_col_name*),
1789  containing the mean of all specified columns for each row.
1790 
1791  Cols are specified by their names and must be of numeric column
1792  type ('float', 'int') or boolean column type. Cells with None are ignored.
1793  Adds None if the row doesn't contain any values.
1794 
1795  :param mean_col_name: name of new column containing mean values
1796  :type mean_col_name: :class:`str`
1797 
1798  :param cols: name or list of names of columns to include in computation of
1799  mean
1800  :type cols: :class:`str` or :class:`list` of strings
1801 
1802  :raises: :class:`TypeError` if column type of columns in *col* is ``string``
1803 
1804  == Example ==
1805 
1806  Staring with the following table:
1807 
1808  ==== ==== ====
1809  x y u
1810  ==== ==== ====
1811  1 10 100
1812  2 15 None
1813  3 20 400
1814  ==== ==== ====
1815 
1816  the code here adds a column with the name 'mean' to yield the table below:
1817 
1818  .. code-block::python
1819 
1820  tab.RowMean('mean', ['x', 'u'])
1821 
1822 
1823  ==== ==== ==== =====
1824  x y u mean
1825  ==== ==== ==== =====
1826  1 10 100 50.5
1827  2 15 None 2
1828  3 20 400 201.5
1829  ==== ==== ==== =====
1830 
1831  """
1832 
1833  if IsScalar(cols):
1834  cols = [cols]
1835 
1836  cols_idxs = []
1837  for col in cols:
1838  idx = self.GetColIndex(col)
1839  col_type = self.col_types[idx]
1840  if col_type!='int' and col_type!='float' and col_type!='bool':
1841  raise TypeError("RowMean can only be used on numeric column types")
1842  cols_idxs.append(idx)
1843 
1844  mean_rows = []
1845  for row in self.rows:
1846  vals = []
1847  for idx in cols_idxs:
1848  v = row[idx]
1849  if v!=None:
1850  vals.append(v)
1851  try:
1852  mean = stutil.Mean(vals)
1853  mean_rows.append(mean)
1854  except:
1855  mean_rows.append(None)
1856 
1857  self.AddCol(mean_col_name, 'f', mean_rows)
1858 
1859  def Percentiles(self, col, nths):
1860  """
1861  returns the percentiles of column *col* given in *nths*.
1862 
1863  The percentils are calculated as
1864 
1865  .. code-block:: python
1866 
1867  values[min(len(values), int(round(len(values)*p/100+0.5)-1))]
1868 
1869  where values are the sorted values of *col* not equal to none
1870  :param: nths: list of percentiles to be calculated. Each percentil is a number
1871  between 0 and 100.
1872 
1873  :raises: :class:`TypeError` if column type is ``string``
1874  :returns: List of percentils in the same order as given in *nths*
1875  """
1876  idx = self.GetColIndex(col)
1877  col_type = self.col_types[idx]
1878  if col_type!='int' and col_type!='float' and col_type!='bool':
1879  raise TypeError("Median can only be used on numeric column types")
1880 
1881  for nth in nths:
1882  if nth < 0 or nth > 100:
1883  raise ValueError("percentiles must be between 0 and 100")
1884  vals=[]
1885  for v in self[col]:
1886  if v!=None:
1887  vals.append(v)
1888  vals=sorted(vals)
1889  if len(vals)==0:
1890  return [None]*len(nths)
1891  percentiles=[]
1892 
1893  for nth in nths:
1894  p=vals[min(len(vals)-1, int(round(len(vals)*nth/100.0+0.5)-1))]
1895  percentiles.append(p)
1896  return percentiles
1897 
1898  def Median(self, col):
1899  """
1900  Returns the median of the given column. Cells with None are ignored. Returns
1901  None, if the column doesn't contain any elements. Col must be of numeric
1902  column type ('float', 'int') or boolean column type.
1903 
1904  :param col: column name
1905  :type col: :class:`str`
1906 
1907  :raises: :class:`TypeError` if column type is ``string``
1908  """
1909  idx = self.GetColIndex(col)
1910  col_type = self.col_types[idx]
1911  if col_type!='int' and col_type!='float' and col_type!='bool':
1912  raise TypeError("Median can only be used on numeric column types")
1913 
1914  vals=[]
1915  for v in self[col]:
1916  if v!=None:
1917  vals.append(v)
1918  stutil.Median(vals)
1919  try:
1920  return stutil.Median(vals)
1921  except:
1922  return None
1923 
1924  def StdDev(self, col):
1925  """
1926  Returns the standard deviation of the given column. Cells with None are
1927  ignored. Returns None, if the column doesn't contain any elements. Col must
1928  be of numeric column type ('float', 'int') or boolean column type.
1929 
1930  :param col: column name
1931  :type col: :class:`str`
1932 
1933  :raises: :class:`TypeError` if column type is ``string``
1934  """
1935  idx = self.GetColIndex(col)
1936  col_type = self.col_types[idx]
1937  if col_type!='int' and col_type!='float' and col_type!='bool':
1938  raise TypeError("StdDev can only be used on numeric column types")
1939 
1940  vals=[]
1941  for v in self[col]:
1942  if v!=None:
1943  vals.append(v)
1944  try:
1945  return stutil.StdDev(vals)
1946  except:
1947  return None
1948 
1949  def Count(self, col, ignore_nan=True):
1950  """
1951  Count the number of cells in column that are not equal to None.
1952 
1953  :param col: column name
1954  :type col: :class:`str`
1955 
1956  :param ignore_nan: ignore all *None* values
1957  :type ignore_nan: :class:`bool`
1958  """
1959  count=0
1960  idx=self.GetColIndex(col)
1961  for r in self.rows:
1962  if ignore_nan:
1963  if r[idx]!=None:
1964  count+=1
1965  else:
1966  count+=1
1967  return count
1968 
1969  def Correl(self, col1, col2):
1970  """
1971  Calculate the Pearson correlation coefficient between *col1* and *col2*, only
1972  taking rows into account where both of the values are not equal to *None*.
1973  If there are not enough data points to calculate a correlation coefficient,
1974  *None* is returned.
1975 
1976  :param col1: column name for first column
1977  :type col1: :class:`str`
1978 
1979  :param col2: column name for second column
1980  :type col2: :class:`str`
1981  """
1982  if IsStringLike(col1) and IsStringLike(col2):
1983  col1 = self.GetColIndex(col1)
1984  col2 = self.GetColIndex(col2)
1985  vals1, vals2=([],[])
1986  for v1, v2 in zip(self[col1], self[col2]):
1987  if v1!=None and v2!=None:
1988  vals1.append(v1)
1989  vals2.append(v2)
1990  try:
1991  return stutil.Correl(vals1, vals2)
1992  except:
1993  return None
1994 
1995  def SpearmanCorrel(self, col1, col2):
1996  """
1997  Calculate the Spearman correlation coefficient between col1 and col2, only
1998  taking rows into account where both of the values are not equal to None. If
1999  there are not enough data points to calculate a correlation coefficient,
2000  None is returned.
2001 
2002  :warning: The function depends on the following module: *scipy.stats.mstats*
2003 
2004  :param col1: column name for first column
2005  :type col1: :class:`str`
2006 
2007  :param col2: column name for second column
2008  :type col2: :class:`str`
2009  """
2010  try:
2011  import scipy.stats.mstats
2012 
2013  if IsStringLike(col1) and IsStringLike(col2):
2014  col1 = self.GetColIndex(col1)
2015  col2 = self.GetColIndex(col2)
2016  vals1, vals2=([],[])
2017  for v1, v2 in zip(self[col1], self[col2]):
2018  if v1!=None and v2!=None:
2019  vals1.append(v1)
2020  vals2.append(v2)
2021  try:
2022  correl = scipy.stats.mstats.spearmanr(vals1, vals2)[0]
2023  if scipy.isnan(correl):
2024  return None
2025  return correl
2026  except:
2027  return None
2028 
2029  except ImportError:
2030  LogError("Function needs scipy.stats.mstats, but I could not import it.")
2031  raise
2032 
2033 
2034  def Save(self, stream_or_filename, format='ost', sep=','):
2035  """
2036  Save the table to stream or filename. The following three file formats
2037  are supported (for more information on file formats, see :meth:`Load`):
2038 
2039  ============= =======================================
2040  ost ost-specific format (human readable)
2041  csv comma separated values (human readable)
2042  pickle pickled byte stream (binary)
2043  html HTML table
2044  context ConTeXt table
2045  ============= =======================================
2046 
2047  :param stream_or_filename: filename or stream for writing output
2048  :type stream_or_filename: :class:`str` or :class:`file`
2049 
2050  :param format: output format (i.e. *ost*, *csv*, *pickle*)
2051  :type format: :class:`str`
2052 
2053  :raises: :class:`ValueError` if format is unknown
2054  """
2055  format=format.lower()
2056  if format=='ost':
2057  return self._SaveOST(stream_or_filename)
2058  if format=='csv':
2059  return self._SaveCSV(stream_or_filename, sep=sep)
2060  if format=='pickle':
2061  return self._SavePickle(stream_or_filename)
2062  if format=='html':
2063  return self._SaveHTML(stream_or_filename)
2064  if format=='context':
2065  return self._SaveContext(stream_or_filename)
2066  raise ValueError('unknown format "%s"' % format)
2067 
2068  def _SavePickle(self, stream):
2069  if not hasattr(stream, 'write'):
2070  stream=open(stream, 'wb')
2071  cPickle.dump(self, stream, cPickle.HIGHEST_PROTOCOL)
2072 
2073  def _SaveHTML(self, stream_or_filename):
2074  def _escape(s):
2075  return s.replace('&', '&amp;').replace('>', '&gt;').replace('<', '&lt;')
2076 
2077  file_opened = False
2078  if not hasattr(stream_or_filename, 'write'):
2079  stream = open(stream_or_filename, 'w')
2080  file_opened = True
2081  else:
2082  stream = stream_or_filename
2083  stream.write('<table>')
2084  stream.write('<tr>')
2085  for col_name in self.col_names:
2086  stream.write('<th>%s</th>' % _escape(col_name))
2087  stream.write('</tr>')
2088  for row in self.rows:
2089  stream.write('<tr>')
2090  for i, col in enumerate(row):
2091  val = ''
2092  if col != None:
2093  if self.col_types[i] == 'float':
2094  val = '%.3f' % col
2095  elif self.col_types[i] == 'int':
2096  val = '%d' % col
2097  elif self.col_types[i] == 'bool':
2098  val = col and 'true' or 'false'
2099  else:
2100  val = str(col)
2101  stream.write('<td>%s</td>' % _escape(val))
2102  stream.write('</tr>')
2103  stream.write('</table>')
2104  if file_opened:
2105  stream.close()
2106  def _SaveContext(self, stream_or_filename):
2107  file_opened = False
2108  if not hasattr(stream_or_filename, 'write'):
2109  stream = open(stream_or_filename, 'w')
2110  file_opened = True
2111  else:
2112  stream = stream_or_filename
2113  stream.write('\\starttable[')
2114  for col_type in self.col_types:
2115  if col_type =='string':
2116  stream.write('l|')
2117  elif col_type=='int':
2118  stream.write('r|')
2119  elif col_type =='float':
2120  stream.write('i3r|')
2121  else:
2122  stream.write('l|')
2123  stream.write(']\n\\HL\n')
2124  for col_name in self.col_names:
2125  stream.write('\\NC \\bf %s' % col_name)
2126  stream.write(' \\AR\\HL\n')
2127  for row in self.rows:
2128  for i, col in enumerate(row):
2129  val = '---'
2130  if col != None:
2131  if self.col_types[i] == 'float':
2132  val = '%.3f' % col
2133  elif self.col_types[i] == 'int':
2134  val = '%d' % col
2135  elif self.col_types[i] == 'bool':
2136  val = col and 'true' or 'false'
2137  else:
2138  val = str(col)
2139  stream.write('\\NC %s' % val)
2140  stream.write(' \\AR\n')
2141  stream.write('\\HL\n')
2142  stream.write('\\stoptable')
2143  if file_opened:
2144  stream.close()
2145 
2146  def _SaveCSV(self, stream, sep):
2147  if not hasattr(stream, 'write'):
2148  stream=open(stream, 'wb')
2149 
2150  writer=csv.writer(stream, delimiter=sep)
2151  writer.writerow(['%s' % n for n in self.col_names])
2152  for row in self.rows:
2153  row=list(row)
2154  for i, c in enumerate(row):
2155  if c==None:
2156  row[i]='NA'
2157  writer.writerow(row)
2158 
2159  def _SaveOST(self, stream):
2160  if hasattr(stream, 'write'):
2161  writer=csv.writer(stream, delimiter=' ')
2162  else:
2163  stream=open(stream, 'w')
2164  writer=csv.writer(stream, delimiter=' ')
2165  if self.comment:
2166  stream.write(''.join(['# %s\n' % l for l in self.comment.split('\n')]))
2167  writer.writerow(['%s[%s]' % t for t in zip(self.col_names, self.col_types)])
2168  for row in self.rows:
2169  row=list(row)
2170  for i, c in enumerate(row):
2171  if c==None:
2172  row[i]='NA'
2173  writer.writerow(row)
2174 
2175 
2176  def GetNumpyMatrix(self, *args):
2177  '''
2178  Returns a numpy matrix containing the selected columns from the table as
2179  columns in the matrix.
2180  Only columns of type *int* or *float* are supported. *NA* values in the
2181  table will be converted to *None* values.
2182 
2183  :param \*args: column names to include in numpy matrix
2184 
2185  :warning: The function depends on *numpy*
2186  '''
2187  try:
2188  import numpy as np
2189 
2190  if len(args)==0:
2191  raise RuntimeError("At least one column must be specified.")
2192 
2193  idxs = []
2194  for arg in args:
2195  idx = self.GetColIndex(arg)
2196  col_type = self.col_types[idx]
2197  if col_type!='int' and col_type!='float':
2198  raise TypeError("Numpy matrix can only be generated from numeric column types")
2199  idxs.append(idx)
2200  m = np.matrix([list(self[i]) for i in idxs])
2201  return m.T
2202 
2203  except ImportError:
2204  LogError("Function needs numpy, but I could not import it.")
2205  raise
2206 
2207 
2208 
2209  def GaussianSmooth(self, col, std=1.0, na_value=0.0, padding='reflect', c=0.0):
2210 
2211  '''
2212  In place gaussian smooth of a column in the table with a given standard deviation.
2213  All nan are set to nan_value before smoothing.
2214 
2215  :param col: column name
2216  :type col: :class:`str`
2217 
2218  :param std: standard deviation for gaussian kernel
2219  :type std: `scalar`
2220 
2221  :param na_value: all na (None) values of the speciefied column are set to na_value before smoothing
2222  :type na_value: `scalar`
2223 
2224  :param padding: allows to handle padding behaviour see scipy ndimage.gaussian_filter1d documentation for more information. standard is reflect
2225  :type padding: :class:`str`
2226 
2227  :param c: constant value used for padding if padding mode is constant
2228  :type c: `scalar`
2229 
2230 
2231 
2232  :warning: The function depends on *scipy*
2233  '''
2234 
2235  try:
2236  from scipy import ndimage
2237  import numpy as np
2238  except ImportError:
2239  LogError("I need scipy.ndimage and numpy, but could not import it")
2240  raise
2241 
2242  idx = self.GetColIndex(col)
2243  col_type = self.col_types[idx]
2244  if col_type!='int' and col_type!='float':
2245  raise TypeError("GaussianSmooth can only be used on numeric column types")
2246 
2247  vals=[]
2248  for v in self[col]:
2249  if v!=None:
2250  vals.append(v)
2251  else:
2252  vals.append(na_value)
2253 
2254 
2255  smoothed_values_ndarray=ndimage.gaussian_filter1d(vals,std, mode=padding, cval=c)
2256 
2257  result=[]
2258 
2259  for v in smoothed_values_ndarray:
2260  result.append(v)
2261 
2262  self[col]=result
2263 
2264 
2265  def GetOptimalPrefactors(self, ref_col, *args, **kwargs):
2266  '''
2267  This returns the optimal prefactor values (i.e. a, b, c, ...) for the
2268  following equation
2269 
2270  .. math::
2271  :label: op1
2272 
2273  a*u + b*v + c*w + ... = z
2274 
2275  where u, v, w and z are vectors. In matrix notation
2276 
2277  .. math::
2278  :label: op2
2279 
2280  A*p = z
2281 
2282  where A contains the data from the table (u,v,w,...), p are the prefactors
2283  to optimize (a,b,c,...) and z is the vector containing the result of
2284  equation :eq:`op1`.
2285 
2286  The parameter ref_col equals to z in both equations, and \*args are columns
2287  u, v and w (or A in :eq:`op2`). All columns must be specified by their names.
2288 
2289  **Example:**
2290 
2291  .. code-block:: python
2292 
2293  tab.GetOptimalPrefactors('colC', 'colA', 'colB')
2294 
2295  The function returns a list of containing the prefactors a, b, c, ... in
2296  the correct order (i.e. same as columns were specified in \*args).
2297 
2298  Weighting:
2299  If the kwarg weights="columX" is specified, the equations are weighted by
2300  the values in that column. Each row is multiplied by the weight in that row,
2301  which leads to :eq:`op3`:
2302 
2303  .. math::
2304  :label: op3
2305 
2306  weight*a*u + weight*b*v + weight*c*w + ... = weight*z
2307 
2308  Weights must be float or int and can have any value. A value of 0 ignores
2309  this equation, a value of 1 means the same as no weight. If all weights are
2310  the same for each row, the same result will be obtained as with no weights.
2311 
2312  **Example:**
2313 
2314  .. code-block:: python
2315 
2316  tab.GetOptimalPrefactors('colC', 'colA', 'colB', weights='colD')
2317 
2318  '''
2319  try:
2320  import numpy as np
2321 
2322  if len(args)==0:
2323  raise RuntimeError("At least one column must be specified.")
2324 
2325  b = self.GetNumpyMatrix(ref_col)
2326  a = self.GetNumpyMatrix(*args)
2327 
2328  if len(kwargs)!=0:
2329  if kwargs.has_key('weights'):
2330  w = self.GetNumpyMatrix(kwargs['weights'])
2331  b = np.multiply(b,w)
2332  a = np.multiply(a,w)
2333 
2334  else:
2335  raise RuntimeError("specified unrecognized kwargs, use weights as key")
2336 
2337  k = (a.T*a).I*a.T*b
2338  return list(np.array(k.T).reshape(-1))
2339 
2340  except ImportError:
2341  LogError("Function needs numpy, but I could not import it.")
2342  raise
2343 
2344  def PlotEnrichment(self, score_col, class_col, score_dir='-',
2345  class_dir='-', class_cutoff=2.0,
2346  style='-', title=None, x_title=None, y_title=None,
2347  clear=True, save=None):
2348  '''
2349  Plot an enrichment curve using matplotlib of column *score_col* classified
2350  according to *class_col*.
2351 
2352  For more information about parameters of the enrichment, see
2353  :meth:`ComputeEnrichment`, and for plotting see :meth:`Plot`.
2354 
2355  :warning: The function depends on *matplotlib*
2356  '''
2357  try:
2358  import matplotlib.pyplot as plt
2359 
2360  enrx, enry = self.ComputeEnrichment(score_col, class_col, score_dir,
2361  class_dir, class_cutoff)
2362 
2363  if not title:
2364  title = 'Enrichment of %s'%score_col
2365 
2366  if not x_title:
2367  x_title = '% database'
2368 
2369  if not y_title:
2370  y_title = '% positives'
2371 
2372  if clear:
2373  plt.clf()
2374 
2375  plt.plot(enrx, enry, style)
2376 
2377  plt.title(title, size='x-large', fontweight='bold')
2378  plt.ylabel(y_title, size='x-large')
2379  plt.xlabel(x_title, size='x-large')
2380 
2381  if save:
2382  plt.savefig(save)
2383 
2384  return plt
2385  except ImportError:
2386  LogError("Function needs matplotlib, but I could not import it.")
2387  raise
2388 
2389  def ComputeEnrichment(self, score_col, class_col, score_dir='-',
2390  class_dir='-', class_cutoff=2.0):
2391  '''
2392  Computes the enrichment of column *score_col* classified according to
2393  *class_col*.
2394 
2395  For this it is necessary, that the datapoints are classified into positive
2396  and negative points. This can be done in two ways:
2397 
2398  - by using one 'bool' type column (*class_col*) which contains *True* for
2399  positives and *False* for negatives
2400 
2401  - by specifying a classification column (*class_col*), a cutoff value
2402  (*class_cutoff*) and the classification columns direction (*class_dir*).
2403  This will generate the classification on the fly
2404 
2405  * if ``class_dir=='-'``: values in the classification column that are less than or equal to class_cutoff will be counted as positives
2406  * if ``class_dir=='+'``: values in the classification column that are larger than or equal to class_cutoff will be counted as positives
2407 
2408  During the calculation, the table will be sorted according to *score_dir*,
2409  where a '-' values means smallest values first and therefore, the smaller
2410  the value, the better.
2411 
2412  :warning: If either the value of *class_col* or *score_col* is *None*, the
2413  data in this row is ignored.
2414  '''
2415 
2416  ALLOWED_DIR = ['+','-']
2417 
2418  score_idx = self.GetColIndex(score_col)
2419  score_type = self.col_types[score_idx]
2420  if score_type!='int' and score_type!='float':
2421  raise TypeError("Score column must be numeric type")
2422 
2423  class_idx = self.GetColIndex(class_col)
2424  class_type = self.col_types[class_idx]
2425  if class_type!='int' and class_type!='float' and class_type!='bool':
2426  raise TypeError("Classifier column must be numeric or bool type")
2427 
2428  if (score_dir not in ALLOWED_DIR) or (class_dir not in ALLOWED_DIR):
2429  raise ValueError("Direction must be one of %s"%str(ALLOWED_DIR))
2430 
2431  self.Sort(score_col, score_dir)
2432 
2433  x = [0]
2434  y = [0]
2435  enr = 0
2436  old_score_val = None
2437  i = 0
2438 
2439  for row in self.rows:
2440  class_val = row[class_idx]
2441  score_val = row[score_idx]
2442  if class_val==None or score_val==None:
2443  continue
2444  if class_val!=None:
2445  if old_score_val==None:
2446  old_score_val = score_val
2447  if score_val!=old_score_val:
2448  x.append(i)
2449  y.append(enr)
2450  old_score_val = score_val
2451  i+=1
2452  if class_type=='bool':
2453  if class_val==True:
2454  enr += 1
2455  else:
2456  if (class_dir=='-' and class_val<=class_cutoff) or (class_dir=='+' and class_val>=class_cutoff):
2457  enr += 1
2458  x.append(i)
2459  y.append(enr)
2460 
2461  # if no false positives or false negatives values are found return None
2462  if x[-1]==0 or y[-1]==0:
2463  return None
2464 
2465  x = [float(v)/x[-1] for v in x]
2466  y = [float(v)/y[-1] for v in y]
2467  return x,y
2468 
2469  def ComputeEnrichmentAUC(self, score_col, class_col, score_dir='-',
2470  class_dir='-', class_cutoff=2.0):
2471  '''
2472  Computes the area under the curve of the enrichment using the trapezoidal
2473  rule.
2474 
2475  For more information about parameters of the enrichment, see
2476  :meth:`ComputeEnrichment`.
2477 
2478  :warning: The function depends on *numpy*
2479  '''
2480  try:
2481  import numpy as np
2482 
2483  enr = self.ComputeEnrichment(score_col, class_col, score_dir,
2484  class_dir, class_cutoff)
2485 
2486  if enr==None:
2487  return None
2488  return np.trapz(enr[1], enr[0])
2489  except ImportError:
2490  LogError("Function needs numpy, but I could not import it.")
2491  raise
2492 
2493  def ComputeROC(self, score_col, class_col, score_dir='-',
2494  class_dir='-', class_cutoff=2.0):
2495  '''
2496  Computes the receiver operating characteristics (ROC) of column *score_col*
2497  classified according to *class_col*.
2498 
2499  For this it is necessary, that the datapoints are classified into positive
2500  and negative points. This can be done in two ways:
2501 
2502  - by using one 'bool' column (*class_col*) which contains True for positives
2503  and False for negatives
2504  - by using a non-bool column (*class_col*), a cutoff value (*class_cutoff*)
2505  and the classification columns direction (*class_dir*). This will generate
2506  the classification on the fly
2507 
2508  - if ``class_dir=='-'``: values in the classification column that are less than or equal to *class_cutoff* will be counted as positives
2509  - if ``class_dir=='+'``: values in the classification column that are larger than or equal to *class_cutoff* will be counted as positives
2510 
2511  During the calculation, the table will be sorted according to *score_dir*,
2512  where a '-' values means smallest values first and therefore, the smaller
2513  the value, the better.
2514 
2515  If *class_col* does not contain any positives (i.e. value is True (if column
2516  is of type bool) or evaluated to True (if column is of type int or float
2517  (depending on *class_dir* and *class_cutoff*))) the ROC is not defined and
2518  the function will return *None*.
2519 
2520  :warning: If either the value of *class_col* or *score_col* is *None*, the
2521  data in this row is ignored.
2522  '''
2523 
2524  ALLOWED_DIR = ['+','-']
2525 
2526  score_idx = self.GetColIndex(score_col)
2527  score_type = self.col_types[score_idx]
2528  if score_type!='int' and score_type!='float':
2529  raise TypeError("Score column must be numeric type")
2530 
2531  class_idx = self.GetColIndex(class_col)
2532  class_type = self.col_types[class_idx]
2533  if class_type!='int' and class_type!='float' and class_type!='bool':
2534  raise TypeError("Classifier column must be numeric or bool type")
2535 
2536  if (score_dir not in ALLOWED_DIR) or (class_dir not in ALLOWED_DIR):
2537  raise ValueError("Direction must be one of %s"%str(ALLOWED_DIR))
2538 
2539  self.Sort(score_col, score_dir)
2540 
2541  x = [0]
2542  y = [0]
2543  tp = 0
2544  fp = 0
2545  old_score_val = None
2546 
2547  for i,row in enumerate(self.rows):
2548  class_val = row[class_idx]
2549  score_val = row[score_idx]
2550  if class_val==None or score_val==None:
2551  continue
2552  if class_val!=None:
2553  if old_score_val==None:
2554  old_score_val = score_val
2555  if score_val!=old_score_val:
2556  x.append(fp)
2557  y.append(tp)
2558  old_score_val = score_val
2559  if class_type=='bool':
2560  if class_val==True:
2561  tp += 1
2562  else:
2563  fp += 1
2564  else:
2565  if (class_dir=='-' and class_val<=class_cutoff) or (class_dir=='+' and class_val>=class_cutoff):
2566  tp += 1
2567  else:
2568  fp += 1
2569  x.append(fp)
2570  y.append(tp)
2571 
2572  # if no false positives or false negatives values are found return None
2573  if x[-1]==0 or y[-1]==0:
2574  return None
2575 
2576  x = [float(v)/x[-1] for v in x]
2577  y = [float(v)/y[-1] for v in y]
2578  return x,y
2579 
2580  def ComputeROCAUC(self, score_col, class_col, score_dir='-',
2581  class_dir='-', class_cutoff=2.0):
2582  '''
2583  Computes the area under the curve of the receiver operating characteristics
2584  using the trapezoidal rule.
2585 
2586  For more information about parameters of the ROC, see
2587  :meth:`ComputeROC`.
2588 
2589  :warning: The function depends on *numpy*
2590  '''
2591  try:
2592  import numpy as np
2593 
2594  roc = self.ComputeROC(score_col, class_col, score_dir,
2595  class_dir, class_cutoff)
2596 
2597  if not roc:
2598  return None
2599  return np.trapz(roc[1], roc[0])
2600  except ImportError:
2601  LogError("Function needs numpy, but I could not import it.")
2602  raise
2603 
2604  def PlotROC(self, score_col, class_col, score_dir='-',
2605  class_dir='-', class_cutoff=2.0,
2606  style='-', title=None, x_title=None, y_title=None,
2607  clear=True, save=None):
2608  '''
2609  Plot an ROC curve using matplotlib.
2610 
2611  For more information about parameters of the ROC, see
2612  :meth:`ComputeROC`, and for plotting see :meth:`Plot`.
2613 
2614  :warning: The function depends on *matplotlib*
2615  '''
2616 
2617  try:
2618  import matplotlib.pyplot as plt
2619 
2620  roc = self.ComputeROC(score_col, class_col, score_dir,
2621  class_dir, class_cutoff)
2622 
2623  if not roc:
2624  return None
2625 
2626  enrx, enry = roc
2627 
2628  if not title:
2629  title = 'ROC of %s'%score_col
2630 
2631  if not x_title:
2632  x_title = 'false positive rate'
2633 
2634  if not y_title:
2635  y_title = 'true positive rate'
2636 
2637  if clear:
2638  plt.clf()
2639 
2640  plt.plot(enrx, enry, style)
2641 
2642  plt.title(title, size='x-large', fontweight='bold')
2643  plt.ylabel(y_title, size='x-large')
2644  plt.xlabel(x_title, size='x-large')
2645 
2646  if save:
2647  plt.savefig(save)
2648 
2649  return plt
2650  except ImportError:
2651  LogError("Function needs matplotlib, but I could not import it.")
2652  raise
2653 
2654  def ComputeMCC(self, score_col, class_col, score_dir='-',
2655  class_dir='-', score_cutoff=2.0, class_cutoff=2.0):
2656  '''
2657  Compute Matthews correlation coefficient (MCC) for one column (*score_col*)
2658  with the points classified into true positives, false positives, true
2659  negatives and false negatives according to a specified classification
2660  column (*class_col*).
2661 
2662  The datapoints in *score_col* and *class_col* are classified into
2663  positive and negative points. This can be done in two ways:
2664 
2665  - by using 'bool' columns which contains True for positives and False
2666  for negatives
2667 
2668  - by using 'float' or 'int' columns and specifying a cutoff value and the
2669  columns direction. This will generate the classification on the fly
2670 
2671  * if ``class_dir``/``score_dir=='-'``: values in the classification column that are less than or equal to *class_cutoff*/*score_cutoff* will be counted as positives
2672  * if ``class_dir``/``score_dir=='+'``: values in the classification column that are larger than or equal to *class_cutoff*/*score_cutoff* will be counted as positives
2673 
2674  The two possibilities can be used together, i.e. 'bool' type for one column
2675  and 'float'/'int' type and cutoff/direction for the other column.
2676  '''
2677  ALLOWED_DIR = ['+','-']
2678 
2679  score_idx = self.GetColIndex(score_col)
2680  score_type = self.col_types[score_idx]
2681  if score_type!='int' and score_type!='float' and score_type!='bool':
2682  raise TypeError("Score column must be numeric or bool type")
2683 
2684  class_idx = self.GetColIndex(class_col)
2685  class_type = self.col_types[class_idx]
2686  if class_type!='int' and class_type!='float' and class_type!='bool':
2687  raise TypeError("Classifier column must be numeric or bool type")
2688 
2689  if (score_dir not in ALLOWED_DIR) or (class_dir not in ALLOWED_DIR):
2690  raise ValueError("Direction must be one of %s"%str(ALLOWED_DIR))
2691 
2692  tp = 0
2693  fp = 0
2694  fn = 0
2695  tn = 0
2696 
2697  for i,row in enumerate(self.rows):
2698  class_val = row[class_idx]
2699  score_val = row[score_idx]
2700  if class_val!=None:
2701  if (class_type=='bool' and class_val==True) or (class_type!='bool' and ((class_dir=='-' and class_val<=class_cutoff) or (class_dir=='+' and class_val>=class_cutoff))):
2702  if (score_type=='bool' and score_val==True) or (score_type!='bool' and ((score_dir=='-' and score_val<=score_cutoff) or (score_dir=='+' and score_val>=score_cutoff))):
2703  tp += 1
2704  else:
2705  fn += 1
2706  else:
2707  if (score_type=='bool' and score_val==False) or (score_type!='bool' and ((score_dir=='-' and score_val>score_cutoff) or (score_dir=='+' and score_val<score_cutoff))):
2708  tn += 1
2709  else:
2710  fp += 1
2711 
2712  mcc = None
2713  msg = None
2714  if (tp+fn)==0:
2715  msg = 'factor (tp + fn) is zero'
2716  elif (tp+fp)==0:
2717  msg = 'factor (tp + fp) is zero'
2718  elif (tn+fn)==0:
2719  msg = 'factor (tn + fn) is zero'
2720  elif (tn+fp)==0:
2721  msg = 'factor (tn + fp) is zero'
2722 
2723  if msg:
2724  LogWarning("Could not compute MCC: MCC is not defined since %s"%msg)
2725  else:
2726  mcc = ((tp*tn)-(fp*fn)) / math.sqrt((tp+fn)*(tp+fp)*(tn+fn)*(tn+fp))
2727  return mcc
2728 
2729 
2730  def IsEmpty(self, col_name=None, ignore_nan=True):
2731  '''
2732  Checks if a table is empty.
2733 
2734  If no column name is specified, the whole table is checked for being empty,
2735  whereas if a column name is specified, only this column is checked.
2736 
2737  By default, all NAN (or None) values are ignored, and thus, a table
2738  containing only NAN values is considered as empty. By specifying the
2739  option ignore_nan=False, NAN values are counted as 'normal' values.
2740  '''
2741 
2742  # table with no columns and no rows
2743  if len(self.col_names)==0:
2744  if col_name:
2745  raise ValueError('Table has no column named "%s"' % col_name)
2746  return True
2747 
2748  # column name specified
2749  if col_name:
2750  if self.Count(col_name, ignore_nan=ignore_nan)==0:
2751  return True
2752  else:
2753  return False
2754 
2755  # no column name specified -> test whole table
2756  else:
2757  for row in self.rows:
2758  for cell in row:
2759  if ignore_nan:
2760  if cell!=None:
2761  return False
2762  else:
2763  return False
2764  return True
2765 
2766 
2767  def Extend(self, tab, overwrite=None):
2768  """
2769  Append each row of *tab* to the current table. The data is appended based
2770  on the column names, thus the order of the table columns is *not* relevant,
2771  only the header names.
2772 
2773  If there is a column in *tab* that is not present in the current table,
2774  it is added to the current table and filled with *None* for all the rows
2775  present in the current table.
2776 
2777  If the type of any column in *tab* is not the same as in the current table
2778  a *TypeError* is raised.
2779 
2780  If *overwrite* is not None and set to an existing column name, the specified
2781  column in the table is searched for the first occurrence of a value matching
2782  the value of the column with the same name in the dictionary. If a matching
2783  value is found, the row is overwritten with the dictionary. If no matching
2784  row is found, a new row is appended to the table.
2785  """
2786  # add column to current table if it doesn't exist
2787  for name,typ in zip(tab.col_names, tab.col_types):
2788  if not name in self.col_names:
2789  self.AddCol(name, typ)
2790 
2791  # check that column types are the same in current and new table
2792  for name in self.col_names:
2793  if name in tab.col_names:
2794  curr_type = self.col_types[self.GetColIndex(name)]
2795  new_type = tab.col_types[tab.GetColIndex(name)]
2796  if curr_type!=new_type:
2797  raise TypeError('cannot extend table, column %s in new '%name +\
2798  'table different type (%s) than in '%new_type +\
2799  'current table (%s)'%curr_type)
2800 
2801  num_rows = len(tab.rows)
2802  for i in range(0,num_rows):
2803  row = tab.rows[i]
2804  data = dict(zip(tab.col_names,row))
2805  self.AddRow(data, overwrite)
2806 
2807 
2808 def Merge(table1, table2, by, only_matching=False):
2809  """
2810  Returns a new table containing the data from both tables. The rows are
2811  combined based on the common values in the column(s) by. The option 'by' can
2812  be a list of column names. When this is the case, merging is based on
2813  multiple columns.
2814  For example, the two tables below
2815 
2816  ==== ====
2817  x y
2818  ==== ====
2819  1 10
2820  2 15
2821  3 20
2822  ==== ====
2823 
2824  ==== ====
2825  x u
2826  ==== ====
2827  1 100
2828  3 200
2829  4 400
2830  ==== ====
2831 
2832  ===== ===== =====
2833  x y u
2834  ===== ===== =====
2835  1 10 100
2836  2 15 None
2837  3 20 200
2838  4 None 400
2839  ===== ===== =====
2840 
2841  when merged by column x, produce the following output:
2842  """
2843  def _key(row, indices):
2844  return tuple([row[i] for i in indices])
2845  def _keep(indices, cn, ct, ni):
2846  ncn, nct, nni=([],[],[])
2847  for i in range(len(cn)):
2848  if i not in indices:
2849  ncn.append(cn[i])
2850  nct.append(ct[i])
2851  nni.append(ni[i])
2852  return ncn, nct, nni
2853  col_names=list(table2.col_names)
2854  col_types=list(table2.col_types)
2855  new_index=[i for i in range(len(col_names))]
2856  if isinstance(by, str):
2857  common2_indices=[col_names.index(by)]
2858  else:
2859  common2_indices=[col_names.index(b) for b in by]
2860  col_names, col_types, new_index=_keep(common2_indices, col_names,
2861  col_types, new_index)
2862 
2863  for i, name in enumerate(col_names):
2864  try_name=name
2865  counter=1
2866  while try_name in table1.col_names:
2867  counter+=1
2868  try_name='%s_%d' % (name, counter)
2869  col_names[i]=try_name
2870  common1={}
2871  if isinstance(by, str):
2872  common1_indices=[table1.col_names.index(by)]
2873  else:
2874  common1_indices=[table1.col_names.index(b) for b in by]
2875  for row in table1.rows:
2876  key=_key(row, common1_indices)
2877  if key in common1:
2878  raise ValueError('duplicate key "%s in first table"' % (str(key)))
2879  common1[key]=row
2880  common2={}
2881  for row in table2.rows:
2882  key=_key(row, common2_indices)
2883  if key in common2:
2884  raise ValueError('duplicate key "%s" in second table' % (str(key)))
2885  common2[key]=row
2886  new_tab=Table(table1.col_names+col_names, table1.col_types+col_types)
2887  for k, v in common1.iteritems():
2888  row=v+[None for i in range(len(table2.col_names)-len(common2_indices))]
2889  matched=False
2890  if k in common2:
2891  matched=True
2892  row2=common2[k]
2893  for i, index in enumerate(new_index):
2894  row[len(table1.col_names)+i]=row2[index]
2895  if only_matching and not matched:
2896  continue
2897  new_tab.AddRow(row)
2898  if only_matching:
2899  return new_tab
2900  for k, v in common2.iteritems():
2901  if not k in common1:
2902  v2=[v[i] for i in new_index]
2903  row=[None for i in range(len(table1.col_names))]+v2
2904  for common1_index, common2_index in zip(common1_indices, common2_indices):
2905  row[common1_index]=v[common2_index]
2906  new_tab.AddRow(row)
2907  return new_tab
2908