OpenStructure
table.py
Go to the documentation of this file.
1 import csv
2 import re
3 import math
4 from ost import stutil
5 import itertools
6 import operator
7 import pickle
8 import weakref
9 from ost import LogError, LogWarning, LogInfo, LogVerbose
10 
11 def MakeTitle(col_name):
12  return col_name.replace('_', ' ')
13 
14 def IsStringLike(value):
15  if isinstance(value, TableCol) or isinstance(value, BinaryColExpr):
16  return False
17  try:
18  value+''
19  return True
20  except:
21  return False
22 
23 def IsNullString(value):
24  value=value.strip().upper()
25  return value in ('', 'NULL', 'NONE', 'NA')
26 
27 def IsScalar(value):
28  if IsStringLike(value):
29  return True
30  try:
31  if isinstance(value, TableCol) or isinstance(value, BinaryColExpr):
32  return False
33  iter(value)
34  return False
35  except:
36  return True
37 
38 def GuessColumnType(iterator):
39  empty=True
40  possibilities=set(['bool', 'int', 'float'])
41  for ele in iterator:
42  str_ele=str(ele).upper()
43  if IsNullString(str_ele):
44  continue
45  empty=False
46  if 'int' in possibilities:
47  try:
48  int(str_ele)
49  except ValueError:
50  possibilities.remove('int')
51 
52  if 'float' in possibilities:
53  try:
54  float(str_ele)
55  except ValueError:
56  possibilities.remove('float')
57  if 'bool' in possibilities:
58  if str_ele not in set(['YES', 'NO', 'TRUE', 'FALSE']):
59  possibilities.remove('bool')
60 
61  if len(possibilities)==0:
62  return 'string'
63  if len(possibilities)==2:
64  return 'int'
65  if empty:
66  return 'string'
67  # return the last element available
68  return possibilities.pop()
69 
71  def __init__(self, op, lhs, rhs):
72  self.opop=op
73  self.lhslhs=lhs
74  self.rhsrhs=rhs
75  if IsScalar(lhs):
76  self.lhslhs=itertools.cyle([self.lhslhs])
77  if IsScalar(rhs):
78  self.rhsrhs=itertools.cycle([self.rhsrhs])
79  def __iter__(self):
80  for l, r in zip(self.lhslhs, self.rhsrhs):
81  if l!=None and r!=None:
82  yield self.opop(l, r)
83  else:
84  yield None
85  def __add__(self, rhs):
86  return BinaryColExpr(operator.add, self, rhs)
87 
88  def __sub__(self, rhs):
89  return BinaryColExpr(operator.sub, self, rhs)
90 
91  def __mul__(self, rhs):
92  return BinaryColExpr(operator.mul, self, rhs)
93 
94  def __div__(self, rhs):
95  return BinaryColExpr(operator.div, self, rhs)
96 
97 class TableCol:
98  def __init__(self, table, col):
99  self._table_table=table
100  if type(col)==str:
101  self.col_indexcol_index=self._table_table.GetColIndex(col)
102  else:
103  self.col_indexcol_index=col
104 
105  def __iter__(self):
106  for row in self._table_table.rows:
107  yield row[self.col_indexcol_index]
108 
109  def __len__(self):
110  return len(self._table_table.rows)
111 
112  def __getitem__(self, index):
113  return self._table_table.rows[index][self.col_indexcol_index]
114 
115  def __setitem__(self, index, value):
116  self._table_table.rows[index][self.col_indexcol_index]=value
117 
118  def __add__(self, rhs):
119  return BinaryColExpr(operator.add, self, rhs)
120 
121  def __sub__(self, rhs):
122  return BinaryColExpr(operator.sub, self, rhs)
123 
124  def __mul__(self, rhs):
125  return BinaryColExpr(operator.mul, self, rhs)
126 
127  def __div__(self, rhs):
128  return BinaryColExpr(operator.div, self, rhs)
129 
130 class TableRow:
131  """
132  Essentially a named tuple, but allows column names that are not valid
133  python variable names.
134  """
135  def __init__(self, row_data, tab):
136  self.__dict__['tab'] = weakref.proxy(tab)
137  self.__dict__['row_data'] = row_data
138 
139  def __getitem__(self, col_name):
140  if type(col_name)==int:
141  return self.row_data[col_name]
142  return self.row_data[self.tab.GetColIndex(col_name)]
143 
144  def __str__(self):
145  s = []
146  for k, v in zip(self.__dict__['tab'].col_names, self.__dict__['row_data']):
147  s.append('%s=%s' % (k, str(v)))
148  return ', '.join(s)
149 
150 
151  def __len__(self):
152  return len(self.row_data)
153 
154  def __setitem__(self, col_name, val):
155  if type(col_name)==int:
156  self.row_data[col_name] = val
157  else:
158  self.row_data[self.tab.GetColIndex(col_name)] = val
159 
160  def __getattr__(self, col_name):
161  if 'col_names' not in self.tab.__dict__ or col_name not in self.tab.col_names:
162  raise AttributeError(col_name)
163  return self.row_data[self.tab.GetColIndex(col_name)]
164 
165  def __setattr__(self, col_name, val):
166  if 'col_names' not in self.tab.__dict__ or col_name not in self.tab.col_names:
167  raise AttributeError(col_name)
168  self.row_data[self.tab.GetColIndex(col_name)] = val
169 
170 class Table(object):
171  """
172 
173  The table class provides convenient access to data in tabular form. An empty
174  table can be easily constructed as follows
175 
176  .. code-block:: python
177 
178  tab = Table()
179 
180  If you want to add columns directly when creating the table, column names
181  and *column types* can be specified as follows
182 
183  .. code-block:: python
184 
185  tab = Table(['nameX','nameY','nameZ'], 'sfb')
186 
187  this will create three columns called nameX, nameY and nameZ of type string,
188  float and bool, respectively. There will be no data in the table and thus,
189  the table will not contain any rows.
190 
191  The following *column types* are supported:
192 
193  ======= ========
194  name abbrev
195  ======= ========
196  string s
197  float f
198  int i
199  bool b
200  ======= ========
201 
202  If you want to add data to the table in addition, use the following:
203 
204  .. code-block:: python
205 
206  tab=Table(['nameX','nameY','nameZ'],
207  'sfb',
208  nameX = ['a','b','c'],
209  nameY = [0.1, 1.2, 3.414],
210  nameZ = [True, False, False])
211 
212  if values for one column is left out, they will be filled with NA, but if
213  values are specified, all values must be specified (i.e. same number of
214  values per column)
215 
216  """
217 
218  SUPPORTED_TYPES=('int', 'float', 'bool', 'string',)
219 
220 
221  def __init__(self, col_names=[], col_types=None, **kwargs):
222 
223  self.col_namescol_names=list(col_names)
224  self.commentcomment=''
225  self.namename=''
226 
227  self.col_typescol_types = self._ParseColTypes_ParseColTypes(col_types)
228  self.rowsrows=[]
229  if len(kwargs)>=0:
230  if not col_names:
231  self.col_namescol_names=[v for v in list(kwargs.keys())]
232  if not self.col_typescol_types:
233  self.col_typescol_types=['string' for u in range(len(self.col_namescol_names))]
234  if len(kwargs)>0:
235  self._AddRowsFromDict_AddRowsFromDict(kwargs)
236 
237  def __getattr__(self, col_name):
238  # pickling doesn't call the standard __init__ defined above and thus
239  # col_names might not be defined. This leads to infinite recursions.
240  # Protect against it by checking that col_names is contained in
241  # __dict__
242  if 'col_names' not in self.__dict__ or col_name not in self.col_namescol_names:
243  raise AttributeError(col_name)
244  return TableCol(self, col_name)
245 
246  @staticmethod
247  def _ParseColTypes(types, exp_num=None):
248  if types==None:
249  return None
250 
251  short2long = {'s' : 'string', 'i': 'int', 'b' : 'bool', 'f' : 'float'}
252  allowed_short = list(short2long.keys())
253  allowed_long = list(short2long.values())
254 
255  type_list = []
256 
257  # string type
258  if IsScalar(types):
259  if type(types)==str:
260  types = types.lower()
261 
262  # single value
263  if types in allowed_long:
264  type_list.append(types)
265  elif types in allowed_short:
266  type_list.append(short2long[types])
267 
268  # comma separated list of long or short types
269  elif types.find(',')!=-1:
270  for t in types.split(','):
271  if t in allowed_long:
272  type_list.append(t)
273  elif t in allowed_short:
274  type_list.append(short2long[t])
275  else:
276  raise ValueError('Unknown type %s in types %s'%(t,types))
277 
278  # string of short types
279  else:
280  for t in types:
281  if t in allowed_short:
282  type_list.append(short2long[t])
283  else:
284  raise ValueError('Unknown type %s in types %s'%(t,types))
285 
286  # non-string type
287  else:
288  raise ValueError('Col type %s must be string or list'%types)
289 
290  # list type
291  else:
292  for t in types:
293  # must be string type
294  if type(t)==str:
295  t = t.lower()
296  if t in allowed_long:
297  type_list.append(t)
298  elif t in allowed_short:
299  type_list.append(short2long[t])
300  else:
301  raise ValueError('Unknown type %s in types %s'%(t,types))
302 
303  # non-string type
304  else:
305  raise ValueError('Col type %s must be string or list'%types)
306 
307  if exp_num:
308  if len(type_list)!=exp_num:
309  raise ValueError('Parsed number of col types (%i) differs from ' + \
310  'expected (%i) in types %s'%(len(type_list),exp_num,types))
311 
312  return type_list
313 
314  def SetName(self, name):
315  '''
316  Set name of the table
317 
318  :param name: name
319  :type name: :class:`str`
320  '''
321  self.namename = name
322 
323  def GetName(self):
324  '''
325  Get name of table
326  '''
327  return self.namename
328 
329  def RenameCol(self, old_name, new_name):
330  """
331  Rename column *old_name* to *new_name*.
332 
333  :param old_name: Name of the old column
334  :param new_name: Name of the new column
335  :raises: :exc:`ValueError` when *old_name* is not a valid column
336  """
337  if old_name==new_name:
338  return
339  self.AddColAddCol(new_name, self.col_typescol_types[self.GetColIndexGetColIndex(old_name)],
340  self[old_name])
341  self.RemoveColRemoveCol(old_name)
342  def _Coerce(self, value, ty):
343  '''
344  Try to convert values (e.g. from :class:`str` type) to the specified type
345 
346  :param value: the value
347  :type value: any type
348 
349  :param ty: name of type to convert it to (i.e. *int*, *float*, *string*,
350  *bool*)
351  :type ty: :class:`str`
352  '''
353  if value=='NA' or value==None:
354  return None
355  if ty=='int':
356  return int(value)
357  if ty=='float':
358  return float(value)
359  if ty=='string':
360  return str(value)
361  if ty=='bool':
362  if isinstance(value, str) or isinstance(value, str):
363  if value.upper() in ('FALSE', 'NO',):
364  return False
365  return True
366  return bool(value)
367  raise ValueError('Unknown type %s' % ty)
368 
369  def GetColIndex(self, col):
370  '''
371  Returns the column index for the column with the given name.
372 
373  :raises: ValueError if no column with the name is found.
374  '''
375  if col not in self.col_namescol_names:
376  raise ValueError('Table has no column named "%s"' % col)
377  return self.col_namescol_names.index(col)
378 
379  def GetColNames(self):
380  '''
381  Returns a list containing all column names.
382  '''
383  return self.col_namescol_names
384 
385  def SearchColNames(self, regex):
386  '''
387  Returns a list of column names matching the regex.
388 
389  :param regex: regex pattern
390  :type regex: :class:`str`
391 
392  :returns: :class:`list` of column names (:class:`str`)
393  '''
394  matching_names = []
395  for name in self.col_namescol_names:
396  matches = re.search(regex, name)
397  if matches:
398  matching_names.append(name)
399  return matching_names
400 
401  def HasCol(self, col):
402  '''
403  Checks if the column with a given name is present in the table.
404  '''
405  return col in self.col_namescol_names
406 
407  def __getitem__(self, k):
408  if type(k)==int:
409  return TableCol(self, self.col_namescol_names[k])
410  else:
411  return TableCol(self, k)
412 
413  def __setitem__(self, k, value):
414  col_index=k
415  if type(k)!=int:
416  col_index=self.GetColIndexGetColIndex(k)
417  if IsScalar(value):
418  value=itertools.cycle([value])
419  for r, v in zip(self.rowsrows, value):
420  r[col_index]=v
421 
422  def ToString(self, float_format='%.3f', int_format='%d', rows=None):
423  '''
424  Convert the table into a string representation.
425 
426  The output format can be modified for int and float type columns by
427  specifying a formatting string for the parameters *float_format* and
428  *int_format*.
429 
430  The option *rows* specify the range of rows to be printed. The parameter
431  must be a type that supports indexing (e.g. a :class:`list`) containing the
432  start and end row *index*, e.g. [start_row_idx, end_row_idx].
433 
434  :param float_format: formatting string for float columns
435  :type float_format: :class:`str`
436 
437  :param int_format: formatting string for int columns
438  :type int_format: :class:`str`
439 
440  :param rows: iterable containing start and end row *index*
441  :type rows: iterable containing :class:`ints <int>`
442  '''
443  widths=[len(cn) for cn in self.col_namescol_names]
444  sel_rows=self.rowsrows
445  if rows:
446  sel_rows=self.rowsrows[rows[0]:rows[1]]
447  for row in sel_rows:
448  for i, (ty, col) in enumerate(zip(self.col_typescol_types, row)):
449  if col==None:
450  widths[i]=max(widths[i], len('NA'))
451  elif ty=='float':
452  widths[i]=max(widths[i], len(float_format % col))
453  elif ty=='int':
454  widths[i]=max(widths[i], len(int_format % col))
455  else:
456  widths[i]=max(widths[i], len(str(col)))
457  s=''
458  if self.commentcomment:
459  s+=''.join(['# %s\n' % l for l in self.commentcomment.split('\n')])
460  total_width=sum(widths)+2*len(widths)
461  for width, col_name in zip(widths, self.col_namescol_names):
462  s+=col_name.center(width+2)
463  s+='\n%s\n' % ('-'*total_width)
464  for row in sel_rows:
465  for width, ty, col in zip(widths, self.col_typescol_types, row):
466  cs=''
467  if col==None:
468  cs='NA'.center(width+2)
469  elif ty=='float':
470  cs=(float_format % col).rjust(width+2)
471  elif ty=='int':
472  cs=(int_format % col).rjust(width+2)
473  else:
474  cs=' '+str(col).ljust(width+1)
475  s+=cs
476  s+='\n'
477  return s
478 
479  def __str__(self):
480  return self.ToStringToString()
481 
482  def Stats(self, col):
483  idx = self.GetColIndexGetColIndex(col)
484  text ='''
485 Statistics for column %(col)s
486 
487  Number of Rows : %(num)d
488  Number of Rows Not None: %(num_non_null)d
489  Mean : %(mean)f
490  Median : %(median)f
491  Standard Deviation : %(stddev)f
492  Min : %(min)f
493  Max : %(max)f
494 '''
495  data = {
496  'col' : col,
497  'num' : len(self.rowsrows),
498  'num_non_null' : self.CountCount(col),
499  'median' : self.MedianMedian(col),
500  'mean' : self.MeanMean(col),
501  'stddev' : self.StdDevStdDev(col),
502  'min' : self.MinMin(col),
503  'max' : self.MaxMax(col),
504  }
505  return text % data
506 
507  def _AddRowsFromDict(self, d, overwrite=None):
508  '''
509  Add one or more rows from a :class:`dictionary <dict>`.
510 
511  If *overwrite* is not None and set to an existing column name, the specified
512  column in the table is searched for the first occurrence of a value matching
513  the value of the column with the same name in the dictionary. If a matching
514  value is found, the row is overwritten with the dictionary. If no matching
515  row is found, a new row is appended to the table.
516 
517  :param d: dictionary containing the data
518  :type d: :class:`dict`
519 
520  :param overwrite: column name to overwrite existing row if value in
521  column *overwrite* matches
522  :type overwrite: :class:`str`
523 
524  :raises: :class:`ValueError` if multiple rows are added but the number of
525  data items is different for different columns.
526  '''
527  # get column indices
528  idxs = [self.GetColIndexGetColIndex(k) for k in list(d.keys())]
529 
530  # convert scalar values to list
531  old_len = None
532  for k,v in d.items():
533  if IsScalar(v):
534  v = [v]
535  d[k] = v
536  if not old_len:
537  old_len = len(v)
538  elif old_len!=len(v):
539  raise ValueError("Cannot add rows: length of data must be equal " + \
540  "for all columns in %s"%str(d))
541 
542  # convert column based dict to row based dict and create row and add data
543  for i,data in enumerate(zip(*list(d.values()))):
544  new_row = [None for a in range(len(self.col_namescol_names))]
545  for idx,v in zip(idxs,data):
546  new_row[idx] = self._Coerce_Coerce(v, self.col_typescol_types[idx])
547 
548  # partially overwrite existing row with new data
549  if overwrite:
550  overwrite_idx = self.GetColIndexGetColIndex(overwrite)
551  added = False
552  for i,r in enumerate(self.rowsrows):
553  if r[overwrite_idx]==new_row[overwrite_idx]:
554  for j,e in enumerate(self.rowsrows[i]):
555  if new_row[j]==None:
556  new_row[j] = e
557  self.rowsrows[i] = new_row
558  added = True
559  break
560 
561  # if not overwrite or overwrite did not find appropriate row
562  if not overwrite or not added:
563  self.rowsrows.append(new_row)
564 
565  def PairedTTest(self, col_a, col_b):
566  """
567  Two-sided test for the null-hypothesis that two related samples
568  have the same average (expected values).
569 
570  :param col_a: First column
571  :type col_a: :class:`str`
572  :param col_b: Second column
573  :type col_b: :class:`str`
574 
575  :returns: P-value between 0 and 1 that the two columns have the
576  same average. The smaller the value, the less related the two
577  columns are.
578  """
579  from scipy.stats import ttest_rel
580  xs = []
581  ys = []
582  for x, y in self.ZipZip(col_a, col_b):
583  if x!=None and y!=None:
584  xs.append(x)
585  ys.append(y)
586  result = ttest_rel(xs, ys)
587  return result[1]
588 
589  def AddRow(self, data, overwrite=None):
590  """
591  Add a row to the table.
592 
593  *data* may either be a dictionary or a list-like object:
594 
595  - If *data* is a dictionary, the keys in the dictionary must match the
596  column names. Columns not found in the dict will be initialized to None.
597  If the dict contains list-like objects, multiple rows will be added, if
598  the number of items in all list-like objects is the same, otherwise a
599  :class:`ValueError` is raised.
600 
601  - If *data* is a list-like object, the row is initialized from the values
602  in *data*. The number of items in *data* must match the number of
603  columns in the table. A :class:`ValuerError` is raised otherwise. The
604  values are added in the order specified in the list, thus, the order of
605  the data must match the columns.
606 
607  If *overwrite* is not None and set to an existing column name, the specified
608  column in the table is searched for the first occurrence of a value matching
609  the value of the column with the same name in the dictionary. If a matching
610  value is found, the row is overwritten with the dictionary. If no matching
611  row is found, a new row is appended to the table.
612 
613  :param data: data to add
614  :type data: :class:`dict` or *list-like* object
615 
616  :param overwrite: column name to overwrite existing row if value in
617  column *overwrite* matches
618  :type overwrite: :class:`str`
619 
620  :raises: :class:`ValueError` if *list-like* object is used and number of
621  items does *not* match number of columns in table.
622 
623  :raises: :class:`ValueError` if *dict* is used and multiple rows are added
624  but the number of data items is different for different columns.
625 
626  **Example:** add multiple data rows to a subset of columns using a dictionary
627 
628  .. code-block:: python
629 
630  # create table with three float columns
631  tab = Table(['x','y','z'], 'fff')
632 
633  # add rows from dict
634  data = {'x': [1.2, 1.6], 'z': [1.6, 5.3]}
635  tab.AddRow(data)
636  print tab
637 
638  '''
639  will produce the table
640 
641  ==== ==== ====
642  x y z
643  ==== ==== ====
644  1.20 NA 1.60
645  1.60 NA 5.30
646  ==== ==== ====
647  '''
648 
649  # overwrite the row with x=1.2 and add row with x=1.9
650  data = {'x': [1.2, 1.9], 'z': [7.9, 3.5]}
651  tab.AddRow(data, overwrite='x')
652  print tab
653 
654  '''
655  will produce the table
656 
657  ==== ==== ====
658  x y z
659  ==== ==== ====
660  1.20 NA 7.90
661  1.60 NA 5.30
662  1.90 NA 3.50
663  ==== ==== ====
664  '''
665  """
666  if type(data)==dict:
667  self._AddRowsFromDict_AddRowsFromDict(data, overwrite)
668  else:
669  if len(data)!=len(self.col_namescol_names):
670  msg='data array must have %d elements, not %d'
671  raise ValueError(msg % (len(self.col_namescol_names), len(data)))
672  new_row = [self._Coerce_Coerce(v, t) for v, t in zip(data, self.col_typescol_types)]
673 
674  # fully overwrite existing row with new data
675  if overwrite:
676  overwrite_idx = self.GetColIndexGetColIndex(overwrite)
677  added = False
678  for i,r in enumerate(self.rowsrows):
679  if r[overwrite_idx]==new_row[overwrite_idx]:
680  self.rowsrows[i] = new_row
681  added = True
682  break
683 
684  # if not overwrite or overwrite did not find appropriate row
685  if not overwrite or not added:
686  self.rowsrows.append(new_row)
687 
688  def RemoveCol(self, col):
689  """
690  Remove column with the given name from the table.
691 
692  :param col: name of column to remove
693  :type col: :class:`str`
694  """
695  idx = self.GetColIndexGetColIndex(col)
696  del self.col_namescol_names[idx]
697  del self.col_typescol_types[idx]
698  for row in self.rowsrows:
699  del row[idx]
700 
701  def AddCol(self, col_name, col_type, data=None):
702  """
703  Add a column to the right of the table.
704 
705  :param col_name: name of new column
706  :type col_name: :class:`str`
707 
708  :param col_type: type of new column (long versions: *int*, *float*, *bool*,
709  *string* or short versions: *i*, *f*, *b*, *s*)
710  :type col_type: :class:`str`
711 
712  :param data: data to add to new column
713  :type data: scalar or iterable
714 
715  **Example:**
716 
717  .. code-block:: python
718 
719  tab = Table(['x'], 'f', x=range(5))
720  tab.AddCol('even', 'bool', itertools.cycle([True, False]))
721  print tab
722 
723  '''
724  will produce the table
725 
726  ==== ====
727  x even
728  ==== ====
729  0 True
730  1 False
731  2 True
732  3 False
733  4 True
734  ==== ====
735  '''
736 
737  If data is a constant instead of an iterable object, it's value
738  will be written into each row:
739 
740  .. code-block:: python
741 
742  tab = Table(['x'], 'f', x=range(5))
743  tab.AddCol('num', 'i', 1)
744  print tab
745 
746  '''
747  will produce the table
748 
749  ==== ====
750  x num
751  ==== ====
752  0 1
753  1 1
754  2 1
755  3 1
756  4 1
757  ==== ====
758  '''
759 
760  As a special case, if there are no previous rows, and data is not
761  None, rows are added for every item in data.
762  """
763 
764  if col_name in self.col_namescol_names:
765  raise ValueError('Column with name %s already exists'%col_name)
766 
767  col_type = self._ParseColTypes_ParseColTypes(col_type, exp_num=1)[0]
768  self.col_namescol_names.append(col_name)
769  self.col_typescol_types.append(col_type)
770 
771  if len(self.rowsrows)>0:
772  if IsScalar(data):
773  for row in self.rowsrows:
774  row.append(data)
775  else:
776  if hasattr(data, '__len__') and len(data)!=len(self.rowsrows):
777  self.col_namescol_names.pop()
778  self.col_typescol_types.pop()
779  raise ValueError('Length of data (%i) must correspond to number of '%len(data) +\
780  'existing rows (%i)'%len(self.rowsrows))
781  for row, d in zip(self.rowsrows, data):
782  row.append(d)
783 
784  elif data!=None and len(self.col_namescol_names)==1:
785  if IsScalar(data):
786  self.AddRowAddRow({col_name : data})
787  else:
788  for v in data:
789  self.AddRowAddRow({col_name : v})
790 
791  def Filter(self, *args, **kwargs):
792  """
793  Returns a filtered table only containing rows matching all the predicates
794  in kwargs and args For example,
795 
796  .. code-block:: python
797 
798  tab.Filter(town='Basel')
799 
800  will return all the rows where the value of the column "town" is equal to
801  "Basel". Several predicates may be combined, i.e.
802 
803  .. code-block:: python
804 
805  tab.Filter(town='Basel', male=True)
806 
807  will return the rows with "town" equal to "Basel" and "male" equal to true.
808  args are unary callables returning true if the row should be included in the
809  result and false if not.
810  """
811  filt_tab=Table(list(self.col_namescol_names), list(self.col_typescol_types))
812  for row in self.rowsrows:
813  matches=True
814  for func in args:
815  if not func(row):
816  matches=False
817  break
818  for key, val in kwargs.items():
819  if row[self.GetColIndexGetColIndex(key)]!=val:
820  matches=False
821  break
822  if matches:
823  filt_tab.AddRow(row)
824  return filt_tab
825 
826 
827  def Select(self, query):
828 
829  """
830  Returns a new table object containing all rows matching a logical query
831  expression.
832 
833  *query* is a string containing the logical expression, that will be
834  evaluated for every row.
835 
836  Operands have to be the name of a column or an expression that can be
837  parsed to float, int, bool or string.
838  Valid operators are: and, or, !=, !, <=, >=, ==, =, <, >, +, -, \\*, /
839 
840  .. code-block:: python
841 
842  subtab = tab.Select('col_a>0.5 and (col_b=5 or col_c=5)')
843 
844  The selection query should be self explaining. Allowed parenthesis are:
845  (), [], {}, whereas parenthesis mismatches get recognized. Expressions like
846  '3<=col_a>=col_b' throw an error, due to problems in figuring out the
847  evaluation order.
848 
849  There are two special expressions:
850 
851  .. code-block:: python
852 
853  #selects rows, where 1.0<=col_a<=1.5
854  subtab = tab.Select('col_a=1.0:1.5')
855 
856  #selects rows, where col_a=1 or col_a=2 or col_a=3
857  subtab = tab.Select('col_a=1,2,3')
858 
859  Only consistent types can be compared. If col_a is of type string and col_b
860  is of type int, following expression would throw an error: 'col_a<col_b'
861  """
862 
863  try:
864  from .table_selector import TableSelector
865  except:
866  raise ImportError("Tried to import from the file table_selector.py, but could not find it!")
867 
868  selector=TableSelector(self.col_typescol_types, self.col_namescol_names, query)
869 
870  selected_tab=Table(list(self.col_namescol_names), list(self.col_typescol_types))
871 
872  for row in self.rowsrows:
873  if selector.EvaluateRow(row):
874  selected_tab.AddRow(row)
875 
876  return selected_tab
877 
878 
879  @staticmethod
880  def _LoadOST(stream_or_filename):
881  fieldname_pattern=re.compile(r'(?P<name>[^[]+)(\[(?P<type>\w+)\])?')
882  values_pattern=re.compile("([^\" ]+|\"[^\"]*\")+")
883  file_opened=False
884  if not hasattr(stream_or_filename, 'read'):
885  stream=open(stream_or_filename, 'r')
886  file_opened=True
887  else:
888  stream=stream_or_filename
889  header=False
890  num_lines=0
891  for line in stream:
892  line=line.strip()
893  if line.startswith('#'):
894  continue
895  if len(line)==0:
896  continue
897  num_lines+=1
898  if not header:
899  fieldnames=[]
900  fieldtypes=[]
901  for col in line.split():
902  match=fieldname_pattern.match(col)
903  if match:
904  if match.group('type'):
905  fieldtypes.append(match.group('type'))
906  else:
907  fieldtypes.append('string')
908  fieldnames.append(match.group('name'))
909  try:
910  tab=Table(fieldnames, fieldtypes)
911  except Exception as e:
912  # potentially fails if we read in crap... clean up and pass on error
913  if file_opened:
914  stream.close()
915  raise e
916  header=True
917  continue
918  tab.AddRow([x.strip('"') for x in values_pattern.findall(line)])
919  if file_opened:
920  stream.close()
921  if num_lines==0:
922  raise IOError("Cannot read table from empty stream")
923  return tab
924 
925  def _GuessColumnTypes(self):
926  for col_idx in range(len(self.col_namescol_names)):
927  self.col_typescol_types[col_idx]=GuessColumnType(self[self.col_namescol_names[col_idx]])
928  for row in self.rowsrows:
929  for idx in range(len(row)):
930  row[idx]=self._Coerce_Coerce(row[idx], self.col_typescol_types[idx])
931 
932  @staticmethod
933  def _LoadCSV(stream_or_filename, sep):
934  file_opened=False
935  if not hasattr(stream_or_filename, 'read'):
936  stream=open(stream_or_filename, 'r')
937  file_opened=True
938  else:
939  stream=stream_or_filename
940  reader=csv.reader(stream, delimiter=sep)
941  first=True
942  for row in reader:
943  if first:
944  header=row
945  types='s'*len(row)
946  tab=Table(header, types)
947  first=False
948  else:
949  tab.AddRow(row)
950  if file_opened:
951  stream.close()
952  if first:
953  raise IOError('trying to load table from empty CSV stream/file')
954 
955  tab._GuessColumnTypes()
956  return tab
957 
958  @staticmethod
959  def _LoadPickle(stream_or_filename):
960  file_opened=False
961  if not hasattr(stream_or_filename, 'read'):
962  stream=open(stream_or_filename, 'rb')
963  file_opened=True
964  else:
965  stream=stream_or_filename
966  tab = pickle.load(stream)
967  if file_opened:
968  stream.close()
969  return tab
970 
971  @staticmethod
972  def _GuessFormat(filename):
973  try:
974  filename = filename.name
975  except AttributeError as e:
976  pass
977  if filename.endswith('.csv'):
978  return 'csv'
979  elif filename.endswith('.pickle'):
980  return 'pickle'
981  else:
982  return 'ost'
983 
984 
985  @staticmethod
986  def Load(stream_or_filename, format='auto', sep=','):
987  """
988  Load table from stream or file with given name.
989 
990  By default, the file format is set to *auto*, which tries to guess the file
991  format from the file extension. The following file extensions are
992  recognized:
993 
994  ============ ======================
995  extension recognized format
996  ============ ======================
997  .csv comma separated values
998  .pickle pickled byte stream
999  <all others> ost-specific format
1000  ============ ======================
1001 
1002  Thus, *format* must be specified for reading file with different filename
1003  extensions.
1004 
1005  The following file formats are understood:
1006 
1007  - ost
1008 
1009  This is an ost-specific, but still human readable file format. The file
1010  (stream) must start with header line of the form
1011 
1012  col_name1[type1] <col_name2[type2]>...
1013 
1014  The types given in brackets must be one of the data types the
1015  :class:`Table` class understands. Each following line in the file then must
1016  contains exactly the same number of data items as listed in the header. The
1017  data items are automatically converted to the column format. Lines starting
1018  with a '#' and empty lines are ignored.
1019 
1020  - pickle
1021 
1022  Deserializes the table from a pickled byte stream.
1023 
1024  - csv
1025 
1026  Reads the table from comma separated values stream. Since there is no
1027  explicit type information in the csv file, the column types are guessed,
1028  using the following simple rules:
1029 
1030  * if all values are either NA/NULL/NONE the type is set to string.
1031  * if all non-null values are convertible to float/int the type is set to
1032  float/int.
1033  * if all non-null values are true/false/yes/no, the value is set to bool.
1034  * for all other cases, the column type is set to string.
1035 
1036  :returns: A new :class:`Table` instance
1037  """
1038  format=format.lower()
1039  if format=='auto':
1040  format = Table._GuessFormat(stream_or_filename)
1041 
1042  if format=='ost':
1043  return Table._LoadOST(stream_or_filename)
1044  if format=='csv':
1045  return Table._LoadCSV(stream_or_filename, sep=sep)
1046  if format=='pickle':
1047  return Table._LoadPickle(stream_or_filename)
1048  raise ValueError('unknown format ""' % format)
1049 
1050  def Sort(self, by, order='+'):
1051  """
1052  Performs an in-place sort of the table, based on column *by*.
1053 
1054  :param by: column name by which to sort
1055  :type by: :class:`str`
1056 
1057  :param order: ascending (``-``) or descending (``+``) order
1058  :type order: :class:`str` (i.e. *+*, *-*)
1059  """
1060  sign=-1
1061  if order=='-':
1062  sign=1
1063  key_index=self.GetColIndexGetColIndex(by)
1064  def _key_cmp(lhs, rhs):
1065  a = lhs[key_index]
1066  b = rhs[key_index]
1067  # mimic behaviour of the cmp function from Python2 that happily
1068  # compared None values
1069  if a is None or b is None:
1070  if a is None and b is not None:
1071  return -1 * sign
1072  if b is None and a is not None:
1073  return 1 * sign
1074  return 0
1075  return sign*((a > b) - (a < b))
1076 
1077  import functools
1078  self.rowsrows=sorted(self.rowsrows, key=functools.cmp_to_key(_key_cmp))
1079 
1080  def GetUnique(self, col, ignore_nan=True):
1081  """
1082  Extract a list of all unique values from one column.
1083 
1084  :param col: column name
1085  :type col: :class:`str`
1086 
1087  :param ignore_nan: ignore all *None* values
1088  :type ignore_nan: :class:`bool`
1089  """
1090  idx = self.GetColIndexGetColIndex(col)
1091  seen = {}
1092  result = []
1093  for row in self.rowsrows:
1094  item = row[idx]
1095  if item!=None or ignore_nan==False:
1096  if item in seen: continue
1097  seen[item] = 1
1098  result.append(item)
1099  return result
1100 
1101  def Zip(self, *args):
1102  """
1103  Allows to conveniently iterate over a selection of columns, e.g.
1104 
1105  .. code-block:: python
1106 
1107  tab = Table.Load('...')
1108  for col1, col2 in tab.Zip('col1', 'col2'):
1109  print col1, col2
1110 
1111  is a shortcut for
1112 
1113  .. code-block:: python
1114 
1115  tab = Table.Load('...')
1116  for col1, col2 in zip(tab['col1'], tab['col2']):
1117  print col1, col2
1118  """
1119  return list(zip(*[self[arg] for arg in args]))
1120 
1121  def Plot(self, x, y=None, z=None, style='.', x_title=None, y_title=None,
1122  z_title=None, x_range=None, y_range=None, z_range=None,
1123  color=None, plot_if=None, legend=None,
1124  num_z_levels=10, z_contour=True, z_interpol='nn', diag_line=False,
1125  labels=None, max_num_labels=None, title=None, clear=True, save=False,
1126  **kwargs):
1127  """
1128  Function to plot values from your table in 1, 2 or 3 dimensions using
1129  `Matplotlib <http://matplotlib.sourceforge.net>`__
1130 
1131  :param x: column name for first dimension
1132  :type x: :class:`str`
1133 
1134  :param y: column name for second dimension
1135  :type y: :class:`str`
1136 
1137  :param z: column name for third dimension
1138  :type z: :class:`str`
1139 
1140  :param style: symbol style (e.g. *.*, *-*, *x*, *o*, *+*, *\\**). For a
1141  complete list check (`matplotlib docu <http://matplotlib.sourceforge.net/api/pyplot_api.html#matplotlib.pyplot.plot>`__).
1142  :type style: :class:`str`
1143 
1144  :param x_title: title for first dimension, if not specified it is
1145  automatically derived from column name
1146  :type x_title: :class:`str`
1147 
1148  :param y_title: title for second dimension, if not specified it is
1149  automatically derived from column name
1150  :type y_title: :class:`str`
1151 
1152  :param z_title: title for third dimension, if not specified it is
1153  automatically derived from column name
1154  :type z_title: :class:`str`
1155 
1156  :param x_range: start and end value for first dimension (e.g. [start_x, end_x])
1157  :type x_range: :class:`list` of length two
1158 
1159  :param y_range: start and end value for second dimension (e.g. [start_y, end_y])
1160  :type y_range: :class:`list` of length two
1161 
1162  :param z_range: start and end value for third dimension (e.g. [start_z, end_z])
1163  :type z_range: :class:`list` of length two
1164 
1165  :param color: color for data (e.g. *b*, *g*, *r*). For a complete list check
1166  (`matplotlib docu <http://matplotlib.sourceforge.net/api/pyplot_api.html#matplotlib.pyplot.plot>`__).
1167  :type color: :class:`str`
1168 
1169  :param plot_if: callable which returnes *True* if row should be plotted. Is
1170  invoked like ``plot_if(self, row)``
1171  :type plot_if: callable
1172 
1173  :param legend: legend label for data series
1174  :type legend: :class:`str`
1175 
1176  :param num_z_levels: number of levels for third dimension
1177  :type num_z_levels: :class:`int`
1178 
1179  :param diag_line: draw diagonal line
1180  :type diag_line: :class:`bool`
1181 
1182  :param labels: column name containing labels to put on x-axis for one
1183  dimensional plot
1184  :type labels: :class:`str`
1185 
1186  :param max_num_labels: limit maximum number of labels
1187  :type max_num_labels: :class:`int`
1188 
1189  :param title: plot title, if not specified it is automatically derived from
1190  plotted column names
1191  :type title: :class:`str`
1192 
1193  :param clear: clear old data from plot
1194  :type clear: :class:`bool`
1195 
1196  :param save: filename for saving plot
1197  :type save: :class:`str`
1198 
1199  :param z_contour: draw contour lines
1200  :type z_contour: :class:`bool`
1201 
1202  :param z_interpol: interpolation method for 3-dimensional plot (one of 'nn',
1203  'linear')
1204  :type z_interpol: :class:`str`
1205 
1206  :param \\*\\*kwargs: additional arguments passed to matplotlib
1207 
1208  :returns: the ``matplotlib.pyplot`` module
1209 
1210  **Examples:** simple plotting functions
1211 
1212  .. code-block:: python
1213 
1214  tab = Table(['a','b','c','d'],'iffi', a=range(5,0,-1),
1215  b=[x/2.0 for x in range(1,6)],
1216  c=[math.cos(x) for x in range(0,5)],
1217  d=range(3,8))
1218 
1219  # one dimensional plot of column 'd' vs. index
1220  plt = tab.Plot('d')
1221  plt.show()
1222 
1223  # two dimensional plot of 'a' vs. 'c'
1224  plt = tab.Plot('a', y='c', style='o-')
1225  plt.show()
1226 
1227  # three dimensional plot of 'a' vs. 'c' with values 'b'
1228  plt = tab.Plot('a', y='c', z='b')
1229  # manually save plot to file
1230  plt.savefig("plot.png")
1231  """
1232  try:
1233  import matplotlib.pyplot as plt
1234  import matplotlib.mlab as mlab
1235  import numpy as np
1236  idx1 = self.GetColIndexGetColIndex(x)
1237  xs = []
1238  ys = []
1239  zs = []
1240 
1241  if clear:
1242  plt.figure(figsize=[8, 6])
1243 
1244  if x_title!=None:
1245  nice_x=x_title
1246  else:
1247  nice_x=MakeTitle(x)
1248 
1249  if y_title!=None:
1250  nice_y=y_title
1251  else:
1252  if y:
1253  nice_y=MakeTitle(y)
1254  else:
1255  nice_y=None
1256 
1257  if z_title!=None:
1258  nice_z = z_title
1259  else:
1260  if z:
1261  nice_z = MakeTitle(z)
1262  else:
1263  nice_z = None
1264 
1265  if x_range and (IsScalar(x_range) or len(x_range)!=2):
1266  raise ValueError('parameter x_range must contain exactly two elements')
1267  if y_range and (IsScalar(y_range) or len(y_range)!=2):
1268  raise ValueError('parameter y_range must contain exactly two elements')
1269  if z_range and (IsScalar(z_range) or len(z_range)!=2):
1270  raise ValueError('parameter z_range must contain exactly two elements')
1271 
1272  if color:
1273  kwargs['color']=color
1274  if legend:
1275  kwargs['label']=legend
1276  if y and z:
1277  idx3 = self.GetColIndexGetColIndex(z)
1278  idx2 = self.GetColIndexGetColIndex(y)
1279  for row in self.rowsrows:
1280  if row[idx1]!=None and row[idx2]!=None and row[idx3]!=None:
1281  if plot_if and not plot_if(self, row):
1282  continue
1283  xs.append(row[idx1])
1284  ys.append(row[idx2])
1285  zs.append(row[idx3])
1286  levels = []
1287  if z_range:
1288  z_spacing = (z_range[1] - z_range[0]) / num_z_levels
1289  l = z_range[0]
1290  else:
1291  l = self.MinMin(z)
1292  z_spacing = (self.MaxMax(z) - l) / num_z_levels
1293 
1294  for i in range(0,num_z_levels+1):
1295  levels.append(l)
1296  l += z_spacing
1297 
1298  xi = np.linspace(min(xs),max(xs),len(xs)*10)
1299  yi = np.linspace(min(ys),max(ys),len(ys)*10)
1300  zi = mlab.griddata(xs, ys, zs, xi, yi, interp=z_interpol)
1301 
1302  if z_contour:
1303  plt.contour(xi,yi,zi,levels,linewidths=0.5,colors='k')
1304 
1305  plt.contourf(xi,yi,zi,levels,cmap=plt.cm.jet)
1306  plt.colorbar(ticks=levels)
1307 
1308  elif y:
1309  idx2=self.GetColIndexGetColIndex(y)
1310  for row in self.rowsrows:
1311  if row[idx1]!=None and row[idx2]!=None:
1312  if plot_if and not plot_if(self, row):
1313  continue
1314  xs.append(row[idx1])
1315  ys.append(row[idx2])
1316  plt.plot(xs, ys, style, **kwargs)
1317 
1318  else:
1319  label_vals=[]
1320 
1321  if labels:
1322  label_idx=self.GetColIndexGetColIndex(labels)
1323  for row in self.rowsrows:
1324  if row[idx1]!=None:
1325  if plot_if and not plot_if(self, row):
1326  continue
1327  xs.append(row[idx1])
1328  if labels:
1329  label_vals.append(row[label_idx])
1330  plt.plot(xs, style, **kwargs)
1331  if labels:
1332  interval = 1
1333  if max_num_labels:
1334  if len(label_vals)>max_num_labels:
1335  interval = int(math.ceil(float(len(label_vals))/max_num_labels))
1336  label_vals = label_vals[::interval]
1337  plt.xticks(np.arange(0, len(xs), interval), label_vals, rotation=45,
1338  size='x-small')
1339 
1340  if title==None:
1341  if nice_z:
1342  title = '%s of %s vs. %s' % (nice_z, nice_x, nice_y)
1343  elif nice_y:
1344  title = '%s vs. %s' % (nice_x, nice_y)
1345  else:
1346  title = nice_x
1347 
1348  plt.title(title, size='x-large', fontweight='bold',
1349  verticalalignment='bottom')
1350 
1351  if legend:
1352  plt.legend(loc=0)
1353 
1354  if x and y:
1355  plt.xlabel(nice_x, size='x-large')
1356  if x_range:
1357  plt.xlim(x_range[0], x_range[1])
1358  if y_range:
1359  plt.ylim(y_range[0], y_range[1])
1360  if diag_line:
1361  plt.plot(x_range, y_range, '-', color='black')
1362 
1363  plt.ylabel(nice_y, size='x-large')
1364  else:
1365  if y_range:
1366  plt.ylim(y_range[0], y_range[1])
1367  if x_title:
1368  plt.xlabel(x_title, size='x-large')
1369  plt.ylabel(nice_y, size='x-large')
1370  if save:
1371  plt.savefig(save)
1372  return plt
1373  except ImportError:
1374  LogError("Function needs numpy and matplotlib, but I could not import it.")
1375  raise
1376 
1377  def PlotHistogram(self, col, x_range=None, num_bins=10, normed=False,
1378  histtype='stepfilled', align='mid', x_title=None,
1379  y_title=None, title=None, clear=True, save=False,
1380  color=None, y_range=None):
1381  """
1382  Create a histogram of the data in col for the range *x_range*, split into
1383  *num_bins* bins and plot it using Matplotlib.
1384 
1385  :param col: column name with data
1386  :type col: :class:`str`
1387 
1388  :param x_range: start and end value for first dimension (e.g. [start_x, end_x])
1389  :type x_range: :class:`list` of length two
1390 
1391  :param y_range: start and end value for second dimension (e.g. [start_y, end_y])
1392  :type y_range: :class:`list` of length two
1393 
1394  :param num_bins: number of bins in range
1395  :type num_bins: :class:`int`
1396 
1397  :param color: Color to be used for the histogram. If not set, color will be
1398  determined by matplotlib
1399  :type color: :class:`str`
1400 
1401  :param normed: normalize histogram
1402  :type normed: :class:`bool`
1403 
1404  :param histtype: type of histogram (i.e. *bar*, *barstacked*, *step*,
1405  *stepfilled*). See (`matplotlib docu <http://matplotlib.sourceforge.net/api/pyplot_api.html#matplotlib.pyplot.hist>`__).
1406  :type histtype: :class:`str`
1407 
1408  :param align: style of histogram (*left*, *mid*, *right*). See
1409  (`matplotlib docu <http://matplotlib.sourceforge.net/api/pyplot_api.html#matplotlib.pyplot.hist>`__).
1410  :type align: :class:`str`
1411 
1412  :param x_title: title for first dimension, if not specified it is
1413  automatically derived from column name
1414  :type x_title: :class:`str`
1415 
1416  :param y_title: title for second dimension, if not specified it is
1417  automatically derived from column name
1418  :type y_title: :class:`str`
1419 
1420  :param title: plot title, if not specified it is automatically derived from
1421  plotted column names
1422  :type title: :class:`str`
1423 
1424  :param clear: clear old data from plot
1425  :type clear: :class:`bool`
1426 
1427  :param save: filename for saving plot
1428  :type save: :class:`str`
1429 
1430  **Examples:** simple plotting functions
1431 
1432  .. code-block:: python
1433 
1434  tab = Table(['a'],'f', a=[math.cos(x*0.01) for x in range(100)])
1435 
1436  # one dimensional plot of column 'd' vs. index
1437  plt = tab.PlotHistogram('a')
1438  plt.show()
1439 
1440  """
1441  try:
1442  import matplotlib.pyplot as plt
1443  import numpy as np
1444 
1445  if len(self.rowsrows)==0:
1446  return None
1447  kwargs={}
1448  if color:
1449  kwargs['color']=color
1450  idx = self.GetColIndexGetColIndex(col)
1451  data = []
1452  for r in self.rowsrows:
1453  if r[idx]!=None:
1454  data.append(r[idx])
1455 
1456  if clear:
1457  plt.clf()
1458 
1459  n, bins, patches = plt.hist(data, bins=num_bins, range=x_range,
1460  normed=normed, histtype=histtype, align=align,
1461  **kwargs)
1462 
1463  if x_title!=None:
1464  nice_x=x_title
1465  else:
1466  nice_x=MakeTitle(col)
1467  plt.xlabel(nice_x, size='x-large')
1468  if y_range:
1469  plt.ylim(y_range)
1470  if y_title!=None:
1471  nice_y=y_title
1472  else:
1473  nice_y="bin count"
1474  plt.ylabel(nice_y, size='x-large')
1475 
1476  if title!=None:
1477  nice_title=title
1478  else:
1479  nice_title="Histogram of %s"%nice_x
1480  plt.title(nice_title, size='x-large', fontweight='bold')
1481 
1482  if save:
1483  plt.savefig(save)
1484  return plt
1485  except ImportError:
1486  LogError("Function needs numpy and matplotlib, but I could not import it.")
1487  raise
1488 
1489  def _Max(self, col):
1490  if len(self.rowsrows)==0:
1491  return None, None
1492  idx = self.GetColIndexGetColIndex(col)
1493  col_type = self.col_typescol_types[idx]
1494  if col_type=='int' or col_type=='float':
1495  max_val = -float('inf')
1496  elif col_type=='bool':
1497  max_val = False
1498  elif col_type=='string':
1499  max_val = chr(0)
1500  max_idx = None
1501  for i in range(0, len(self.rowsrows)):
1502  val = self.rowsrows[i][idx]
1503  if val and val > max_val:
1504  max_val = self.rowsrows[i][idx]
1505  max_idx = i
1506  return max_val, max_idx
1507 
1508  def PlotBar(self, cols=None, rows=None, xlabels=None, set_xlabels=True, xlabels_rotation='horizontal', y_title=None, title=None,
1509  colors=None, width=0.8, bottom=0, legend=False, legend_names=None, show=False, save=False):
1510 
1511  """
1512  Create a barplot of the data in cols. Every column will be represented
1513  at one position. If there are several rows, each column will be grouped
1514  together.
1515 
1516  :param cols: List of column names. Every column will be represented as a
1517  single bar. If cols is None, every column of the table gets
1518  plotted.
1519  :type cols: :class:`list`
1520 
1521  :param rows: List of row indices. Values from given rows will be plotted
1522  in parallel at one column position. If set to None, all rows
1523  of the table will be plotted. Note, that the maximum number
1524  of rows is 7.
1525  :type rows: :class:`list`
1526 
1527  :param xlabels: Label for every col on x-axis. If set to None, the column
1528  names are used. The xlabel plotting can be supressed by
1529  the parameter set_xlabel.
1530  :type xlabels: :class:`list`
1531 
1532  :param set_xlabels: Controls whether xlabels are plotted or not.
1533  :type set_xlabels: :class:`bool`
1534 
1535  :param x_labels_rotation: Can either be 'horizontal', 'vertical' or an
1536  integer, that describes the rotation in degrees.
1537 
1538  :param y_title: Y-axis description
1539  :type y_title: :class:`str`
1540 
1541  :title: Title of the plot. No title appears if set to None
1542  :type title: :class:`str`
1543 
1544  :param colors: Colors of the different bars in each group. Must be a list
1545  of valid colors in matplotlib. Length of color and rows must
1546  be consistent.
1547  :type colors: :class:`list`
1548 
1549  :param width: The available space for the groups on the x-axis is divided
1550  by the exact number of groups. The parameters width is the
1551  fraction of what is actually used. If it would be 1.0 the
1552  bars of the different groups would touch each other.
1553  Value must be between [0;1]
1554  :type width: :class:`float`
1555 
1556  :param bottom: Bottom
1557  :type bottom: :class:`float`
1558 
1559  :param legend: Legend for color explanation, the corresponding row
1560  respectively. If set to True, legend_names must be provided.
1561  :type legend: :class:`bool`
1562 
1563  :param legend_names: List of names, that describe the differently colored
1564  bars. Length must be consistent with number of rows.
1565 
1566  :param show: If set to True, the plot is directly displayed.
1567 
1568  :param save: If set, a png image with name save in the current working
1569  directory will be saved.
1570  :type save: :class:`str`
1571 
1572  """
1573  try:
1574  import numpy as np
1575  import matplotlib.pyplot as plt
1576  except:
1577  raise ImportError('PlotBar relies on numpy and matplotlib, but I could' \
1578  'not import it!')
1579 
1580  standard_colors=['b','g','y','c','m','r','k']
1581  data=[]
1582 
1583  if cols==None:
1584  cols=self.col_namescol_names
1585 
1586  if width<=0 or width>1:
1587  raise ValueError('Width must be in [0;1]')
1588 
1589  if rows==None:
1590  if len(self.rowsrows)>7:
1591  raise ValueError('Table contains too many rows to represent them at one '\
1592  'bar position in parallel. You can Select a Subtable or '\
1593  'specify the parameter rows with a list of row indices '\
1594  '(max 7)')
1595  else:
1596  rows=list(range(len(self.rowsrows)))
1597  else:
1598  if not isinstance(rows,list):
1599  rows=[rows]
1600  if len(rows)>7:
1601  raise ValueError('Too many rows to represent (max 7). Please note, that '\
1602  'data from multiple rows from one column gets '\
1603  'represented at one position in parallel.')
1604 
1605  for r_idx in rows:
1606  row=self.rowsrows[r_idx]
1607  temp=list()
1608  for c in cols:
1609  try:
1610  c_idx=self.GetColIndexGetColIndex(c)
1611  except:
1612  raise ValueError('Cannot find column with name '+str(c))
1613  temp.append(row[c_idx])
1614  data.append(temp)
1615 
1616  if colors==None:
1617  colors=standard_colors[:len(rows)]
1618 
1619  if len(rows)!=len(colors):
1620  raise ValueError("Number of rows and number of colors must be consistent!")
1621 
1622  ind=np.arange(len(data[0]))
1623  single_bar_width=float(width)/len(data)
1624 
1625  fig=plt.figure()
1626  ax=fig.add_subplot(111)
1627  legend_data=[]
1628 
1629  for i in range(len(data)):
1630  legend_data.append(ax.bar(ind+i*single_bar_width+(1-width)/2,data[i],single_bar_width,bottom=bottom,color=colors[i])[0])
1631 
1632  if title!=None:
1633  ax.set_title(title, size='x-large', fontweight='bold')
1634 
1635  if y_title!=None:
1636  nice_y=y_title
1637  else:
1638  nice_y="value"
1639  ax.set_ylabel(nice_y)
1640 
1641  if xlabels:
1642  if len(data[0])!=len(xlabels):
1643  raise ValueError('Number of xlabels is not consistent with number of cols!')
1644  else:
1645  xlabels=cols
1646 
1647  if set_xlabels:
1648  ax.set_xticks(ind+0.5)
1649  ax.set_xticklabels(xlabels, rotation = xlabels_rotation)
1650  else:
1651  ax.set_xticks([])
1652 
1653  if legend == True:
1654  if legend_names==None:
1655  raise ValueError('You must provide legend names! e.g. names for the rows, '\
1656  'that are printed in parallel.')
1657  if len(legend_names)!=len(data):
1658  raise ValueError('length of legend_names must be consistent with number '\
1659  'of plotted rows!')
1660  ax.legend(legend_data, legend_names)
1661 
1662  if save:
1663  plt.savefig(save)
1664 
1665  if show:
1666  plt.show()
1667 
1668  return plt
1669 
1670  def PlotHexbin(self, x, y, title=None, x_title=None, y_title=None, x_range=None, y_range=None, binning='log',
1671  colormap='jet', show_scalebar=False, scalebar_label=None, clear=True, save=False, show=False):
1672 
1673  """
1674  Create a heatplot of the data in col x vs the data in col y using matplotlib
1675 
1676  :param x: column name with x data
1677  :type x: :class:`str`
1678 
1679  :param y: column name with y data
1680  :type y: :class:`str`
1681 
1682  :param title: title of the plot, will be generated automatically if set to None
1683  :type title: :class:`str`
1684 
1685  :param x_title: label of x-axis, will be generated automatically if set to None
1686  :type title: :class:`str`
1687 
1688  :param y_title: label of y-axis, will be generated automatically if set to None
1689  :type title: :class:`str`
1690 
1691  :param x_range: start and end value for first dimension (e.g. [start_x, end_x])
1692  :type x_range: :class:`list` of length two
1693 
1694  :param y_range: start and end value for second dimension (e.g. [start_y, end_y])
1695  :type y_range: :class:`list` of length two
1696 
1697  :param binning: type of binning. If set to None, the value of a hexbin will
1698  correspond to the number of datapoints falling into it. If
1699  set to 'log', the value will be the log with base 10 of the above
1700  value (log(i+1)). If an integer is provided, the number of a
1701  hexbin is equal the number of datapoints falling into it divided
1702  by the integer. If a list of values is provided, these values
1703  will be the lower bounds of the bins.
1704 
1705  :param colormap: colormap, that will be used. Value can be every colormap defined
1706  in matplotlib or an own defined colormap. You can either pass a
1707  string with the name of the matplotlib colormap or a colormap
1708  object.
1709 
1710  :param show_scalebar: If set to True, a scalebar according to the chosen colormap is shown
1711  :type show_scalebar: :class:`bool`
1712 
1713  :param scalebar_label: Label of the scalebar
1714  :type scalebar_label: :class:`str`
1715 
1716  :param clear: clear old data from plot
1717  :type clear: :class:`bool`
1718 
1719  :param save: filename for saving plot
1720  :type save: :class:`str`
1721 
1722  :param show: directly show plot
1723  :type show: :class:`bool`
1724 
1725  """
1726 
1727  try:
1728  import matplotlib.pyplot as plt
1729  import matplotlib.cm as cm
1730  except:
1731  raise ImportError('PlotHexbin relies on matplotlib, but I could not import it')
1732 
1733  idx=self.GetColIndexGetColIndex(x)
1734  idy=self.GetColIndexGetColIndex(y)
1735  xdata=[]
1736  ydata=[]
1737 
1738  for r in self.rowsrows:
1739  if r[idx]!=None and r[idy]!=None:
1740  xdata.append(r[idx])
1741  ydata.append(r[idy])
1742 
1743  if clear:
1744  plt.clf()
1745 
1746  if x_title!=None:
1747  nice_x=x_title
1748  else:
1749  nice_x=MakeTitle(x)
1750 
1751  if y_title!=None:
1752  nice_y=y_title
1753  else:
1754  nice_y=MakeTitle(y)
1755 
1756  if title==None:
1757  title = '%s vs. %s' % (nice_x, nice_y)
1758 
1759  if IsStringLike(colormap):
1760  colormap=getattr(cm, colormap)
1761 
1762  if x_range and (IsScalar(x_range) or len(x_range)!=2):
1763  raise ValueError('parameter x_range must contain exactly two elements')
1764  if y_range and (IsScalar(y_range) or len(y_range)!=2):
1765  raise ValueError('parameter y_range must contain exactly two elements')
1766 
1767  ext = [min(xdata),max(xdata),min(ydata),max(ydata)]
1768 
1769  if x_range:
1770  plt.xlim((x_range[0], x_range[1]))
1771  ext[0]=x_range[0]
1772  ext[1]=x_range[1]
1773  if y_range:
1774  plt.ylim(y_range[0], y_range[1])
1775  ext[2]=y_range[0]
1776  ext[3]=y_range[1]
1777 
1778 
1779  plt.hexbin(xdata, ydata, bins=binning, cmap=colormap, extent=ext)
1780 
1781  plt.title(title, size='x-large', fontweight='bold',
1782  verticalalignment='bottom')
1783 
1784  plt.xlabel(nice_x)
1785  plt.ylabel(nice_y)
1786 
1787  if show_scalebar:
1788  cb=plt.colorbar()
1789  if scalebar_label:
1790  cb.set_label(scalebar_label)
1791 
1792  if save:
1793  plt.savefig(save)
1794 
1795  if show:
1796  plt.show()
1797 
1798  return plt
1799 
1800  def MaxRow(self, col):
1801  """
1802  Returns the row containing the cell with the maximal value in col. If
1803  several rows have the highest value, only the first one is returned.
1804  ''None'' values are ignored.
1805 
1806  :param col: column name
1807  :type col: :class:`str`
1808 
1809  :returns: row with maximal col value or None if the table is empty
1810  """
1811  val, idx = self._Max_Max(col)
1812  if idx!=None:
1813  return self.rowsrows[idx]
1814 
1815  def Max(self, col):
1816  """
1817  Returns the maximum value in col. If several rows have the highest value,
1818  only the first one is returned. ''None'' values are ignored.
1819 
1820  :param col: column name
1821  :type col: :class:`str`
1822  """
1823  val, idx = self._Max_Max(col)
1824  return val
1825 
1826  def MaxIdx(self, col):
1827  """
1828  Returns the row index of the cell with the maximal value in col. If
1829  several rows have the highest value, only the first one is returned.
1830  ''None'' values are ignored.
1831 
1832  :param col: column name
1833  :type col: :class:`str`
1834  """
1835  val, idx = self._Max_Max(col)
1836  return idx
1837 
1838  def _Min(self, col):
1839  if len(self.rowsrows)==0:
1840  return None, None
1841  idx=self.GetColIndexGetColIndex(col)
1842  col_type = self.col_typescol_types[idx]
1843  if col_type=='int' or col_type=='float':
1844  min_val=float('inf')
1845  elif col_type=='bool':
1846  min_val=True
1847  elif col_type=='string':
1848  min_val=chr(255)
1849  min_idx=None
1850  for i,row in enumerate(self.rowsrows):
1851  if row[idx]!=None and row[idx]<min_val:
1852  min_val=row[idx]
1853  min_idx=i
1854  return min_val, min_idx
1855 
1856  def Min(self, col):
1857  """
1858  Returns the minimal value in col. If several rows have the lowest value,
1859  only the first one is returned. ''None'' values are ignored.
1860 
1861  :param col: column name
1862  :type col: :class:`str`
1863  """
1864  val, idx = self._Min_Min(col)
1865  return val
1866 
1867  def MinRow(self, col):
1868  """
1869  Returns the row containing the cell with the minimal value in col. If
1870  several rows have the lowest value, only the first one is returned.
1871  ''None'' values are ignored.
1872 
1873  :param col: column name
1874  :type col: :class:`str`
1875 
1876  :returns: row with minimal col value or None if the table is empty
1877  """
1878  val, idx = self._Min_Min(col)
1879  if idx!=None:
1880  return self.rowsrows[idx]
1881 
1882  def MinIdx(self, col):
1883  """
1884  Returns the row index of the cell with the minimal value in col. If
1885  several rows have the lowest value, only the first one is returned.
1886  ''None'' values are ignored.
1887 
1888  :param col: column name
1889  :type col: :class:`str`
1890  """
1891  val, idx = self._Min_Min(col)
1892  return idx
1893 
1894  def Sum(self, col):
1895  """
1896  Returns the sum of the given column. Cells with ''None'' are ignored. Returns
1897  0.0, if the column doesn't contain any elements. Col must be of numeric
1898  column type ('float', 'int') or boolean column type.
1899 
1900  :param col: column name
1901  :type col: :class:`str`
1902 
1903  :raises: :class:`TypeError` if column type is ``string``
1904  """
1905  idx = self.GetColIndexGetColIndex(col)
1906  col_type = self.col_typescol_types[idx]
1907  if col_type!='int' and col_type!='float' and col_type!='bool':
1908  raise TypeError("Sum can only be used on numeric column types")
1909  s = 0.0
1910  for r in self.rowsrows:
1911  if r[idx]!=None:
1912  s += r[idx]
1913  return s
1914 
1915  def Mean(self, col):
1916  """
1917  Returns the mean of the given column. Cells with ''None'' are ignored. Returns
1918  None, if the column doesn't contain any elements. Col must be of numeric
1919  ('float', 'int') or boolean column type.
1920 
1921  If column type is *bool*, the function returns the ratio of
1922  number of 'Trues' by total number of elements.
1923 
1924  :param col: column name
1925  :type col: :class:`str`
1926 
1927  :raises: :class:`TypeError` if column type is ``string``
1928  """
1929  idx = self.GetColIndexGetColIndex(col)
1930  col_type = self.col_typescol_types[idx]
1931  if col_type!='int' and col_type!='float' and col_type!='bool':
1932  raise TypeError("Mean can only be used on numeric or bool column types")
1933 
1934  vals=[]
1935  for v in self[col]:
1936  if v!=None:
1937  vals.append(v)
1938  try:
1939  return stutil.Mean(vals)
1940  except:
1941  return None
1942 
1943  def RowMean(self, mean_col_name, cols):
1944  """
1945  Adds a new column of type 'float' with a specified name (*mean_col_name*),
1946  containing the mean of all specified columns for each row.
1947 
1948  Cols are specified by their names and must be of numeric column
1949  type ('float', 'int') or boolean column type. Cells with None are ignored.
1950  Adds ''None'' if the row doesn't contain any values.
1951 
1952  :param mean_col_name: name of new column containing mean values
1953  :type mean_col_name: :class:`str`
1954 
1955  :param cols: name or list of names of columns to include in computation of
1956  mean
1957  :type cols: :class:`str` or :class:`list` of strings
1958 
1959  :raises: :class:`TypeError` if column type of columns in *col* is ``string``
1960 
1961  == Example ==
1962 
1963  Staring with the following table:
1964 
1965  ==== ==== ====
1966  x y u
1967  ==== ==== ====
1968  1 10 100
1969  2 15 None
1970  3 20 400
1971  ==== ==== ====
1972 
1973  the code here adds a column with the name 'mean' to yield the table below:
1974 
1975  .. code-block::python
1976 
1977  tab.RowMean('mean', ['x', 'u'])
1978 
1979 
1980  ==== ==== ==== =====
1981  x y u mean
1982  ==== ==== ==== =====
1983  1 10 100 50.5
1984  2 15 None 2
1985  3 20 400 201.5
1986  ==== ==== ==== =====
1987 
1988  """
1989 
1990  if IsScalar(cols):
1991  cols = [cols]
1992 
1993  cols_idxs = []
1994  for col in cols:
1995  idx = self.GetColIndexGetColIndex(col)
1996  col_type = self.col_typescol_types[idx]
1997  if col_type!='int' and col_type!='float' and col_type!='bool':
1998  raise TypeError("RowMean can only be used on numeric column types")
1999  cols_idxs.append(idx)
2000 
2001  mean_rows = []
2002  for row in self.rowsrows:
2003  vals = []
2004  for idx in cols_idxs:
2005  v = row[idx]
2006  if v!=None:
2007  vals.append(v)
2008  try:
2009  mean = stutil.Mean(vals)
2010  mean_rows.append(mean)
2011  except:
2012  mean_rows.append(None)
2013 
2014  self.AddColAddCol(mean_col_name, 'f', mean_rows)
2015 
2016  def Percentiles(self, col, nths):
2017  """
2018  Returns the percentiles of column *col* given in *nths*.
2019 
2020  The percentiles are calculated as
2021 
2022  .. code-block:: python
2023 
2024  values[min(len(values)-1, int(math.floor(len(values)*nth/100.0)))]
2025 
2026  where values are the sorted values of *col* not equal to ''None''
2027 
2028  :param col: column name
2029  :type col: :class:`str`
2030  :param nths: list of percentiles to be calculated. Each percentile is a
2031  number between 0 and 100.
2032  :type nths: :class:`list` of numbers
2033 
2034  :raises: :class:`TypeError` if column type is ``string``
2035  :returns: List of percentiles in the same order as given in *nths*
2036  """
2037  idx = self.GetColIndexGetColIndex(col)
2038  col_type = self.col_typescol_types[idx]
2039  if col_type!='int' and col_type!='float' and col_type!='bool':
2040  raise TypeError("Median can only be used on numeric column types")
2041 
2042  for nth in nths:
2043  if nth < 0 or nth > 100:
2044  raise ValueError("percentiles must be between 0 and 100")
2045  vals=[]
2046  for v in self[col]:
2047  if v!=None:
2048  vals.append(v)
2049  vals=sorted(vals)
2050  if len(vals)==0:
2051  return [None]*len(nths)
2052  percentiles=[]
2053 
2054  for nth in nths:
2055  # rounding behaviour between Python2 and Python3 changed....
2056  # p=vals[min(len(vals)-1, int(round(len(vals)*nth/100.0+0.5)-1))]
2057  p=vals[min(len(vals)-1, int(math.floor(len(vals)*nth/100.0)))]
2058  percentiles.append(p)
2059  return percentiles
2060 
2061  def Median(self, col):
2062  """
2063  Returns the median of the given column. Cells with ''None'' are ignored. Returns
2064  ''None'', if the column doesn't contain any elements. Col must be of numeric
2065  column type ('float', 'int') or boolean column type.
2066 
2067  :param col: column name
2068  :type col: :class:`str`
2069 
2070  :raises: :class:`TypeError` if column type is ``string``
2071  """
2072  idx = self.GetColIndexGetColIndex(col)
2073  col_type = self.col_typescol_types[idx]
2074  if col_type!='int' and col_type!='float' and col_type!='bool':
2075  raise TypeError("Median can only be used on numeric column types")
2076 
2077  vals=[]
2078  for v in self[col]:
2079  if v!=None:
2080  vals.append(v)
2081  stutil.Median(vals)
2082  try:
2083  return stutil.Median(vals)
2084  except:
2085  return None
2086 
2087  def StdDev(self, col):
2088  """
2089  Returns the standard deviation of the given column. Cells with ''None'' are
2090  ignored. Returns ''None'', if the column doesn't contain any elements. Col must
2091  be of numeric column type ('float', 'int') or boolean column type.
2092 
2093  :param col: column name
2094  :type col: :class:`str`
2095 
2096  :raises: :class:`TypeError` if column type is ``string``
2097  """
2098  idx = self.GetColIndexGetColIndex(col)
2099  col_type = self.col_typescol_types[idx]
2100  if col_type!='int' and col_type!='float' and col_type!='bool':
2101  raise TypeError("StdDev can only be used on numeric column types")
2102 
2103  vals=[]
2104  for v in self[col]:
2105  if v!=None:
2106  vals.append(v)
2107  try:
2108  return stutil.StdDev(vals)
2109  except:
2110  return None
2111 
2112  def Count(self, col, ignore_nan=True):
2113  """
2114  Count the number of cells in column that are not equal to ''None''.
2115 
2116  :param col: column name
2117  :type col: :class:`str`
2118 
2119  :param ignore_nan: ignore all *None* values
2120  :type ignore_nan: :class:`bool`
2121  """
2122  count=0
2123  idx=self.GetColIndexGetColIndex(col)
2124  for r in self.rowsrows:
2125  if ignore_nan:
2126  if r[idx]!=None:
2127  count+=1
2128  else:
2129  count+=1
2130  return count
2131 
2132  def Correl(self, col1, col2):
2133  """
2134  Calculate the Pearson correlation coefficient between *col1* and *col2*, only
2135  taking rows into account where both of the values are not equal to *None*.
2136  If there are not enough data points to calculate a correlation coefficient,
2137  *None* is returned.
2138 
2139  :param col1: column name for first column
2140  :type col1: :class:`str`
2141 
2142  :param col2: column name for second column
2143  :type col2: :class:`str`
2144  """
2145  if IsStringLike(col1) and IsStringLike(col2):
2146  col1 = self.GetColIndexGetColIndex(col1)
2147  col2 = self.GetColIndexGetColIndex(col2)
2148  vals1, vals2=([],[])
2149  for v1, v2 in zip(self[col1], self[col2]):
2150  if v1!=None and v2!=None:
2151  vals1.append(v1)
2152  vals2.append(v2)
2153  try:
2154  return stutil.Correl(vals1, vals2)
2155  except:
2156  return None
2157 
2158  def SpearmanCorrel(self, col1, col2):
2159  """
2160  Calculate the Spearman correlation coefficient between col1 and col2, only
2161  taking rows into account where both of the values are not equal to None. If
2162  there are not enough data points to calculate a correlation coefficient,
2163  None is returned.
2164 
2165  :warning: The function depends on the following module: *scipy.stats.mstats*
2166 
2167  :param col1: column name for first column
2168  :type col1: :class:`str`
2169 
2170  :param col2: column name for second column
2171  :type col2: :class:`str`
2172  """
2173  try:
2174  import scipy.stats.mstats
2175 
2176  if IsStringLike(col1) and IsStringLike(col2):
2177  col1 = self.GetColIndexGetColIndex(col1)
2178  col2 = self.GetColIndexGetColIndex(col2)
2179  vals1, vals2=([],[])
2180  for v1, v2 in zip(self[col1], self[col2]):
2181  if v1!=None and v2!=None:
2182  vals1.append(v1)
2183  vals2.append(v2)
2184  try:
2185  correl = scipy.stats.mstats.spearmanr(vals1, vals2)[0]
2186  if scipy.isnan(correl):
2187  return None
2188  return correl
2189  except:
2190  return None
2191 
2192  except ImportError:
2193  LogError("Function needs scipy.stats.mstats, but I could not import it.")
2194  raise
2195 
2196 
2197  def Save(self, stream_or_filename, format='ost', sep=','):
2198  """
2199  Save the table to stream or filename. The following three file formats
2200  are supported (for more information on file formats, see :meth:`Load`):
2201 
2202  ============= =======================================
2203  ost ost-specific format (human readable)
2204  csv comma separated values (human readable)
2205  pickle pickled byte stream (binary)
2206  html HTML table
2207  context ConTeXt table
2208  ============= =======================================
2209 
2210  :param stream_or_filename: filename or stream for writing output
2211  :type stream_or_filename: :class:`str` or :class:`file`
2212 
2213  :param format: output format (i.e. *ost*, *csv*, *pickle*)
2214  :type format: :class:`str`
2215 
2216  :raises: :class:`ValueError` if format is unknown
2217  """
2218  format=format.lower()
2219  if format=='ost':
2220  return self._SaveOST_SaveOST(stream_or_filename)
2221  if format=='csv':
2222  return self._SaveCSV_SaveCSV(stream_or_filename, sep=sep)
2223  if format=='pickle':
2224  return self._SavePickle_SavePickle(stream_or_filename)
2225  if format=='html':
2226  return self._SaveHTML_SaveHTML(stream_or_filename)
2227  if format=='context':
2228  return self._SaveContext_SaveContext(stream_or_filename)
2229  raise ValueError('unknown format "%s"' % format)
2230 
2231  def _SavePickle(self, stream):
2232  file_opened=False
2233  if not hasattr(stream, 'write'):
2234  stream=open(stream, 'wb')
2235  file_opened=True
2236  pickle.dump(self, stream, pickle.HIGHEST_PROTOCOL)
2237  if file_opened:
2238  stream.close()
2239 
2240  def _SaveHTML(self, stream_or_filename):
2241  def _escape(s):
2242  return s.replace('&', '&amp;').replace('>', '&gt;').replace('<', '&lt;')
2243 
2244  file_opened = False
2245  if not hasattr(stream_or_filename, 'write'):
2246  stream = open(stream_or_filename, 'w')
2247  file_opened = True
2248  else:
2249  stream = stream_or_filename
2250  stream.write('<table>')
2251  stream.write('<tr>')
2252  for col_name in self.col_namescol_names:
2253  stream.write('<th>%s</th>' % _escape(col_name))
2254  stream.write('</tr>')
2255  for row in self.rowsrows:
2256  stream.write('<tr>')
2257  for i, col in enumerate(row):
2258  val = ''
2259  if col != None:
2260  if self.col_typescol_types[i] == 'float':
2261  val = '%.3f' % col
2262  elif self.col_typescol_types[i] == 'int':
2263  val = '%d' % col
2264  elif self.col_typescol_types[i] == 'bool':
2265  val = col and 'true' or 'false'
2266  else:
2267  val = str(col)
2268  stream.write('<td>%s</td>' % _escape(val))
2269  stream.write('</tr>')
2270  stream.write('</table>')
2271  if file_opened:
2272  stream.close()
2273  def _SaveContext(self, stream_or_filename):
2274  file_opened = False
2275  if not hasattr(stream_or_filename, 'write'):
2276  stream = open(stream_or_filename, 'w')
2277  file_opened = True
2278  else:
2279  stream = stream_or_filename
2280  stream.write('\\starttable[')
2281  for col_type in self.col_typescol_types:
2282  if col_type =='string':
2283  stream.write('l|')
2284  elif col_type=='int':
2285  stream.write('r|')
2286  elif col_type =='float':
2287  stream.write('i3r|')
2288  else:
2289  stream.write('l|')
2290  stream.write(']\n\\HL\n')
2291  for col_name in self.col_namescol_names:
2292  stream.write('\\NC \\bf %s' % col_name)
2293  stream.write(' \\AR\\HL\n')
2294  for row in self.rowsrows:
2295  for i, col in enumerate(row):
2296  val = '---'
2297  if col != None:
2298  if self.col_typescol_types[i] == 'float':
2299  val = '%.3f' % col
2300  elif self.col_typescol_types[i] == 'int':
2301  val = '%d' % col
2302  elif self.col_typescol_types[i] == 'bool':
2303  val = col and 'true' or 'false'
2304  else:
2305  val = str(col)
2306  stream.write('\\NC %s' % val)
2307  stream.write(' \\AR\n')
2308  stream.write('\\HL\n')
2309  stream.write('\\stoptable')
2310  if file_opened:
2311  stream.close()
2312 
2313  def _SaveCSV(self, stream, sep):
2314  file_opened=False
2315  if not hasattr(stream, 'write'):
2316  stream=open(stream, 'w')
2317  file_opened=True
2318 
2319  writer=csv.writer(stream, delimiter=sep)
2320  writer.writerow(['%s' % n for n in self.col_namescol_names])
2321  for row in self.rowsrows:
2322  row=list(row)
2323  for i, c in enumerate(row):
2324  if c==None:
2325  row[i]='NA'
2326  writer.writerow(row)
2327  if file_opened:
2328  stream.close()
2329 
2330 
2331  def _SaveOST(self, stream):
2332  file_opened=False
2333  if hasattr(stream, 'write'):
2334  writer=csv.writer(stream, delimiter=' ')
2335  else:
2336  stream=open(stream, 'w')
2337  writer=csv.writer(stream, delimiter=' ')
2338  file_opened=True
2339  if self.commentcomment:
2340  stream.write(''.join(['# %s\n' % l for l in self.commentcomment.split('\n')]))
2341  writer.writerow(['%s[%s]' % t for t in zip(self.col_namescol_names, self.col_typescol_types)])
2342  for row in self.rowsrows:
2343  row=list(row)
2344  for i, c in enumerate(row):
2345  if c==None:
2346  row[i]='NA'
2347  writer.writerow(row)
2348  if file_opened:
2349  stream.close()
2350 
2351  def GetNumpyMatrixAsArray(self, *args):
2352  '''
2353  Returns a numpy array containing the selected columns from the table as
2354  columns as a matrix.
2355 
2356  Only columns of type *int* or *float* are supported. *NA* values in the
2357  table will be converted to *None* values.
2358 
2359  Originally the function used the numpy matrix class but that is going to be
2360  deprecated in the future. Numpy itself suggests replacing numpy matrix by
2361  numpy array.
2362 
2363  :param \\*args: column names to include in numpy array
2364 
2365  :warning: The function depends on *numpy*
2366  '''
2367  try:
2368  import numpy as np
2369 
2370  if len(args)==0:
2371  raise RuntimeError("At least one column must be specified.")
2372 
2373  idxs = []
2374  for arg in args:
2375  idx = self.GetColIndexGetColIndex(arg)
2376  col_type = self.col_typescol_types[idx]
2377  if col_type!='int' and col_type!='float':
2378  raise TypeError("Numpy matrix can only be generated from numeric "+\
2379  "column types")
2380  idxs.append(idx)
2381 
2382  a = np.array([list(self[i]) for i in idxs])
2383  return a.T
2384 
2385  except ImportError:
2386  LogError("Function needs numpy, but I could not import it.")
2387  raise
2388 
2389  def GetNumpyMatrix(self, *args):
2390  '''
2391  *Caution*: Numpy is deprecating the use of the numpy matrix class.
2392 
2393  Returns a numpy matrix containing the selected columns from the table as
2394  columns in the matrix.
2395 
2396  Only columns of type *int* or *float* are supported. *NA* values in the
2397  table will be converted to *None* values.
2398 
2399  :param \\*args: column names to include in numpy matrix
2400 
2401  :warning: The function depends on *numpy*
2402  '''
2403  LogWarning("table.GetNumpyMatrix is deprecated, please use "+
2404  "table.GetNumpyMatrixAsArray instead")
2405  try:
2406  import numpy as np
2407  m = self.GetNumpyMatrixAsArrayGetNumpyMatrixAsArray(*args)
2408  return np.matrix(m)
2409  except ImportError:
2410  LogError("Function needs numpy, but I could not import it.")
2411  raise
2412 
2413  def GaussianSmooth(self, col, std=1.0, na_value=0.0, padding='reflect', c=0.0):
2414 
2415  '''
2416  In place Gaussian smooth of a column in the table with a given standard deviation.
2417  All nan are set to nan_value before smoothing.
2418 
2419  :param col: column name
2420  :type col: :class:`str`
2421 
2422  :param std: standard deviation for gaussian kernel
2423  :type std: `scalar`
2424 
2425  :param na_value: all na (None) values of the speciefied column are set to na_value before smoothing
2426  :type na_value: `scalar`
2427 
2428  :param padding: allows to handle padding behaviour see scipy ndimage.gaussian_filter1d documentation for more information. standard is reflect
2429  :type padding: :class:`str`
2430 
2431  :param c: constant value used for padding if padding mode is constant
2432  :type c: `scalar`
2433 
2434 
2435 
2436  :warning: The function depends on *scipy*
2437  '''
2438 
2439  try:
2440  from scipy import ndimage
2441  import numpy as np
2442  except ImportError:
2443  LogError("I need scipy.ndimage and numpy, but could not import it")
2444  raise
2445 
2446  idx = self.GetColIndexGetColIndex(col)
2447  col_type = self.col_typescol_types[idx]
2448  if col_type!='int' and col_type!='float':
2449  raise TypeError("GaussianSmooth can only be used on numeric column types")
2450 
2451  vals=[]
2452  for v in self[col]:
2453  if v!=None:
2454  vals.append(v)
2455  else:
2456  vals.append(na_value)
2457 
2458 
2459  smoothed_values_ndarray=ndimage.gaussian_filter1d(vals,std, mode=padding, cval=c)
2460 
2461  result=[]
2462 
2463  for v in smoothed_values_ndarray:
2464  result.append(v)
2465 
2466  self[col]=result
2467 
2468 
2469  def GetOptimalPrefactors(self, ref_col, *args, **kwargs):
2470  '''
2471  This returns the optimal prefactor values (i.e. :math:`a, b, c, ...`) for
2472  the following equation
2473 
2474  .. math::
2475  :label: op1
2476 
2477  a*u + b*v + c*w + ... = z
2478 
2479  where :math:`u, v, w` and :math:`z` are vectors. In matrix notation
2480 
2481  .. math::
2482  :label: op2
2483 
2484  A*p = z
2485 
2486  where :math:`A` contains the data from the table :math:`(u,v,w,...)`,
2487  :math:`p` are the prefactors to optimize :math:`(a,b,c,...)` and :math:`z`
2488  is the vector containing the result of equation :eq:`op1`.
2489 
2490  The parameter ref_col equals to :math:`z` in both equations, and \\*args
2491  are columns :math:`u`, :math:`v` and :math:`w` (or :math:`A` in :eq:`op2`).
2492  All columns must be specified by their names.
2493 
2494  **Example:**
2495 
2496  .. code-block:: python
2497 
2498  tab.GetOptimalPrefactors('colC', 'colA', 'colB')
2499 
2500  The function returns a list containing the prefactors
2501  :math:`a, b, c, ...` in the correct order (i.e. same as columns were
2502  specified in \\*args).
2503 
2504  Weighting:
2505  If the kwarg weights="columX" is specified, the equations are weighted by
2506  the values in that column. Each row is multiplied by the weight in that
2507  row, which leads to :eq:`op3`:
2508 
2509  .. math::
2510  :label: op3
2511 
2512  \\textit{weight}*a*u + \\textit{weight}*b*v + \\textit{weight}*c*w + ...
2513  = \\textit{weight}*z
2514 
2515  Weights must be float or int and can have any value. A value of 0 ignores
2516  this equation, a value of 1 means the same as no weight. If all weights are
2517  the same for each row, the same result will be obtained as with no weights.
2518 
2519  **Example:**
2520 
2521  .. code-block:: python
2522 
2523  tab.GetOptimalPrefactors('colC', 'colA', 'colB', weights='colD')
2524 
2525  '''
2526  try:
2527  import numpy as np
2528 
2529  if len(args)==0:
2530  raise RuntimeError("At least one column must be specified.")
2531 
2532  b = self.GetNumpyMatrixAsArrayGetNumpyMatrixAsArray(ref_col)
2533  a = self.GetNumpyMatrixAsArrayGetNumpyMatrixAsArray(*args)
2534 
2535  if len(kwargs)!=0:
2536  if 'weights' in kwargs:
2537  w = self.GetNumpyMatrixAsArrayGetNumpyMatrixAsArray(kwargs['weights'])
2538  b = np.multiply(b,w)
2539  a = np.multiply(a,w)
2540 
2541  else:
2542  raise RuntimeError("specified unrecognized kwargs, use weights as key")
2543 
2544  k = np.linalg.inv(a.T@a)@a.T@b
2545  return list(k.T.reshape(-1))
2546 
2547  except ImportError:
2548  LogError("Function needs numpy, but I could not import it.")
2549  raise
2550 
2551  def PlotEnrichment(self, score_col, class_col, score_dir='-',
2552  class_dir='-', class_cutoff=2.0,
2553  style='-', title=None, x_title=None, y_title=None,
2554  clear=True, save=None):
2555  '''
2556  Plot an enrichment curve using matplotlib of column *score_col* classified
2557  according to *class_col*.
2558 
2559  For more information about parameters of the enrichment, see
2560  :meth:`ComputeEnrichment`, and for plotting see :meth:`Plot`.
2561 
2562  :warning: The function depends on *matplotlib*
2563  '''
2564  try:
2565  import matplotlib.pyplot as plt
2566 
2567  enrx, enry = self.ComputeEnrichmentComputeEnrichment(score_col, class_col, score_dir,
2568  class_dir, class_cutoff)
2569 
2570  if not title:
2571  title = 'Enrichment of %s'%score_col
2572 
2573  if not x_title:
2574  x_title = '% database'
2575 
2576  if not y_title:
2577  y_title = '% positives'
2578 
2579  if clear:
2580  plt.clf()
2581 
2582  plt.plot(enrx, enry, style)
2583 
2584  plt.title(title, size='x-large', fontweight='bold')
2585  plt.ylabel(y_title, size='x-large')
2586  plt.xlabel(x_title, size='x-large')
2587 
2588  if save:
2589  plt.savefig(save)
2590 
2591  return plt
2592  except ImportError:
2593  LogError("Function needs matplotlib, but I could not import it.")
2594  raise
2595 
2596  def ComputeEnrichment(self, score_col, class_col, score_dir='-',
2597  class_dir='-', class_cutoff=2.0):
2598  '''
2599  Computes the enrichment of column *score_col* classified according to
2600  *class_col*.
2601 
2602  For this it is necessary, that the datapoints are classified into positive
2603  and negative points. This can be done in two ways:
2604 
2605  - by using one 'bool' type column (*class_col*) which contains *True* for
2606  positives and *False* for negatives
2607 
2608  - by specifying a classification column (*class_col*), a cutoff value
2609  (*class_cutoff*) and the classification columns direction (*class_dir*).
2610  This will generate the classification on the fly
2611 
2612  * if ``class_dir=='-'``: values in the classification column that are less than or equal to class_cutoff will be counted as positives
2613  * if ``class_dir=='+'``: values in the classification column that are larger than or equal to class_cutoff will be counted as positives
2614 
2615  During the calculation, the table will be sorted according to *score_dir*,
2616  where a '-' values means smallest values first and therefore, the smaller
2617  the value, the better.
2618 
2619  :warning: If either the value of *class_col* or *score_col* is *None*, the
2620  data in this row is ignored.
2621  '''
2622 
2623  ALLOWED_DIR = ['+','-']
2624 
2625  score_idx = self.GetColIndexGetColIndex(score_col)
2626  score_type = self.col_typescol_types[score_idx]
2627  if score_type!='int' and score_type!='float':
2628  raise TypeError("Score column must be numeric type")
2629 
2630  class_idx = self.GetColIndexGetColIndex(class_col)
2631  class_type = self.col_typescol_types[class_idx]
2632  if class_type!='int' and class_type!='float' and class_type!='bool':
2633  raise TypeError("Classifier column must be numeric or bool type")
2634 
2635  if (score_dir not in ALLOWED_DIR) or (class_dir not in ALLOWED_DIR):
2636  raise ValueError("Direction must be one of %s"%str(ALLOWED_DIR))
2637 
2638  self.SortSort(score_col, score_dir)
2639 
2640  x = [0]
2641  y = [0]
2642  enr = 0
2643  old_score_val = None
2644  i = 0
2645 
2646  for row in self.rowsrows:
2647  class_val = row[class_idx]
2648  score_val = row[score_idx]
2649  if class_val==None or score_val==None:
2650  continue
2651  if class_val!=None:
2652  if old_score_val==None:
2653  old_score_val = score_val
2654  if score_val!=old_score_val:
2655  x.append(i)
2656  y.append(enr)
2657  old_score_val = score_val
2658  i+=1
2659  if class_type=='bool':
2660  if class_val==True:
2661  enr += 1
2662  else:
2663  if (class_dir=='-' and class_val<=class_cutoff) or (class_dir=='+' and class_val>=class_cutoff):
2664  enr += 1
2665  x.append(i)
2666  y.append(enr)
2667 
2668  # if no false positives or false negatives values are found return None
2669  if x[-1]==0 or y[-1]==0:
2670  return None
2671 
2672  x = [float(v)/x[-1] for v in x]
2673  y = [float(v)/y[-1] for v in y]
2674  return x,y
2675 
2676  def ComputeEnrichmentAUC(self, score_col, class_col, score_dir='-',
2677  class_dir='-', class_cutoff=2.0):
2678  '''
2679  Computes the area under the curve of the enrichment using the trapezoidal
2680  rule.
2681 
2682  For more information about parameters of the enrichment, see
2683  :meth:`ComputeEnrichment`.
2684 
2685  :warning: The function depends on *numpy*
2686  '''
2687  try:
2688  import numpy as np
2689 
2690  enr = self.ComputeEnrichmentComputeEnrichment(score_col, class_col, score_dir,
2691  class_dir, class_cutoff)
2692 
2693  if enr==None:
2694  return None
2695  return np.trapz(enr[1], enr[0])
2696  except ImportError:
2697  LogError("Function needs numpy, but I could not import it.")
2698  raise
2699 
2700  def ComputeROC(self, score_col, class_col, score_dir='-',
2701  class_dir='-', class_cutoff=2.0):
2702  '''
2703  Computes the receiver operating characteristics (ROC) of column *score_col*
2704  classified according to *class_col*.
2705 
2706  For this it is necessary, that the datapoints are classified into positive
2707  and negative points. This can be done in two ways:
2708 
2709  - by using one 'bool' column (*class_col*) which contains True for positives
2710  and False for negatives
2711  - by using a non-bool column (*class_col*), a cutoff value (*class_cutoff*)
2712  and the classification columns direction (*class_dir*). This will generate
2713  the classification on the fly
2714 
2715  - if ``class_dir=='-'``: values in the classification column that are less than or equal to *class_cutoff* will be counted as positives
2716  - if ``class_dir=='+'``: values in the classification column that are larger than or equal to *class_cutoff* will be counted as positives
2717 
2718  During the calculation, the table will be sorted according to *score_dir*,
2719  where a '-' values means smallest values first and therefore, the smaller
2720  the value, the better.
2721 
2722  If *class_col* does not contain any positives (i.e. value is True (if column
2723  is of type bool) or evaluated to True (if column is of type int or float
2724  (depending on *class_dir* and *class_cutoff*))) the ROC is not defined and
2725  the function will return *None*.
2726 
2727  :warning: If either the value of *class_col* or *score_col* is *None*, the
2728  data in this row is ignored.
2729  '''
2730 
2731  ALLOWED_DIR = ['+','-']
2732 
2733  score_idx = self.GetColIndexGetColIndex(score_col)
2734  score_type = self.col_typescol_types[score_idx]
2735  if score_type!='int' and score_type!='float':
2736  raise TypeError("Score column must be numeric type")
2737 
2738  class_idx = self.GetColIndexGetColIndex(class_col)
2739  class_type = self.col_typescol_types[class_idx]
2740  if class_type!='int' and class_type!='float' and class_type!='bool':
2741  raise TypeError("Classifier column must be numeric or bool type")
2742 
2743  if (score_dir not in ALLOWED_DIR) or (class_dir not in ALLOWED_DIR):
2744  raise ValueError("Direction must be one of %s"%str(ALLOWED_DIR))
2745 
2746  self.SortSort(score_col, score_dir)
2747 
2748  x = [0]
2749  y = [0]
2750  tp = 0
2751  fp = 0
2752  old_score_val = None
2753 
2754  for i,row in enumerate(self.rowsrows):
2755  class_val = row[class_idx]
2756  score_val = row[score_idx]
2757  if class_val==None or score_val==None:
2758  continue
2759  if class_val!=None:
2760  if old_score_val==None:
2761  old_score_val = score_val
2762  if score_val!=old_score_val:
2763  x.append(fp)
2764  y.append(tp)
2765  old_score_val = score_val
2766  if class_type=='bool':
2767  if class_val==True:
2768  tp += 1
2769  else:
2770  fp += 1
2771  else:
2772  if (class_dir=='-' and class_val<=class_cutoff) or (class_dir=='+' and class_val>=class_cutoff):
2773  tp += 1
2774  else:
2775  fp += 1
2776  x.append(fp)
2777  y.append(tp)
2778 
2779  # if no false positives or false negatives values are found return None
2780  if x[-1]==0 or y[-1]==0:
2781  return None
2782 
2783  x = [float(v)/x[-1] for v in x]
2784  y = [float(v)/y[-1] for v in y]
2785  return x,y
2786 
2787  def ComputeROCAUC(self, score_col, class_col, score_dir='-',
2788  class_dir='-', class_cutoff=2.0):
2789  '''
2790  Computes the area under the curve of the receiver operating characteristics
2791  using the trapezoidal rule.
2792 
2793  For more information about parameters of the ROC, see
2794  :meth:`ComputeROC`.
2795 
2796  :warning: The function depends on *numpy*
2797  '''
2798  try:
2799  import numpy as np
2800 
2801  roc = self.ComputeROCComputeROC(score_col, class_col, score_dir,
2802  class_dir, class_cutoff)
2803 
2804  if not roc:
2805  return None
2806  return np.trapz(roc[1], roc[0])
2807  except ImportError:
2808  LogError("Function needs numpy, but I could not import it.")
2809  raise
2810 
2811  def ComputeLogROCAUC(self, score_col, class_col, score_dir='-',
2812  class_dir='-', class_cutoff=2.0):
2813  '''
2814  Computes the area under the curve of the log receiver operating
2815  characteristics (logROC) where the x-axis is semilogarithmic
2816  using the trapezoidal rule.
2817 
2818  The logROC is computed with a lambda of 0.001 according to
2819  Rapid Context-Dependent Ligand Desolvation in Molecular Docking
2820  Mysinger M. and Shoichet B., Journal of Chemical Information and Modeling
2821  2010 50 (9), 1561-1573
2822 
2823  For more information about parameters of the ROC, see
2824  :meth:`ComputeROC`.
2825 
2826  :warning: The function depends on *numpy*
2827  '''
2828  try:
2829  import numpy as np
2830 
2831  roc = self.ComputeROCComputeROC(score_col, class_col, score_dir,
2832  class_dir, class_cutoff)
2833 
2834  if not roc:
2835  return None
2836 
2837  rocxt, rocyt = roc
2838  rocx=[]
2839  rocy=[]
2840 
2841  # define lambda
2842  l=0.001
2843 
2844  # remove all duplicate x-values
2845  rocxt = [x if x>0 else l for x in rocxt]
2846  for i in range(len(rocxt)-1):
2847  if rocxt[i]==rocxt[i+1]:
2848  continue
2849  rocx.append(rocxt[i])
2850  rocy.append(rocyt[i])
2851  rocx.append(1.0)
2852  rocy.append(1.0)
2853 
2854  # compute logauc
2855  value = 0
2856  for i in range(len(rocx)-1):
2857  x = rocx[i]
2858  if rocx[i]==rocx[i+1]:
2859  continue
2860  b = rocy[i+1]-rocx[i+1]*((rocy[i+1]-rocy[i])/(rocx[i+1]-rocx[i]))
2861  value += ((rocy[i+1]-rocy[i])/math.log(10))+b*(math.log10(rocx[i+1])-math.log10(rocx[i]))
2862  return value/math.log10(1.0/l)
2863 
2864  except ImportError:
2865  LogError("Function needs numpy, but I could not import it.")
2866  raise
2867 
2868  def PlotROC(self, score_col, class_col, score_dir='-',
2869  class_dir='-', class_cutoff=2.0,
2870  style='-', title=None, x_title=None, y_title=None,
2871  clear=True, save=None):
2872  '''
2873  Plot an ROC curve using matplotlib.
2874 
2875  For more information about parameters of the ROC, see
2876  :meth:`ComputeROC`, and for plotting see :meth:`Plot`.
2877 
2878  :warning: The function depends on *matplotlib*
2879  '''
2880 
2881  try:
2882  import matplotlib.pyplot as plt
2883 
2884  roc = self.ComputeROCComputeROC(score_col, class_col, score_dir,
2885  class_dir, class_cutoff)
2886 
2887  if not roc:
2888  return None
2889 
2890  enrx, enry = roc
2891 
2892  if not title:
2893  title = 'ROC of %s'%score_col
2894 
2895  if not x_title:
2896  x_title = 'false positive rate'
2897 
2898  if not y_title:
2899  y_title = 'true positive rate'
2900 
2901  if clear:
2902  plt.clf()
2903 
2904  plt.plot(enrx, enry, style)
2905 
2906  plt.title(title, size='x-large', fontweight='bold')
2907  plt.ylabel(y_title, size='x-large')
2908  plt.xlabel(x_title, size='x-large')
2909 
2910  if save:
2911  plt.savefig(save)
2912 
2913  return plt
2914  except ImportError:
2915  LogError("Function needs matplotlib, but I could not import it.")
2916  raise
2917 
2918  def PlotLogROC(self, score_col, class_col, score_dir='-',
2919  class_dir='-', class_cutoff=2.0,
2920  style='-', title=None, x_title=None, y_title=None,
2921  clear=True, save=None):
2922  '''
2923  Plot an logROC curve where the x-axis is semilogarithmic using matplotlib
2924 
2925  For more information about parameters of the ROC, see
2926  :meth:`ComputeROC`, and for plotting see :meth:`Plot`.
2927 
2928  :warning: The function depends on *matplotlib*
2929  '''
2930 
2931  try:
2932  import matplotlib.pyplot as plt
2933 
2934  roc = self.ComputeROCComputeROC(score_col, class_col, score_dir,
2935  class_dir, class_cutoff)
2936 
2937  if not roc:
2938  return None
2939 
2940  rocx, rocy = roc
2941 
2942  if not title:
2943  title = 'logROC of %s'%score_col
2944 
2945  if not x_title:
2946  x_title = 'false positive rate'
2947 
2948  if not y_title:
2949  y_title = 'true positive rate'
2950 
2951  if clear:
2952  plt.clf()
2953 
2954  rocx = [x if x>0 else 0.001 for x in rocx]
2955 
2956 
2957  plt.plot(rocx, rocy, style)
2958 
2959  plt.title(title, size='x-large', fontweight='bold')
2960  plt.ylabel(y_title, size='x-large')
2961  plt.xlabel(x_title, size='x-large')
2962  try:
2963  plt.xscale('log', basex=10)
2964  except:
2965  plt.xscale('log', base=10) # breaking change in matplotlib 3.5
2966  plt.xlim(0.001, 1.0)
2967 
2968  if save:
2969  plt.savefig(save)
2970 
2971  return plt
2972  except ImportError:
2973  LogError("Function needs matplotlib, but I could not import it.")
2974  raise
2975 
2976  def ComputeMCC(self, score_col, class_col, score_dir='-',
2977  class_dir='-', score_cutoff=2.0, class_cutoff=2.0):
2978  '''
2979  Compute Matthews correlation coefficient (MCC) for one column (*score_col*)
2980  with the points classified into true positives, false positives, true
2981  negatives and false negatives according to a specified classification
2982  column (*class_col*).
2983 
2984  The datapoints in *score_col* and *class_col* are classified into
2985  positive and negative points. This can be done in two ways:
2986 
2987  - by using 'bool' columns which contains True for positives and False
2988  for negatives
2989 
2990  - by using 'float' or 'int' columns and specifying a cutoff value and the
2991  columns direction. This will generate the classification on the fly
2992 
2993  * if ``class_dir``/``score_dir=='-'``: values in the classification column that are less than or equal to *class_cutoff*/*score_cutoff* will be counted as positives
2994  * if ``class_dir``/``score_dir=='+'``: values in the classification column that are larger than or equal to *class_cutoff*/*score_cutoff* will be counted as positives
2995 
2996  The two possibilities can be used together, i.e. 'bool' type for one column
2997  and 'float'/'int' type and cutoff/direction for the other column.
2998  '''
2999  ALLOWED_DIR = ['+','-']
3000 
3001  score_idx = self.GetColIndexGetColIndex(score_col)
3002  score_type = self.col_typescol_types[score_idx]
3003  if score_type!='int' and score_type!='float' and score_type!='bool':
3004  raise TypeError("Score column must be numeric or bool type")
3005 
3006  class_idx = self.GetColIndexGetColIndex(class_col)
3007  class_type = self.col_typescol_types[class_idx]
3008  if class_type!='int' and class_type!='float' and class_type!='bool':
3009  raise TypeError("Classifier column must be numeric or bool type")
3010 
3011  if (score_dir not in ALLOWED_DIR) or (class_dir not in ALLOWED_DIR):
3012  raise ValueError("Direction must be one of %s"%str(ALLOWED_DIR))
3013 
3014  tp = 0
3015  fp = 0
3016  fn = 0
3017  tn = 0
3018 
3019  for i,row in enumerate(self.rowsrows):
3020  class_val = row[class_idx]
3021  score_val = row[score_idx]
3022  if class_val!=None:
3023  if (class_type=='bool' and class_val==True) or (class_type!='bool' and ((class_dir=='-' and class_val<=class_cutoff) or (class_dir=='+' and class_val>=class_cutoff))):
3024  if (score_type=='bool' and score_val==True) or (score_type!='bool' and ((score_dir=='-' and score_val<=score_cutoff) or (score_dir=='+' and score_val>=score_cutoff))):
3025  tp += 1
3026  else:
3027  fn += 1
3028  else:
3029  if (score_type=='bool' and score_val==False) or (score_type!='bool' and ((score_dir=='-' and score_val>score_cutoff) or (score_dir=='+' and score_val<score_cutoff))):
3030  tn += 1
3031  else:
3032  fp += 1
3033 
3034  mcc = None
3035  msg = None
3036  if (tp+fn)==0:
3037  msg = 'factor (tp + fn) is zero'
3038  elif (tp+fp)==0:
3039  msg = 'factor (tp + fp) is zero'
3040  elif (tn+fn)==0:
3041  msg = 'factor (tn + fn) is zero'
3042  elif (tn+fp)==0:
3043  msg = 'factor (tn + fp) is zero'
3044 
3045  if msg:
3046  LogWarning("Could not compute MCC: MCC is not defined since %s"%msg)
3047  else:
3048  mcc = ((tp*tn)-(fp*fn)) / math.sqrt((tp+fn)*(tp+fp)*(tn+fn)*(tn+fp))
3049  return mcc
3050 
3051 
3052  def IsEmpty(self, col_name=None, ignore_nan=True):
3053  '''
3054  Checks if a table is empty.
3055 
3056  If no column name is specified, the whole table is checked for being empty,
3057  whereas if a column name is specified, only this column is checked.
3058 
3059  By default, all NAN (or None) values are ignored, and thus, a table
3060  containing only NAN values is considered as empty. By specifying the
3061  option ignore_nan=False, NAN values are counted as 'normal' values.
3062  '''
3063 
3064  # table with no columns and no rows
3065  if len(self.col_namescol_names)==0:
3066  if col_name:
3067  raise ValueError('Table has no column named "%s"' % col_name)
3068  return True
3069 
3070  # column name specified
3071  if col_name:
3072  if self.CountCount(col_name, ignore_nan=ignore_nan)==0:
3073  return True
3074  else:
3075  return False
3076 
3077  # no column name specified -> test whole table
3078  else:
3079  for row in self.rowsrows:
3080  for cell in row:
3081  if ignore_nan:
3082  if cell!=None:
3083  return False
3084  else:
3085  return False
3086  return True
3087 
3088 
3089  def Extend(self, tab, overwrite=None):
3090  """
3091  Append each row of *tab* to the current table. The data is appended based
3092  on the column names, thus the order of the table columns is *not* relevant,
3093  only the header names.
3094 
3095  If there is a column in *tab* that is not present in the current table,
3096  it is added to the current table and filled with *None* for all the rows
3097  present in the current table.
3098 
3099  If the type of any column in *tab* is not the same as in the current table
3100  a *TypeError* is raised.
3101 
3102  If *overwrite* is not None and set to an existing column name, the specified
3103  column in the table is searched for the first occurrence of a value matching
3104  the value of the column with the same name in the dictionary. If a matching
3105  value is found, the row is overwritten with the dictionary. If no matching
3106  row is found, a new row is appended to the table.
3107  """
3108  # add column to current table if it doesn't exist
3109  for name,typ in zip(tab.col_names, tab.col_types):
3110  if not name in self.col_namescol_names:
3111  self.AddColAddCol(name, typ)
3112 
3113  # check that column types are the same in current and new table
3114  for name in self.col_namescol_names:
3115  if name in tab.col_names:
3116  curr_type = self.col_typescol_types[self.GetColIndexGetColIndex(name)]
3117  new_type = tab.col_types[tab.GetColIndex(name)]
3118  if curr_type!=new_type:
3119  raise TypeError('cannot extend table, column %s in new '%name +\
3120  'table different type (%s) than in '%new_type +\
3121  'current table (%s)'%curr_type)
3122 
3123  num_rows = len(tab.rows)
3124  for i in range(0,num_rows):
3125  row = tab.rows[i]
3126  data = dict(list(zip(tab.col_names,row)))
3127  self.AddRowAddRow(data, overwrite)
3128 
3129 
3130 def Merge(table1, table2, by, only_matching=False):
3131  """
3132  Returns a new table containing the data from both tables. The rows are
3133  combined based on the common values in the column(s) by. The option 'by' can
3134  be a list of column names. When this is the case, merging is based on
3135  multiple columns.
3136  For example, the two tables below
3137 
3138  ==== ====
3139  x y
3140  ==== ====
3141  1 10
3142  2 15
3143  3 20
3144  ==== ====
3145 
3146  ==== ====
3147  x u
3148  ==== ====
3149  1 100
3150  3 200
3151  4 400
3152  ==== ====
3153 
3154  when merged by column x, produce the following output:
3155 
3156  ===== ===== =====
3157  x y u
3158  ===== ===== =====
3159  1 10 100
3160  2 15 None
3161  3 20 200
3162  4 None 400
3163  ===== ===== =====
3164 
3165 
3166  """
3167  def _key(row, indices):
3168  return tuple([row[i] for i in indices])
3169  def _keep(indices, cn, ct, ni):
3170  ncn, nct, nni=([],[],[])
3171  for i in range(len(cn)):
3172  if i not in indices:
3173  ncn.append(cn[i])
3174  nct.append(ct[i])
3175  nni.append(ni[i])
3176  return ncn, nct, nni
3177  col_names=list(table2.col_names)
3178  col_types=list(table2.col_types)
3179  new_index=[i for i in range(len(col_names))]
3180  if isinstance(by, str):
3181  common2_indices=[col_names.index(by)]
3182  else:
3183  common2_indices=[col_names.index(b) for b in by]
3184  col_names, col_types, new_index=_keep(common2_indices, col_names,
3185  col_types, new_index)
3186 
3187  for i, name in enumerate(col_names):
3188  try_name=name
3189  counter=1
3190  while try_name in table1.col_names:
3191  counter+=1
3192  try_name='%s_%d' % (name, counter)
3193  col_names[i]=try_name
3194  common1={}
3195  if isinstance(by, str):
3196  common1_indices=[table1.col_names.index(by)]
3197  else:
3198  common1_indices=[table1.col_names.index(b) for b in by]
3199  for row in table1.rows:
3200  key=_key(row, common1_indices)
3201  if key in common1:
3202  raise ValueError('duplicate key "%s in first table"' % (str(key)))
3203  common1[key]=row
3204  common2={}
3205  for row in table2.rows:
3206  key=_key(row, common2_indices)
3207  if key in common2:
3208  raise ValueError('duplicate key "%s" in second table' % (str(key)))
3209  common2[key]=row
3210  new_tab=Table(table1.col_names+col_names, table1.col_types+col_types)
3211  for k, v in common1.items():
3212  row=v+[None for i in range(len(table2.col_names)-len(common2_indices))]
3213  matched=False
3214  if k in common2:
3215  matched=True
3216  row2=common2[k]
3217  for i, index in enumerate(new_index):
3218  row[len(table1.col_names)+i]=row2[index]
3219  if only_matching and not matched:
3220  continue
3221  new_tab.AddRow(row)
3222  if only_matching:
3223  return new_tab
3224  for k, v in common2.items():
3225  if not k in common1:
3226  v2=[v[i] for i in new_index]
3227  row=[None for i in range(len(table1.col_names))]+v2
3228  for common1_index, common2_index in zip(common1_indices, common2_indices):
3229  row[common1_index]=v[common2_index]
3230  new_tab.AddRow(row)
3231  return new_tab
3232 
3233 
3234 # LocalWords: numpy Numpy
def __add__(self, rhs)
Definition: table.py:85
def __iter__(self)
Definition: table.py:79
def __init__(self, op, lhs, rhs)
Definition: table.py:71
def __div__(self, rhs)
Definition: table.py:94
def __mul__(self, rhs)
Definition: table.py:91
def __sub__(self, rhs)
Definition: table.py:88
def __init__(self, table, col)
Definition: table.py:98
def __add__(self, rhs)
Definition: table.py:118
def __iter__(self)
Definition: table.py:105
def __getitem__(self, index)
Definition: table.py:112
def __div__(self, rhs)
Definition: table.py:127
def __setitem__(self, index, value)
Definition: table.py:115
def __len__(self)
Definition: table.py:109
def __mul__(self, rhs)
Definition: table.py:124
def __sub__(self, rhs)
Definition: table.py:121
def Percentiles(self, col, nths)
Definition: table.py:2016
def __getitem__(self, k)
Definition: table.py:407
def _SaveOST(self, stream)
Definition: table.py:2331
def ComputeEnrichment(self, score_col, class_col, score_dir='-', class_dir='-', class_cutoff=2.0)
Definition: table.py:2597
def _Min(self, col)
Definition: table.py:1838
def Sort(self, by, order='+')
Definition: table.py:1050
def GetColIndex(self, col)
Definition: table.py:369
def _ParseColTypes(types, exp_num=None)
Definition: table.py:247
def HasCol(self, col)
Definition: table.py:401
def SpearmanCorrel(self, col1, col2)
Definition: table.py:2158
def __str__(self)
Definition: table.py:479
def GetNumpyMatrixAsArray(self, *args)
Definition: table.py:2351
def Extend(self, tab, overwrite=None)
Definition: table.py:3089
def Min(self, col)
Definition: table.py:1856
def PlotBar(self, cols=None, rows=None, xlabels=None, set_xlabels=True, xlabels_rotation='horizontal', y_title=None, title=None, colors=None, width=0.8, bottom=0, legend=False, legend_names=None, show=False, save=False)
Definition: table.py:1509
def __init__(self, col_names=[], col_types=None, **kwargs)
Definition: table.py:221
def PlotHexbin(self, x, y, title=None, x_title=None, y_title=None, x_range=None, y_range=None, binning='log', colormap='jet', show_scalebar=False, scalebar_label=None, clear=True, save=False, show=False)
Definition: table.py:1671
def GaussianSmooth(self, col, std=1.0, na_value=0.0, padding='reflect', c=0.0)
Definition: table.py:2413
def __getattr__(self, col_name)
Definition: table.py:237
def _SaveHTML(self, stream_or_filename)
Definition: table.py:2240
def RemoveCol(self, col)
Definition: table.py:688
def __setitem__(self, k, value)
Definition: table.py:413
def Median(self, col)
Definition: table.py:2061
def PlotROC(self, score_col, class_col, score_dir='-', class_dir='-', class_cutoff=2.0, style='-', title=None, x_title=None, y_title=None, clear=True, save=None)
Definition: table.py:2871
def MaxIdx(self, col)
Definition: table.py:1826
def Filter(self, *args, **kwargs)
Definition: table.py:791
def GetColNames(self)
Definition: table.py:379
def AddCol(self, col_name, col_type, data=None)
Definition: table.py:701
def Mean(self, col)
Definition: table.py:1915
def Count(self, col, ignore_nan=True)
Definition: table.py:2112
def MaxRow(self, col)
Definition: table.py:1800
def MinRow(self, col)
Definition: table.py:1867
def Load(stream_or_filename, format='auto', sep=',')
Definition: table.py:986
def GetName(self)
Definition: table.py:323
def AddRow(self, data, overwrite=None)
Definition: table.py:589
def ToString(self, float_format='%.3f', int_format='%d', rows=None)
Definition: table.py:422
def _SaveContext(self, stream_or_filename)
Definition: table.py:2273
def PlotLogROC(self, score_col, class_col, score_dir='-', class_dir='-', class_cutoff=2.0, style='-', title=None, x_title=None, y_title=None, clear=True, save=None)
Definition: table.py:2921
def Correl(self, col1, col2)
Definition: table.py:2132
def _Max(self, col)
Definition: table.py:1489
def PlotEnrichment(self, score_col, class_col, score_dir='-', class_dir='-', class_cutoff=2.0, style='-', title=None, x_title=None, y_title=None, clear=True, save=None)
Definition: table.py:2554
def SetName(self, name)
Definition: table.py:314
def ComputeEnrichmentAUC(self, score_col, class_col, score_dir='-', class_dir='-', class_cutoff=2.0)
Definition: table.py:2677
def _AddRowsFromDict(self, d, overwrite=None)
Definition: table.py:507
def Plot(self, x, y=None, z=None, style='.', x_title=None, y_title=None, z_title=None, x_range=None, y_range=None, z_range=None, color=None, plot_if=None, legend=None, num_z_levels=10, z_contour=True, z_interpol='nn', diag_line=False, labels=None, max_num_labels=None, title=None, clear=True, save=False, **kwargs)
Definition: table.py:1126
def Save(self, stream_or_filename, format='ost', sep=',')
Definition: table.py:2197
def GetUnique(self, col, ignore_nan=True)
Definition: table.py:1080
def ComputeLogROCAUC(self, score_col, class_col, score_dir='-', class_dir='-', class_cutoff=2.0)
Definition: table.py:2812
def _SavePickle(self, stream)
Definition: table.py:2231
def IsEmpty(self, col_name=None, ignore_nan=True)
Definition: table.py:3052
def RenameCol(self, old_name, new_name)
Definition: table.py:329
def Max(self, col)
Definition: table.py:1815
def Select(self, query)
Definition: table.py:827
def ComputeROC(self, score_col, class_col, score_dir='-', class_dir='-', class_cutoff=2.0)
Definition: table.py:2701
def _Coerce(self, value, ty)
Definition: table.py:342
def GetNumpyMatrix(self, *args)
Definition: table.py:2389
def Zip(self, *args)
Definition: table.py:1101
def _SaveCSV(self, stream, sep)
Definition: table.py:2313
def ComputeROCAUC(self, score_col, class_col, score_dir='-', class_dir='-', class_cutoff=2.0)
Definition: table.py:2788
def StdDev(self, col)
Definition: table.py:2087
def MinIdx(self, col)
Definition: table.py:1882
def GetOptimalPrefactors(self, ref_col, *args, **kwargs)
Definition: table.py:2469
def PairedTTest(self, col_a, col_b)
Definition: table.py:565
def PlotHistogram(self, col, x_range=None, num_bins=10, normed=False, histtype='stepfilled', align='mid', x_title=None, y_title=None, title=None, clear=True, save=False, color=None, y_range=None)
Definition: table.py:1380
def SearchColNames(self, regex)
Definition: table.py:385
def Stats(self, col)
Definition: table.py:482
def Sum(self, col)
Definition: table.py:1894
def ComputeMCC(self, score_col, class_col, score_dir='-', class_dir='-', score_cutoff=2.0, class_cutoff=2.0)
Definition: table.py:2977
def RowMean(self, mean_col_name, cols)
Definition: table.py:1943
def __str__(self)
Definition: table.py:144
def __setitem__(self, col_name, val)
Definition: table.py:154
def __getattr__(self, col_name)
Definition: table.py:160
def __len__(self)
Definition: table.py:151
def __setattr__(self, col_name, val)
Definition: table.py:165
def __init__(self, row_data, tab)
Definition: table.py:135
def __getitem__(self, col_name)
Definition: table.py:139
def MakeTitle(col_name)
Definition: table.py:11
def GuessColumnType(iterator)
Definition: table.py:38
def IsStringLike(value)
Definition: table.py:14
def IsScalar(value)
Definition: table.py:27
def Merge(table1, table2, by, only_matching=False)
Definition: table.py:3130
def IsNullString(value)
Definition: table.py:23