src/third_party/chromite/lib/table.py

   1 #!/usr/bin/python
   2 # Copyright (c) 2011 The Chromium OS Authors. All rights reserved.
   3 # Use of this source code is governed by a BSD-style license that can be
   4 # found in the LICENSE file.
   5
   6 """Support generic spreadsheet-like table information."""
   7
   8 import inspect
   9 import re
  10 import sys
  11
  12 from chromite.lib import cros_build_lib
  13
  14
  15 class Table(object):
  16   """Class to represent column headers and rows of data."""
  17
  18   __slots__ = ['_column_set',  # Set of column headers (for faster lookup)
  19                '_columns',     # List of column headers in order
  20                '_name',        # Name to associate with table
  21                '_rows',        # List of row dicts
  22                ]
  23
  24   EMPTY_CELL = ''
  25
  26   CSV_BQ = '__BEGINQUOTE__'
  27   CSV_EQ = '__ENDQUOTE__'
  28
  29   @staticmethod
  30   def _SplitCSVLine(line):
  31     '''Split a single CSV line into separate values.
  32
  33     Behavior illustrated by the following examples, with all but
  34     the last example taken from Google Docs spreadsheet behavior:
  35     'a,b,c,d':           ==> ['a', 'b', 'c', 'd'],
  36     'a, b, c, d':        ==> ['a', ' b', ' c', ' d'],
  37     'a,b,c,':            ==> ['a', 'b', 'c', ''],
  38     'a,"b c",d':         ==> ['a', 'b c', 'd'],
  39     'a,"b, c",d':        ==> ['a', 'b, c', 'd'],
  40     'a,"b, c, d",e':     ==> ['a', 'b, c, d', 'e'],
  41     'a,"""b, c""",d':    ==> ['a', '"b, c"', 'd'],
  42     'a,"""b, c"", d",e': ==> ['a', '"b, c", d', 'e'],
  43     'a,b\,c,d':          ==> ['a', 'b,c', 'd'],
  44
  45     Return a list of values.
  46     '''
  47     # Split on commas, handling two special cases:
  48     # 1) Escaped commas are not separators.
  49     # 2) A quoted value can have non-separator commas in it.  Quotes
  50     #    should be removed.
  51     vals = []
  52     for val in re.split(r'(?<!\\),', line):
  53       if not val:
  54         vals.append(val)
  55         continue
  56
  57       # Handle regular double quotes at beginning/end specially.
  58       if val[0] == '"':
  59         val = Table.CSV_BQ + val[1:]
  60       if val[-1] == '"' and (val[-2] != '"' or val[-3] == '"'):
  61         val = val[0:-1] + Table.CSV_EQ
  62
  63       # Remove escape characters now.
  64       val = val.replace(r'\,', ',')  # \ before ,
  65       val = val.replace('""', '"')   # " before " (Google Spreadsheet syntax)
  66
  67       prevval = vals[-1] if vals else None
  68
  69       # If previous value started with quote and ended without one, then
  70       # the current value is just a continuation of the previous value.
  71       if prevval and prevval.startswith(Table.CSV_BQ):
  72         val = prevval + "," + val
  73         # Once entire value is read, strip surrounding quotes
  74         if val.endswith(Table.CSV_EQ):
  75           vals[-1] = val[len(Table.CSV_BQ):-len(Table.CSV_EQ)]
  76         else:
  77           vals[-1] = val
  78       elif val.endswith(Table.CSV_EQ):
  79         vals.append(val[len(Table.CSV_BQ):-len(Table.CSV_EQ)])
  80       else:
  81         vals.append(val)
  82
  83     # If an unpaired Table.CSV_BQ is still in vals, then replace with ".
  84     vals = [val.replace(Table.CSV_BQ, '"') for val in vals]
  85
  86     return vals
  87
  88   @staticmethod
  89   def LoadFromCSV(csv_file, name=None):
  90     """Create a new Table object by loading contents of |csv_file|."""
  91     if type(csv_file) is file:
  92       file_handle = csv_file
  93     else:
  94       file_handle = open(csv_file, 'r')
  95     table = None
  96
  97     for line in file_handle:
  98       if line[-1] == '\n':
  99         line = line[0:-1]
 100
 101       vals = Table._SplitCSVLine(line)
 102
 103       if not table:
 104         # Read headers
 105         table = Table(vals, name=name)
 106
 107       else:
 108         # Read data row
 109         table.AppendRow(vals)
 110
 111     return table
 112
 113   def __init__(self, columns, name=None):
 114     self._columns = columns
 115     self._column_set = set(columns)
 116     self._rows = []
 117     self._name = name
 118
 119   def __str__(self):
 120     """Return a table-like string representation of this table."""
 121     cols = ['%10s' % col for col in self._columns]
 122     text = 'Columns: %s\n' % ', '.join(cols)
 123
 124     ix = 0
 125     for row in self._rows:
 126       vals = ['%10s' % row[col] for col in self._columns]
 127       text += 'Row %3d: %s\n' % (ix, ', '.join(vals))
 128       ix += 1
 129     return text
 130
 131   def __nonzero__(self):
 132     """Define boolean equivalent for this table."""
 133     return bool(self._columns)
 134
 135   def __len__(self):
 136     """Length of table equals the number of rows."""
 137     return self.GetNumRows()
 138
 139   def __eq__(self, other):
 140     """Return true if two tables are equal."""
 141     # pylint: disable=W0212
 142     return self._columns == other._columns and self._rows == other._rows
 143
 144   def __ne__(self, other):
 145     """Return true if two tables are not equal."""
 146     return not self == other
 147
 148   def __getitem__(self, index):
 149     """Access one or more rows by index or slice."""
 150     return self.GetRowByIndex(index)
 151
 152   def __delitem__(self, index):
 153     """Delete one or more rows by index or slice."""
 154     self.RemoveRowByIndex(index)
 155
 156   def __iter__(self):
 157     """Declare that this class supports iteration (over rows)."""
 158     return self._rows.__iter__()
 159
 160   def GetName(self):
 161     """Return name associated with table, None if not available."""
 162     return self._name
 163
 164   def SetName(self, name):
 165     """Set the name associated with table."""
 166     self._name = name
 167
 168   def Clear(self):
 169     """Remove all row data."""
 170     self._rows = []
 171
 172   def GetNumRows(self):
 173     """Return the number of rows in the table."""
 174     return len(self._rows)
 175
 176   def GetNumColumns(self):
 177     """Return the number of columns in the table."""
 178     return len(self._columns)
 179
 180   def GetColumns(self):
 181     """Return list of column names in order."""
 182     return list(self._columns)
 183
 184   def GetRowByIndex(self, index):
 185     """Access one or more rows by index or slice.
 186
 187     If more than one row is returned they will be contained in a list.
 188     """
 189     return self._rows[index]
 190
 191   def _GenRowFilter(self, id_values):
 192     """Return a method that returns true for rows matching |id_values|."""
 193     def Grep(row):
 194       """Filter function for rows with id_values."""
 195       for key in id_values:
 196         if id_values[key] != row.get(key, None):
 197           return False
 198       return True
 199     return Grep
 200
 201   def GetRowsByValue(self, id_values):
 202     """Return list of rows matching key/value pairs in |id_values|."""
 203     # If row retrieval by value is heavily used for larger tables, then
 204     # the implementation should change to be more efficient, at the
 205     # expense of some pre-processing and extra storage.
 206     grep = self._GenRowFilter(id_values)
 207     return [r for r in self._rows if grep(r)]
 208
 209   def GetRowIndicesByValue(self, id_values):
 210     """Return list of indices for rows matching k/v pairs in |id_values|."""
 211     grep = self._GenRowFilter(id_values)
 212     indices = []
 213     for ix, row in enumerate(self._rows):
 214       if grep(row):
 215         indices.append(ix)
 216
 217     return indices
 218
 219   def _PrepareValuesForAdd(self, values):
 220     """Prepare a |values| dict/list to be added as a row.
 221
 222     If |values| is a dict, verify that only supported column
 223     values are included. Add empty string values for columns
 224     not seen in the row.  The original dict may be altered.
 225
 226     If |values| is a list, translate it to a dict using known
 227     column order.  Append empty values as needed to match number
 228     of expected columns.
 229
 230     Return prepared dict.
 231     """
 232     if isinstance(values, dict):
 233       for col in values:
 234         if not col in self._column_set:
 235           raise LookupError("Tried adding data to unknown column '%s'" % col)
 236
 237       for col in self._columns:
 238         if not col in values:
 239           values[col] = self.EMPTY_CELL
 240
 241     elif isinstance(values, list):
 242       if len(values) > len(self._columns):
 243         raise LookupError("Tried adding row with too many columns")
 244       if len(values) < len(self._columns):
 245         shortage = len(self._columns) - len(values)
 246         values.extend([self.EMPTY_CELL] * shortage)
 247
 248       values = dict(zip(self._columns, values))
 249
 250     return values
 251
 252   def AppendRow(self, values):
 253     """Add a single row of data to the table, according to |values|.
 254
 255     The |values| argument can be either a dict or list.
 256     """
 257     row = self._PrepareValuesForAdd(values)
 258     self._rows.append(row)
 259
 260   def SetRowByIndex(self, index, values):
 261     """Replace the row at |index| with values from |values| dict."""
 262     row = self._PrepareValuesForAdd(values)
 263     self._rows[index] = row
 264
 265   def RemoveRowByIndex(self, index):
 266     """Remove the row at |index|."""
 267     del self._rows[index]
 268
 269   def HasColumn(self, name):
 270     """Return True if column |name| is in this table, False otherwise."""
 271     return name in self._column_set
 272
 273   def GetColumnIndex(self, name):
 274     """Return the column index for column |name|, -1 if not found."""
 275     for ix, col in enumerate(self._columns):
 276       if name == col:
 277         return ix
 278     return -1
 279
 280   def GetColumnByIndex(self, index):
 281     """Return the column name at |index|"""
 282     return self._columns[index]
 283
 284   def InsertColumn(self, index, name, value=None):
 285     """Insert a new column |name| into table at index |index|.
 286
 287     If |value| is specified, all rows will have |value| in the new column.
 288     Otherwise, they will have the EMPTY_CELL value.
 289     """
 290     if self.HasColumn(name):
 291       raise LookupError("Column %s already exists in table." % name)
 292
 293     self._columns.insert(index, name)
 294     self._column_set.add(name)
 295
 296     for row in self._rows:
 297       row[name] = value if value is not None else self.EMPTY_CELL
 298
 299   def AppendColumn(self, name, value=None):
 300     """Same as InsertColumn, but new column is appended after existing ones."""
 301     self.InsertColumn(self.GetNumColumns(), name, value)
 302
 303   def ProcessRows(self, row_processor):
 304     """Invoke |row_processor| on each row in sequence."""
 305     for row in self._rows:
 306       row_processor(row)
 307
 308   def MergeTable(self, other_table, id_columns, merge_rules=None,
 309                  allow_new_columns=False, key=None, reverse=False,
 310                  new_name=None):
 311     """Merge |other_table| into this table, identifying rows by |id_columns|.
 312
 313     The |id_columns| argument can either be a list of identifying columns names
 314     or a single column name (string).  The values in these columns will be used
 315     to identify the existing row that each row in |other_table| should be
 316     merged into.
 317
 318     The |merge_rules| specify what to do when there is a merge conflict.  Every
 319     column where a conflict is anticipated should have an entry in the
 320     |merge_rules| dict.  The value should be one of:
 321     'join_with:<text>| = Join the two conflicting values with <text>
 322     'accept_this_val' = Keep value in 'this' table and discard 'other' value.
 323     'accept_other_val' = Keep value in 'other' table and discard 'this' value.
 324     function = Keep return value from function(col_name, this_val, other_val)
 325
 326     A default merge rule can be specified with the key '__DEFAULT__' in
 327     |merge_rules|.
 328
 329     By default, the |other_table| must not have any columns that don't already
 330     exist in this table.  To allow new columns to be creating by virtue of their
 331     presence in |other_table| set |allow_new_columns| to true.
 332
 333     To sort the final merged table, supply |key| and |reverse| arguments exactly
 334     as they work with the Sort method.
 335     """
 336     # If requested, allow columns in other_table to create new columns
 337     # in this table if this table does not already have them.
 338     if allow_new_columns:
 339       # pylint: disable=W0212
 340       for ix, col in enumerate(other_table._columns):
 341         if not self.HasColumn(col):
 342           # Create a merge_rule on the fly for this new column.
 343           if not merge_rules:
 344             merge_rules = {}
 345           merge_rules[col] = 'accept_other_val'
 346
 347           if ix == 0:
 348             self.InsertColumn(0, col)
 349           else:
 350             prevcol = other_table._columns[ix - 1]
 351             previx = self.GetColumnIndex(prevcol)
 352             self.InsertColumn(previx + 1, col)
 353
 354     for other_row in other_table:
 355       self._MergeRow(other_row, id_columns, merge_rules=merge_rules)
 356
 357     # Optionally re-sort the merged table.
 358     if key:
 359       self.Sort(key, reverse=reverse)
 360
 361     if new_name:
 362       self.SetName(new_name)
 363     elif self.GetName() and other_table.GetName():
 364       self.SetName(self.GetName() + ' + ' + other_table.GetName())
 365
 366   def _GetIdValuesForRow(self, row, id_columns):
 367     """Return a dict with values from |row| in |id_columns|."""
 368     id_values = dict((col, row[col]) for col in
 369                      cros_build_lib.iflatten_instance(id_columns))
 370     return id_values
 371
 372   def _MergeRow(self, other_row, id_columns, merge_rules=None):
 373     """Merge |other_row| into this table.
 374
 375     See MergeTables for description of |id_columns| and |merge_rules|.
 376     """
 377     id_values = self._GetIdValuesForRow(other_row, id_columns)
 378
 379     row_indices = self.GetRowIndicesByValue(id_values)
 380     if row_indices:
 381       row_index = row_indices[0]
 382       row = self.GetRowByIndex(row_index)
 383       for col in other_row:
 384         if col in row:
 385           # Find the merge rule that applies to this column, if there is one.
 386           merge_rule = None
 387           if merge_rules:
 388             merge_rule = merge_rules.get(col, None)
 389             if not merge_rule and merge_rules:
 390               merge_rule = merge_rules.get('__DEFAULT__', None)
 391
 392           try:
 393             val = self._MergeColValue(col, row[col], other_row[col],
 394                                       merge_rule=merge_rule)
 395           except ValueError:
 396             msg = "Failed to merge '%s' value in row %r" % (col, id_values)
 397             print >> sys.stderr, msg
 398             raise
 399
 400           if val != row[col]:
 401             row[col] = val
 402         else:
 403           # Cannot add new columns to row this way.
 404           raise LookupError("Tried merging data to unknown column '%s'" % col)
 405       self.SetRowByIndex(row_index, row)
 406     else:
 407       self.AppendRow(other_row)
 408
 409   def _MergeColValue(self, col, val, other_val, merge_rule):
 410     """Merge |col| values |val| and |other_val| according to |merge_rule|.
 411
 412     See MergeTable method for explanation of option |merge_rule|.
 413     """
 414     if val == other_val:
 415       return val
 416
 417     if not merge_rule:
 418       raise ValueError("Cannot merge column values without rule: '%s' vs '%s'" %
 419                        (val, other_val))
 420     elif inspect.isfunction(merge_rule):
 421       try:
 422         return merge_rule(col, val, other_val)
 423       except ValueError:
 424         pass # Fall through to exception at end
 425     elif merge_rule == 'accept_this_val':
 426       return val
 427     elif merge_rule == 'accept_other_val':
 428       return other_val
 429     else:
 430       match = re.match(r'join_with:(.+)$', merge_rule)
 431       if match:
 432         return match.group(1).join(v for v in (val, other_val) if v)
 433
 434     raise ValueError("Invalid merge rule (%s) for values '%s' and '%s'." %
 435                      (merge_rule, val, other_val))
 436
 437   def Sort(self, key, reverse=False):
 438     """Sort the rows using the given |key| function."""
 439     self._rows.sort(key=key, reverse=reverse)
 440
 441   def WriteCSV(self, filehandle, hiddencols=None):
 442     """Write this table out as comma-separated values to |filehandle|.
 443
 444     To skip certain columns during the write, use the |hiddencols| set.
 445     """
 446     def ColFilter(col):
 447       """Filter function for columns not in hiddencols."""
 448       return not hiddencols or col not in hiddencols
 449
 450     cols = [col for col in self._columns if ColFilter(col)]
 451     filehandle.write(','.join(cols) + '\n')
 452     for row in self._rows:
 453       vals = [row.get(col, self.EMPTY_CELL) for col in cols]
 454       filehandle.write(','.join(vals) + '\n')