Created parse_log.py, competitor to parse_log.sh

author Daniel Golden <dgolden1@gmail.com>

Thu, 30 Oct 2014 22:51:14 +0000 (15:51 -0700)

committer Daniel Golden <dgolden1@gmail.com>

Mon, 8 Dec 2014 20:06:19 +0000 (12:06 -0800)
author Daniel Golden <dgolden1@gmail.com>
Thu, 30 Oct 2014 22:51:14 +0000 (15:51 -0700)
committer Daniel Golden <dgolden1@gmail.com>
Mon, 8 Dec 2014 20:06:19 +0000 (12:06 -0800)
diff --git a/tools/extra/extract_seconds.py b/tools/extra/extract_seconds.py

index f791afa..591a51f 100755 (executable)
--- a/tools/extra/extract_seconds.py
+++ b/tools/extra/extract_seconds.py
@@ -18,18 +18,39 @@ def extract_datetime_from_line(line, year):
      dt = datetime.datetime(year, month, day, hour, minute, second, microsecond)
      return dt
  
+
+def get_log_created_year(input_file):
+    """Get year from log file system timestamp
+    """
+
+    log_created_time = os.path.getctime(input_file)
+    log_created_year = datetime.datetime.fromtimestamp(log_created_time).year
+    return log_created_year
+
+
+def get_start_time(line_iterable, year):
+    """Find start time from group of lines
+    """
+
+    start_datetime = None
+    for line in line_iterable:
+        line = line.strip()
+        if line.find('Solving') != -1:
+            start_datetime = extract_datetime_from_line(line, year)
+            break
+    return start_datetime
+
+
  def extract_seconds(input_file, output_file):
      with open(input_file, 'r') as f:
          lines = f.readlines()
-    log_created_time = os.path.getctime(input_file)
-    log_created_year = datetime.datetime.fromtimestamp(log_created_time).year
-    start_time_found = False
+    log_created_year = get_log_created_year(input_file)
+    start_datetime = get_start_time(lines, log_created_year)
+    assert start_datetime, 'Start time not found'
+
      out = open(output_file, 'w')
      for line in lines:
          line = line.strip()
-        if not start_time_found and line.find('Solving') != -1:
-            start_time_found = True
-            start_datetime = extract_datetime_from_line(line, log_created_year)
          if line.find('Iteration') != -1:
              dt = extract_datetime_from_line(line, log_created_year)
              elapsed_seconds = (dt - start_datetime).total_seconds()
diff --git a/tools/extra/parse_log.py b/tools/extra/parse_log.py

new file mode 100755 (executable)

index 0000000..dca7999
--- /dev/null
+++ b/tools/extra/parse_log.py
@@ -0,0 +1,140 @@
+#!/usr/bin/env python
+
+"""
+Parse training log
+
+Competitor to parse_log.sh
+"""
+
+import os
+import re
+import extract_seconds
+import argparse
+
+
+def get_line_type(line):
+    """Return either 'test' or 'train depending on line type
+    """
+
+    line_type = None
+    if line.find('Train') != -1:
+        line_type = 'train'
+    elif line.find('Test') != -1:
+        line_type = 'test'
+    return line_type
+
+
+def parse_log(path_to_log):
+    """Parse log file
+    """
+
+    re_iteration = re.compile('Iteration (\d+)')
+    re_accuracy = re.compile('output #\d+: accuracy = ([\.\d]+)')
+    re_loss = re.compile('output #\d+: loss = ([\.\d]+)')
+    re_lr = re.compile('lr = ([\.\d]+)')
+
+    # Pick out lines of interest
+    iteration = -1
+    test_accuracy = -1
+    learning_rate = float('NaN')
+    train_list = []
+    test_list = []
+
+    logfile_year = extract_seconds.get_log_created_year(path_to_log)
+    with open(path_to_log) as f:
+        start_time = extract_seconds.get_start_time(f, logfile_year)
+
+        for line in f:
+            iteration_match = re_iteration.search(line)
+            if iteration_match:
+                iteration = float(iteration_match.group(1))
+            if iteration == -1:
+                # Only look for other stuff if we've found the first iteration
+                continue
+
+            time = extract_seconds.extract_datetime_from_line(line,
+                                                              logfile_year)
+            seconds = (time - start_time).total_seconds()
+
+            lr_match = re_lr.search(line)
+            if lr_match:
+                learning_rate = float(lr_match.group(1))
+
+            accuracy_match = re_accuracy.search(line)
+            if accuracy_match:
+                test_accuracy = float(accuracy_match.group(1))
+
+            loss_match = re_loss.search(line)
+            if loss_match:
+                loss = float(loss_match.group(1))
+                line_type = get_line_type(line)
+                assert line_type, ('Failed to determine line type for line: ' +
+                                   line)
+                if line_type == 'test':
+                    # NOTE: we assume that accuracy always comes right before
+                    # loss for test data
+                    test_list.append((iteration, seconds, test_accuracy, loss))
+                elif line_type == 'train':
+                    train_list.append((iteration, seconds, loss, learning_rate))
+
+    return train_list, test_list
+
+
+def save_csv_files(logfile_path, output_dir, train_list, test_list,
+                   verbose=False):
+    """Save CSV files to output_dir
+
+    If the input log file is, e.g., caffe.INFO, the names will be
+    caffe.INFO.train and caffe.INFO.test
+    """
+
+    log_basename = os.path.basename(logfile_path)
+    train_filename = os.path.join(output_dir, log_basename + '.train')
+    write_csv(train_filename, train_list, '%d,%f,%f,%f',
+              'NumIters,Seconds,TrainingLoss,LearningRate', verbose)
+
+    test_filename = os.path.join(output_dir, log_basename + '.test')
+    write_csv(test_filename, test_list, '%d,%f,%f,%f',
+              'NumIters,Seconds,TestAccuracy,TestLoss', verbose)
+
+
+def write_csv(output_filename, list_of_tuples, format_string, header,
+              verbose=False):
+    """Write a CSV file
+    """
+    with open(output_filename, 'w') as f:
+        f.write(header + '\n')
+        for row in list_of_tuples:
+            line = format_string % row
+            f.write(line + '\n')
+    if verbose:
+        print 'Wrote %s' % output_filename
+
+
+def parse_args():
+    description = ('Parse a Caffe training log into two CSV files '
+                   'representing training and testing information')
+    parser = argparse.ArgumentParser(description=description)
+
+    parser.add_argument('logfile_path',
+                        help='Path to log file')
+
+    parser.add_argument('output_dir',
+                        help='Directory in which to place output CSV files')
+
+    parser.add_argument('--verbose',
+                        action='store_true',
+                        help='Print some extra info (e.g., output filenames)')
+
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    train_list, test_list = parse_log(args.logfile_path)
+    save_csv_files(args.logfile_path, args.output_dir, train_list, test_list)
+
+
+if __name__ == '__main__':
+    main()
author	Daniel Golden <dgolden1@gmail.com>
	Thu, 30 Oct 2014 22:51:14 +0000 (15:51 -0700)
committer	Daniel Golden <dgolden1@gmail.com>
	Mon, 8 Dec 2014 20:06:19 +0000 (12:06 -0800)
tools/extra/extract_seconds.py		patch \| blob \| history
tools/extra/parse_log.py	[new file with mode: 0755]	patch \| blob