From 02ecf1dacf24e99e9914f173e3ad2cb83637a3b2 Mon Sep 17 00:00:00 2001 From: Evan Shelhamer Date: Mon, 19 May 2014 23:50:15 -0700 Subject: [PATCH] windowed detection in python --- python/caffe/__init__.py | 1 + python/caffe/detection/__init__.py | 0 python/caffe/detection/detector.py | 462 ------------------------------------- python/caffe/detector.py | 110 +++++++++ python/detect.py | 151 ++++++++++++ 5 files changed, 262 insertions(+), 462 deletions(-) delete mode 100644 python/caffe/detection/__init__.py delete mode 100644 python/caffe/detection/detector.py create mode 100644 python/caffe/detector.py create mode 100755 python/detect.py diff --git a/python/caffe/__init__.py b/python/caffe/__init__.py index e07b013..430bfce 100644 --- a/python/caffe/__init__.py +++ b/python/caffe/__init__.py @@ -1,3 +1,4 @@ from .pycaffe import Net, SGDSolver from .classifier import Classifier +from .detector import Detector import io diff --git a/python/caffe/detection/__init__.py b/python/caffe/detection/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/python/caffe/detection/detector.py b/python/caffe/detection/detector.py deleted file mode 100644 index 9355274..0000000 --- a/python/caffe/detection/detector.py +++ /dev/null @@ -1,462 +0,0 @@ -#!/usr/bin/env python -""" -Do windowed detection by classifying a number of images/crops at once, -optionally using the selective search window proposal method. - -This implementation follows - Ross Girshick, Jeff Donahue, Trevor Darrell, Jitendra Malik. - Rich feature hierarchies for accurate object detection and semantic - segmentation. - http://arxiv.org/abs/1311.2524 - -The selective_search_ijcv_with_python code is available at - https://github.com/sergeyk/selective_search_ijcv_with_python - -TODO: -- batch up image filenames as well: don't want to load all of them into memory -- refactor into class (without globals) -- get rid of imagenet mean file and just use mean pixel value -""" -import numpy as np -import pandas as pd -import os -import sys -import argparse -import time -import skimage.io -import skimage.transform -import selective_search_ijcv_with_python as selective_search -import caffe - -NET = None - -IMAGE_DIM = None -CROPPED_DIM = None -IMAGE_CENTER = None - -IMAGE_MEAN = None -CROPPED_IMAGE_MEAN = None - -BATCH_SIZE = None -NUM_OUTPUT = None - -CROP_MODES = ['list', 'center_only', 'corners', 'selective_search'] - - -def load_image(filename): - """ - Input: - filename: string - - Output: - image: an image of size (H x W x 3) of type uint8. - """ - img = skimage.io.imread(filename) - if img.ndim == 2: - img = np.tile(img[:, :, np.newaxis], (1, 1, 3)) - elif img.shape[2] == 4: - img = img[:, :, :3] - return img - - -def format_image(image, window=None, cropped_size=False): - """ - Input: - image: (H x W x 3) ndarray - window: (4) ndarray - (ymin, xmin, ymax, xmax) coordinates, 0-indexed - cropped_size: bool - Whether to output cropped size image or full size image. - - Output: - image: (3 x H x W) ndarray - Resized to either IMAGE_DIM or CROPPED_DIM. - dims: (H, W) of the original image - """ - dims = image.shape[:2] - - # Crop a subimage if window is provided. - if window is not None: - image = image[window[0]:window[2], window[1]:window[3]] - - # Resize to input size, subtract mean, convert to BGR - image = image[:, :, ::-1] - if cropped_size: - image = skimage.transform.resize(image, (CROPPED_DIM, CROPPED_DIM)) * 255 - image -= CROPPED_IMAGE_MEAN - else: - image = skimage.transform.resize(image, (IMAGE_DIM, IMAGE_DIM)) * 255 - image -= IMAGE_MEAN - - image = image.swapaxes(1, 2).swapaxes(0, 1) - return image, dims - - -def _image_coordinates(dims, window): - """ - Calculate the original image coordinates of a - window in the canonical (IMAGE_DIM x IMAGE_DIM) coordinates - - Input: - dims: (H, W) of the original image - window: (ymin, xmin, ymax, xmax) in the (IMAGE_DIM x IMAGE_DIM) frame - - Output: - image_window: (ymin, xmin, ymax, xmax) in the original image frame - """ - h, w = dims - max_dim = float(IMAGE_DIM) - h_scale, w_scale = h / max_dim, w / max_dim - image_window = window * np.array((1. / h_scale, 1. / w_scale, - h_scale, w_scale)) - return image_window.round().astype(int) - - -def _assemble_images_list(input_df): - """ - For each image, collect the crops for the given windows. - - Input: - input_df: pandas.DataFrame - with 'filename', 'ymin', 'xmin', 'ymax', 'xmax' columns - - Output: - images_df: pandas.DataFrame - with 'image', 'window', 'filename' columns - """ - # unpack sequence of (image filename, windows) - coords = ['ymin', 'xmin', 'ymax', 'xmax'] - image_windows = ( - (ix, input_df.iloc[np.where(input_df.index == ix)][coords].values) - for ix in input_df.index.unique() - ) - - # extract windows - data = [] - for image_fname, windows in image_windows: - image = load_image(image_fname) - for window in windows: - window_image, _ = format_image(image, window, cropped_size=True) - data.append({ - 'image': window_image[np.newaxis, :], - 'window': window, - 'filename': image_fname - }) - - images_df = pd.DataFrame(data) - return images_df - - -def _assemble_images_center_only(image_fnames): - """ - For each image, square the image and crop its center. - - Input: - image_fnames: list - - Output: - images_df: pandas.DataFrame - With 'image', 'window', 'filename' columns. - """ - crop_start, crop_end = IMAGE_CENTER, IMAGE_CENTER + CROPPED_DIM - crop_window = np.array((crop_start, crop_start, crop_end, crop_end)) - - data = [] - for image_fname in image_fnames: - image, dims = format_image(load_image(image_fname)) - data.append({ - 'image': image[np.newaxis, :, - crop_start:crop_end, - crop_start:crop_end], - 'window': _image_coordinates(dims, crop_window), - 'filename': image_fname - }) - - images_df = pd.DataFrame(data) - return images_df - - -def _assemble_images_corners(image_fnames): - """ - For each image, square the image and crop its center, four corners, - and mirrored version of the above. - - Input: - image_fnames: list - - Output: - images_df: pandas.DataFrame - With 'image', 'window', 'filename' columns. - """ - # make crops - indices = [0, IMAGE_DIM - CROPPED_DIM] - crops = np.empty((5, 4), dtype=int) - curr = 0 - for i in indices: - for j in indices: - crops[curr] = (i, j, i + CROPPED_DIM, j + CROPPED_DIM) - curr += 1 - crops[4] = (IMAGE_CENTER, IMAGE_CENTER, - IMAGE_CENTER + CROPPED_DIM, IMAGE_CENTER + CROPPED_DIM) - all_crops = np.tile(crops, (2, 1)) - - data = [] - for image_fname in image_fnames: - image, dims = format_image(load_image(image_fname)) - image_crops = np.empty((10, 3, CROPPED_DIM, CROPPED_DIM), dtype=np.float32) - curr = 0 - for crop in crops: - image_crops[curr] = image[:, crop[0]:crop[2], crop[1]:crop[3]] - curr += 1 - image_crops[5:] = image_crops[:5, :, :, ::-1] # flip for mirrors - for i in range(len(all_crops)): - data.append({ - 'image': image_crops[i][np.newaxis, :], - 'window': _image_coordinates(dims, all_crops[i]), - 'filename': image_fname - }) - - images_df = pd.DataFrame(data) - return images_df - - -def _assemble_images_selective_search(image_fnames): - """ - Run Selective Search window proposals on all images, then for each - image-window pair, extract a square crop. - - Input: - image_fnames: list - - Output: - images_df: pandas.DataFrame - With 'image', 'window', 'filename' columns. - """ - windows_list = selective_search.get_windows(image_fnames) - - data = [] - for image_fname, windows in zip(image_fnames, windows_list): - image = load_image(image_fname) - for window in windows: - window_image, _ = format_image(image, window, cropped_size=True) - data.append({ - 'image': window_image[np.newaxis, :], - 'window': window, - 'filename': image_fname - }) - - images_df = pd.DataFrame(data) - return images_df - - -def assemble_batches(inputs, crop_mode='center_only'): - """ - Assemble DataFrame of image crops for feature computation. - - Input: - inputs: list of filenames (center_only, corners, and selective_search mode) - OR input DataFrame (list mode) - mode: string - 'list': take the image windows from the input as-is - 'center_only': take the CROPPED_DIM middle of the image windows - 'corners': take CROPPED_DIM-sized boxes at 4 corners and center of - the image windows, as well as their flipped versions: a total of 10. - 'selective_search': run Selective Search region proposal on the - image windows, and take each enclosing subwindow. - - Output: - df_batches: list of DataFrames, each one of BATCH_SIZE rows. - Each row has 'image', 'filename', and 'window' info. - Column 'image' contains (X x 3 x CROPPED_DIM x CROPPED_IM) ndarrays. - Column 'filename' contains source filenames. - Column 'window' contains [ymin, xmin, ymax, xmax] ndarrays. - If 'filename' is None, then the row is just for padding. - - Note: for increased efficiency, increase the batch size (to the limit of gpu - memory) to avoid the communication cost - """ - if crop_mode == 'list': - images_df = _assemble_images_list(inputs) - - elif crop_mode == 'center_only': - images_df = _assemble_images_center_only(inputs) - - elif crop_mode == 'corners': - images_df = _assemble_images_corners(inputs) - - elif crop_mode == 'selective_search': - images_df = _assemble_images_selective_search(inputs) - - else: - raise Exception("Unknown mode: not in {}".format(CROP_MODES)) - - # Make sure the DataFrame has a multiple of BATCH_SIZE rows: - # just fill the extra rows with NaN filenames and all-zero images. - N = images_df.shape[0] - remainder = N % BATCH_SIZE - if remainder > 0: - zero_image = np.zeros_like(images_df['image'].iloc[0]) - zero_window = np.zeros((1, 4), dtype=int) - remainder_df = pd.DataFrame([{ - 'filename': None, - 'image': zero_image, - 'window': zero_window - }] * (BATCH_SIZE - remainder)) - images_df = images_df.append(remainder_df) - N = images_df.shape[0] - - # Split into batches of BATCH_SIZE. - ind = np.arange(N) / BATCH_SIZE - df_batches = [images_df[ind == i] for i in range(N / BATCH_SIZE)] - return df_batches - - -def compute_feats(images_df): - input_blobs = [np.ascontiguousarray( - np.concatenate(images_df['image'].values), dtype='float32')] - output_blobs = [np.empty((BATCH_SIZE, NUM_OUTPUT, 1, 1), dtype=np.float32)] - - NET.Forward(input_blobs, output_blobs) - feats = [output_blobs[0][i].flatten() for i in range(len(output_blobs[0]))] - - # Add the features and delete the images. - del images_df['image'] - images_df['feat'] = feats - return images_df - - -def config(model_def, pretrained_model, gpu, image_dim, image_mean_file): - global IMAGE_DIM, CROPPED_DIM, IMAGE_CENTER, IMAGE_MEAN, CROPPED_IMAGE_MEAN - global NET, BATCH_SIZE, NUM_OUTPUT - - # Initialize network by loading model definition and weights. - t = time.time() - print("Loading Caffe model.") - NET = caffe.Net(model_def, pretrained_model) - NET.set_phase_test() - if gpu: - NET.set_mode_gpu() - print("Caffe model loaded in {:.3f} s".format(time.time() - t)) - - # Configure for input/output data - IMAGE_DIM = image_dim - CROPPED_DIM = NET.blobs.values()[0].width - IMAGE_CENTER = int((IMAGE_DIM - CROPPED_DIM) / 2) - - # Load the data set mean file - IMAGE_MEAN = np.load(image_mean_file) - - CROPPED_IMAGE_MEAN = IMAGE_MEAN[IMAGE_CENTER:IMAGE_CENTER + CROPPED_DIM, - IMAGE_CENTER:IMAGE_CENTER + CROPPED_DIM, - :] - BATCH_SIZE = NET.blobs.values()[0].num # network batch size - NUM_OUTPUT = NET.blobs.values()[-1].channels # number of output classes - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - - # Required arguments: input and output. - parser.add_argument( - "input_file", - help="Input txt/csv filename. If .txt, must be list of filenames.\ - If .csv, must be comma-separated file with header\ - 'filename, xmin, ymin, xmax, ymax'" - ) - parser.add_argument( - "output_file", - help="Output h5/csv filename. Format depends on extension." - ) - - # Optional arguments. - parser.add_argument( - "--model_def", - default="../../../examples/imagenet/imagenet_deploy.prototxt", - help="Model definition file." - ) - parser.add_argument( - "--pretrained_model", - default="../../../examples/imagenet/caffe_reference_imagenet_model", - help="Trained model weights file." - ) - parser.add_argument( - "--gpu", - default=False, - help="Switch for gpu computation." - ) - parser.add_argument( - "--crop_mode", - default="center_only", - choices=CROP_MODES, - help="Image crop mode" - ) - parser.add_argument( - "--images_dim", - default=256, - help="Canonical dimension of (square) images." - ) - parser.add_argument( - "--images_mean_file", - default=os.path.join( - os.path.dirname(__file__), '../imagenet/ilsvrc_2012_mean.npy'), - help="Data set image mean (numpy array).") - - args = parser.parse_args() - - # Configure network, input, output. - config(args.model_def, args.pretrained_model, args.gpu, args.images_dim, - args.images_mean_file) - - # Load input. - t = time.time() - print('Loading input and assembling batches...') - if args.input_file.lower().endswith('txt'): - with open(args.input_file) as f: - inputs = [_.strip() for _ in f.readlines()] - elif args.input_file.lower().endswith('csv'): - inputs = pd.read_csv(args.input_file, sep=',', dtype={'filename': str}) - inputs.set_index('filename', inplace=True) - else: - raise Exception("Uknown input file type: not in txt or csv") - - # Assemble into batches - image_batches = assemble_batches(inputs, args.crop_mode) - print('{} batches assembled in {:.3f} s'.format(len(image_batches), - time.time() - t)) - - # Process the batches. - t = time.time() - print 'Processing {} files in {} batches'.format(len(inputs), - len(image_batches)) - dfs_with_feats = [] - for i in range(len(image_batches)): - if i % 10 == 0: - print('...on batch {}/{}, elapsed time: {:.3f} s'.format( - i, len(image_batches), time.time() - t)) - dfs_with_feats.append(compute_feats(image_batches[i])) - - # Concatenate, droppping the padding rows. - df = pd.concat(dfs_with_feats).dropna(subset=['filename']) - df.set_index('filename', inplace=True) - print("Processing complete after {:.3f} s.".format(time.time() - t)) - - # Label coordinates - coord_cols = ['ymin', 'xmin', 'ymax', 'xmax'] - df[coord_cols] = pd.DataFrame( - data=np.vstack(df['window']), index=df.index, columns=coord_cols) - del(df['window']) - - # Write out the results. - t = time.time() - if args.output_file.lower().endswith('csv'): - # enumerate the class probabilities - class_cols = ['class{}'.format(x) for x in range(NUM_OUTPUT)] - df[class_cols] = pd.DataFrame( - data=np.vstack(df['feat']), index=df.index, columns=class_cols) - df.to_csv(args.output_file, cols=coord_cols + class_cols) - else: - df.to_hdf(args.output_file, 'df', mode='w') - print("Done. Saving to {} took {:.3f} s.".format( - args.output_file, time.time() - t)) - - sys.exit() diff --git a/python/caffe/detector.py b/python/caffe/detector.py new file mode 100644 index 0000000..db9b1aa --- /dev/null +++ b/python/caffe/detector.py @@ -0,0 +1,110 @@ +#!/usr/bin/env python +""" +Do windowed detection by classifying a number of images/crops at once, +optionally using the selective search window proposal method. + +This implementation follows ideas in + Ross Girshick, Jeff Donahue, Trevor Darrell, Jitendra Malik. + Rich feature hierarchies for accurate object detection and semantic + segmentation. + http://arxiv.org/abs/1311.2524 + +The selective_search_ijcv_with_python code required for the selective search +proposal mode is available at + https://github.com/sergeyk/selective_search_ijcv_with_python + +TODO +- R-CNN crop mode / crop with context. +- Bundle with R-CNN model for example. +""" +import numpy as np +import os + +import caffe + + +class Detector(caffe.Net): + """ + Detector extends Net for windowed detection by a list of crops or + selective search proposals. + """ + def __init__(self, model_file, pretrained_file, gpu=False, mean_file=None, + input_scale=None, channel_swap=None): + """ + Take + gpu, mean_file, input_scale, channel_swap: convenience params for + setting mode, mean, input scale, and channel order. + """ + caffe.Net.__init__(self, model_file, pretrained_file) + self.set_phase_test() + + if gpu: + self.set_mode_gpu() + else: + self.set_mode_cpu() + + if mean_file: + self.set_mean(self.inputs[0], mean_file) + if input_scale: + self.set_input_scale(self.inputs[0], input_scale) + if channel_swap: + self.set_channel_swap(self.inputs[0], channel_swap) + + + def detect_windows(self, images_windows): + """ + Do windowed detection over given images and windows. Windows are + extracted then warped to the input dimensions of the net. + + Take + images_windows: (image filename, window list) iterable. + + Give + detections: list of {filename: image filename, window: crop coordinates, + predictions: prediction vector} dicts. + """ + # Extract windows. + window_inputs = [] + for image_fname, windows in images_windows: + image = caffe.io.load_image(image_fname).astype(np.float32) + for window in windows: + window_inputs.append(image[window[0]:window[2], + window[1]:window[3]]) + + # Run through the net (warping windows to input dimensions). + caffe_in = self.preprocess(self.inputs[0], window_inputs) + out = self.forward_all(**{self.inputs[0]: caffe_in}) + predictions = out[self.outputs[0]].squeeze(axis=(2,3)) + + # Package predictions with images and windows. + detections = [] + ix = 0 + for image_fname, windows in images_windows: + for window in windows: + detections.append({ + 'window': window, + 'prediction': predictions[ix], + 'filename': image_fname + }) + ix += 1 + return detections + + + def detect_selective_search(self, image_fnames): + """ + Do windowed detection over Selective Search proposals by extracting + the crop and warping to the input dimensions of the net. + + Take + image_fnames: list + + Give + detections: list of {filename: image filename, window: crop coordinates, + predictions: prediction vector} dicts. + """ + import selective_search_ijcv_with_python as selective_search + # Make absolute paths so MATLAB can find the files. + image_fnames = [os.path.abspath(f) for f in image_fnames] + windows_list = selective_search.get_windows(image_fnames) + # Run windowed detection on the selective search list. + return self.detect_windows(zip(image_fnames, windows_list)) diff --git a/python/detect.py b/python/detect.py new file mode 100755 index 0000000..15418bb --- /dev/null +++ b/python/detect.py @@ -0,0 +1,151 @@ +#!/usr/bin/env python +""" +detector.py is an out-of-the-box windowed detector +callable from the command line. + +By default it configures and runs the Caffe reference ImageNet model. +Note that this model was trained for image classification and not detection, +and finetuning for detection can be expected to improve results. + +The selective_search_ijcv_with_python code required for the selective search +proposal mode is available at + https://github.com/sergeyk/selective_search_ijcv_with_python + +TODO: +- batch up image filenames as well: don't want to load all of them into memory +- come up with a batching scheme that preserved order / keeps a unique ID +""" +import numpy as np +import pandas as pd +import os +import argparse +import time + +import caffe + +CROP_MODES = ['list', 'selective_search'] +COORD_COLS = ['ymin', 'xmin', 'ymax', 'xmax'] + + +def main(argv): + pycaffe_dir = os.path.dirname(__file__) + + parser = argparse.ArgumentParser() + # Required arguments: input and output. + parser.add_argument( + "input_file", + help="Input txt/csv filename. If .txt, must be list of filenames.\ + If .csv, must be comma-separated file with header\ + 'filename, xmin, ymin, xmax, ymax'" + ) + parser.add_argument( + "output_file", + help="Output h5/csv filename. Format depends on extension." + ) + # Optional arguments. + parser.add_argument( + "--model_def", + default=os.path.join(pycaffe_dir, + "../examples/imagenet/imagenet_deploy.prototxt"), + help="Model definition file." + ) + parser.add_argument( + "--pretrained_model", + default=os.path.join(pycaffe_dir, + "../examples/imagenet/caffe_reference_imagenet_model"), + help="Trained model weights file." + ) + parser.add_argument( + "--crop_mode", + default="center_only", + choices=CROP_MODES, + help="Image crop mode" + ) + parser.add_argument( + "--gpu", + action='store_true', + help="Switch for gpu computation." + ) + parser.add_argument( + "--mean_file", + default=os.path.join(pycaffe_dir, + 'caffe/imagenet/ilsvrc_2012_mean.npy'), + help="Data set image mean of H x W x K dimensions (numpy array). " + + "Set to '' for no mean subtraction." + ) + parser.add_argument( + "--input_scale", + type=float, + default=255, + help="Multiply input features by this scale before input to net" + ) + parser.add_argument( + "--channel_swap", + default='2,1,0', + help="Order to permute input channels. The default converts " + + "RGB -> BGR since BGR is the Caffe default by way of OpenCV." + + ) + args = parser.parse_args() + + channel_swap = [int(s) for s in args.channel_swap.split(',')] + + # Make detector. + detector = caffe.Detector(args.model_def, args.pretrained_model, + gpu=args.gpu, mean_file=args.mean_file, + input_scale=args.input_scale, channel_swap=channel_swap) + + if args.gpu: + print 'GPU mode' + + # Load input. + t = time.time() + print('Loading input...') + if args.input_file.lower().endswith('txt'): + with open(args.input_file) as f: + inputs = [_.strip() for _ in f.readlines()] + elif args.input_file.lower().endswith('csv'): + inputs = pd.read_csv(args.input_file, sep=',', dtype={'filename': str}) + inputs.set_index('filename', inplace=True) + else: + raise Exception("Unknown input file type: not in txt or csv.") + + # Detect. + if args.crop_mode == 'list': + # Unpack sequence of (image filename, windows). + images_windows = ( + (ix, inputs.iloc[np.where(inputs.index == ix)][COORD_COLS].values) + for ix in inputs.index.unique() + ) + detections = detector.detect_windows(images_windows) + else: + detections = detector.detect_selective_search(inputs) + print("Processed {} windows in {:.3f} s.".format(len(detections), + time.time() - t)) + + # Collect into dataframe with labeled fields. + df = pd.DataFrame(detections) + df.set_index('filename', inplace=True) + df[COORD_COLS] = pd.DataFrame( + data=np.vstack(df['window']), index=df.index, columns=COORD_COLS) + del(df['window']) + + # Save results. + t = time.time() + if args.output_file.lower().endswith('csv'): + # csv + # Enumerate the class probabilities. + class_cols = ['class{}'.format(x) for x in range(NUM_OUTPUT)] + df[class_cols] = pd.DataFrame( + data=np.vstack(df['feat']), index=df.index, columns=class_cols) + df.to_csv(args.output_file, cols=COORD_COLS + class_cols) + else: + # h5 + df.to_hdf(args.output_file, 'df', mode='w') + print("Saved to {} in {:.3f} s.".format(args.output_file, + time.time() - t)) + + +if __name__ == "__main__": + import sys + main(sys.argv) -- 2.7.4