From 0584ae8f98fdedd8271124a2c2366233b4763b83 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Tim-Philipp=20M=C3=BCller?= Date: Wed, 30 Nov 2011 19:00:42 +0000 Subject: [PATCH] splitfilesrc: specify filenames via normal wildcards instead of regular expressions Less cracktastic in the end. --- gst/multifile/Makefile.am | 5 +- gst/multifile/gstsplitfilesrc.c | 74 +++++---- gst/multifile/patternspec.c | 334 ++++++++++++++++++++++++++++++++++++++++ gst/multifile/patternspec.h | 47 ++++++ 4 files changed, 424 insertions(+), 36 deletions(-) create mode 100644 gst/multifile/patternspec.c create mode 100644 gst/multifile/patternspec.h diff --git a/gst/multifile/Makefile.am b/gst/multifile/Makefile.am index e7dea03..519bd1b 100644 --- a/gst/multifile/Makefile.am +++ b/gst/multifile/Makefile.am @@ -5,13 +5,14 @@ libgstmultifile_la_SOURCES = \ gstmultifilesink.c \ gstmultifilesrc.c \ gstmultifile.c \ - gstsplitfilesrc.c + gstsplitfilesrc.c \ + patternspec.c libgstmultifile_la_CFLAGS = $(GST_BASE_CFLAGS) $(GST_CFLAGS) $(GIO_CFLAGS) libgstmultifile_la_LIBADD = $(GST_BASE_LIBS) $(GST_LIBS) $(GIO_LIBS) libgstmultifile_la_LDFLAGS = $(GST_PLUGIN_LDFLAGS) libgstmultifile_la_LIBTOOLFLAGS = --tag=disable-static -noinst_HEADERS = gstmultifilesrc.h gstmultifilesink.h gstsplitfilesrc.h +noinst_HEADERS = gstmultifilesrc.h gstmultifilesink.h gstsplitfilesrc.h patternspec.h Android.mk: Makefile.am $(BUILT_SOURCES) diff --git a/gst/multifile/gstsplitfilesrc.c b/gst/multifile/gstsplitfilesrc.c index 3fcdf07..3125997 100644 --- a/gst/multifile/gstsplitfilesrc.c +++ b/gst/multifile/gstsplitfilesrc.c @@ -25,17 +25,14 @@ * had to be split into multiple parts due to filesystem file size limitations, * for example. * - * The files to select are chosen via the location property, which takes a - * regular expression (note: shell-style wildcards will not work). If the - * location is an absolute path or contains directory components, only the - * base file name part will be considered a regular expression. The results - * will be sorted. The location may include directory components, but the - * regular expression to select the files can only be in the filename part. + * The files to select are chosen via the location property, which supports + * (and expects) shell-style wildcards (but only for the filename, not for + * directories). The results will be sorted. * * * Example launch line * |[ - * gst-launch splitfilesrc location="/path/to/part-.*.mpg" ! decodebin ! ... \ + * gst-launch splitfilesrc location="/path/to/part-*.mpg" ! decodebin ! ... \ * ]| Plays the different parts as if they were one single MPEG file. * * @@ -51,9 +48,16 @@ #endif #include "gstsplitfilesrc.h" +#include "patternspec.h" #include +#ifdef G_OS_WIN32 +#define DEFAULT_PATTERN_MATCH_MODE MATCH_MODE_UTF8 +#else +#define DEFAULT_PATTERN_MATCH_MODE MATCH_MODE_AUTO +#endif + enum { PROP_LOCATION = 1 @@ -105,6 +109,12 @@ gst_split_file_src_base_init (gpointer g_class) "Tim-Philipp Müller "); } +#ifdef G_OS_WIN32 +#define WIN32_BLURB " Location string must be in UTF-8 encoding (on Windows)." +#else +#define WIN32_BLURB /* nothing */ +#endif + static void gst_split_file_src_class_init (GstSplitFileSrcClass * klass) { @@ -115,16 +125,12 @@ gst_split_file_src_class_init (GstSplitFileSrcClass * klass) gobject_class->get_property = gst_split_file_src_get_property; gobject_class->finalize = gst_split_file_src_finalize; - /* We're using a regular expression here instead of wildcards, because - * GPatternSpec can only handle UTF-8 and filenames on unix tend to be - * just bytes and are often ISO-8859-X, and we don't feel like - * re-inventing GPatternSpec */ g_object_class_install_property (gobject_class, PROP_LOCATION, g_param_spec_string ("location", "File Location", - "Regular expression to create file names of the input files. If " + "Wildcard pattern to match file names of the input files. If " "the location is an absolute path or contains directory components, " - "only the base file name part will be considered a regular " - "expression. The results will be sorted.", + "only the base file name part will be considered for pattern " + "matching. The results will be sorted." WIN32_BLURB, DEFAULT_LOCATION, G_PARAM_READWRITE | G_PARAM_STATIC_STRINGS)); gstbasesrc_class->start = GST_DEBUG_FUNCPTR (gst_split_file_src_start); @@ -203,6 +209,12 @@ gst_split_file_src_set_property (GObject * object, guint prop_id, GST_OBJECT_LOCK (src); g_free (src->location); src->location = g_value_dup_string (value); +#ifdef G_OS_WIN32 + if (!g_utf8_validate (src->location, -1, NULL)) { + g_warning ("splitfilesrc 'location' property must be in UTF-8 " + "encoding on Windows"); + } +#endif GST_OBJECT_UNLOCK (src); break; default: @@ -239,10 +251,9 @@ static gchar ** gst_split_file_src_find_files (GstSplitFileSrc * src, const gchar * dirname, const gchar * basename, GError ** err) { + PatternSpec *pspec; GPtrArray *files; - GRegex *regex; const gchar *name; - gchar *regex_string; GDir *dir; if (dirname == NULL || basename == NULL) @@ -255,25 +266,20 @@ gst_split_file_src_find_files (GstSplitFileSrc * src, const gchar * dirname, if (dir == NULL) return NULL; - /* we want the filename to be the whole filename, not just some match - * in the middle of the filename */ - if (g_str_has_suffix (basename, "$")) - regex_string = g_strdup (basename); - else - regex_string = g_strconcat (basename, "$", NULL); - - regex = g_regex_new (regex_string, G_REGEX_RAW, (GRegexMatchFlags) 0, err); - g_free (regex_string); + if (DEFAULT_PATTERN_MATCH_MODE == MATCH_MODE_UTF8 && + !g_utf8_validate (basename, -1, NULL)) { + goto not_utf8; + } - if (regex == NULL) - goto regex_fail; + /* mode will be AUTO on linux/unix and UTF8 on win32 */ + pspec = pattern_spec_new (basename, DEFAULT_PATTERN_MATCH_MODE); files = g_ptr_array_new (); while ((name = g_dir_read_name (dir))) { GST_TRACE_OBJECT (src, "check: %s", name); - if (g_regex_match (regex, name, (GRegexMatchFlags) 0, NULL)) { - GST_LOG_OBJECT (src, "match: %s", name); + if (pattern_match_string (pspec, name)) { + GST_DEBUG_OBJECT (src, "match: %s", name); g_ptr_array_add (files, g_build_filename (dirname, name, NULL)); } } @@ -284,7 +290,7 @@ gst_split_file_src_find_files (GstSplitFileSrc * src, const gchar * dirname, g_ptr_array_sort (files, (GCompareFunc) gst_split_file_src_array_sortfunc); g_ptr_array_add (files, NULL); - g_regex_unref (regex); + pattern_spec_free (pspec); g_dir_close (dir); return (gchar **) g_ptr_array_free (files, FALSE); @@ -296,21 +302,21 @@ invalid_location: "No filename specified."); return NULL; } -regex_fail: +not_utf8: { - GST_WARNING_OBJECT (src, "g_regex_new() failed: %s", (*err)->message); g_dir_close (dir); + g_set_error_literal (err, G_FILE_ERROR, G_FILE_ERROR_INVAL, + "Filename pattern must be UTF-8 on Windows."); return NULL; } no_matches: { - g_regex_unref (regex); + pattern_spec_free (pspec); g_dir_close (dir); g_set_error_literal (err, G_FILE_ERROR, G_FILE_ERROR_NOENT, "Found no files matching the pattern."); return NULL; } - } static gboolean diff --git a/gst/multifile/patternspec.c b/gst/multifile/patternspec.c new file mode 100644 index 0000000..59de8d1 --- /dev/null +++ b/gst/multifile/patternspec.c @@ -0,0 +1,334 @@ +/* GPattern copy that supports raw (non-utf8) matching + * based on: GLIB - Library of useful routines for C programming + * Copyright (C) 1995-1997, 1999 Peter Mattis, Red Hat, Inc. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + */ +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "patternspec.h" +#include + +typedef enum +{ + MATCH_ALL, /* "*A?A*" */ + MATCH_ALL_TAIL, /* "*A?AA" */ + MATCH_HEAD, /* "AAAA*" */ + MATCH_TAIL, /* "*AAAA" */ + MATCH_EXACT, /* "AAAAA" */ + MATCH_LAST +} MatchType; + +struct _PatternSpec +{ + MatchMode match_mode; + MatchType match_type; + guint pattern_length; + guint min_length; + guint max_length; + gchar *pattern; +}; + +static inline gchar * +raw_strreverse (const gchar * str, gssize size) +{ + g_assert (size > 0); + return g_strreverse (g_strndup (str, size)); +} + +static inline gboolean +pattern_ph_match (const gchar * match_pattern, MatchMode match_mode, + const gchar * match_string, gboolean * wildcard_reached_p) +{ + register const gchar *pattern, *string; + register gchar ch; + + pattern = match_pattern; + string = match_string; + + ch = *pattern; + pattern++; + while (ch) { + switch (ch) { + case '?': + if (!*string) + return FALSE; + if (match_mode == MATCH_MODE_UTF8) + string = g_utf8_next_char (string); + else + ++string; + break; + + case '*': + *wildcard_reached_p = TRUE; + do { + ch = *pattern; + pattern++; + if (ch == '?') { + if (!*string) + return FALSE; + if (match_mode == MATCH_MODE_UTF8) + string = g_utf8_next_char (string); + else + ++string; + } + } + while (ch == '*' || ch == '?'); + if (!ch) + return TRUE; + do { + gboolean next_wildcard_reached = FALSE; + while (ch != *string) { + if (!*string) + return FALSE; + if (match_mode == MATCH_MODE_UTF8) + string = g_utf8_next_char (string); + else + ++string; + } + string++; + if (pattern_ph_match (pattern, match_mode, string, + &next_wildcard_reached)) + return TRUE; + if (next_wildcard_reached) + /* the forthcoming pattern substring up to the next wildcard has + * been matched, but a mismatch occoured for the rest of the + * pattern, following the next wildcard. + * there's no need to advance the current match position any + * further if the rest pattern will not match. + */ + return FALSE; + } + while (*string); + break; + + default: + if (ch == *string) + string++; + else + return FALSE; + break; + } + + ch = *pattern; + pattern++; + } + + return *string == 0; +} + +static gboolean +pattern_match (PatternSpec * pspec, guint string_length, + const gchar * string, const gchar * string_reversed) +{ + MatchMode match_mode; + + g_assert (pspec != NULL); + g_assert (string != NULL); + + if (string_length < pspec->min_length || string_length > pspec->max_length) + return FALSE; + + match_mode = pspec->match_mode; + if (match_mode == MATCH_MODE_AUTO) { + if (!g_utf8_validate (string, string_length, NULL)) + match_mode = MATCH_MODE_RAW; + else + match_mode = MATCH_MODE_UTF8; + } + + switch (pspec->match_type) { + gboolean dummy; + case MATCH_ALL: + return pattern_ph_match (pspec->pattern, match_mode, string, &dummy); + case MATCH_ALL_TAIL: + if (string_reversed) + return pattern_ph_match (pspec->pattern, match_mode, string_reversed, + &dummy); + else { + gboolean result; + gchar *tmp; + if (match_mode == MATCH_MODE_UTF8) { + tmp = g_utf8_strreverse (string, string_length); + } else { + tmp = raw_strreverse (string, string_length); + } + result = pattern_ph_match (pspec->pattern, match_mode, tmp, &dummy); + g_free (tmp); + return result; + } + case MATCH_HEAD: + if (pspec->pattern_length == string_length) + return memcmp (pspec->pattern, string, string_length) == 0; + else if (pspec->pattern_length) + return memcmp (pspec->pattern, string, pspec->pattern_length) == 0; + else + return TRUE; + case MATCH_TAIL: + if (pspec->pattern_length) + /* compare incl. NUL terminator */ + return memcmp (pspec->pattern, + string + (string_length - pspec->pattern_length), + pspec->pattern_length + 1) == 0; + else + return TRUE; + case MATCH_EXACT: + if (pspec->pattern_length != string_length) + return FALSE; + else + return memcmp (pspec->pattern, string, string_length) == 0; + default: + g_return_val_if_fail (pspec->match_type < MATCH_LAST, FALSE); + return FALSE; + } +} + +PatternSpec * +pattern_spec_new (const gchar * pattern, MatchMode match_mode) +{ + PatternSpec *pspec; + gboolean seen_joker = FALSE, seen_wildcard = FALSE, more_wildcards = FALSE; + gint hw_pos = -1, tw_pos = -1, hj_pos = -1, tj_pos = -1; + gboolean follows_wildcard = FALSE; + guint pending_jokers = 0; + const gchar *s; + gchar *d; + guint i; + + g_assert (pattern != NULL); + g_assert (match_mode != MATCH_MODE_UTF8 + || g_utf8_validate (pattern, -1, NULL)); + + /* canonicalize pattern and collect necessary stats */ + pspec = g_new (PatternSpec, 1); + pspec->match_mode = match_mode; + pspec->pattern_length = strlen (pattern); + pspec->min_length = 0; + pspec->max_length = 0; + pspec->pattern = g_new (gchar, pspec->pattern_length + 1); + + if (pspec->match_mode == MATCH_MODE_AUTO) { + if (!g_utf8_validate (pattern, -1, NULL)) + pspec->match_mode = MATCH_MODE_RAW; + } + + d = pspec->pattern; + for (i = 0, s = pattern; *s != 0; s++) { + switch (*s) { + case '*': + if (follows_wildcard) { /* compress multiple wildcards */ + pspec->pattern_length--; + continue; + } + follows_wildcard = TRUE; + if (hw_pos < 0) + hw_pos = i; + tw_pos = i; + break; + case '?': + pending_jokers++; + pspec->min_length++; + if (pspec->match_mode == MATCH_MODE_RAW) { + pspec->max_length += 1; + } else { + pspec->max_length += 4; /* maximum UTF-8 character length */ + } + continue; + default: + for (; pending_jokers; pending_jokers--, i++) { + *d++ = '?'; + if (hj_pos < 0) + hj_pos = i; + tj_pos = i; + } + follows_wildcard = FALSE; + pspec->min_length++; + pspec->max_length++; + break; + } + *d++ = *s; + i++; + } + for (; pending_jokers; pending_jokers--) { + *d++ = '?'; + if (hj_pos < 0) + hj_pos = i; + tj_pos = i; + } + *d++ = 0; + seen_joker = hj_pos >= 0; + seen_wildcard = hw_pos >= 0; + more_wildcards = seen_wildcard && hw_pos != tw_pos; + if (seen_wildcard) + pspec->max_length = G_MAXUINT; + + /* special case sole head/tail wildcard or exact matches */ + if (!seen_joker && !more_wildcards) { + if (pspec->pattern[0] == '*') { + pspec->match_type = MATCH_TAIL; + memmove (pspec->pattern, pspec->pattern + 1, --pspec->pattern_length); + pspec->pattern[pspec->pattern_length] = 0; + return pspec; + } + if (pspec->pattern_length > 0 && + pspec->pattern[pspec->pattern_length - 1] == '*') { + pspec->match_type = MATCH_HEAD; + pspec->pattern[--pspec->pattern_length] = 0; + return pspec; + } + if (!seen_wildcard) { + pspec->match_type = MATCH_EXACT; + return pspec; + } + } + + /* now just need to distinguish between head or tail match start */ + tw_pos = pspec->pattern_length - 1 - tw_pos; /* last pos to tail distance */ + tj_pos = pspec->pattern_length - 1 - tj_pos; /* last pos to tail distance */ + if (seen_wildcard) + pspec->match_type = tw_pos > hw_pos ? MATCH_ALL_TAIL : MATCH_ALL; + else /* seen_joker */ + pspec->match_type = tj_pos > hj_pos ? MATCH_ALL_TAIL : MATCH_ALL; + if (pspec->match_type == MATCH_ALL_TAIL) { + gchar *tmp = pspec->pattern; + + if (pspec->match_mode == MATCH_MODE_RAW) { + pspec->pattern = raw_strreverse (pspec->pattern, pspec->pattern_length); + } else { + pspec->pattern = + g_utf8_strreverse (pspec->pattern, pspec->pattern_length); + } + g_free (tmp); + } + return pspec; +} + +void +pattern_spec_free (PatternSpec * pspec) +{ + g_assert (pspec != NULL); + + g_free (pspec->pattern); + g_free (pspec); +} + +gboolean +pattern_match_string (PatternSpec * pspec, const gchar * string) +{ + return pattern_match (pspec, strlen (string), string, NULL); +} diff --git a/gst/multifile/patternspec.h b/gst/multifile/patternspec.h new file mode 100644 index 0000000..c3e9436 --- /dev/null +++ b/gst/multifile/patternspec.h @@ -0,0 +1,47 @@ +/* GPattern copy that supports raw (non-utf8) matching + * based on: GLIB - Library of useful routines for C programming + * Copyright (C) 1995-1997, 1999 Peter Mattis, Red Hat, Inc. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + */ + +#ifndef __PATTERN_SPEC_H__ +#define __PATTERN_SPEC_H__ + +#include + +G_BEGIN_DECLS + +typedef enum +{ + MATCH_MODE_AUTO = 0, + MATCH_MODE_UTF8, + MATCH_MODE_RAW +} MatchMode; + +typedef struct _PatternSpec PatternSpec; + +PatternSpec * pattern_spec_new (const gchar * pattern, + MatchMode match_mode); + +void pattern_spec_free (PatternSpec * pspec); + +gboolean pattern_match_string (PatternSpec * pspec, + const gchar * string); + +G_END_DECLS + +#endif /* __PATTERN_SPEC_H__ */ -- 2.7.4