From b47878f90385acd42588658a6dcf2c0edfae419f Mon Sep 17 00:00:00 2001 From: Peng Wu Date: Mon, 23 May 2011 11:38:14 +0800 Subject: [PATCH] write validate k mixture model tool --- utils/training/Makefile.am | 9 +- utils/training/export_k_mixture_model.cpp | 2 +- utils/training/validate_k_mixture_model.cpp | 134 ++++++++++++++++++++++++++++ 3 files changed, 142 insertions(+), 3 deletions(-) create mode 100644 utils/training/validate_k_mixture_model.cpp diff --git a/utils/training/Makefile.am b/utils/training/Makefile.am index 0915479..a8b5478 100644 --- a/utils/training/Makefile.am +++ b/utils/training/Makefile.am @@ -35,7 +35,8 @@ noinst_PROGRAMS = gen_ngram \ prune_k_mixture_model \ import_k_mixture_model \ export_k_mixture_model \ - k_mixture_model_to_interpolation + k_mixture_model_to_interpolation \ + validate_k_mixture_model gen_ngram_SOURCES = gen_ngram.cpp @@ -79,4 +80,8 @@ export_k_mixture_model_LDADD = ../../src/libpinyin.la @GLIB2_LDFLAGS@ k_mixture_model_to_interpolation_SOURCES = k_mixture_model_to_interpolation.cpp -k_mixture_model_to_interpolation_LDADD = ../../src/libpinyin.la @GLIB2_LDFLAGS@ \ No newline at end of file +k_mixture_model_to_interpolation_LDADD = ../../src/libpinyin.la @GLIB2_LDFLAGS@ + +validate_k_mixture_model_SOURCES = validate_k_mixture_model.cpp + +validate_k_mixture_model_LDADD = ../../src/libpinyin.la @GLIB2_LDFLAGS@ diff --git a/utils/training/export_k_mixture_model.cpp b/utils/training/export_k_mixture_model.cpp index f42df96..b10ee43 100644 --- a/utils/training/export_k_mixture_model.cpp +++ b/utils/training/export_k_mixture_model.cpp @@ -48,7 +48,7 @@ bool print_k_mixture_model_array_headers(FILE * output, for (size_t i = 0; i < items->len; ++i) { phrase_token_t * token = &g_array_index(items, phrase_token_t, i); KMixtureModelArrayHeader array_header; - bigram->get_array_header(*token, array_header); + assert(bigram->get_array_header(*token, array_header)); char * phrase = taglib_token_to_string(phrase_index, *token); if ( phrase ) fprintf(output, "\\item %s count %d\n", phrase, array_header.m_WC); diff --git a/utils/training/validate_k_mixture_model.cpp b/utils/training/validate_k_mixture_model.cpp new file mode 100644 index 0000000..3e5458d --- /dev/null +++ b/utils/training/validate_k_mixture_model.cpp @@ -0,0 +1,134 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2011 Peng Wu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include "pinyin.h" +#include "k_mixture_model.h" + +void print_help(){ + printf("Usage: validate_k_mixture_model \n"); +} + +bool validate_unigram(KMixtureModelBigram * bigram){ + KMixtureModelMagicHeader magic_header; + if( !bigram->get_magic_header(magic_header) ){ + fprintf(stderr, "no magic header in k mixture model.\n"); + return false; + } + + guint32 expected_sum = magic_header.m_WC; + if ( 0 == expected_sum ){ + fprintf(stderr, "word count in magic header is unexpected zero.\n"); + return false; + } + + GArray * items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); + bigram->get_all_items(items); + + guint32 sum = 0; + for (size_t i = 0; i < items->len; ++i) { + phrase_token_t * token = &g_array_index(items, phrase_token_t, i); + KMixtureModelArrayHeader array_header; + assert(bigram->get_array_header(*token, array_header)); + sum += array_header.m_WC; + } + + if ( sum != expected_sum ){ + fprintf(stderr, "word count in magic header:%d\n", expected_sum); + fprintf(stderr, "sum of word count in array headers:%d\n", sum); + fprintf(stderr, "the sum differs from word count.\n"); + return false; + } + + g_array_free(items, TRUE); + return true; +} + +bool validate_bigram(KMixtureModelBigram * bigram){ + bool result = true; + + GArray * items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); + bigram->get_all_items(items); + + for (size_t i = 0; i < items->len; ++i) { + phrase_token_t * token = &g_array_index(items, phrase_token_t, i); + KMixtureModelSingleGram * single_gram = NULL; + assert(bigram->load(*token, single_gram)); + FlexibleBigramPhraseArray array = g_array_new + (FALSE, FALSE, sizeof(KMixtureModelArrayItemWithToken)); + single_gram->retrieve_all(array); + + KMixtureModelArrayHeader array_header; + assert(single_gram->get_array_header(array_header)); + + guint32 expected_sum = array_header.m_WC; + if ( 0 == expected_sum ){ + fprintf(stderr, "in the array header of token %d:\n", *token); + fprintf(stderr, "word count is unexpected zero.\n"); + result = false; + } + + guint32 sum = 0; + for (size_t m = 0; m< array->len; ++m){ + KMixtureModelArrayItemWithToken * item = &g_array_index(array, KMixtureModelArrayItemWithToken, m); + + sum += item->m_item.m_WC; + } + + if ( sum != expected_sum ){ + fprintf(stderr, "word count in array header:%d\n", expected_sum); + fprintf(stderr, "sum of word count in array items:%d\n", sum); + fprintf(stderr, "the sum differs from word count.\n"); + result = false; + } + } + + g_array_free(items, TRUE); + return result; +} + +int main(int argc, char * argv[]){ + int i = 1; + const char * k_mixture_model_filename = NULL; + + while ( i < argc ){ + if ( strcmp ("--help", argv[i]) == 0 ){ + print_help(); + exit(0); + } else { + k_mixture_model_filename = argv[i]; + } + } + + KMixtureModelBigram bigram(K_MIXTURE_MODEL_MAGIC_NUMBER); + bigram.attach(k_mixture_model_filename, ATTACH_READONLY); + + if (!validate_unigram(&bigram)) { + fprintf(stderr, "k mixture model validation failed.\n"); + exit(ENODATA); + } + + if (!validate_bigram(&bigram)) { + fprintf(stderr, "k mixture model validation failed.\n"); + exit(ENODATA); + } + + return 0; +} -- 2.7.4