improve prune k mixture model
authorPeng Wu <alexepico@gmail.com>
Wed, 27 Jul 2011 15:23:57 +0000 (23:23 +0800)
committerPeng Wu <alexepico@gmail.com>
Wed, 27 Jul 2011 15:25:51 +0000 (23:25 +0800)
utils/training/k_mixture_model.h
utils/training/prune_k_mixture_model.cpp

index 50218c2d519a5c932b17e94c052f250b00e2aa83..2d9816b9f584ce64c668230c345ea7527c9d3460 100644 (file)
@@ -54,6 +54,7 @@ static inline parameter_t compute_B(corpus_count_t N,
                                     corpus_count_t T,
                                     corpus_count_t n_0,
                                     corpus_count_t n_1){
+    /* Note: re-check this, to see if we can remove if statement. */
     /* Please consider B_2 is no less than 2 in paper. */
     if ( 0 == T - n_1 && 0 == N - n_0 - n_1 )
         return 2;
index 8ae69928c0b3364ae190647304ee613904c16dce..f4bad652b70f2a49df93b6cf423d9d2fd875cfbe 100644 (file)
@@ -52,7 +52,16 @@ bool prune_k_mixture_model(KMixtureModelMagicHeader * magic_header,
                  item->m_item.m_n_1);
         }
 
-        assert(remained_poss >= 0);
+        /* wrong remained possibility. */
+        if (remained_poss < 0) {
+            fprintf(stderr, "wrong remained possibility is found.\n");
+            fprintf(stderr, "k:%d N:%d WC:%d n_0:%d n_1:%d\n",
+                    g_prune_k, magic_header->m_N, item->m_item.m_WC,
+                    magic_header->m_N - item->m_item.m_N_n_0,
+                    item->m_item.m_n_1);
+            exit(EDOM);
+        }
+
         if ( remained_poss < g_prune_poss ) {
             /* prune this word or phrase. */
             KMixtureModelArrayItem removed_item;
@@ -114,7 +123,14 @@ int main(int argc, char * argv[]){
     GArray * items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
     bigram.get_all_items(items);
 
+    /* print prune progress */
+    size_t progress = 0; size_t onestep = items->len / 20;
     for ( size_t i = 0; i < items->len; ++i ){
+        if ( progress >= onestep ) {
+            progress = 0; printf("*");
+        }
+        progress ++;
+
         phrase_token_t * token = &g_array_index(items, phrase_token_t, i);
         KMixtureModelSingleGram * single_gram = NULL;
         bigram.load(*token, single_gram);
@@ -142,6 +158,8 @@ int main(int argc, char * argv[]){
         removed_array = NULL;
     }
 
+    printf("\n");
+
     bigram.set_magic_header(magic_header);
 
     /* post processing clean up zero items */