src/lookup/phrase_lookup.h

   1 /*
   2  *  libpinyin
   3  *  Library to deal with pinyin.
   4  *
   5  *  Copyright (C) 2006-2007 Peng Wu
   6  *
   7  *  This program is free software; you can redistribute it and/or modify
   8  *  it under the terms of the GNU General Public License as published by
   9  *  the Free Software Foundation; either version 2 of the License, or
  10  *  (at your option) any later version.
  11  *
  12  *  This program is distributed in the hope that it will be useful,
  13  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15  *  GNU General Public License for more details.
  16  *
  17  *  You should have received a copy of the GNU General Public License
  18  *  along with this program; if not, write to the Free Software
  19  *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
  20  */
  21
  22 #ifndef PHRASE_LOOKUP_H
  23 #define PHRASE_LOOKUP_H
  24
  25 #include "novel_types.h"
  26 #include "ngram.h"
  27 #include "lookup.h"
  28
  29 /**
  30  * phrase_lookup.h
  31  *
  32  * The definitions of phrase lookup related classes and structs.
  33  *
  34  */
  35
  36 namespace pinyin{
  37
  38 /**
  39  * PhraseLookup:
  40  *
  41  * The phrase lookup class to convert the sentence to phrase tokens.
  42  *
  43  */
  44 class PhraseLookup{
  45 private:
  46     const gfloat bigram_lambda;
  47     const gfloat unigram_lambda;
  48
  49     PhraseItem m_cache_phrase_item;
  50     SingleGram m_merged_single_gram;
  51 protected:
  52     //saved varibles
  53     FacadePhraseTable2 * m_phrase_table;
  54     FacadePhraseIndex * m_phrase_index;
  55     Bigram * m_system_bigram;
  56     Bigram * m_user_bigram;
  57
  58     //internal step data structure
  59     GPtrArray * m_steps_index;
  60     /* Array of LookupStepIndex */
  61     GPtrArray * m_steps_content;
  62     /* Array of LookupStepContent */
  63
  64     /* Saved sentence */
  65     int m_sentence_length;
  66     ucs4_t * m_sentence;
  67
  68 protected:
  69     /* Explicitly search the next phrase,
  70      *  to avoid double phrase lookup as the next token has only one.
  71      */
  72     bool search_unigram2(int nstep, PhraseTokens tokens);
  73     bool search_bigram2(int nstep, PhraseTokens tokens);
  74
  75     bool unigram_gen_next_step(int nstep, lookup_value_t * cur_value, phrase_token_t token);
  76     bool bigram_gen_next_step(int nstep, lookup_value_t * cur_value, phrase_token_t token, gfloat bigram_poss);
  77
  78     bool save_next_step(int next_step_pos, lookup_value_t * cur_value, lookup_value_t * next_step);
  79
  80     bool final_step(MatchResults & results);
  81 public:
  82     /**
  83      * PhraseLookup::PhraseLookup:
  84      * @lambda: the lambda parameter for interpolation model.
  85      * @phrase_table: the phrase table.
  86      * @phrase_index: the phrase index.
  87      * @system_bigram: the system bi-gram.
  88      * @user_bigram: the user bi-gram.
  89      *
  90      * The constructor of the PhraseLookup.
  91      *
  92      */
  93     PhraseLookup(const gfloat lambda,
  94                  FacadePhraseTable2 * phrase_table,
  95                  FacadePhraseIndex * phrase_index,
  96                  Bigram * system_bigram,
  97                  Bigram * user_bigram);
  98
  99     /**
 100      * PhraseLookup::~PhraseLookup:
 101      *
 102      * The destructor of the PhraseLookup.
 103      *
 104      */
 105     ~PhraseLookup();
 106
 107     /**
 108      * PhraseLookup::get_best_match:
 109      * @sentence_length: the length of the sentence in ucs4 characters.
 110      * @sentence: the ucs4 characters of the sentence.
 111      * @results: the segmented sentence in the form of phrase tokens.
 112      * @returns: whether the segment operation is successful.
 113      *
 114      * Segment the sentence into phrase tokens.
 115      *
 116      * Note: this method only accepts the characters in phrase large table.
 117      *
 118      */
 119     bool get_best_match(int sentence_length, ucs4_t sentence[], MatchResults & results);
 120
 121     /**
 122      * PhraseLookup::convert_to_utf8:
 123      * @results: the guessed sentence in the form of phrase tokens.
 124      * @result_string: the converted sentence in utf8 string.
 125      * @returns: whether the convert operation is successful.
 126      *
 127      * Convert the sentence from phrase tokens to the utf8 string.
 128      *
 129      * Note: free the result_string by g_free.
 130      *
 131      */
 132     bool convert_to_utf8(MatchResults results,
 133                          /* out */ char * & result_string)
 134     {
 135         return pinyin::convert_to_utf8(m_phrase_index, results,
 136                                        "\n", true, result_string);
 137     }
 138 };
 139
 140 };
 141
 142 #endif