import from pinyin.
authorPeng Wu <alexepico@gmail.com>
Tue, 3 Aug 2010 02:42:47 +0000 (10:42 +0800)
committerPeng Wu <alexepico@gmail.com>
Tue, 3 Aug 2010 02:42:47 +0000 (10:42 +0800)
46 files changed:
AUTHORS [new file with mode: 0644]
COPYING [new file with mode: 0644]
Makefile.am [new file with mode: 0644]
autogen.sh [new file with mode: 0755]
configure.ac [new file with mode: 0644]
src/Makefile.am [new file with mode: 0644]
src/include/Makefile.am [new file with mode: 0644]
src/include/memory_chunk.h [new file with mode: 0755]
src/include/novel_types.h [new file with mode: 0755]
src/include/stl_lite.h [new file with mode: 0644]
src/lookup/Makefile.am [new file with mode: 0644]
src/lookup/lookup.h [new file with mode: 0644]
src/lookup/pinyin_lookup.cpp [new file with mode: 0644]
src/lookup/winner_tree.cpp [new file with mode: 0644]
src/lookup/winner_tree.h [new file with mode: 0644]
src/segment/Makefile.am [new file with mode: 0644]
src/segment/mmseg.cpp [new file with mode: 0644]
src/storage/Makefile.am [new file with mode: 0644]
src/storage/ngram.cpp [new file with mode: 0644]
src/storage/ngram.h [new file with mode: 0644]
src/storage/phrase_index.cpp [new file with mode: 0644]
src/storage/phrase_index.h [new file with mode: 0755]
src/storage/pinyin_base.cpp [new file with mode: 0644]
src/storage/pinyin_base.h [new file with mode: 0644]
src/storage/pinyin_large_table.cpp [new file with mode: 0644]
src/storage/pinyin_large_table.h [new file with mode: 0755]
src/storage/pinyin_phrase.h [new file with mode: 0644]
src/storage/pinyin_zhuyin_map_data.h [new file with mode: 0644]
src/training/Makefile.am [new file with mode: 0644]
src/training/estimate_interpolation.cpp [new file with mode: 0644]
src/training/gen_ngram.cpp [new file with mode: 0644]
src/training/gen_unigram.cpp [new file with mode: 0644]
tests/Makefile.am [new file with mode: 0644]
tests/include/Makefile.am [new file with mode: 0644]
tests/include/test_memory_chunk.cpp [new file with mode: 0755]
tests/lookup/Makefile.am [new file with mode: 0644]
tests/lookup/test_simple_lookup.cpp [new file with mode: 0644]
tests/storage/Makefile.am [new file with mode: 0644]
tests/storage/test_ngram.cpp [new file with mode: 0644]
tests/storage/test_parser.cpp [new file with mode: 0644]
tests/storage/test_phrase_index.cpp [new file with mode: 0644]
tests/storage/test_pinyin_index.cpp [new file with mode: 0644]
utils/Makefile.am [new file with mode: 0644]
utils/storage/Makefile.am [new file with mode: 0644]
utils/storage/gen_binary_files.cpp [new file with mode: 0644]
utils/storage/gen_pinyin_table.cpp [new file with mode: 0644]

diff --git a/AUTHORS b/AUTHORS
new file mode 100644 (file)
index 0000000..4d1c44e
--- /dev/null
+++ b/AUTHORS
@@ -0,0 +1,2 @@
+James Su 2002,2003,2006 <suzhe@tsinghua.edu.cn>
+Peng Wu 2006-2007 <alexepico@gmail.com>
diff --git a/COPYING b/COPYING
new file mode 100644 (file)
index 0000000..d511905
--- /dev/null
+++ b/COPYING
@@ -0,0 +1,339 @@
+                   GNU GENERAL PUBLIC LICENSE
+                      Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+                           Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users.  This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it.  (Some other Free Software Foundation software is covered by
+the GNU Lesser General Public License instead.)  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+  To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have.  You must make sure that they, too, receive or can get the
+source code.  And you must show them these terms so they know their
+rights.
+
+  We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+  Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software.  If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+  Finally, any free program is threatened constantly by software
+patents.  We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary.  To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+                   GNU GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License.  The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language.  (Hereinafter, translation is included without limitation in
+the term "modification".)  Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+  1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+  2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) You must cause the modified files to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    b) You must cause any work that you distribute or publish, that in
+    whole or in part contains or is derived from the Program or any
+    part thereof, to be licensed as a whole at no charge to all third
+    parties under the terms of this License.
+
+    c) If the modified program normally reads commands interactively
+    when run, you must cause it, when started running for such
+    interactive use in the most ordinary way, to print or display an
+    announcement including an appropriate copyright notice and a
+    notice that there is no warranty (or else, saying that you provide
+    a warranty) and that users may redistribute the program under
+    these conditions, and telling the user how to view a copy of this
+    License.  (Exception: if the Program itself is interactive but
+    does not normally print such an announcement, your work based on
+    the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+    a) Accompany it with the complete corresponding machine-readable
+    source code, which must be distributed under the terms of Sections
+    1 and 2 above on a medium customarily used for software interchange; or,
+
+    b) Accompany it with a written offer, valid for at least three
+    years, to give any third party, for a charge no more than your
+    cost of physically performing source distribution, a complete
+    machine-readable copy of the corresponding source code, to be
+    distributed under the terms of Sections 1 and 2 above on a medium
+    customarily used for software interchange; or,
+
+    c) Accompany it with the information you received as to the offer
+    to distribute corresponding source code.  (This alternative is
+    allowed only for noncommercial distribution and only if you
+    received the program in object code or executable form with such
+    an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it.  For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable.  However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License.  Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+  5. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Program or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+  6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+  7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded.  In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+  9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation.  If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+  10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission.  For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this.  Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+                           NO WARRANTY
+
+  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+                    END OF TERMS AND CONDITIONS
+
+           How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+    Gnomovision version 69, Copyright (C) year name of author
+    Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+  `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+  <signature of Ty Coon>, 1 April 1989
+  Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs.  If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library.  If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.
diff --git a/Makefile.am b/Makefile.am
new file mode 100644 (file)
index 0000000..0dd9b9e
--- /dev/null
@@ -0,0 +1,35 @@
+## Makefile.am -- Process this file with automake to produce Makefile.in
+## Copyright (C) 2007 Peng Wu
+##
+## This program is free software; you can redistribute it and/or modify
+## it under the terms of the GNU General Public License as published by
+## the Free Software Foundation; either version 2, or (at your option)
+## any later version.
+##
+## This program is distributed in the hope that it will be useful,
+## but WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+## GNU General Public License for more details.
+##
+## You should have received a copy of the GNU General Public License
+## along with this program; if not, write to the Free Software
+## Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+EXTRA_DIST             = config.rpath m4/ChangeLog  COPYING \
+                         intltool-extract.in \
+                         intltool-update.in \
+                         intltool-merge.in 
+
+AUTOMAKE_OPTIONS       = gnu
+SUBDIRS                = po intl src utils modules tests data
+
+MAINTAINERCLEANFILES   = Makefile.in 
+
+CLEANFILES             = *.bak \
+                          intltool-extract \
+                          intltool-merge \
+                          intltool-update
+
+
+ACLOCAL                        = aclocal -I .
+
diff --git a/autogen.sh b/autogen.sh
new file mode 100755 (executable)
index 0000000..cb8d4ae
--- /dev/null
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+aclocal
+
+libtoolize --force
+
+autoheader
+
+automake -a
+
+autoconf
+
+./configure --enable-tests
diff --git a/configure.ac b/configure.ac
new file mode 100644 (file)
index 0000000..1464c43
--- /dev/null
@@ -0,0 +1,143 @@
+#                                               -*- Autoconf -*-
+# Process this file with autoconf to produce a configure script.
+
+
+AC_PREREQ(2.60)
+AC_INIT(novel-pinyin, 0.2.5, alexepico@gmail.com)
+AM_INIT_AUTOMAKE
+AC_CONFIG_SRCDIR([config.h.in])
+AC_CONFIG_HEADER([config.h])
+
+SCIM_VERSION=1.2.0
+AC_SUBST(SCIM_VERSION)
+
+# Checks for programs.
+AC_PROG_CXX
+AC_PROG_CC
+AC_PROG_CPP
+AC_PROG_INSTALL
+AC_PROG_LN_S
+AC_PROG_MAKE_SET
+
+AC_GNU_SOURCE
+
+# Init libtool
+AC_LIBTOOL_WIN32_DLL
+AC_LIBTOOL_DLOPEN
+AC_PROG_LIBTOOL
+AC_LIB_LTDL
+AC_SUBST(LIBTOOL_DEPS)
+
+# Init gettext
+ALL_LINGUAS="zh_CN zh_TW"#"ko ja de fr it"
+AM_GNU_GETTEXT
+
+# Check intltool
+#AC_PROG_INTLTOOL
+IT_PROG_INTLTOOL([0.33], [no-xml])
+
+GETTEXT_PACKAGE="novel-pinyin"
+AC_SUBST(GETTEXT_PACKAGE)
+
+# libtool option to control which symbols are exported
+# right now, symbols starting with _ are not exported
+LIBTOOL_EXPORT_OPTIONS='-export-symbols-regex "^[[^_]].*"'
+AC_SUBST(LIBTOOL_EXPORT_OPTIONS)
+
+# Checks for libraries.
+PKG_CHECK_MODULES(GLIB2, [glib-2.0 >= 2.4.0])
+
+GLIB2_CPPFLAGS=`$PKG_CONFIG --cflags glib-2.0`
+
+AC_SUBST(GLIB2_CPPFLAGS)
+
+GLIB2_LDFLAGS=`$PKG_CONFIG --libs glib-2.0`
+
+AC_SUBST(GLIB2_LDFLAGS)
+
+# Check SCIM
+PKG_CHECK_MODULES(SCIM,[scim >= $SCIM_VERSION])
+
+# Check if we should build setup module
+PKG_CHECK_MODULES(SCIM_GTKUTILS,[scim-gtkutils >= $SCIM_VERSION],
+                                [SCIM_HAS_GTKUTILS=yes],
+                                [SCIM_HAS_GTKUTILS=no])
+
+if test "$SCIM_HAS_GTKUTILS" = "yes"; then
+  SCIM_BUILD_SETUP=1
+else
+  SCIM_BUILD_SETUP=0
+fi
+
+AM_CONDITIONAL(SCIM_BUILD_SETUP, [test "$SCIM_HAS_GTKUTILS" = "yes"])
+AC_SUBST(SCIM_BUILD_SETUP)
+
+
+SCIM_ICONDIR=`$PKG_CONFIG --variable=icondir scim`
+SCIM_MODULEDIR=`$PKG_CONFIG --variable=moduledir scim`
+SCIM_DATADIR=`$PKG_CONFIG --variable=scimdatadir scim`
+
+if test "x$SCIM_ICONDIR" = "x"; then
+  SCIM_ICONDIR=${datadir}/scim/icons
+fi
+
+if test "x$SCIM_MODULEDIR" = "x"; then
+  SCIM_MODULEDIR=${libdir}/scim-1.0
+fi
+
+if test "x$SCIM_DATADIR" = "x"; then
+  SCIM_DATADIR=${datadir}/scim
+fi
+
+AC_SUBST(SCIM_ICONDIR)
+AC_SUBST(SCIM_MODULEDIR)
+AC_SUBST(SCIM_DATADIR)
+
+NOVEL_PINYIN_DATADIR=$SCIM_DATADIR/novel-pinyin
+
+AC_SUBST(NOVEL_PINYIN_DATADIR)
+
+# Checks for header files.
+AC_HEADER_STDC
+AC_CHECK_HEADERS([locale.h stdlib.h string.h sys/time.h unistd.h])
+
+# Checks for typedefs, structures, and compiler characteristics.
+AC_HEADER_STDBOOL
+AC_C_CONST
+AC_C_INLINE
+AC_TYPE_SIZE_T
+AC_HEADER_TIME
+
+# Checks for library functions.
+AC_FUNC_MALLOC
+AC_FUNC_MEMCMP
+AC_FUNC_REALLOC
+AC_FUNC_STAT
+AC_CHECK_FUNCS([gettimeofday memmove memset setlocale])
+
+AC_CHECK_HEADERS([libintl.h string.h])
+
+AC_CHECK_HEADER([db.h], [], AC_MSG_ERROR([Cannot find Berkeley DB library version 4]))
+
+AC_SEARCH_LIBS([db_create], [db], [], AC_MSG_ERROR([Cannot find Berkeley DB library version 4]))
+
+
+AC_CONFIG_FILES([Makefile 
+                po/Makefile.in
+                intl/Makefile
+                data/Makefile
+                src/Makefile
+                 src/include/Makefile
+                 src/storage/Makefile
+                src/segment/Makefile
+                src/training/Makefile
+                src/lookup/Makefile
+                modules/Makefile
+                modules/scim/Makefile
+                tests/Makefile
+                 tests/include/Makefile
+                 tests/storage/Makefile
+                tests/lookup/Makefile
+                utils/Makefile
+                 utils/storage/Makefile])
+AC_OUTPUT
diff --git a/src/Makefile.am b/src/Makefile.am
new file mode 100644 (file)
index 0000000..59e009f
--- /dev/null
@@ -0,0 +1,25 @@
+## Makefile.am -- Process this file with automake to produce Makefile.in
+## Copyright (C) 2007 Peng Wu
+##
+## This program is free software; you can redistribute it and/or modify
+## it under the terms of the GNU General Public License as published by
+## the Free Software Foundation; either version 2, or (at your option)
+## any later version.
+##
+## This program is distributed in the hope that it will be useful,
+## but WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+## GNU General Public License for more details.
+##
+## You should have received a copy of the GNU General Public License
+## along with this program; if not, write to the Free Software
+## Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+AUTOMAKE_OPTIONS       = gnu
+SUBDIRS                = include storage segment training lookup
+
+MAINTAINERCLEANFILES   = Makefile.in 
+
+CLEANFILES             = *.bak 
+
+ACLOCAL                        = aclocal -I $(ac_aux_dir)
diff --git a/src/include/Makefile.am b/src/include/Makefile.am
new file mode 100644 (file)
index 0000000..bb605ee
--- /dev/null
@@ -0,0 +1,22 @@
+## Makefile.am -- Process this file with automake to produce Makefile.in
+## Copyright (C) 2007 Peng Wu
+##
+## This program is free software; you can redistribute it and/or modify
+## it under the terms of the GNU General Public License as published by
+## the Free Software Foundation; either version 2, or (at your option)
+## any later version.
+##
+## This program is distributed in the hope that it will be useful,
+## but WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+## GNU General Public License for more details.
+##
+## You should have received a copy of the GNU General Public License
+## along with this program; if not, write to the Free Software
+## Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+MAINTAINERCLEANFILES    = Makefile.in
+
+noinst_HEADERS          = memory_chunk.h \
+                         novel_types.h \
+                         stl_lite.h
diff --git a/src/include/memory_chunk.h b/src/include/memory_chunk.h
new file mode 100755 (executable)
index 0000000..3571256
--- /dev/null
@@ -0,0 +1,264 @@
+/* 
+ *  novel-pinyin,
+ *  A Simplified Chinese Sentence-Based Pinyin Input Method Engine
+ *  Based On Markov Model.
+ *  
+ *  Copyright (C) 2006-2007 Peng Wu
+ *  
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ * 
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *  
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef MEMORY_CHUNK_H
+#define MEMORY_CHUNK_H
+
+#include <assert.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include <stdlib.h>
+#include "stl_lite.h"
+
+/*  for unmanaged mode
+ *  m_free_func == free , when memory is allocated by malloc
+ *  m_free_func == NULL,
+ *  when memory is in small protion of allocated area
+ *  m_free_func == other,
+ *  malloc then free.
+ */
+
+class MemoryChunk{
+    typedef   void (* free_func_t)(void *);
+private:
+    char * m_data_begin;
+    char * m_data_end; //one data pass the end.
+    char * m_allocated; //one data pass the end.
+    free_func_t m_free_func;
+    
+private:
+    void reset(){
+       if ( m_free_func )
+           (*m_free_func)(m_data_begin);
+       m_data_begin = NULL;
+       m_data_end = NULL;
+       m_allocated = NULL;
+       m_free_func = NULL;
+    }
+    
+    void ensure_has_space(size_t new_size){
+       int delta_size = m_data_begin + new_size - m_data_end;
+       if ( delta_size <= 0 ) return;
+       ensure_has_more_space ( delta_size );
+    }
+    
+    /* enlarge function */
+    void ensure_has_more_space(size_t extra_size){
+       if ( 0 == extra_size ) return;
+       size_t newsize;
+       size_t cursize = size();
+       if ( m_free_func != free ) {
+           /* copy on resize */
+           newsize = cursize + extra_size;
+           /* do the copy */
+           char * tmp = (char *) malloc(newsize);
+           assert(tmp);
+           memset(tmp, 0, newsize);
+           memmove(tmp, m_data_begin, cursize);
+           /* free the origin memory */
+           if ( m_free_func){
+               (*m_free_func)(m_data_begin);
+           }
+           
+           /* change varibles */
+           m_data_begin = tmp;
+           m_data_end = m_data_begin + cursize;
+           m_allocated = m_data_begin + newsize;
+           m_free_func = free;
+           return;
+       }
+       /* the memory area is managed by this memory chunk */
+       if ( extra_size <= (size_t) (m_allocated - m_data_end))
+           return;
+       newsize = std_lite::max( capacity()<<1, cursize + extra_size);
+       m_data_begin = (char *) realloc(m_data_begin, newsize);
+       assert(m_data_begin);
+       memset(m_data_begin + cursize, 0, newsize - cursize);
+       m_data_end = m_data_begin + cursize;
+       m_allocated = m_data_begin + newsize;
+       return;
+    }
+    
+public:
+    /* constructors */
+    MemoryChunk(){
+       m_data_begin = NULL;
+       m_data_end = NULL;
+       m_allocated = NULL;
+       m_free_func = NULL;
+    }
+    
+    /* destructors */
+    ~MemoryChunk(){
+       reset();
+    }
+
+    /* read access method */
+    void* begin() const{
+       return m_data_begin;
+    }
+    
+    void* end() const{
+      return m_data_end;
+    }
+
+    size_t size(){
+       return m_data_end - m_data_begin;
+    }
+    
+    void set_size(size_t newsize){
+       ensure_has_space(newsize);
+       m_data_end = m_data_begin + newsize;
+    }
+    
+    size_t capacity(){
+       return m_allocated - m_data_begin;
+    }
+  
+    /* 
+     *  Transfer management of a memory chunk allocated by other part system
+     *  to the memory chunk.
+     */
+    void set_chunk(void* begin, size_t length, free_func_t free_func){
+       if ( m_free_func )
+           m_free_func( m_data_begin );
+       
+       m_data_begin = (char *) begin;
+       m_data_end = (char *) m_data_begin + length;
+       m_allocated = (char *) m_data_begin + length;
+       m_free_func = free_func;
+    }
+  
+    /* subchunk
+     * use set_buffer internally.
+     * new chunk need to be deleted.
+     */
+    MemoryChunk * get_sub_chunk(size_t offset, size_t length){
+       MemoryChunk * retval = new MemoryChunk();
+       char * begin_pos = m_data_begin + offset;
+       retval->set_chunk(begin_pos, length, NULL);
+       return retval;
+    }
+    /* write function
+     * Data are written directly to the memory area.
+     */
+    bool set_content(size_t offset, const void * data, size_t len){
+       size_t cursize = std_lite::max(size(), offset + len);
+       ensure_has_space(offset + len);
+       memmove(m_data_begin + offset, data, len);
+       m_data_end = m_data_begin + cursize;
+       return true;
+    }
+    /* insert function
+     * Data are written to the memory area,
+     * the original content are moved towards the rear.
+     * parameter offset start from zero.
+     */
+    bool insert_content(size_t offset, const void * data, size_t length){
+       ensure_has_more_space(length);
+       size_t move_size = size() - offset;
+       memmove(m_data_begin + offset + length, m_data_begin + offset, move_size);
+       memmove(m_data_begin + offset, data, length);
+       m_data_end += length;
+       return true;
+    }
+    /* remove function
+     * Data are removed directly,
+     * the following content are moved towards the front.
+     */
+    bool remove_content(size_t offset, size_t length){
+       size_t move_size = size() - offset - length;
+       memmove(m_data_begin + offset, m_data_begin + offset + length, move_size);
+       m_data_end -= length;
+       return true;
+    }
+
+    /* get_content function
+     * Get the binary data
+     */
+    bool get_content(size_t offset, void * buffer, size_t length){
+       if ( size() < offset + length )
+           return false;
+       memcpy( buffer, m_data_begin + offset, length);
+       return true;
+    }
+
+    /* compact memory, reduce the size */
+    void compact_memory(){
+       if ( m_free_func != free )
+           return;
+       size_t newsize = size();
+       m_data_begin = (char *) realloc(m_data_begin, newsize);
+       m_allocated = m_data_begin + newsize;
+    }
+  
+    /* file storage  functions */
+    bool load(const char * filename){
+       /* free old data */
+       reset();
+
+       struct stat stat_buf;
+
+       int retval = stat(filename, &stat_buf);
+    
+       if ( retval )
+           return false;
+       
+       FILE* file = fopen(filename, "r");
+       if ( !file )
+           return false;
+       int data_len = stat_buf.st_size;
+       void* data = malloc(data_len);
+       if ( !data ){
+           fclose(file);
+           return false;      
+       }
+       
+       data_len = fread(data, 1, data_len, file);
+       set_chunk(data, data_len, free);
+        //Fixes memory chunk end. 
+        if ( stat_buf.st_size > data_len )
+          m_allocated = (char *) m_data_begin + stat_buf.st_size;
+       fclose(file);
+       return true;
+    }
+
+    bool save(const char * filename){
+       FILE* file = fopen(filename, "w");
+       if ( !file )
+           return false;
+
+       size_t data_len = fwrite(begin(), 1, size(), file);
+       if ( data_len != size()){
+           fclose(file);
+           return false;
+       }
+
+       fsync(fileno(file));
+       fclose(file);
+       return true;
+    }
+};
+
+#endif
diff --git a/src/include/novel_types.h b/src/include/novel_types.h
new file mode 100755 (executable)
index 0000000..a992e8e
--- /dev/null
@@ -0,0 +1,117 @@
+/* 
+ *  novel-pinyin,
+ *  A Simplified Chinese Sentence-Based Pinyin Input Method Engine
+ *  Based On Markov Model.
+ *  
+ *  Copyright (C) 2006-2007 Peng Wu
+ *  
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ * 
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *  
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef NOVEL_TYPES_H
+#define NOVEL_TYPES_H
+
+#include <limits.h>
+#include <glib.h>
+
+typedef guint32 phrase_token_t;
+typedef gunichar2 utf16_t;
+
+/*
+ *  Phrase Index Library Definition
+ *  Reserve 4-bits for future usage.
+ */
+
+#define PHRASE_MASK  0x00FFFFFF
+#define PHRASE_INDEX_LIBRARY_MASK 0x0F000000
+#define PHRASE_INDEX_LIBRARY_COUNT (1<<4)
+#define PHRASE_INDEX_LIBRARY_INDEX(token) ((token&PHRASE_INDEX_LIBRARY_MASK)>>24)
+#define PHRASE_INDEX_MAKE_TOKEN(phrase_index, token) \
+    ( ( (phrase_index<<24) & PHRASE_INDEX_LIBRARY_MASK)|(token & PHRASE_MASK))
+
+
+/* 
+ *  PhraseIndexRanges definitions
+ */
+
+struct PhraseIndexRange{
+      phrase_token_t m_range_begin;
+      phrase_token_t m_range_end; /* pass the last item like stl */
+};
+
+/*Array of PhraseIndexRange*/
+typedef GArray * PhraseIndexRanges[PHRASE_INDEX_LIBRARY_COUNT];
+
+/* 
+ *  PinYin Table Definition
+ */
+class MemoryChunk;
+
+
+/* For both PinYin Table and Phrase Table */
+enum SearchResult{
+    SEARCH_NONE = 0x00,           /* found nothing */
+    SEARCH_OK = 0x01 ,            /* found items */
+    SEARCH_CONTINUED = 0x02       /* has longer word in the storage to search */
+};
+
+enum AddIndexResult{
+    INSERT_OK = 0 ,            /* insert ok */         
+    INSERT_ITEM_EXISTS         /* item already exists */
+};
+
+enum RemoveIndexResult{
+    REMOVE_OK = 0,             /* remove ok */
+    REMOVE_ITEM_DONOT_EXISTS   /* item don't exists */
+};
+/*
+ *  n-gram Definition
+ *  no B parameter(there are duplicated items in uni-gram and bi-gram)
+ *  used in system n-gram and user n-gram.
+ *  using delta technique.
+ */
+
+struct BigramPhraseItem{
+  phrase_token_t m_token;
+  gfloat         m_freq; /* P(W2|W1) */
+};
+
+typedef GArray * BigramPhraseArray; /* Array of HighLevelPhraseItem */
+
+/* 
+ *  n-gram Definition
+ *  n-gram library
+ */
+
+enum AttachOption{
+  ATTACH_NEW_FILE = 1,
+  ATTACH_READ = 2,
+  ATTACH_READ_WRITE = 3
+};
+
+#define MAX_PHRASE_LENGTH 16
+
+const phrase_token_t sentence_start = 1;
+const phrase_token_t token_min = 0;
+const phrase_token_t token_max = UINT_MAX;
+
+const char c_separate = '#';
+typedef guint32 table_offset_t;
+
+typedef double parameter_t;
+
+#define LAMBDA_PARAMETER 0.588792
+
+#endif
diff --git a/src/include/stl_lite.h b/src/include/stl_lite.h
new file mode 100644 (file)
index 0000000..0612782
--- /dev/null
@@ -0,0 +1,285 @@
+#ifndef STL_LITE_H
+#define STL_LITE_H
+
+#include <ctype.h>
+#include <stdlib.h>
+#include <string.h>
+
+namespace std_lite{
+
+  /**
+   *  @brief This does what you think it does.
+   *  @param  a  A thing of arbitrary type.
+   *  @param  b  Another thing of arbitrary type.
+   *  @return   The lesser of the parameters.
+   *
+   *  This is the simple classic generic implementation.  It will work on
+   *  temporary expressions, since they are only evaluated once, unlike a
+   *  preprocessor macro.
+  */
+  template<typename _Tp>
+    inline const _Tp&
+    min(const _Tp& __a, const _Tp& __b)
+    {
+      //return __b < __a ? __b : __a;
+      if (__b < __a)
+       return __b;
+      return __a;
+    }
+
+
+  /**
+   *  @brief This does what you think it does.
+   *  @param  a  A thing of arbitrary type.
+   *  @param  b  Another thing of arbitrary type.
+   *  @return   The greater of the parameters.
+   *
+   *  This is the simple classic generic implementation.  It will work on
+   *  temporary expressions, since they are only evaluated once, unlike a
+   *  preprocessor macro.
+  */
+  template<typename _Tp>
+    inline const _Tp&
+    max(const _Tp& __a, const _Tp& __b)
+    {
+      //return  __a < __b ? __b : __a;
+      if (__a < __b)
+       return __b;
+      return __a;
+    }
+
+  /**
+   *  This is one of the @link s20_3_1_base functor base classes@endlink.
+   */
+  template <class _Arg1, class _Arg2, class _Result>
+    struct binary_function
+    {
+      typedef _Arg1 first_argument_type;   ///< the type of the first argument
+                                           ///  (no surprises here)
+
+      typedef _Arg2 second_argument_type;  ///< the type of the second argument
+      typedef _Result result_type;         ///< type of the return type
+    };
+  /** @}  */
+
+  /// pair holds two objects of arbitrary type.
+  template<class _T1, class _T2>
+    struct pair
+    {
+      typedef _T1 first_type;    ///<  @c first_type is the first bound type
+      typedef _T2 second_type;   ///<  @c second_type is the second bound type
+
+      _T1 first;                 ///< @c first is a copy of the first object
+      _T2 second;                ///< @c second is a copy of the second object
+
+      // _GLIBCXX_RESOLVE_LIB_DEFECTS
+      // 265.  std::pair::pair() effects overly restrictive
+      /** The default constructor creates @c first and @c second using their
+       *  respective default constructors.  */
+      pair()
+      : first(), second() { }
+
+      /** Two objects may be passed to a @c pair constructor to be copied.  */
+      pair(const _T1& __a, const _T2& __b)
+      : first(__a), second(__b) { }
+
+      /** There is also a templated copy ctor for the @c pair class itself.  */
+      template<class _U1, class _U2>
+        pair(const pair<_U1, _U2>& __p)
+       : first(__p.first), second(__p.second) { }
+    };
+
+  /// Two pairs of the same type are equal iff their members are equal.
+  template<class _T1, class _T2>
+    inline bool
+    operator==(const pair<_T1, _T2>& __x, const pair<_T1, _T2>& __y)
+    { return __x.first == __y.first && __x.second == __y.second; }
+
+  /// <http://gcc.gnu.org/onlinedocs/libstdc++/20_util/howto.html#pairlt>
+  template<class _T1, class _T2>
+    inline bool
+    operator<(const pair<_T1, _T2>& __x, const pair<_T1, _T2>& __y)
+    { return __x.first < __y.first
+            || (!(__y.first < __x.first) && __x.second < __y.second); }
+
+  /// Uses @c operator== to find the result.
+  template<class _T1, class _T2>
+    inline bool
+    operator!=(const pair<_T1, _T2>& __x, const pair<_T1, _T2>& __y)
+    { return !(__x == __y); }
+
+  /// Uses @c operator< to find the result.
+  template<class _T1, class _T2>
+    inline bool
+    operator>(const pair<_T1, _T2>& __x, const pair<_T1, _T2>& __y)
+    { return __y < __x; }
+
+  /// Uses @c operator< to find the result.
+  template<class _T1, class _T2>
+    inline bool
+    operator<=(const pair<_T1, _T2>& __x, const pair<_T1, _T2>& __y)
+    { return !(__y < __x); }
+
+  /// Uses @c operator< to find the result.
+  template<class _T1, class _T2>
+    inline bool
+    operator>=(const pair<_T1, _T2>& __x, const pair<_T1, _T2>& __y)
+    { return !(__x < __y); }
+
+  /**
+   *  @brief A convenience wrapper for creating a pair from two objects.
+   *  @param  x  The first object.
+   *  @param  y  The second object.
+   *  @return   A newly-constructed pair<> object of the appropriate type.
+   *
+   *  The standard requires that the objects be passed by reference-to-const,
+   *  but LWG issue #181 says they should be passed by const value.  We follow
+   *  the LWG by default.
+   */
+  // _GLIBCXX_RESOLVE_LIB_DEFECTS
+  // 181.  make_pair() unintended behavior
+  template<class _T1, class _T2>
+    inline pair<_T1, _T2>
+    make_pair(_T1 __x, _T2 __y)
+    { return pair<_T1, _T2>(__x, __y); }
+
+  /**
+   *  @brief Finds the first position in which @a val could be inserted
+   *         without changing the ordering.
+   *  @param  first   An iterator.
+   *  @param  last    Another iterator.
+   *  @param  val     The search term.
+   *  @param  comp    A functor to use for comparisons.
+   *  @return  An iterator pointing to the first element "not less than" @a val,
+   *           or end() if every element is less than @a val.
+   *  @ingroup binarysearch
+   *
+   *  The comparison function should have the same effects on ordering as
+   *  the function used for the initial sort.
+  */
+  template<typename _ForwardIterator, typename _Tp, typename _Compare>
+    _ForwardIterator
+    lower_bound(_ForwardIterator __first, _ForwardIterator __last,
+               const _Tp& __val, _Compare __comp)
+    {
+      typedef size_t _DistanceType;
+
+      _DistanceType __len = __last - __first;
+      _DistanceType __half;
+      _ForwardIterator __middle;
+
+      while (__len > 0)
+       {
+         __half = __len >> 1;
+         __middle = __first;
+         __middle += __half;
+         if (__comp(*__middle, __val))
+           {
+             __first = __middle;
+             ++__first;
+             __len = __len - __half - 1;
+           }
+         else
+           __len = __half;
+       }
+      return __first;
+    }
+
+  /**
+   *  @brief Finds the last position in which @a val could be inserted
+   *         without changing the ordering.
+   *  @param  first   An iterator.
+   *  @param  last    Another iterator.
+   *  @param  val     The search term.
+   *  @param  comp    A functor to use for comparisons.
+   *  @return  An iterator pointing to the first element greater than @a val,
+   *           or end() if no elements are greater than @a val.
+   *  @ingroup binarysearch
+   *
+   *  The comparison function should have the same effects on ordering as
+   *  the function used for the initial sort.
+  */
+  template<typename _ForwardIterator, typename _Tp, typename _Compare>
+    _ForwardIterator
+    upper_bound(_ForwardIterator __first, _ForwardIterator __last,
+               const _Tp& __val, _Compare __comp)
+    {
+      typedef size_t _DistanceType;
+      _DistanceType __len = __last - __first;
+      _DistanceType __half;
+      _ForwardIterator __middle;
+
+      while (__len > 0)
+       {
+         __half = __len >> 1;
+         __middle = __first;
+         __middle += __half;
+         if (__comp(__val, *__middle))
+           __len = __half;
+         else
+           {
+             __first = __middle;
+             ++__first;
+             __len = __len - __half - 1;
+           }
+       }
+      return __first;
+    }
+
+  /**
+   *  @brief Finds the largest subrange in which @a val could be inserted
+   *         at any place in it without changing the ordering.
+   *  @param  first   An iterator.
+   *  @param  last    Another iterator.
+   *  @param  val     The search term.
+   *  @param  comp    A functor to use for comparisons.
+   *  @return  An pair of iterators defining the subrange.
+   *  @ingroup binarysearch
+   *
+   *  This is equivalent to
+   *  @code
+   *    std::make_pair(lower_bound(first, last, val, comp),
+   *                   upper_bound(first, last, val, comp))
+   *  @endcode
+   *  but does not actually call those functions.
+  */
+  template<typename _ForwardIterator, typename _Tp, typename _Compare>
+    pair<_ForwardIterator, _ForwardIterator>
+    equal_range(_ForwardIterator __first, _ForwardIterator __last,
+               const _Tp& __val,
+               _Compare __comp)
+    {
+
+      typedef size_t _DistanceType;
+
+      _DistanceType __len = __last - __first;
+      _DistanceType __half;
+      _ForwardIterator __middle, __left, __right;
+
+      while (__len > 0)
+       {
+         __half = __len >> 1;
+         __middle = __first;
+         __middle += __half;
+         if (__comp(*__middle, __val))
+           {
+             __first = __middle;
+             ++__first;
+             __len = __len - __half - 1;
+           }
+         else if (__comp(__val, *__middle))
+           __len = __half;
+         else
+           {
+             __left = lower_bound(__first, __middle, __val, __comp);
+             __first += __len;
+             __right = upper_bound(++__middle, __first, __val, __comp);
+             return pair<_ForwardIterator, _ForwardIterator>(__left, __right);
+           }
+       }
+      return pair<_ForwardIterator, _ForwardIterator>(__first, __first);
+    }
+
+
+}
+#endif
diff --git a/src/lookup/Makefile.am b/src/lookup/Makefile.am
new file mode 100644 (file)
index 0000000..2b7d21f
--- /dev/null
@@ -0,0 +1,30 @@
+## Makefile.am -- Process this file with automake to produce Makefile.in
+## Copyright (C) 2007 Peng Wu
+##
+## This program is free software; you can redistribute it and/or modify
+## it under the terms of the GNU General Public License as published by
+## the Free Software Foundation; either version 2, or (at your option)
+## any later version.
+##
+## This program is distributed in the hope that it will be useful,
+## but WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+## GNU General Public License for more details.
+##
+## You should have received a copy of the GNU General Public License
+## along with this program; if not, write to the Free Software
+## Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+MAINTAINERCLEANFILES    = Makefile.in
+
+INCLUDES               = -I$(top_srcdir)/src/include \
+                         -I$(top_srcdir)/src/storage \
+                         @GLIB2_CPPFLAGS@
+
+noinst_HEADERS         = lookup.h winner_tree.h
+
+noinst_PROGRAMS                = 
+
+noinst_LTLIBRARIES     = liblookup.la
+
+liblookup_la_SOURCES   = pinyin_lookup.cpp winner_tree.cpp
diff --git a/src/lookup/lookup.h b/src/lookup/lookup.h
new file mode 100644 (file)
index 0000000..676c6ea
--- /dev/null
@@ -0,0 +1,144 @@
+/* 
+ *  novel-pinyin,
+ *  A Simplified Chinese Sentence-Based Pinyin Input Method Engine
+ *  Based On Markov Model.
+ *  
+ *  Copyright (C) 2006-2007 Peng Wu
+ *  
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ * 
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *  
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef LOOKUP_H
+#define LOOKUP_H
+
+#include <float.h>
+#include <glib.h>
+#include "novel_types.h"
+#include "pinyin_base.h"
+
+class WinnerTree;
+
+/** @file lookup.h
+ *  @brief the definitions of lookup related classes and structs.
+ *         Currently only contains pinyin lookup.
+ */
+
+typedef phrase_token_t lookup_key_t;
+
+struct lookup_value_t{
+    phrase_token_t m_handles[2];
+    gfloat m_poss;
+    gint32 m_last_step;
+    lookup_value_t(gfloat poss = FLT_MAX){
+       m_handles[0] = NULL; m_handles[1] = NULL;
+       m_poss = poss;
+       m_last_step = -1;
+    }
+};
+
+enum constraint_type{NO_CONSTRAINT, CONSTRAINT_ONESTEP, CONSTRAINT_NOSEARCH };
+
+struct lookup_constraint_t{
+    constraint_type m_type;
+    union{
+       phrase_token_t m_token;
+       guint32 m_constraint_step; /* index of m_token */
+    };
+};
+
+typedef GArray * CandidateConstraints; /* Array of lookup_constraint_t */
+typedef GArray * MatchResults;         /* Array of phrase_token_t */
+
+namespace novel{
+class PinyinLargeTable;
+class FacadePhraseIndex;
+class Bigram;
+};
+
+typedef GHashTable * LookupStepIndex;
+/* Key: lookup_key_t, Value: int m, index to m_steps_content[i][m] */
+typedef GArray * LookupStepContent; /* array of lookup_value_t */
+
+
+class IBranchIterator{
+public:
+  virtual ~IBranchIterator(){}
+  virtual bool has_next() = 0;
+  virtual lookup_value_t next() = 0;
+  virtual lookup_value_t max() = 0;
+};  
+
+class PinyinLookup{
+private:
+    static const gfloat bigram_lambda = LAMBDA_PARAMETER;
+    static const gfloat unigram_lambda = 1 - LAMBDA_PARAMETER;
+    
+    PhraseItem m_cache_phrase_item;
+protected:
+    //saved varibles
+    CandidateConstraints m_constraints;
+    PinyinKeyVector m_keys;
+    
+    novel::PinyinLargeTable * m_pinyin_table;
+    novel::FacadePhraseIndex * m_phrase_index;
+    novel::PinyinCustomSettings * m_custom;
+    novel::Bigram * m_bigram;
+    
+    //internal step data structure
+    GPtrArray * m_steps_index;  
+    /* Array of LookupStepIndex */
+    GPtrArray * m_steps_content;
+    /* Array of LookupStepContent */
+
+    GArray * m_table_cache;
+    /* Array of PhraseIndexRanges */
+    
+    WinnerTree * m_winner_tree;
+
+    size_t prepare_table_cache(int nstep, int total_pinyin);
+    
+    bool search_unigram(IBranchIterator * iter,  int nstep, int npinyin);
+    bool search_bigram(IBranchIterator * iter,  int nstep, int npinyin);
+    
+    bool unigram_gen_next_step(int nstep, lookup_value_t * cur_step, phrase_token_t token);
+    bool bigram_gen_next_step(int nstep, lookup_value_t * cur_step, phrase_token_t token, gfloat bigram_poss);
+        
+    bool save_next_step(int next_step_pos, lookup_value_t * cur_step, lookup_value_t * next_step);
+    
+    bool final_step(MatchResults & results);
+public:
+    PinyinLookup( PinyinCustomSettings * custom, PinyinLargeTable * pinyin_table, FacadePhraseIndex * phrase_index, Bigram * bigram);
+
+    ~PinyinLookup();
+
+    bool get_best_match(PinyinKeyVector keys, CandidateConstraints constraints, MatchResults & results);
+    
+    bool train_result(PinyinKeyVector keys, CandidateConstraints constraints, MatchResults & results);
+
+    bool convert_to_utf8(MatchResults results, /* out */ char * & result_string);
+
+    bool add_constraint(CandidateConstraints constraints, size_t index, phrase_token_t token);
+
+    bool clear_constraint(CandidateConstraints constraints, size_t index);
+
+    bool validate_constraint(CandidateConstraints constraints, PinyinKeyVector m_parsed_keys);
+
+    /* init pinyin table lookup array */
+    bool prepare_pinyin_lookup(PhraseIndexRanges ranges);
+    /* destroy pinyin table lookup array */
+    bool destroy_pinyin_lookup(PhraseIndexRanges ranges);
+};
+
+#endif
diff --git a/src/lookup/pinyin_lookup.cpp b/src/lookup/pinyin_lookup.cpp
new file mode 100644 (file)
index 0000000..c335453
--- /dev/null
@@ -0,0 +1,587 @@
+/* 
+ *  novel-pinyin,
+ *  A Simplified Chinese Sentence-Based Pinyin Input Method Engine
+ *  Based On Markov Model.
+ *  
+ *  Copyright (C) 2006-2007 Peng Wu
+ *  
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ * 
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *  
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <math.h>
+#include <assert.h>
+#include <iostream>
+#include "stl_lite.h"
+#include "novel_types.h"
+#include "pinyin_base.h"
+#include "pinyin_phrase.h"
+#include "pinyin_large_table.h"
+#include "phrase_index.h"
+#include "ngram.h"
+#include "lookup.h"
+#include "winner_tree.h"
+
+const gfloat PinyinLookup::bigram_lambda;
+const gfloat PinyinLookup::unigram_lambda;
+
+PinyinLookup::PinyinLookup(PinyinCustomSettings * custom, PinyinLargeTable * pinyin_table, FacadePhraseIndex * phrase_index, Bigram * bigram){
+    m_custom = custom;
+    m_pinyin_table = pinyin_table;
+    m_phrase_index = phrase_index;
+    m_bigram = bigram;
+    m_winner_tree = new WinnerTree;
+    m_steps_index = g_ptr_array_new();
+    m_steps_content = g_ptr_array_new();
+    m_table_cache = g_array_new(FALSE, TRUE, sizeof(PhraseIndexRanges));
+    g_array_set_size(m_table_cache, 1);
+}
+
+PinyinLookup::~PinyinLookup(){
+    if ( m_winner_tree )
+       delete m_winner_tree;
+    m_winner_tree = NULL;
+    //free resources
+    for ( size_t i = 0; i < m_table_cache->len; ++i){
+       PhraseIndexRanges * ranges = &g_array_index(m_table_cache, PhraseIndexRanges, i);
+       destroy_pinyin_lookup(*ranges);
+    }
+    //g_array_set_size(m_table_cache, 1);
+    g_array_free(m_table_cache, TRUE);
+
+    //free m_steps_index
+    for ( size_t i = 0; i < m_steps_index->len; ++i){
+       GHashTable * table = (GHashTable *) g_ptr_array_index(m_steps_index, i);
+       g_hash_table_destroy(table);
+       g_ptr_array_index(m_steps_index, i) = NULL;
+    }
+    g_ptr_array_free(m_steps_index, TRUE);
+
+    //free m_steps_content
+    for ( size_t i = 0; i < m_steps_content->len; ++i){
+       GArray * array = (GArray *) g_ptr_array_index(m_steps_content, i);
+       g_array_free(array, TRUE);
+       g_ptr_array_index(m_steps_content, i) = NULL;
+    }
+    g_ptr_array_free(m_steps_content, TRUE);
+        
+}
+
+bool PinyinLookup::prepare_pinyin_lookup(PhraseIndexRanges ranges){
+    //memset(ranges, 0, sizeof(ranges));
+    for ( size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i ){
+       GArray * & array = ranges[i];
+       assert(NULL == array);
+       if (m_phrase_index->m_sub_phrase_indices[i]){
+           array = g_array_new(FALSE, FALSE, sizeof (PhraseIndexRange));
+       }
+    }
+       return true;
+}
+
+bool PinyinLookup::destroy_pinyin_lookup(PhraseIndexRanges ranges){
+    for ( size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT ; ++i){
+       GArray * & array = ranges[i];
+       if ( array )
+           g_array_free(array, TRUE);
+       array = NULL;
+    }
+       return true;
+}
+
+size_t PinyinLookup::prepare_table_cache(int nstep, int total_pinyin){
+    //free resources
+    for ( size_t i = 0; i < m_table_cache->len; ++i){
+       PhraseIndexRanges * ranges = &g_array_index(m_table_cache, PhraseIndexRanges, i);
+       destroy_pinyin_lookup(*ranges);
+    }
+    //g_array_set_size(m_table_cache, 1);
+    PinyinKey * pinyin_keys = (PinyinKey *)m_keys->data;
+    pinyin_keys += nstep;
+    //init resources
+    g_array_set_size(m_table_cache, MAX_PHRASE_LENGTH + 1);
+    size_t len;
+    for ( len = 1; len <= total_pinyin && len <= MAX_PHRASE_LENGTH; ++len){
+       PhraseIndexRanges * ranges = &g_array_index(m_table_cache, PhraseIndexRanges, len);
+       prepare_pinyin_lookup(*ranges);
+       int result = m_pinyin_table->search(len, pinyin_keys, *ranges);
+       if (!( result & SEARCH_CONTINUED)){
+           ++len;
+           break;
+       }
+    }
+    g_array_set_size(m_table_cache, std_lite::min(len, (size_t) MAX_PHRASE_LENGTH + 1));
+    return m_table_cache->len - 1;
+}
+
+bool PinyinLookup::get_best_match(PinyinKeyVector keys, CandidateConstraints constraints, MatchResults & results){
+    //g_array_set_size(results, 0);
+
+    m_constraints = constraints;
+    m_keys = keys;
+    int nstep = keys->len + 1;
+
+    //free m_steps_index
+    for ( size_t i = 0; i < m_steps_index->len; ++i){
+       GHashTable * table = (GHashTable *) g_ptr_array_index(m_steps_index, i);
+       g_hash_table_destroy(table);
+       g_ptr_array_index(m_steps_index, i) = NULL;
+    }
+
+    //free m_steps_content
+    for ( size_t i = 0; i < m_steps_content->len; ++i){
+       GArray * array = (GArray *) g_ptr_array_index(m_steps_content, i);
+       g_array_free(array, TRUE);
+       g_ptr_array_index(m_steps_content, i) = NULL;
+    }    
+    
+    //add null start step
+    g_ptr_array_set_size(m_steps_index, nstep);
+    g_ptr_array_set_size(m_steps_content, nstep);
+
+    for ( size_t i = 0 ; i < nstep; ++i ){
+       //initialize m_steps_index
+       g_ptr_array_index(m_steps_index, i) = g_hash_table_new(g_direct_hash, g_direct_equal);
+       //initialize m_steps_content
+       g_ptr_array_index(m_steps_content, i) = g_array_new(FALSE, FALSE, sizeof(lookup_value_t));
+    }
+    
+    lookup_key_t initial_key = sentence_start;
+    lookup_value_t initial_value(log(1));
+    initial_value.m_handles[1] = sentence_start;
+    GArray * initial_step_content = (GArray *) g_ptr_array_index(m_steps_content, 0);
+    initial_step_content = g_array_append_val(initial_step_content, initial_value);
+    GHashTable * initial_step_index = (GHashTable *) g_ptr_array_index(m_steps_index, 0);
+    g_hash_table_insert(initial_step_index, GUINT_TO_POINTER(initial_key), GUINT_TO_POINTER(initial_step_content->len - 1));
+
+#if 0
+    LookupStepContent tmp_step = (LookupStepContent) g_ptr_array_index(m_steps_content, 0);
+    IBranchIterator * iter = m_winner_tree->get_iterator(tmp_step);
+    size_t npinyin = prepare_table_cache(0, keys->len);
+    search_unigram(iter, 0, npinyin);
+    delete iter;
+#endif
+
+    for ( size_t i = 0 ; i < nstep - 1 ; ++i ){
+       LookupStepContent tmp_step = (LookupStepContent) g_ptr_array_index(m_steps_content, i);
+       IBranchIterator * iter = m_winner_tree->get_iterator(tmp_step);
+       size_t npinyin = prepare_table_cache(i, keys->len - i);
+       search_bigram(iter, i, npinyin),
+           search_unigram(iter, i, npinyin);
+       delete iter;
+    }
+    return final_step(results);
+}
+
+bool PinyinLookup::search_unigram(IBranchIterator * iter, int nstep, int npinyin){
+    lookup_constraint_t* constraint = &g_array_index(m_constraints, lookup_constraint_t, nstep);
+    if ( CONSTRAINT_NOSEARCH == constraint->m_type )
+       return false;
+    GArray * lookup_content = (GArray *) g_ptr_array_index(m_steps_content, nstep);
+    if ( 0 == lookup_content->len )
+       return false;
+    lookup_value_t max_step = iter->max();
+    if ( CONSTRAINT_ONESTEP == constraint->m_type){
+           return unigram_gen_next_step(nstep, &max_step, constraint->m_token);
+    }
+    if ( NO_CONSTRAINT == constraint->m_type ){
+       bool found = false;
+       for ( size_t i = 1; i < m_table_cache->len && i <= MAX_PHRASE_LENGTH; ++i){
+       lookup_constraint_t * constraint = &g_array_index(m_constraints, lookup_constraint_t, nstep + i - 1);
+       if ( constraint->m_type != NO_CONSTRAINT )
+           continue;
+           PhraseIndexRanges * ranges = &g_array_index(m_table_cache,PhraseIndexRanges, i);
+           for ( size_t m = 0; m < PHRASE_INDEX_LIBRARY_COUNT; ++m){
+               GArray * array = (*ranges)[m];
+               if ( !array ) continue;
+               for ( size_t n = 0; n < array->len; ++n){
+                   PhraseIndexRange * range = &g_array_index(array, PhraseIndexRange, n);
+                   for ( phrase_token_t token = range->m_range_begin; 
+                         token != range->m_range_end; ++token){
+                       found = unigram_gen_next_step(nstep, &max_step, token)|| found;
+                   }  
+               }
+           }
+       }
+       return found;
+    }
+    return false;
+}
+
+
+bool PinyinLookup::search_bigram(IBranchIterator * iter, 
+                                int nstep, int npinyin){
+    lookup_constraint_t* constraint = &g_array_index(m_constraints, lookup_constraint_t, nstep);
+    if ( CONSTRAINT_NOSEARCH == constraint->m_type )
+       return false;
+    GArray * lookup_content = (GArray *) g_ptr_array_index(m_steps_content, nstep);
+
+    bool found = false;
+    BigramPhraseArray bigram_phrase_items = g_array_new(FALSE, FALSE, 
+                                              sizeof(BigramPhraseItem));
+    while ( iter->has_next() ){
+       lookup_value_t cur_step = iter->next();
+       //printf("token:%d\t%d\n", cur_step.m_handles[0], cur_step.m_handles[1]);
+       phrase_token_t index_token = cur_step.m_handles[1];
+       SingleGram * system, * user;
+       m_bigram->load(index_token, system, user);
+       if ( system && user ){
+           guint32 total_freq;
+           assert(user->get_total_freq(total_freq));
+           assert(system->set_total_freq(total_freq));
+       }
+       if ( CONSTRAINT_ONESTEP == constraint->m_type ){
+           phrase_token_t token = constraint->m_token;
+           if ( system ){
+               guint32 freq;
+               if( system->get_freq(token, freq) ){
+                   guint32 total_freq;
+                   system->get_total_freq(total_freq);
+                   gfloat bigram_poss = freq / (gfloat) total_freq;
+                   found =  bigram_gen_next_step(nstep, &cur_step, token, bigram_poss) || found;
+               }
+           }
+           if ( user ){
+               guint32 freq;
+               if( user->get_freq(token, freq)){
+                   guint32 total_freq;
+                   user->get_total_freq(total_freq);
+                   gfloat bigram_poss = freq / (gfloat) total_freq;
+                   found = bigram_gen_next_step(nstep, &cur_step, token, bigram_poss) || found;
+               }
+           }
+       }
+
+       if ( NO_CONSTRAINT == constraint->m_type ){
+           for ( size_t i = 1; i < m_table_cache->len 
+                     && i <= MAX_PHRASE_LENGTH;++i ){
+                lookup_constraint_t * constraint = &g_array_index(m_constraints, lookup_constraint_t, nstep + i - 1);
+                if ( constraint->m_type != NO_CONSTRAINT )
+                     continue;
+
+               PhraseIndexRanges * ranges = &g_array_index(m_table_cache, PhraseIndexRanges, i);
+               for( size_t m = 0; m < PHRASE_INDEX_LIBRARY_COUNT; ++m){
+                   GArray * array = (*ranges)[m];
+                   if ( !array ) continue;
+                   for ( size_t n = 0; n < array->len; ++n){
+                       PhraseIndexRange * range = &g_array_index(array, PhraseIndexRange, n);
+                       if (system){
+                           g_array_set_size(bigram_phrase_items, 0);
+                           system->search(range, bigram_phrase_items);
+                           for( size_t k = 0; k < bigram_phrase_items->len; 
+                                ++k){
+                               BigramPhraseItem * item = &g_array_index(bigram_phrase_items, BigramPhraseItem, k);
+                               found = bigram_gen_next_step(nstep, &cur_step, item->m_token, item->m_freq) || found;
+                           }
+                       }
+                       if (user){
+                           g_array_set_size(bigram_phrase_items, 0);
+                           user->search(range, bigram_phrase_items);
+                           for( size_t k  = 0; k < bigram_phrase_items->len;
+                                ++k){
+                               BigramPhraseItem * item = &g_array_index(bigram_phrase_items, BigramPhraseItem, k);
+                               found = bigram_gen_next_step(nstep, &cur_step, item->m_token, item->m_freq) || found;
+                           }
+                       }
+                   }
+               }
+           }
+       }
+       if (system)
+           delete system;
+       if (user)
+           delete user;
+    }
+    g_array_free(bigram_phrase_items, TRUE);
+    return found;
+}
+
+
+bool PinyinLookup::unigram_gen_next_step(int nstep, lookup_value_t * cur_step, phrase_token_t token){
+    PinyinKey * pinyinkeys = ((PinyinKey *)m_keys->data) + nstep;
+    if (!m_phrase_index->get_phrase_item(token, m_cache_phrase_item))
+       return false;
+    size_t phrase_length = m_cache_phrase_item.get_phrase_length();
+    gfloat elem_poss = m_cache_phrase_item.get_unigram_frequency() / (gfloat)
+       m_phrase_index->get_phrase_index_total_freq();
+    if ( elem_poss < FLT_EPSILON )
+       return false;
+    gfloat pinyin_poss = m_cache_phrase_item.get_pinyin_possibility(*m_custom, pinyinkeys);
+    if (pinyin_poss < FLT_EPSILON )
+       return false;
+    lookup_value_t next_step;
+    next_step.m_handles[0] = cur_step->m_handles[1]; next_step.m_handles[1] = token;
+    next_step.m_poss = cur_step->m_poss + log(elem_poss * pinyin_poss * unigram_lambda);
+    next_step.m_last_step = nstep;
+    
+    return save_next_step(nstep + phrase_length, cur_step, &next_step);
+}
+
+bool PinyinLookup::bigram_gen_next_step(int nstep, lookup_value_t * cur_step, phrase_token_t token, gfloat bigram_poss){
+    PinyinKey * pinyinkeys = ((PinyinKey *)m_keys->data) + nstep;
+    if (!m_phrase_index->get_phrase_item(token, m_cache_phrase_item))
+       return false;
+    size_t phrase_length = m_cache_phrase_item.get_phrase_length();
+    gfloat unigram_poss = m_cache_phrase_item.get_unigram_frequency() / (gfloat)
+       m_phrase_index->get_phrase_index_total_freq();
+    if ( bigram_poss < FLT_EPSILON && unigram_poss < FLT_EPSILON )
+       return false;
+    gfloat pinyin_poss = m_cache_phrase_item.get_pinyin_possibility(*m_custom, pinyinkeys);
+    if ( pinyin_poss < FLT_EPSILON )
+       return false;
+    lookup_value_t next_step;
+    next_step.m_handles[0] = cur_step->m_handles[1]; next_step.m_handles[1] = token;
+    next_step.m_poss = cur_step->m_poss + 
+       log(( bigram_lambda * bigram_poss + unigram_lambda * unigram_poss) *pinyin_poss);
+    next_step.m_last_step = nstep;
+    
+    return save_next_step(nstep + phrase_length, cur_step, &next_step);
+}
+
+bool PinyinLookup::save_next_step(int next_step_pos, lookup_value_t * cur_step, lookup_value_t * next_step){
+    lookup_key_t next_key = next_step->m_handles[1];
+    GHashTable * next_lookup_index = (GHashTable *) g_ptr_array_index(m_steps_index, next_step_pos);
+    GArray * next_lookup_content = (GArray *) g_ptr_array_index(m_steps_content, next_step_pos);
+    
+    gpointer key, value;
+    gboolean lookup_result = g_hash_table_lookup_extended(next_lookup_index, GUINT_TO_POINTER(next_key), &key, &value);
+    size_t step_index = GPOINTER_TO_UINT(value);
+    if ( !lookup_result ){
+       g_array_append_val(next_lookup_content, *next_step);
+       g_hash_table_insert(next_lookup_index, GUINT_TO_POINTER(next_key), GUINT_TO_POINTER(next_lookup_content->len - 1));
+       return true;
+    }else{
+       lookup_value_t * orig_next_value = &g_array_index(next_lookup_content, lookup_value_t,step_index);
+       if ( orig_next_value->m_poss < next_step->m_poss) {
+           orig_next_value->m_handles[0] = next_step->m_handles[0];
+           assert(orig_next_value->m_handles[1] == next_step->m_handles[1]);
+           orig_next_value->m_poss = next_step->m_poss;
+           orig_next_value->m_last_step = next_step->m_last_step;
+           return true;
+       }
+       return false;
+    }
+}
+
+bool PinyinLookup::final_step(MatchResults & results){
+    //reset results
+    g_array_set_size(results, m_steps_content->len);
+    for ( size_t i = 0 ; i < m_steps_content->len ; ++i){
+       phrase_token_t * token = &g_array_index(results, phrase_token_t, i);
+       *token = NULL;
+    }
+    //find max element
+    size_t last_step_pos = m_steps_content->len - 1;
+    
+    GArray * last_step_array = (GArray *)g_ptr_array_index(m_steps_content, last_step_pos);
+    if ( last_step_array->len == 0 )
+       return false;
+    lookup_value_t * max_value = &g_array_index(last_step_array, lookup_value_t, 0);
+    for ( size_t i = 1; i < last_step_array->len; ++i){
+       lookup_value_t * cur_value = &g_array_index(last_step_array, lookup_value_t, i);
+       if ( cur_value->m_poss > max_value->m_poss )
+           max_value = cur_value;
+    }
+
+    //backtracing
+    while( true ){
+       int cur_step_pos = max_value->m_last_step;
+       if ( -1 == cur_step_pos )
+           break;
+
+       phrase_token_t * token = &g_array_index(results, phrase_token_t, cur_step_pos);
+       *token = max_value->m_handles[1];
+
+       phrase_token_t last_token = max_value->m_handles[0];
+       
+       
+       GHashTable * lookup_step_index = (GHashTable *)g_ptr_array_index(m_steps_index, cur_step_pos);
+       gpointer key, value;
+       gboolean result = g_hash_table_lookup_extended(lookup_step_index, GUINT_TO_POINTER(last_token), &key, &value);
+       if (!result)
+           return false;
+       GArray * lookup_step_content = (GArray *)g_ptr_array_index(m_steps_content, cur_step_pos);
+
+       max_value = &g_array_index(lookup_step_content, lookup_value_t, GPOINTER_TO_UINT(value));
+    }
+    
+    //no need to reverse the result
+    
+    return true;
+}
+
+bool PinyinLookup::train_result(PinyinKeyVector keys, CandidateConstraints constraints, MatchResults & results){
+    bool train_next = false;
+    PinyinKey * pinyin_keys = (PinyinKey *)keys->data;
+    //TODO: verify the new training method.
+    phrase_token_t last_token = sentence_start;
+    // constraints->len + 1 == results->len
+    guint32 train_factor = 23;
+    for ( size_t i = 0; i < constraints->len; ++i){
+       phrase_token_t * token = &g_array_index(results, phrase_token_t, i);
+       if ( *token == NULL )
+           continue;
+       lookup_constraint_t * constraint = &g_array_index(constraints, lookup_constraint_t, i);
+       if (train_next || CONSTRAINT_ONESTEP == constraint->m_type ){
+           if (CONSTRAINT_ONESTEP == constraint->m_type){
+               assert(*token == constraint->m_token);
+               train_next = true;
+           }else{
+               train_next = false;
+           }
+           //add pi-gram frequency
+           //std::cout<<"i:"<<i<<"last_token:"<<last_token<<"\ttoken:"<<*token<<std::endl;
+           m_phrase_index->get_phrase_item(*token, m_cache_phrase_item);
+           m_cache_phrase_item.increase_pinyin_possibility(*m_custom, pinyin_keys + i, train_factor);
+           m_phrase_index->add_unigram_frequency(*token, train_factor);
+           if ( last_token ){
+               SingleGram * system, *user;
+               m_bigram->load(last_token, system, user);
+               guint32 total_freq;
+               if ( !user ){
+                   total_freq = 0;
+                   if ( system )
+                       assert(system->get_total_freq(total_freq));
+                   user = new SingleGram;
+                   user->set_total_freq(total_freq);
+               }
+               guint32 freq = 0;
+               if ( !user->get_freq(*token, freq)){
+                   if (system) system->get_freq(*token, freq);
+                   user->set_freq(*token, freq);
+               }
+               assert(user->get_total_freq(total_freq));
+               //protect against total_freq overflow.
+               if ( train_factor > 0 && total_freq > total_freq + train_factor)
+                   goto next;
+               assert(user->set_total_freq(total_freq + train_factor));
+               assert(user->get_freq(*token, freq));
+               //if total_freq is not overflow, then freq won't overflow.
+               assert(user->set_freq(*token, freq + train_factor));
+               assert(m_bigram->store(last_token, user));
+           next:
+               if (system) delete system;
+               if (user) delete user;
+           }
+       }
+       last_token = *token;
+    }
+    return true;
+}
+
+bool PinyinLookup::convert_to_utf8(MatchResults results, /* out */ char * & result_string){
+    result_string = g_strdup("");
+    for ( size_t i = 0; i < results->len; ++i){
+       phrase_token_t * token = &g_array_index(results, phrase_token_t, i);
+       if ( NULL == *token )
+           continue;
+       m_phrase_index->get_phrase_item(*token, m_cache_phrase_item);
+       utf16_t buffer[MAX_PHRASE_LENGTH];
+       m_cache_phrase_item.get_phrase_string(buffer);
+       guint8 length = m_cache_phrase_item.get_phrase_length();
+       gchar * phrase = g_utf16_to_utf8(buffer, length, NULL, NULL, NULL);
+       char * tmp = result_string;
+       result_string = g_strconcat(result_string, phrase, NULL);
+       g_free(tmp); g_free(phrase);
+    }
+    return true;
+}
+
+bool PinyinLookup::add_constraint(CandidateConstraints constraints, size_t index, phrase_token_t token){
+    if ( !m_phrase_index->get_phrase_item(token, m_cache_phrase_item) )
+       return false;
+
+    size_t phrase_length = m_cache_phrase_item.get_phrase_length();
+    if ( index + phrase_length > constraints->len )
+       return false;
+
+    for ( size_t i = index; i < index + phrase_length ; ++i ){
+       clear_constraint(constraints, i);
+    }
+
+    lookup_constraint_t * constraint = &g_array_index(constraints, lookup_constraint_t, index);
+    constraint->m_type = CONSTRAINT_ONESTEP;
+    constraint->m_token = token;
+    
+    for (size_t i = 1; i < phrase_length; ++i){
+       constraint = &g_array_index(constraints, lookup_constraint_t, index + i);
+       constraint->m_type = CONSTRAINT_NOSEARCH;
+       constraint->m_constraint_step = index;
+    }
+       return true;
+}
+
+bool PinyinLookup::clear_constraint(CandidateConstraints constraints, size_t index){
+    if ( index < 0 || index >= constraints->len )
+       return false;
+    lookup_constraint_t * constraint = &g_array_index(constraints, lookup_constraint_t, index);
+    if (constraint->m_type == NO_CONSTRAINT)
+       return false;
+    if (constraint->m_type == CONSTRAINT_NOSEARCH){
+       index = constraint->m_constraint_step;
+       constraint = &g_array_index(constraints, lookup_constraint_t, index);
+    }
+    
+    assert(constraint->m_type == CONSTRAINT_ONESTEP);    
+
+    phrase_token_t token = constraint->m_token;
+    if (!m_phrase_index->get_phrase_item(token, m_cache_phrase_item))
+       return false;
+
+    size_t phrase_length = m_cache_phrase_item.get_phrase_length();
+    for ( size_t i = 0; i < phrase_length; ++i){
+       if ( index + i >= constraints->len )
+           continue;
+       constraint = &g_array_index(constraints, lookup_constraint_t, index + i);
+       constraint->m_type = NO_CONSTRAINT;
+    }
+       return true;
+}
+
+bool PinyinLookup::validate_constraint(CandidateConstraints constraints, PinyinKeyVector m_parsed_keys){
+    //resize constraints array
+    size_t constraints_length = constraints->len;
+    if ( m_parsed_keys->len > constraints_length ){
+       g_array_set_size(constraints, m_parsed_keys->len);
+       //initialize new element
+       for( size_t i = constraints_length; i < m_parsed_keys->len; ++i){
+           lookup_constraint_t * constraint = &g_array_index(constraints, lookup_constraint_t, i);
+           constraint->m_type = NO_CONSTRAINT;
+       }
+    }else if (m_parsed_keys->len < constraints_length ){
+       g_array_set_size(constraints, m_parsed_keys->len);
+    }
+    
+    PinyinKey * pinyin_keys = (PinyinKey *)m_parsed_keys->data;
+    
+    for ( size_t i = 0; i < constraints->len; ++i){
+       lookup_constraint_t * constraint = &g_array_index(constraints, lookup_constraint_t, i);
+       if ( constraint->m_type == CONSTRAINT_ONESTEP ){
+           phrase_token_t token = constraint->m_token;
+           m_phrase_index->get_phrase_item(token, m_cache_phrase_item);
+           size_t phrase_length = m_cache_phrase_item.get_phrase_length();
+           //clear too long constraint
+           if ( i + phrase_length > constraints->len ){
+               clear_constraint(constraints, i);
+               continue;
+           }
+           //clear invalidated pinyin
+           gfloat pinyin_poss = m_cache_phrase_item.get_pinyin_possibility(*m_custom, pinyin_keys + i);
+           if ( pinyin_poss < FLT_EPSILON ){
+               clear_constraint(constraints, i);
+           }
+       }
+    }
+    return true;
+}
diff --git a/src/lookup/winner_tree.cpp b/src/lookup/winner_tree.cpp
new file mode 100644 (file)
index 0000000..248a749
--- /dev/null
@@ -0,0 +1,141 @@
+/* 
+ *  novel-pinyin,
+ *  A Simplified Chinese Sentence-Based Pinyin Input Method Engine
+ *  Based On Markov Model.
+ *  
+ *  Copyright (C) 2006-2007 Peng Wu
+ *  
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ * 
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *  
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <float.h>
+#include <limits.h>
+#include <stdio.h>
+#include "memory_chunk.h"
+#include "phrase_index.h"
+#include "lookup.h"
+#include "winner_tree.h"
+
+WinnerTreeBranchIterator::WinnerTreeBranchIterator(WinnerTree & tree)
+    :m_tree(tree), m_counter(0){
+    m_max_value = m_tree.m_items[m_tree.get_winner()];
+    m_counter = 0;
+}
+
+bool WinnerTreeBranchIterator::has_next(){
+    if ( m_counter >= m_tree.m_tree_size)
+       return false;
+    return m_counter < nbranch;
+}
+
+lookup_value_t WinnerTreeBranchIterator::next(){
+    int winner = m_tree.get_winner();
+    lookup_value_t tmp = m_tree.m_items[winner];
+    m_tree.m_items[winner].m_poss = 
+       - FLT_MAX;
+    m_tree.replay(winner);
+    ++m_counter;
+    return tmp;
+}
+
+void WinnerTree::play(int p, int lc, int rc){
+    m_tree[p] = winner(lc, rc);
+    //continue competition
+    while( p > 1 && p % 2) {
+       m_tree[p/2] = winner( m_tree[p - 1], m_tree[p]);
+       p/=2;
+  }
+}
+
+
+bool WinnerTree::initialize(LookupStepContent cur_step){
+    size_t size = cur_step->len;
+    if ( size > m_max_tree_size ){
+       init(size);
+    }
+    assert(size > nbranch);
+    m_tree_size = size;
+    
+    //initialize array tree
+    int nindex = 1;
+    
+    for( size_t i = 0; i < cur_step->len ; ++i){
+       lookup_value_t * cur_value = &g_array_index(cur_step, lookup_value_t, i);
+       m_items[nindex++] = *cur_value;
+    }
+    
+    //compute s = 2 ^ log(n -1)
+    int i, s;
+    for( s = 1; 2 * s <= m_tree_size - 1; s += s);
+  
+    m_low_ext = 2 * (m_tree_size - s);
+    m_offset = 2 * s - 1;
+  
+    //compute outside nodes
+    for( i = 2; i <= m_low_ext; i += 2)
+       play((m_offset + i)/2, i - 1, i);
+    //compute other nodes
+    if ( m_tree_size % 2){
+       play( m_tree_size / 2, m_tree[m_tree_size - 1], m_low_ext +1);
+       i = m_low_ext + 3;
+    }else i = m_low_ext + 2;
+  
+    //compute others 
+    for( ; i <= m_tree_size; i += 2)
+    play( (i - m_low_ext + m_tree_size - 1) / 2, i - 1, i);
+    return true;
+}
+
+void WinnerTree::replay(int i){
+    assert( 1 <= i && i <= m_tree_size);
+    
+    int p; //compete node
+    int lc; //p's left child
+    int rc; //p's right child
+    
+    //first compete
+    if ( i <= m_low_ext){
+       p = (m_offset + i) / 2;
+       lc = 2 * p - m_offset;
+       rc = lc + 1;
+    }else{
+       p = (i - m_low_ext + m_tree_size -1) / 2;
+       if ( 2 * p == m_tree_size - 1 ){
+           lc = m_tree[2*p];
+           rc = i;
+       }else{
+           lc = 2 * p - m_tree_size + 1 + m_low_ext;
+           rc = lc + 1;
+       }
+    }
+    
+    m_tree[p] = winner(lc, rc);
+    
+    //added by wupeng
+    if ( ( p | 0x01 )  == m_tree_size ){
+        p /= 2;
+       m_tree[p] = winner( m_tree[2 * p], m_low_ext + 1 );
+    }
+    
+    //compute others
+    p /= 2;
+    for( ; p >= 1 ; p /= 2)
+       m_tree[p] = winner( m_tree[2 * p], m_tree[2 * p + 1]);
+}
+
+int WinnerTree::winner(int lc, int rc){
+    return m_items[lc].m_poss > m_items[rc].m_poss ? 
+    lc : rc;
+}
diff --git a/src/lookup/winner_tree.h b/src/lookup/winner_tree.h
new file mode 100644 (file)
index 0000000..262f196
--- /dev/null
@@ -0,0 +1,148 @@
+/* 
+ *  novel-pinyin,
+ *  A Simplified Chinese Sentence-Based Pinyin Input Method Engine
+ *  Based On Markov Model.
+ *  
+ *  Copyright (C) 2006-2007 Peng Wu
+ *  
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ * 
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *  
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef LOOKUP_WINNER_TREE_H
+#define LOOKUP_WINNER_TREE_H
+
+#include <assert.h>
+#include "lookup.h"
+
+const int nbranch = 32;
+
+class DirectBranchIterator: public IBranchIterator{//for nitem <= nbranch
+    LookupStepContent m_step_content;
+    size_t m_iter_pos;
+public:
+    //Constructor
+    DirectBranchIterator(LookupStepContent step_content)
+       :m_step_content(step_content)
+    { m_iter_pos = 0; }
+    
+    //Destructor
+    virtual ~DirectBranchIterator(){}
+    
+    //Member Function
+    bool has_next(){
+       return m_iter_pos != m_step_content->len;
+    }
+    
+    lookup_value_t next(){
+       lookup_value_t * tmp = &g_array_index(m_step_content, 
+                                             lookup_value_t, m_iter_pos);
+       ++m_iter_pos;
+       return *tmp;
+    }
+    
+    lookup_value_t max(){
+       lookup_value_t * max_value = &g_array_index(m_step_content, lookup_value_t, 0);
+       for ( size_t i = 1 ; i < m_step_content->len; ++i){
+           lookup_value_t * cur_value = &g_array_index(m_step_content, lookup_value_t, i);
+           if ( cur_value->m_poss > max_value->m_poss )
+               max_value = cur_value;
+       }
+       return *max_value;
+    }
+};
+
+class WinnerTree;
+
+class WinnerTreeBranchIterator: public IBranchIterator{//for nitem <= nbranch
+    WinnerTree& m_tree;
+    int m_counter;
+    lookup_value_t m_max_value;
+public:
+    //Constructor
+    WinnerTreeBranchIterator(WinnerTree & tree);
+    
+    //Destructor
+    virtual ~WinnerTreeBranchIterator(){}
+  
+    //Member Function
+    bool has_next();
+    
+    lookup_value_t next();
+    
+    lookup_value_t max(){
+       return m_max_value;
+    }
+    
+};
+
+class WinnerTree{
+    friend class WinnerTreeBranchIterator;
+private:
+    size_t m_max_tree_size; // maxsize
+    int m_tree_size; // n
+    int m_low_ext;
+    int m_offset;
+    int * m_tree; 
+    MemoryChunk m_buffer;
+    MemoryChunk m_tree_buffer;
+    lookup_value_t * m_items;
+
+    int winner(int lc, int rc);
+    
+    void play(int p, int lc, int rc);
+    
+    void init(int tree_size){
+       m_max_tree_size = tree_size;
+       //data buffer
+       m_buffer.set_size( sizeof(lookup_value_t) * (tree_size + 1) );
+       m_items = (lookup_value_t *) m_buffer.begin();
+       
+       //tree item buffer
+       m_tree_buffer.set_size( sizeof(int) * m_max_tree_size);
+       m_tree = (int * ) m_tree_buffer.begin();
+       m_tree_size = 0;
+    }
+    
+public:
+    
+    //Constructor
+    WinnerTree(int tree_size = 10){
+       init(tree_size);
+    }
+    
+    //Destructor
+    ~WinnerTree() { }
+
+    //need delete this
+    IBranchIterator* get_iterator(LookupStepContent step){
+       if ( step->len <= nbranch )
+           return new DirectBranchIterator(step);
+       //TODO:another situation > nbranch
+       assert(initialize(step));
+       return new WinnerTreeBranchIterator(*this);
+    }
+    
+protected:
+    
+    int get_winner() const {
+       return (m_tree_size)? m_tree[1] : 0;
+    }
+    
+    //Member Function
+    bool initialize(LookupStepContent cur_step);
+    void replay(int i);
+};
+
+#endif
diff --git a/src/segment/Makefile.am b/src/segment/Makefile.am
new file mode 100644 (file)
index 0000000..0e58ddf
--- /dev/null
@@ -0,0 +1,28 @@
+## Makefile.am -- Process this file with automake to produce Makefile.in
+## Copyright (C) 2007 Peng Wu
+##
+## This program is free software; you can redistribute it and/or modify
+## it under the terms of the GNU General Public License as published by
+## the Free Software Foundation; either version 2, or (at your option)
+## any later version.
+##
+## This program is distributed in the hope that it will be useful,
+## but WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+## GNU General Public License for more details.
+##
+## You should have received a copy of the GNU General Public License
+## along with this program; if not, write to the Free Software
+## Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+MAINTAINERCLEANFILES    = Makefile.in
+
+INCLUDES               = -I$(top_srcdir)/src/include \
+                         -I$(top_srcdir)/src/storage \
+                         @GLIB2_CPPFLAGS@
+
+noinst_PROGRAMS                = mmseg
+
+mmseg_SOURCES          = mmseg.cpp
+
+mmseg_LDADD            = @GLIB2_LDFLAGS@
diff --git a/src/segment/mmseg.cpp b/src/segment/mmseg.cpp
new file mode 100644 (file)
index 0000000..6a3d7f7
--- /dev/null
@@ -0,0 +1,212 @@
+/* 
+ *  novel-pinyin,
+ *  A Simplified Chinese Sentence-Based Pinyin Input Method Engine
+ *  Based On Markov Model.
+ *  
+ *  Copyright (C) 2006-2007 Peng Wu
+ *  
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ * 
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *  
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <string.h>
+#include <limits.h>
+#include <locale.h>
+#include <glib.h>
+#include "novel_types.h"
+
+static GHashTable * g_phrases;
+
+struct SegmentStep{
+    phrase_token_t  m_handle;
+    char * m_phrase;
+    //use formula W = No. of words. Zero handle means one word.
+    size_t m_nword;
+    //backtracing information, -1 one step backward.
+    gint8 m_backward_nstep;
+};
+
+//read gb_char.table and gbk_char.table
+bool init_phrases(FILE * infile){
+    char pinyin[256];
+    char phrase[256];
+    phrase_token_t token;
+    size_t freq;
+    while (!feof(infile)){
+        fscanf(infile, "%s", pinyin);
+        fscanf(infile, "%s", phrase);
+        fscanf(infile, "%d", &token);
+        fscanf(infile, "%ld", &freq);
+        if ( feof(infile) )
+            break;
+       g_hash_table_insert(g_phrases, g_strdup(phrase), 
+                           GUINT_TO_POINTER(token));   
+    }
+       return true;
+}
+
+bool segment(GHashTable * phrases, // Lookup Phrases
+            const char * phrase,
+            GArray * strings /* Array of const char * */){
+    GArray * steps = g_array_new(TRUE, TRUE, sizeof(SegmentStep));
+    GArray * offsets = g_array_new(TRUE, TRUE, sizeof(size_t));
+    //construct dynamic programming.
+    size_t phrase_length = g_utf8_strlen(phrase, -1);
+    const char * p = phrase;
+    size_t offset = p - phrase;
+    g_array_append_val(offsets, offset);
+    g_array_set_size(steps, phrase_length + 1);
+    for ( size_t i = 0 ; i < phrase_length; ++i){
+       p = g_utf8_next_char(p);
+       offset = p - phrase;
+       g_array_append_val(offsets, offset);
+    }
+    assert( *p == '\0' );
+
+    //initialize segment steps values.
+    for ( size_t i = 0; i < phrase_length + 1; ++i){
+       SegmentStep* step = &g_array_index(steps, SegmentStep, i);
+       step->m_nword = UINT_MAX;
+    }
+    
+    for ( size_t i = 0 ; i < phrase_length + 1; ++i){
+       size_t* offset_begin = &g_array_index(offsets, size_t, i);
+       const char * phrase_begin = phrase + *offset_begin;
+       SegmentStep * step_begin = &g_array_index(steps, SegmentStep, i);
+       size_t nword = step_begin->m_nword;
+       for ( size_t k = i + 1; k < phrase_length + 1; ++k){
+           size_t* offset_end = &g_array_index(offsets, size_t, k);
+           size_t len = *offset_end - *offset_begin;
+           char * cur_phrase = g_strndup(phrase_begin, len);
+           phrase_token_t token; 
+           gpointer orig_key, value;
+           gboolean result = g_hash_table_lookup_extended
+               (phrases, cur_phrase, &orig_key, &value);
+           if ( result ){
+               token = GPOINTER_TO_UINT(value);
+           }else{
+               token = 0;
+               if ( 1 != k - i ){ //skip non-phrase
+                   g_free(cur_phrase);
+                   continue;
+               }
+           }
+           ++nword;
+           SegmentStep * step_end = &g_array_index(steps, SegmentStep, k);
+           if ( nword < step_end->m_nword){
+               if ( step_end->m_phrase ){
+                   g_free(step_end->m_phrase);
+                   step_end->m_phrase = NULL;
+               }
+               step_end->m_handle = token;
+               step_end->m_phrase = cur_phrase;
+               step_end->m_nword = nword;
+               step_end->m_backward_nstep = k - i;
+           }else{
+               g_free(cur_phrase);
+           }
+       }
+    }
+    //backtracing to get the result.
+    size_t cur_step = phrase_length;
+    g_array_set_size(strings, 0);
+    while ( cur_step ){
+       SegmentStep* step_end = &g_array_index(steps, SegmentStep, cur_step);
+       char * str_dup = g_strdup(step_end->m_phrase);
+       g_array_append_val(strings, str_dup);
+       cur_step = cur_step - step_end->m_backward_nstep;
+    }
+    
+    for ( size_t i = 0; i < strings->len / 2; ++i){
+       char ** phrase_head = &g_array_index(strings, char * , i);
+       char ** phrase_tail = &g_array_index(strings, char * , strings->len -1 - i);
+       char * phrase_tmp;
+       phrase_tmp = * phrase_head; 
+       * phrase_head = * phrase_tail; 
+       * phrase_tail = phrase_tmp;
+    }
+
+    //free strndup memory
+    for ( size_t i = 0; i < steps->len; ++i){
+       SegmentStep* step = &g_array_index(steps, SegmentStep, i);
+       if ( step->m_phrase ){
+           g_free(step->m_phrase);
+           step->m_phrase = NULL;
+       }
+    }
+
+    g_array_free(offsets, TRUE);
+    g_array_free(steps, TRUE);
+       return true;
+}
+
+void print_help(){
+    printf("Usage: mmseg [--generate-extra-enter]\n");
+    exit(1);
+}
+
+int main(int argc, char * argv[]){
+    int i = 1;
+    bool gen_extra_enter = false;
+
+    setlocale(LC_ALL,"");
+    while ( i < argc ){
+       if ( strcmp("--help", argv[i] ) == 0) {
+           print_help();
+       }else if ( strcmp("--generate-extra-enter", argv[i]) == 0) {
+           gen_extra_enter = true;
+       }
+       ++i;
+    }
+    
+    g_phrases = g_hash_table_new_full(g_str_hash, g_str_equal, g_free, NULL);
+    //init phrase lookup
+    FILE * gb_file = fopen("../../data/gb_char.table", "r");
+    if ( gb_file == NULL ){
+       fprintf(stderr, "can't open gb_char.table!\n");
+       exit(1);
+    }
+    init_phrases(gb_file);
+    fclose(gb_file);
+    FILE * gbk_file = fopen("../../data/gbk_char.table", "r");
+    if ( gbk_file == NULL ){
+       fprintf(stderr, "can't open gbk_char.table!\n");
+       exit(1);
+    }
+    init_phrases(gbk_file);
+    fclose(gbk_file);
+    
+    char* linebuf = (char *)malloc ( 1024 * sizeof (char) );
+    size_t size = 1024;
+    while( getline(&linebuf, &size, stdin) ){
+       if ( feof(stdin) )
+           break;
+        linebuf[strlen(linebuf)-1] = '\0';
+
+       GArray * phrases = g_array_new(TRUE, TRUE, sizeof( char *));
+       segment(g_phrases, linebuf, phrases);
+       for ( size_t i = 0; i < phrases->len; ++i){
+           char * phrase = g_array_index(phrases, char *, i);
+           printf("%s\n", phrase);
+           g_free(phrase);
+       }
+       if ( gen_extra_enter )
+           printf("\n");
+       g_array_free(phrases, TRUE);
+    }
+    free(linebuf);
+}
diff --git a/src/storage/Makefile.am b/src/storage/Makefile.am
new file mode 100644 (file)
index 0000000..adf2b5c
--- /dev/null
@@ -0,0 +1,35 @@
+## Makefile.am -- Process this file with automake to produce Makefile.in
+## Copyright (C) 2007 Peng Wu
+##
+## This program is free software; you can redistribute it and/or modify
+## it under the terms of the GNU General Public License as published by
+## the Free Software Foundation; either version 2, or (at your option)
+## any later version.
+##
+## This program is distributed in the hope that it will be useful,
+## but WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+## GNU General Public License for more details.
+##
+## You should have received a copy of the GNU General Public License
+## along with this program; if not, write to the Free Software
+## Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+INCLUDES                = -I$(top_srcdir)/src/include \
+                         -I$(top_srcdir)/src/storage \
+                         @GLIB2_CPPFLAGS@
+
+noinst_HEADERS          = pinyin_large_table.h \
+                         pinyin_base.h \
+                         pinyin_phrase.h \
+                         phrase_index.h \
+                         pinyin_zhuyin_map_data.h \
+                         ngram.h
+
+noinst_LTLIBRARIES       = libstorage.la
+
+libstorage_la_SOURCES    = pinyin_base.cpp \
+                         pinyin_large_table.cpp \
+                         phrase_index.cpp \
+                         ngram.cpp
+
diff --git a/src/storage/ngram.cpp b/src/storage/ngram.cpp
new file mode 100644 (file)
index 0000000..7fdc58f
--- /dev/null
@@ -0,0 +1,283 @@
+/* 
+ *  novel-pinyin,
+ *  A Simplified Chinese Sentence-Based Pinyin Input Method Engine
+ *  Based On Markov Model.
+ *  
+ *  Copyright (C) 2006-2007 Peng Wu
+ *  
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ * 
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *  
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <stdio.h>
+#include "memory_chunk.h"
+#include "novel_types.h"
+#include "ngram.h"
+
+struct SingleGramItem{
+    phrase_token_t m_token;
+    guint32 m_freq;
+};
+
+SingleGram::SingleGram(){
+    m_chunk.set_size(sizeof(guint32));
+    memset(m_chunk.begin(), 0, sizeof(guint32));
+}
+
+SingleGram::SingleGram(void * buffer, size_t length){
+    m_chunk.set_chunk(buffer, length, NULL);
+}
+
+bool SingleGram::set_total_freq(guint32 m_total){
+    char * buf_begin = (char *)m_chunk.begin();
+    *((guint32 *)buf_begin) = m_total;
+    return true;
+}
+
+bool SingleGram::get_total_freq(guint32 & m_total){
+    char * buf_begin = (char *)m_chunk.begin();
+    m_total = *((guint32 *)buf_begin);
+    return true;
+}
+
+bool SingleGram::prune(){
+#if 1
+    SingleGramItem * begin = (SingleGramItem *)
+       ((const char *)(m_chunk.begin()) + sizeof(guint32));
+    SingleGramItem * end = (SingleGramItem *)m_chunk.end();
+    
+    size_t nitem = 0;
+    for ( SingleGramItem * cur = begin; cur != end; ++cur){
+       cur->m_freq--;
+       nitem++;
+       if ( cur->m_freq == 0 ){
+           size_t offset = sizeof(guint32) + (cur - begin)
+               * sizeof(SingleGramItem) ;
+           m_chunk.remove_content(offset, sizeof(SingleGramItem));
+       }
+    }
+    guint32 total_freq;
+    assert(get_total_freq(total_freq));
+    assert(set_total_freq(total_freq - nitem));
+#endif
+       return true;
+}
+
+bool token_less_than(const SingleGramItem & lhs,const SingleGramItem & rhs){
+    return lhs.m_token < rhs.m_token;
+}
+
+bool SingleGram::search(/* in */ PhraseIndexRange * range, 
+                       /* out */ BigramPhraseArray array){
+    const SingleGramItem * begin = (const SingleGramItem *)
+       ((const char *)(m_chunk.begin()) + sizeof(guint32));
+    const SingleGramItem * end = (const SingleGramItem *)m_chunk.end();
+    SingleGramItem compare_item;
+    compare_item.m_token = range->m_range_begin;
+    const SingleGramItem * cur_item = std_lite::lower_bound(begin, end, compare_item, token_less_than);
+
+    guint32 total_freq;
+    BigramPhraseItem bigram_item;
+    assert(get_total_freq(total_freq));
+    for ( ; cur_item != end; ++cur_item){
+       if ( cur_item->m_token >= range->m_range_end )
+           break;
+       bigram_item.m_token = cur_item->m_token;
+       bigram_item.m_freq = cur_item->m_freq / (gfloat)total_freq;
+       g_array_append_val(array, bigram_item);
+    }
+    return true;
+}
+
+bool SingleGram::get_freq(/* in */ phrase_token_t token,
+                       /* out */ guint32 & freq){
+    freq = 0;
+    const SingleGramItem * begin = (const SingleGramItem *)
+       ((const char *)(m_chunk.begin()) + sizeof(guint32));
+    const SingleGramItem * end = (const SingleGramItem *)m_chunk.end();
+    SingleGramItem compare_item;
+    compare_item.m_token = token;
+    const SingleGramItem * cur_item = std_lite::lower_bound(begin, end, compare_item, token_less_than);
+    
+    for ( ; cur_item != end; ++cur_item){
+       if ( cur_item->m_token > token )
+           return false;
+       if ( cur_item->m_token == token ){
+           freq = cur_item -> m_freq;
+           return true;
+       }
+    }
+    return false;
+}
+
+bool SingleGram::set_freq(/* in */ phrase_token_t token,
+                             guint32 freq){
+    SingleGramItem * begin = (SingleGramItem *)
+       ((const char *)(m_chunk.begin()) + sizeof(guint32));
+    SingleGramItem * end = (SingleGramItem *)m_chunk.end();
+    SingleGramItem compare_item;
+    compare_item.m_token = token;
+    SingleGramItem * cur_item = std_lite::lower_bound(begin, end, compare_item, token_less_than);
+    
+    SingleGramItem insert_item;
+    insert_item.m_token = token;
+    insert_item.m_freq = freq;
+    for ( ;cur_item != end; ++cur_item){
+       if ( cur_item->m_token > token ){
+           size_t offset  = sizeof(guint32) + 
+               sizeof(SingleGramItem) * (cur_item - begin);
+           m_chunk.insert_content(offset, &insert_item, 
+                                  sizeof(SingleGramItem));
+           return true;
+       }
+       if ( cur_item->m_token == token ){
+           cur_item -> m_freq = freq;
+           return true;
+       }
+    }
+    m_chunk.insert_content(m_chunk.size(), &insert_item, 
+                          sizeof(SingleGramItem));
+    return true;
+}
+
+
+bool Bigram::attach(const char * systemfile, const char * userfile){
+    reset();
+    if ( systemfile ){
+       int ret = db_create(&m_system, NULL, 0);
+       if ( ret != 0 )
+           assert(false);
+       
+       m_system->open(m_system, NULL, systemfile, NULL, 
+                      DB_HASH, DB_RDONLY, 0664);
+       if ( ret != 0)
+           return false;
+    }
+
+    if ( userfile ){
+       int ret = db_create(&m_user, NULL, 0);
+       if ( ret != 0 )
+           assert(false);
+       
+       m_user->open(m_user, NULL, userfile, NULL, DB_HASH, DB_CREATE, 0664);
+       if ( ret != 0)
+           return false;       
+    }
+    return true;
+}
+
+bool Bigram::load(phrase_token_t index, SingleGram * & system_gram, SingleGram * & user_gram){
+    DBT db_key;
+    memset(&db_key, 0, sizeof(DBT));
+    db_key.data = &index;
+    db_key.size = sizeof(phrase_token_t);
+    
+    system_gram = NULL; user_gram = NULL;
+    if ( m_system ){
+       DBT db_data;
+       memset(&db_data, 0, sizeof(DBT));
+       int ret = m_system->get(m_system, NULL, &db_key, &db_data, 0);
+       if ( ret == 0 )
+           system_gram = new SingleGram(db_data.data, db_data.size);
+    }
+    if ( m_user ){
+       DBT db_data;
+       memset(&db_data, 0, sizeof(DBT));
+       int ret = m_user->get(m_user, NULL, &db_key, &db_data, 0);
+       if ( ret == 0 )
+           user_gram = new SingleGram(db_data.data, db_data.size);
+    }
+    return true;
+}
+
+bool Bigram::store(phrase_token_t index, SingleGram * user_gram){
+    if ( !m_user )
+       return false;
+    DBT db_key;
+    memset(&db_key, 0, sizeof(DBT));
+    db_key.data = &index;
+    db_key.size = sizeof(phrase_token_t);
+    DBT db_data;
+    memset(&db_data, 0, sizeof(DBT));
+    db_data.data = user_gram->m_chunk.begin();
+    db_data.size = user_gram->m_chunk.size();
+    
+    int ret = m_user->put(m_user, NULL, &db_key, &db_data, 0);
+    return ret == 0;
+}
+
+bool Bigram::get_all_items(GArray * system, GArray * user){
+    bool retval = false;
+    g_array_set_size(system, 0);
+    g_array_set_size(user, 0);
+    if ( m_system ){
+       DBC * cursorp;
+       DBT key, data;
+       int ret;
+       /* Get a cursor */
+       m_system->cursor(m_system, NULL, &cursorp, 0); 
+       
+       /* Initialize our DBTs. */
+       memset(&key, 0, sizeof(DBT));
+       memset(&data, 0, sizeof(DBT));
+       
+       /* Iterate over the database, retrieving each record in turn. */
+       while ((ret = cursorp->c_get(cursorp, &key, &data, DB_NEXT)) == 0) {
+           assert(key.size == sizeof(phrase_token_t));
+           phrase_token_t * token = (phrase_token_t *)key.data;
+           g_array_append_val(system, *token);
+       }
+       
+       if (ret != DB_NOTFOUND) {
+           fprintf(stderr, "system db error, exit!");
+           exit(1);
+       }
+
+       /* Cursors must be closed */
+       if (cursorp != NULL) 
+           cursorp->c_close(cursorp); 
+
+       retval = true;
+    }
+    if ( m_user ){
+       DBC * cursorp;
+       DBT key, data;
+       int ret;
+       /* Get a cursor */
+       m_user->cursor(m_user, NULL, &cursorp, 0);
+
+       /* Initialize out DBTs. */
+       memset(&key, 0, sizeof(DBT));
+       memset(&data, 0, sizeof(DBT));
+       
+       /* Iterate over the database, retrieving each record in turn. */
+       while((ret = cursorp->c_get(cursorp, &key, &data, DB_NEXT)) == 0) {
+           assert(key.size == sizeof(phrase_token_t));
+           phrase_token_t * token = (phrase_token_t *) key.data;
+           g_array_append_val(user, *token);
+       }
+       
+       if (ret != DB_NOTFOUND){
+           fprintf(stderr, "user db error, exit!");
+           exit(1);
+       }
+       
+       /* Cursor must be closed */
+       if ( cursorp != NULL)
+           cursorp->c_close(cursorp);
+
+       retval = true;
+    }
+    return retval;
+}
diff --git a/src/storage/ngram.h b/src/storage/ngram.h
new file mode 100644 (file)
index 0000000..39a9ecc
--- /dev/null
@@ -0,0 +1,119 @@
+/* 
+ *  novel-pinyin,
+ *  A Simplified Chinese Sentence-Based Pinyin Input Method Engine
+ *  Based On Markov Model.
+ *  
+ *  Copyright (C) 2006-2007 Peng Wu
+ *  
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ * 
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *  
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef NGRAM_H
+#define NGRAM_H
+
+#include <db.h>
+
+namespace novel{
+
+class Bigram;
+
+/* Note:
+ * When transfer from system ngram to user ngram, 
+ *   if user ngram doesn't exist,
+ *     copy total freq from system ngram to user ngram,
+ *     so the total freq exists.
+ *   if item freq don't exist, copy item freq from system to user ngram,
+ *     so the item freq exists.
+ *     if user ngram already exists(always true), increases the total freq,
+ *     if item ngram already exists(always true), increases the freq.
+ */
+
+class SingleGram{
+    friend class Bigram;
+private:
+    MemoryChunk m_chunk;
+    SingleGram(void * buffer, size_t length);
+public:
+    /* Null Constructor */
+    SingleGram();
+    /* search method */
+    /* the array result contains many items */
+    bool search(/* in */ PhraseIndexRange * range, 
+              /* out */ BigramPhraseArray array);
+
+    bool get_freq(/* in */ phrase_token_t token,
+              /* out */ guint32 & freq); 
+    
+    /* set_freq method
+     */
+    bool set_freq(/* in */ phrase_token_t token,
+                 guint32 freq);
+
+    /* set_total_freq method
+     * used in user bigram table
+     */
+    bool set_total_freq(guint32 m_total);
+    
+    /* get_total_freq method
+     * used in user bigram table
+     */
+    bool get_total_freq(guint32 & m_total);
+    
+    /* prune one method
+     * only used in training
+     */
+    bool prune();
+};
+
+class Bigram{
+private:
+    DB * m_system;
+    DB * m_user;
+public:
+    Bigram(){
+       m_system = NULL; m_user = NULL;
+    }
+
+    ~Bigram(){
+       reset();
+    }
+
+    void reset(){
+       if ( m_system ){
+           m_system->close(m_system, 0);
+           m_system = NULL;
+       }
+       if ( m_user ){
+           m_user->close(m_user, 0);
+           m_user = NULL;
+       }
+    }
+    
+    /* attach system and user bi-gram */
+    /* when with training systemdb is NULL, only user_gram */
+    bool attach(const char * systemfile, const char * userfile);
+
+    bool load(phrase_token_t index, SingleGram * & system_gram, SingleGram * & user_gram);
+    bool store(phrase_token_t index, SingleGram * user_gram);
+    /* array of phrase_token_t items, for parameter estimation. */
+    bool get_all_items(GArray * system, GArray * user);
+};
+
+};
+
+using namespace novel;
+
+
+#endif
diff --git a/src/storage/phrase_index.cpp b/src/storage/phrase_index.cpp
new file mode 100644 (file)
index 0000000..7dbecb3
--- /dev/null
@@ -0,0 +1,340 @@
+/* 
+ *  novel-pinyin,
+ *  A Simplified Chinese Sentence-Based Pinyin Input Method Engine
+ *  Based On Markov Model.
+ *  
+ *  Copyright (C) 2006-2007 Peng Wu
+ *  
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ * 
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *  
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include "phrase_index.h"
+
+bool PhraseItem::set_n_pronunciation(guint8 n_prouns){
+    m_chunk.set_content(sizeof(guint8), &n_prouns, sizeof(guint8));
+    return true;
+}
+
+bool PhraseItem::get_nth_pronunciation(size_t index, PinyinKey * pinyin, guint32 & freq){
+    guint8 phrase_length = get_phrase_length();
+    table_offset_t offset = phrase_item_header + phrase_length * sizeof( utf16_t) + index * ( phrase_length * sizeof (PinyinKey) + sizeof(guint32));
+    bool retval = m_chunk.get_content(offset, pinyin, phrase_length * sizeof(PinyinKey));
+    if ( !retval )
+       return retval;
+    return m_chunk.get_content(offset + phrase_length * sizeof(PinyinKey), &freq , sizeof(guint32));
+}
+
+void PhraseItem::append_pronunciation(PinyinKey * pinyin, guint32 freq){
+    guint8 phrase_length = get_phrase_length();
+    set_n_pronunciation(get_n_pronunciation() + 1);
+    m_chunk.set_content(m_chunk.size(), pinyin, phrase_length * sizeof(PinyinKey));
+    m_chunk.set_content(m_chunk.size(), &freq, sizeof(guint32));
+}
+
+void PhraseItem::remove_nth_pronunciation(size_t index){
+    guint8 phrase_length = get_phrase_length();
+    set_n_pronunciation(get_n_pronunciation() - 1);
+    size_t offset = phrase_item_header + phrase_length * sizeof ( utf16_t ) + index * (phrase_length * sizeof (PinyinKey) + sizeof(guint32));
+    m_chunk.remove_content(offset, phrase_length * sizeof(PinyinKey) + sizeof(guint32));
+}
+
+bool PhraseItem::get_phrase_string(utf16_t * phrase){
+    guint8 phrase_length = get_phrase_length();
+    return m_chunk.get_content(phrase_item_header, phrase, phrase_length * sizeof(utf16_t));
+}
+
+bool PhraseItem::set_phrase_string(guint8 phrase_length, utf16_t * phrase){
+    m_chunk.set_content(0, &phrase_length, sizeof(guint8));
+    m_chunk.set_content(phrase_item_header, phrase, phrase_length * sizeof(utf16_t));
+    return true;
+}
+
+void PhraseItem::increase_pinyin_possibility(PinyinCustomSettings & custom,
+                                            PinyinKey * pinyin_keys,
+                                            gint32 delta){
+    guint8 phrase_length = get_phrase_length();
+    guint8 npron = get_n_pronunciation();
+    size_t offset = phrase_item_header + phrase_length * sizeof ( utf16_t );
+    char * buf_begin = (char *) m_chunk.begin();
+    guint32 total_freq = 0;
+    for ( int i = 0 ; i < npron ; ++i){
+       char * pinyin_begin = buf_begin + offset +
+           i * ( phrase_length * sizeof(PinyinKey) + sizeof(guint32) );
+       guint32 * freq = (guint32 *)(pinyin_begin + phrase_length * sizeof(PinyinKey));
+       total_freq += *freq;
+       if ( 0 == pinyin_compare_with_ambiguities(custom,
+                                                 (PinyinKey *)pinyin_begin,
+                                                 pinyin_keys,
+                                                 phrase_length)){
+           //protect against total_freq overflow.
+           if ( delta > 0 && total_freq > total_freq + delta )
+               return;
+           *freq += delta;
+           total_freq += delta;
+       }
+    }
+}
+
+
+guint32 SubPhraseIndex::get_phrase_index_total_freq(){
+    return m_total_freq;
+}
+
+bool SubPhraseIndex::add_unigram_frequency(phrase_token_t token, guint32 delta){
+    table_offset_t offset;
+    guint32 freq;
+    bool result = m_phrase_index.get_content
+       ((token & PHRASE_MASK) 
+        * sizeof(table_offset_t), &offset, sizeof(table_offset_t));
+
+    if ( !result)
+       return result;
+
+    if ( 0 == offset )
+       return false;
+
+    result = m_phrase_content.get_content
+       (offset + sizeof(guint8) + sizeof(guint8), &freq, sizeof(guint32));
+    //protect total_freq overflow
+    if ( delta > 0 && m_total_freq > m_total_freq + delta )
+       return false;
+    freq += delta;
+    m_total_freq += delta;
+    return m_phrase_content.set_content(offset + sizeof(guint8) + sizeof(guint8), &freq, sizeof(guint32));
+}
+
+bool SubPhraseIndex::get_phrase_item(phrase_token_t token, PhraseItem & item){
+    table_offset_t offset;
+    guint8 phrase_length;
+    guint8 n_prons;
+    
+    bool result = m_phrase_index.get_content
+       ((token & PHRASE_MASK) 
+        * sizeof(table_offset_t), &offset, sizeof(table_offset_t));
+
+    if ( !result )
+       return result;
+
+    if ( 0 == offset )
+       return false;
+
+    result = m_phrase_content.get_content(offset, &phrase_length, sizeof(guint8));
+    if ( !result ) 
+       return result;
+    
+    result = m_phrase_content.get_content(offset+sizeof(guint8), &n_prons, sizeof(guint8));
+    if ( !result ) 
+       return result;
+
+    size_t length = phrase_item_header + phrase_length * sizeof ( utf16_t ) + n_prons * ( phrase_length * sizeof (PinyinKey) + sizeof(guint32) );
+    item.m_chunk.set_chunk((char *)m_phrase_content.begin() + offset, length, NULL);
+    return true;
+}
+
+bool SubPhraseIndex::add_phrase_item(phrase_token_t token, PhraseItem * item){
+    table_offset_t offset = m_phrase_content.size();
+    if ( 0 == offset )
+       offset = 8;
+    m_phrase_content.set_content(offset, item->m_chunk.begin(), item->m_chunk.size());
+    m_phrase_index.set_content((token & PHRASE_MASK) 
+                              * sizeof(table_offset_t), &offset, sizeof(table_offset_t));
+    m_total_freq += item->get_unigram_frequency();
+    return true;
+}
+
+bool SubPhraseIndex::remove_phrase_item(phrase_token_t token, PhraseItem * & item){
+    table_offset_t offset;
+    guint8 phrase_length;
+    guint8 n_prons;
+    
+    bool result = m_phrase_index.get_content
+       ((token & PHRASE_MASK)
+        * sizeof(table_offset_t), &offset, sizeof(table_offset_t));
+    
+    if ( !result )
+       return result;
+
+    if ( 0 == offset )
+       return false;
+
+    result = m_phrase_content.get_content(offset, &phrase_length, sizeof(guint8));
+    if ( !result )
+       return result;
+
+    result = m_phrase_content.get_content(offset+sizeof(guint8), &n_prons, sizeof(guint8));
+    if ( !result )
+       return result;
+    
+    size_t length = phrase_item_header + phrase_length * sizeof ( utf16_t ) + n_prons * ( phrase_length * sizeof (PinyinKey) + sizeof(guint32) );
+    item = new PhraseItem;
+    //implictly copy data from m_chunk_content.
+    item->m_chunk.set_content(0, (char *) m_phrase_content.begin() + offset, length);
+
+    const table_offset_t zero_const = 0;
+    m_phrase_index.set_content((token & PHRASE_MASK)
+                              * sizeof(table_offset_t), &zero_const, sizeof(table_offset_t));
+    m_total_freq -= item->get_unigram_frequency();
+    return true;
+}
+
+bool FacadePhraseIndex::load(guint8 phrase_index, MemoryChunk * chunk){
+    SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
+    if ( !sub_phrases ){
+       sub_phrases = new SubPhraseIndex;
+    }
+    
+    bool retval = sub_phrases->load(chunk, 0, chunk->size());
+    if ( !retval )
+       return retval;
+    m_total_freq += sub_phrases->get_phrase_index_total_freq();
+    return retval;
+}
+
+bool FacadePhraseIndex::store(guint8 phrase_index, MemoryChunk * new_chunk){
+    table_offset_t end;
+    SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
+    if ( !sub_phrases )
+       return false;
+    
+    sub_phrases->store(new_chunk, 0, end);
+    return true;
+}
+
+bool FacadePhraseIndex::unload(guint8 phrase_index){
+    SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
+    if ( !sub_phrases )
+       return false;
+    m_total_freq -= sub_phrases->get_phrase_index_total_freq();
+    delete sub_phrases;
+    sub_phrases = NULL;
+    return true;
+}
+
+bool SubPhraseIndex::load(MemoryChunk * chunk, 
+                         table_offset_t offset, table_offset_t end){
+    //save the memory chunk
+    if ( m_chunk ){
+       delete m_chunk;
+       m_chunk = NULL;
+    }
+    m_chunk = chunk;
+    
+    char * buf_begin = (char *)chunk->begin();
+    chunk->get_content(offset, &m_total_freq, sizeof(guint32));
+    offset += sizeof(guint32);
+    table_offset_t index_one, index_two, index_three;
+    chunk->get_content(offset, &index_one, sizeof(table_offset_t));
+    offset += sizeof(table_offset_t);
+    chunk->get_content(offset, &index_two, sizeof(table_offset_t));
+    offset += sizeof(table_offset_t);
+    chunk->get_content(offset, &index_three, sizeof(table_offset_t));
+    offset += sizeof(table_offset_t);
+    g_return_val_if_fail(*(buf_begin + offset) == c_separate, FALSE);
+    g_return_val_if_fail(*(buf_begin + index_two - 1) == c_separate, FALSE);
+    g_return_val_if_fail(*(buf_begin + index_three - 1) == c_separate, FALSE);
+    m_phrase_index.set_chunk(buf_begin + index_one, 
+                            index_two - 1 - index_one, NULL);
+    m_phrase_content.set_chunk(buf_begin + index_two, 
+                                index_three - 1 - index_two, NULL);
+    g_return_val_if_fail( index_three <= end, FALSE);
+    return true;
+}
+
+bool SubPhraseIndex::store(MemoryChunk * new_chunk, 
+                          table_offset_t offset, table_offset_t& end){
+    new_chunk->set_content(offset, &m_total_freq, sizeof(guint32));
+    table_offset_t index = offset + sizeof(guint32);
+        
+    offset = index + sizeof(table_offset_t) * 3 ;
+    new_chunk->set_content(offset, &c_separate, sizeof(char));
+    offset += sizeof(char);
+    
+    new_chunk->set_content(index, &offset, sizeof(table_offset_t));
+    index += sizeof(table_offset_t);
+    new_chunk->set_content(offset, m_phrase_index.begin(), m_phrase_index.size());
+    offset += m_phrase_index.size();
+    new_chunk->set_content(offset, &c_separate, sizeof(char));
+    offset += sizeof(char);
+
+    new_chunk->set_content(index, &offset, sizeof(table_offset_t));
+    index += sizeof(table_offset_t);
+    
+    new_chunk->set_content(offset, m_phrase_content.begin(), m_phrase_content.size());
+    offset += m_phrase_content.size();
+    new_chunk->set_content(offset, &c_separate, sizeof(char));
+    offset += sizeof(char);
+    new_chunk->set_content(index, &offset, sizeof(table_offset_t));
+    return true;
+}
+
+bool FacadePhraseIndex::load_text(guint8 phrase_index, FILE * infile){
+    SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
+    if ( !sub_phrases ){
+       sub_phrases = new SubPhraseIndex;
+    }
+
+    char pinyin[256];
+    char phrase[256];
+    phrase_token_t token;
+    size_t freq;
+    PhraseItem * item_ptr = new PhraseItem;
+    phrase_token_t cur_token = 0;
+    while ( !feof(infile)){
+        fscanf(infile, "%s", pinyin);
+        fscanf(infile, "%s", phrase);
+        fscanf(infile, "%ld", &token);
+       fscanf(infile, "%ld", &freq);
+       if ( feof(infile) )
+           break;
+
+       glong written;
+       utf16_t * phrase_utf16 = g_utf8_to_utf16(phrase, -1, NULL, 
+                                              &written, NULL);
+       
+       if ( 0 == cur_token ){
+           cur_token = token;
+           item_ptr->set_phrase_string(written, phrase_utf16);
+       }
+
+       if ( cur_token != token ){
+           add_phrase_item( cur_token, item_ptr);
+           delete item_ptr;
+           item_ptr = new PhraseItem;
+           cur_token = token;
+           item_ptr->set_phrase_string(written, phrase_utf16);
+       }
+
+       PinyinDefaultParser parser;
+       NullPinyinValidator validator;
+       PinyinKeyVector keys;
+       PinyinKeyPosVector poses;
+       
+       keys = g_array_new(FALSE, FALSE, sizeof( PinyinKey));
+       poses = g_array_new(FALSE, FALSE, sizeof( PinyinKeyPos));
+       parser.parse(validator, keys, poses, pinyin);
+       
+       assert ( item_ptr->get_phrase_length() == keys->len );
+       item_ptr->append_pronunciation((PinyinKey *)keys->data, freq);
+
+       g_array_free(keys, TRUE);
+       g_array_free(poses, TRUE);
+       g_free(phrase_utf16);
+    }
+
+    add_phrase_item( cur_token, item_ptr);
+    delete item_ptr;
+    m_total_freq += m_sub_phrase_indices[phrase_index]->get_phrase_index_total_freq();
+    return true;
+}
diff --git a/src/storage/phrase_index.h b/src/storage/phrase_index.h
new file mode 100755 (executable)
index 0000000..e635453
--- /dev/null
@@ -0,0 +1,250 @@
+/* 
+ *  novel-pinyin,
+ *  A Simplified Chinese Sentence-Based Pinyin Input Method Engine
+ *  Based On Markov Model.
+ *  
+ *  Copyright (C) 2006-2007 Peng Wu
+ *  
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ * 
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *  
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef PHRASE_INDEX_H
+#define PHRASE_INDEX_H
+
+#include <stdio.h>
+#include <glib.h>
+#include "novel_types.h"
+#include "pinyin_base.h"
+#include "pinyin_phrase.h"
+#include "memory_chunk.h"
+
+class PinyinLookup;
+
+namespace novel{
+
+/* Because this is not large,
+ * Store this in user home directory.
+ */
+
+const int phrase_item_header = sizeof(guint8) + sizeof(guint8) + sizeof(guint32);
+
+class PhraseItem{
+    friend class SubPhraseIndex;
+private:
+    MemoryChunk m_chunk;
+    bool set_n_pronunciation(guint8 n_prouns);
+public:
+    /* Null Constructor */
+    PhraseItem(){
+       m_chunk.set_size(phrase_item_header);
+       memset(m_chunk.begin(), 0, m_chunk.size());
+    }
+
+    PhraseItem(MemoryChunk chunk){
+       m_chunk = chunk;
+       assert ( m_chunk.size() >= phrase_item_header);
+    }
+
+    /* functions */
+    guint8 get_phrase_length(){
+       char * buf_begin = (char *)m_chunk.begin();
+       return (*(guint8 *)buf_begin);
+    }
+
+    guint8 get_n_pronunciation(){
+       char * buf_begin = ( char *) m_chunk.begin();
+       return (*(guint8 *)(buf_begin + sizeof(guint8)));
+    }
+
+    guint32 get_unigram_frequency(){
+       char * buf_begin = (char *)m_chunk.begin();
+       return (*(guint32 *)(buf_begin + sizeof(guint8) + sizeof(guint8)));
+    }
+
+    gfloat get_pinyin_possibility(PinyinCustomSettings & custom, 
+                                 PinyinKey * pinyin_keys){
+       guint8 phrase_length = get_phrase_length();
+       guint8 npron = get_n_pronunciation();
+       size_t offset = phrase_item_header + phrase_length * sizeof ( utf16_t );
+       char * buf_begin = (char *)m_chunk.begin();
+       guint32 matched = 0, total_freq =0;
+       for ( int i = 0 ; i < npron ; ++i){
+           char * pinyin_begin = buf_begin + offset + 
+               i * ( phrase_length * sizeof(PinyinKey) + sizeof(guint32) );
+           guint32 * freq = (guint32 *)(pinyin_begin + phrase_length * sizeof(PinyinKey));
+           total_freq += *freq;
+           if ( 0 == pinyin_compare_with_ambiguities(custom, 
+                                                     (PinyinKey *)pinyin_begin,
+                                                     pinyin_keys,
+                                                     phrase_length)){
+               matched += *freq;
+           }
+       }
+       // use preprocessor to avoid zero freq, in gen_pinyin_table.
+       /*
+       if ( 0 == total_freq )
+           return 0.1;
+       */
+       gfloat retval = matched / (gfloat) total_freq;
+       /*
+       if ( 0 == retval )
+           return 0.03;
+       */
+       return retval;
+    }
+    
+    void increase_pinyin_possibility(PinyinCustomSettings & custom,
+                                    PinyinKey * pinyin_keys,
+                                    gint32 delta);
+
+    bool get_phrase_string(utf16_t * phrase);
+    bool set_phrase_string(guint8 phrase_length, utf16_t * phrase);
+    bool get_nth_pronunciation(size_t index, 
+                              /* out */ PinyinKey * pinyin, 
+                              /* out */ guint32 & freq);
+    /* Normally don't change the first pronunciation,
+     * which decides the token number.
+     */
+    void append_pronunciation(PinyinKey * pinyin, guint32 freq);
+    void remove_nth_pronunciation(size_t index);
+};
+
+/*
+ *  In Sub Phrase Index, token == (token & PHRASE_MASK).
+ */
+
+class SubPhraseIndex{
+private:
+    guint32 m_total_freq;
+    MemoryChunk m_phrase_index;
+    MemoryChunk m_phrase_content;
+    MemoryChunk * m_chunk;
+public:
+    SubPhraseIndex():m_total_freq(0){
+       m_chunk = NULL;
+    }
+
+    ~SubPhraseIndex(){
+       reset();
+    }
+
+    void reset(){
+       if ( m_chunk ){
+           delete m_chunk;
+           m_chunk = NULL;
+       }
+    }    
+    
+    bool load(MemoryChunk * chunk, 
+             table_offset_t offset, table_offset_t end);
+    bool store(MemoryChunk * new_chunk, 
+              table_offset_t offset, table_offset_t & end);
+    
+    /* Zero-gram */
+    guint32 get_phrase_index_total_freq();
+    bool add_unigram_frequency(phrase_token_t token, guint32 delta);
+    /* get_phrase_item function can't modify the phrase item, 
+     * but can increment the freq of the special pronunciation.
+     */
+    bool get_phrase_item(phrase_token_t token, PhraseItem & item);
+    bool add_phrase_item(phrase_token_t token, PhraseItem * item);
+    /* remove_phrase_item will substract item->get_unigram_frequency()
+     * from m_total_freq
+     */
+    bool remove_phrase_item(phrase_token_t token, /* out */ PhraseItem * & item);
+};
+
+class FacadePhraseIndex{
+    friend class ::PinyinLookup;
+private:
+    guint32 m_total_freq;
+    SubPhraseIndex * m_sub_phrase_indices[PHRASE_INDEX_LIBRARY_COUNT];
+public:
+    FacadePhraseIndex(){
+       m_total_freq = 0;
+       memset(m_sub_phrase_indices, 0, sizeof(m_sub_phrase_indices));
+    }
+
+    ~FacadePhraseIndex(){
+       for ( size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i){
+           if ( m_sub_phrase_indices[i] ){
+               delete m_sub_phrase_indices[i];
+               m_sub_phrase_indices[i] = NULL;
+           }
+       }
+    }
+
+    /* load/store single sub phrase index, according to the config files. */
+    bool load_text(guint8 phrase_index, FILE * infile);
+    bool load(guint8 phrase_index, MemoryChunk * chunk);
+    bool store(guint8 phrase_index, MemoryChunk * new_chunk);
+    bool unload(guint8 phrase_index);
+
+    /* Zero-gram */
+    guint32 get_phrase_index_total_freq(){
+       return m_total_freq;
+    }
+
+    bool add_unigram_frequency(phrase_token_t token, guint32 delta){
+       guint8 index = PHRASE_INDEX_LIBRARY_INDEX(token);
+       SubPhraseIndex * sub_phrase = m_sub_phrase_indices[index];
+       if ( !sub_phrase )
+           return false;
+       m_total_freq += delta;
+       return sub_phrase->add_unigram_frequency(token, delta);
+    }
+
+    /* get_phrase_item function can't modify the phrase item */
+    bool get_phrase_item(phrase_token_t token, PhraseItem & item){
+       guint8 index = PHRASE_INDEX_LIBRARY_INDEX(token);
+       SubPhraseIndex * sub_phrase = m_sub_phrase_indices[index];
+       if ( !sub_phrase )
+           return false;
+       return sub_phrase->get_phrase_item(token, item);
+    }
+
+    bool add_phrase_item(phrase_token_t token, PhraseItem * item){
+       guint8 index = PHRASE_INDEX_LIBRARY_INDEX(token);
+       SubPhraseIndex * & sub_phrase = m_sub_phrase_indices[index];
+       if ( !sub_phrase ){
+           sub_phrase = new SubPhraseIndex;
+       }   
+       m_total_freq += item->get_unigram_frequency();
+       return sub_phrase->add_phrase_item(token, item);
+    }
+
+    bool remove_phrase_item(phrase_token_t token, PhraseItem * & item){
+       guint8 index = PHRASE_INDEX_LIBRARY_INDEX(token);
+       SubPhraseIndex * & sub_phrase = m_sub_phrase_indices[index];
+       if ( !sub_phrase ){
+           return false;
+       }
+       bool result = sub_phrase->remove_phrase_item(token, item);
+       if ( !result )
+           return result;
+       m_total_freq -= item->get_unigram_frequency();
+       return result;
+    }
+};
+};
+
+using namespace novel;
+
+
+
+
+
+#endif
diff --git a/src/storage/pinyin_base.cpp b/src/storage/pinyin_base.cpp
new file mode 100644 (file)
index 0000000..cffee3c
--- /dev/null
@@ -0,0 +1,1425 @@
+/* 
+ *  novel-pinyin,
+ *  A Simplified Chinese Sentence-Based Pinyin Input Method Engine
+ *  Based On Markov Model.
+ *  
+ *  Copyright (C) 2002,2003,2006 James Su
+ *  
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ * 
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *  
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include "stl_lite.h"
+#include "novel_types.h"
+#include "pinyin_base.h"
+#include "pinyin_phrase.h"
+#include "pinyin_large_table.h"
+
+// Internal data definition
+
+/**
+ * struct of pinyin token.
+ *
+ * this struct store the informations of a pinyin token
+ * (an initial or final)
+ */
+struct PinyinToken
+{
+    const char *latin;      /**< Latin name of the token. */
+    const char *zhuyin;     /**< Zhuyin name in UTF-8. */
+    int   latin_len;  /**< length of Latin name. */
+    int   zhuyin_len; /**< length of Chinese name. */
+};
+
+/**
+ * struct to index PinyinToken list.
+ */
+struct PinyinTokenIndex
+{
+    int start;
+    int num;
+};
+
+static const PinyinToken __pinyin_initials[] =
+{
+    {"", "", 0, 0},
+    {"b", "ㄅ", 1, 1},
+    {"c", "ㄘ", 1, 1},
+    {"ch","ㄔ", 2, 1},
+    {"d", "ㄉ", 1, 1},
+    {"f", "ㄈ", 1, 1},
+    {"h", "ㄏ", 1, 1},
+    {"g", "ㄍ", 1, 1},
+    {"j", "ㄐ", 1, 1},
+    {"k", "ㄎ", 1, 1},
+    {"m", "ㄇ", 1, 1},
+    {"n", "ㄋ", 1, 1},
+    {"l", "ㄌ", 1, 1},
+    {"r", "ㄖ", 1, 1},
+    {"p", "ㄆ", 1, 1},
+    {"q", "ㄑ", 1, 1},
+    {"s", "ㄙ", 1, 1},
+    {"sh","ㄕ", 2, 1},
+    {"t", "ㄊ", 1, 1},
+    {"w", "ㄨ", 1, 1},  //Should be omitted in some case.
+    {"x", "ㄒ", 1, 1},
+    {"y", "ㄧ", 1, 1},  //Should be omitted in some case.
+    {"z", "ㄗ", 1, 1},
+    {"zh","ㄓ", 2, 1}
+};
+
+static const PinyinToken __pinyin_finals[] =
+{
+    {"", "", 0, 0},
+    {"a",   "ㄚ",   1, 1},
+    {"ai",  "ㄞ",   2, 1},
+    {"an",  "ㄢ",   2, 1},
+    {"ang", "ㄤ",   3, 1},
+    {"ao",  "ㄠ",   2, 1},
+    {"e",   "ㄜ",   1, 1},
+    {"ea",  "ㄝ",   2, 1},
+    {"ei",  "ㄟ",   2, 1},
+    {"en",  "ㄣ",   2, 1},
+    {"eng", "ㄥ",   3, 1},
+    {"er",  "ㄦ",   2, 1},
+    {"i",   "ㄧ",   1, 1},
+    {"ia",  "ㄧㄚ", 2, 2},
+    {"ian", "ㄧㄢ", 3, 2},
+    {"iang","ㄧㄤ", 4, 2},
+    {"iao", "ㄧㄠ", 3, 2},
+    {"ie",  "ㄧㄝ", 2, 2},
+    {"in",  "ㄧㄣ", 2, 2},
+    {"ing", "ㄧㄥ", 3, 2},
+    {"iong","ㄩㄥ", 4, 2},
+    {"iu",  "ㄧㄡ", 2, 2},
+    {"ng",  "ㄣ",   2, 1},
+    {"o",   "ㄛ",   1, 1},
+    {"ong", "ㄨㄥ", 3, 2},
+    {"ou",  "ㄡ",   2, 1},
+    {"u",   "ㄨ",   1, 1},
+    {"ua",  "ㄨㄚ", 2, 2},
+    {"uai", "ㄨㄞ", 3, 2},
+    {"uan", "ㄨㄢ", 3, 2},
+    {"uang","ㄨㄤ", 4, 2},
+    {"ue",  "ㄩㄝ", 2, 2},
+    {"ueng","ㄨㄥ", 4, 2},
+    {"ui",  "ㄨㄟ", 2, 2},
+    {"un",  "ㄨㄣ", 2, 2},
+    {"uo",  "ㄨㄛ", 2, 2},
+    {"v",   "ㄩ",   1, 1},
+    {"van", "ㄩㄢ", 3, 2},
+    {"ve",  "ㄩㄝ", 2, 2},
+    {"vn",  "ㄩㄣ", 2, 2}
+};
+
+static const PinyinToken __pinyin_tones [] =
+{
+    {"", "", 0, 0},
+    {"1", "ˉ", 1, 1},
+    {"2", "ˊ", 1, 1},
+    {"3", "ˇ", 1, 1},
+    {"4", "ˋ", 1, 1},
+    {"5", "˙", 1, 1}
+};
+
+static const PinyinTokenIndex __pinyin_initials_index[] =
+{
+    //a     b      c      d     e       f      g      h      i      j      k      l      m 
+    {-1,0},{1,1}, {2,2}, {4,1}, {-1,0},{5,1}, {7,1}, {6,1}, {-1,0},{8,1}, {9,1}, {12,1},{10,1},
+    //n     o      p      q      r      s      t      u      v      w      x      y      z
+    {11,1},{-1,0},{14,1},{15,1},{13,1},{16,2},{18,1},{-1,0},{-1,0},{19,1},{20,1},{21,1},{22,2}
+};
+
+static const PinyinTokenIndex __pinyin_finals_index[] =
+{
+    //a     b      c      d      e     f      g      h      i       j      k      l      m 
+    {1,5}, {-1,0},{-1,0},{-1,0},{6,6},{-1,0},{-1,0},{-1,0},{12,10},{-1,0},{-1,0},{-1,0},{-1,0},
+    //n     o      p      q      r      s      t      u      v      w      x      y      z
+    {22,1},{23,3},{-1,0},{-1,0},{-1,0},{-1,0},{-1,0},{26,10},{36,4},{-1,0},{-1,0},{-1,0},{-1,0}
+};
+
+
+
+static const PinyinInitial __shuang_pin_stone_initial_map [] =
+{
+    PINYIN_ZeroInitial,    // A
+    PINYIN_Bo,             // B
+    PINYIN_Ci,             // C
+    PINYIN_De,             // D
+    PINYIN_ZeroInitial,    // E
+    PINYIN_Fo,             // F
+    PINYIN_Ge,             // G
+    PINYIN_He,             // H
+    PINYIN_Shi,            // I
+    PINYIN_Ji,             // J
+    PINYIN_Ke,             // K
+    PINYIN_Le,             // L
+    PINYIN_Mo,             // M
+    PINYIN_Ne,             // N
+    PINYIN_ZeroInitial,    // O
+    PINYIN_Po,             // P
+    PINYIN_Qi,             // Q
+    PINYIN_Ri,             // R
+    PINYIN_Si,             // S
+    PINYIN_Te,             // T
+    PINYIN_Chi,            // U
+    PINYIN_Zhi,            // V
+    PINYIN_Wu,             // W
+    PINYIN_Xi,             // X
+    PINYIN_Yi,             // Y
+    PINYIN_Zi,             // Z
+    PINYIN_ZeroInitial,    // ;
+};
+
+static const PinyinFinal __shuang_pin_stone_final_map [][2] =
+{
+    { PINYIN_A,   PINYIN_ZeroFinal },         // A
+    { PINYIN_Ia,  PINYIN_Ua        },         // B
+    { PINYIN_Uan, PINYIN_ZeroFinal },         // C
+    { PINYIN_Ao,  PINYIN_ZeroFinal },         // D
+    { PINYIN_E,   PINYIN_ZeroFinal },         // E
+    { PINYIN_An,  PINYIN_ZeroFinal },         // F
+    { PINYIN_Ang, PINYIN_ZeroFinal },         // G
+    { PINYIN_Uang,PINYIN_Iang      },         // H
+    { PINYIN_I,   PINYIN_ZeroFinal },         // I
+    { PINYIN_Ian, PINYIN_ZeroFinal },         // J
+    { PINYIN_Iao, PINYIN_ZeroFinal },         // K
+    { PINYIN_In,  PINYIN_ZeroFinal },         // L
+    { PINYIN_Ie,  PINYIN_ZeroFinal },         // M
+    { PINYIN_Iu,  PINYIN_ZeroFinal },         // N
+    { PINYIN_Uo,  PINYIN_O         },         // O
+    { PINYIN_Ou,  PINYIN_ZeroFinal },         // P
+    { PINYIN_Ing, PINYIN_Er        },         // Q
+    { PINYIN_En,  PINYIN_ZeroFinal },         // R
+    { PINYIN_Ai,  PINYIN_ZeroFinal },         // S
+    { PINYIN_Ng,  PINYIN_Eng       },         // T
+    { PINYIN_U,   PINYIN_ZeroFinal },         // U
+    { PINYIN_V,   PINYIN_Ui        },         // V
+    { PINYIN_Ei,  PINYIN_ZeroFinal },         // W
+    { PINYIN_Uai, PINYIN_Ue        },         // X
+    { PINYIN_Ong, PINYIN_Iong      },         // Y
+    { PINYIN_Un,  PINYIN_ZeroFinal },         // Z
+    { PINYIN_ZeroFinal, PINYIN_ZeroFinal },   // ;
+};
+
+
+static const PinyinInitial __shuang_pin_zrm_initial_map [] =
+{
+    PINYIN_ZeroInitial,    // A
+    PINYIN_Bo,             // B
+    PINYIN_Ci,             // C
+    PINYIN_De,             // D
+    PINYIN_ZeroInitial,    // E
+    PINYIN_Fo,             // F
+    PINYIN_Ge,             // G
+    PINYIN_He,             // H
+    PINYIN_Chi,            // I
+    PINYIN_Ji,             // J
+    PINYIN_Ke,             // K
+    PINYIN_Le,             // L
+    PINYIN_Mo,             // M
+    PINYIN_Ne,             // N
+    PINYIN_ZeroInitial,    // O
+    PINYIN_Po,             // P
+    PINYIN_Qi,             // Q
+    PINYIN_Ri,             // R
+    PINYIN_Si,             // S
+    PINYIN_Te,             // T
+    PINYIN_Shi,            // U
+    PINYIN_Zhi,            // V
+    PINYIN_Wu,             // W
+    PINYIN_Xi,             // X
+    PINYIN_Yi,             // Y
+    PINYIN_Zi,             // Z
+    PINYIN_ZeroInitial,    // ;
+};
+
+static const PinyinFinal __shuang_pin_zrm_final_map [][2] =
+{
+    { PINYIN_A,   PINYIN_ZeroFinal },         // A
+    { PINYIN_Ou,  PINYIN_ZeroFinal },         // B
+    { PINYIN_Iao, PINYIN_ZeroFinal },         // C
+    { PINYIN_Uang,PINYIN_Iang      },         // D
+    { PINYIN_E,   PINYIN_ZeroFinal },         // E
+    { PINYIN_En,  PINYIN_ZeroFinal },         // F
+    { PINYIN_Ng,  PINYIN_Eng       },         // G
+    { PINYIN_Ang, PINYIN_ZeroFinal },         // H
+    { PINYIN_I,   PINYIN_ZeroFinal },         // I
+    { PINYIN_An,  PINYIN_ZeroFinal },         // J
+    { PINYIN_Ao,  PINYIN_ZeroFinal },         // K
+    { PINYIN_Ai,  PINYIN_ZeroFinal },         // L
+    { PINYIN_Ian, PINYIN_ZeroFinal },         // M
+    { PINYIN_In,  PINYIN_ZeroFinal },         // N
+    { PINYIN_Uo,  PINYIN_O         },         // O
+    { PINYIN_Un,  PINYIN_ZeroFinal },         // P
+    { PINYIN_Iu,  PINYIN_ZeroFinal },         // Q
+    { PINYIN_Uan, PINYIN_Er        },         // R
+    { PINYIN_Ong, PINYIN_Iong      },         // S
+    { PINYIN_Ue,  PINYIN_ZeroFinal },         // T
+    { PINYIN_U,   PINYIN_ZeroFinal },         // U
+    { PINYIN_V,   PINYIN_Ui        },         // V
+    { PINYIN_Ia,  PINYIN_Ua        },         // W
+    { PINYIN_Ie,  PINYIN_ZeroFinal },         // X
+    { PINYIN_Ing, PINYIN_Uai       },         // Y
+    { PINYIN_Ei,  PINYIN_ZeroFinal },         // Z
+    { PINYIN_ZeroFinal, PINYIN_ZeroFinal },   // ;
+};
+
+
+static const PinyinInitial __shuang_pin_ms_initial_map [] =
+{
+    PINYIN_ZeroInitial,    // A
+    PINYIN_Bo,             // B
+    PINYIN_Ci,             // C
+    PINYIN_De,             // D
+    PINYIN_ZeroInitial,    // E
+    PINYIN_Fo,             // F
+    PINYIN_Ge,             // G
+    PINYIN_He,             // H
+    PINYIN_Chi,            // I
+    PINYIN_Ji,             // J
+    PINYIN_Ke,             // K
+    PINYIN_Le,             // L
+    PINYIN_Mo,             // M
+    PINYIN_Ne,             // N
+    PINYIN_ZeroInitial,    // O
+    PINYIN_Po,             // P
+    PINYIN_Qi,             // Q
+    PINYIN_Ri,             // R
+    PINYIN_Si,             // S
+    PINYIN_Te,             // T
+    PINYIN_Shi,            // U
+    PINYIN_Zhi,            // V
+    PINYIN_Wu,             // W
+    PINYIN_Xi,             // X
+    PINYIN_Yi,             // Y
+    PINYIN_Zi,             // Z
+    PINYIN_ZeroInitial,    // ;
+};
+
+static const PinyinFinal __shuang_pin_ms_final_map [][2] =
+{
+    { PINYIN_A,   PINYIN_ZeroFinal },         // A
+    { PINYIN_Ou,  PINYIN_ZeroFinal },         // B
+    { PINYIN_Iao, PINYIN_ZeroFinal },         // C
+    { PINYIN_Uang,PINYIN_Iang      },         // D
+    { PINYIN_E,   PINYIN_ZeroFinal },         // E
+    { PINYIN_En,  PINYIN_ZeroFinal },         // F
+    { PINYIN_Ng,  PINYIN_Eng       },         // G
+    { PINYIN_Ang, PINYIN_ZeroFinal },         // H
+    { PINYIN_I,   PINYIN_ZeroFinal },         // I
+    { PINYIN_An,  PINYIN_ZeroFinal },         // J
+    { PINYIN_Ao,  PINYIN_ZeroFinal },         // K
+    { PINYIN_Ai,  PINYIN_ZeroFinal },         // L
+    { PINYIN_Ian, PINYIN_ZeroFinal },         // M
+    { PINYIN_In,  PINYIN_ZeroFinal },         // N
+    { PINYIN_Uo,  PINYIN_O         },         // O
+    { PINYIN_Un,  PINYIN_ZeroFinal },         // P
+    { PINYIN_Iu,  PINYIN_ZeroFinal },         // Q
+    { PINYIN_Uan, PINYIN_Er        },         // R
+    { PINYIN_Ong, PINYIN_Iong      },         // S
+    { PINYIN_Ue,  PINYIN_ZeroFinal },         // T
+    { PINYIN_U,   PINYIN_ZeroFinal },         // U
+    { PINYIN_V,   PINYIN_Ui        },         // V
+    { PINYIN_Ia,  PINYIN_Ua        },         // W
+    { PINYIN_Ie,  PINYIN_ZeroFinal },         // X
+    { PINYIN_Uai, PINYIN_V         },         // Y
+    { PINYIN_Ei,  PINYIN_ZeroFinal },         // Z
+    { PINYIN_Ing, PINYIN_ZeroFinal },         // ;
+};
+
+
+static const PinyinInitial __shuang_pin_ziguang_initial_map [] =
+{
+    PINYIN_Chi,            // A
+    PINYIN_Bo,             // B
+    PINYIN_Ci,             // C
+    PINYIN_De,             // D
+    PINYIN_ZeroInitial,    // E
+    PINYIN_Fo,             // F
+    PINYIN_Ge,             // G
+    PINYIN_He,             // H
+    PINYIN_Shi,            // I
+    PINYIN_Ji,             // J
+    PINYIN_Ke,             // K
+    PINYIN_Le,             // L
+    PINYIN_Mo,             // M
+    PINYIN_Ne,             // N
+    PINYIN_ZeroInitial,    // O
+    PINYIN_Po,             // P
+    PINYIN_Qi,             // Q
+    PINYIN_Ri,             // R
+    PINYIN_Si,             // S
+    PINYIN_Te,             // T
+    PINYIN_Zhi,            // U
+    PINYIN_ZeroInitial,    // V
+    PINYIN_Wu,             // W
+    PINYIN_Xi,             // X
+    PINYIN_Yi,             // Y
+    PINYIN_Zi,             // Z
+    PINYIN_ZeroInitial,    // ;
+};
+
+static const PinyinFinal __shuang_pin_ziguang_final_map [][2] =
+{
+    { PINYIN_A,   PINYIN_ZeroFinal },         // A
+    { PINYIN_Iao, PINYIN_ZeroFinal },         // B
+    { PINYIN_Ing, PINYIN_ZeroFinal },         // C
+    { PINYIN_Ie,  PINYIN_ZeroFinal },         // D
+    { PINYIN_E,   PINYIN_ZeroFinal },         // E
+    { PINYIN_Ian, PINYIN_ZeroFinal },         // F
+    { PINYIN_Uang,PINYIN_Iang      },         // G
+    { PINYIN_Ong, PINYIN_Iong      },         // H
+    { PINYIN_I,   PINYIN_ZeroFinal },         // I
+    { PINYIN_Iu,  PINYIN_Er        },         // J
+    { PINYIN_Ei,  PINYIN_ZeroFinal },         // K
+    { PINYIN_Uan, PINYIN_ZeroFinal },         // L
+    { PINYIN_Un,  PINYIN_ZeroFinal },         // M
+    { PINYIN_Ui,  PINYIN_Ue        },         // N
+    { PINYIN_Uo,  PINYIN_O         },         // O
+    { PINYIN_Ai,  PINYIN_ZeroFinal },         // P
+    { PINYIN_Ao,  PINYIN_ZeroFinal },         // Q
+    { PINYIN_An,  PINYIN_ZeroFinal },         // R
+    { PINYIN_Ang, PINYIN_ZeroFinal },         // S
+    { PINYIN_Ng,  PINYIN_Eng       },         // T
+    { PINYIN_U,   PINYIN_ZeroFinal },         // U
+    { PINYIN_V,   PINYIN_ZeroFinal },         // V
+    { PINYIN_En,  PINYIN_ZeroFinal },         // W
+    { PINYIN_Ia,  PINYIN_Ua        },         // X
+    { PINYIN_In,  PINYIN_Uai       },         // Y
+    { PINYIN_Ou,  PINYIN_ZeroFinal },         // Z
+    { PINYIN_ZeroFinal, PINYIN_ZeroFinal },   // ;
+};
+
+
+static const PinyinInitial __shuang_pin_abc_initial_map [] =
+{
+    PINYIN_Zhi,            // A
+    PINYIN_Bo,             // B
+    PINYIN_Ci,             // C
+    PINYIN_De,             // D
+    PINYIN_Chi,            // E
+    PINYIN_Fo,             // F
+    PINYIN_Ge,             // G
+    PINYIN_He,             // H
+    PINYIN_ZeroInitial,    // I
+    PINYIN_Ji,             // J
+    PINYIN_Ke,             // K
+    PINYIN_Le,             // L
+    PINYIN_Mo,             // M
+    PINYIN_Ne,             // N
+    PINYIN_ZeroInitial,    // O
+    PINYIN_Po,             // P
+    PINYIN_Qi,             // Q
+    PINYIN_Ri,             // R
+    PINYIN_Si,             // S
+    PINYIN_Te,             // T
+    PINYIN_ZeroInitial,    // U
+    PINYIN_Shi,            // V
+    PINYIN_Wu,             // W
+    PINYIN_Xi,             // X
+    PINYIN_Yi,             // Y
+    PINYIN_Zi,             // Z
+    PINYIN_ZeroInitial,    // ;
+};
+
+static const PinyinFinal __shuang_pin_abc_final_map [][2] =
+{
+    { PINYIN_A,   PINYIN_ZeroFinal },         // A
+    { PINYIN_Ou,  PINYIN_ZeroFinal },         // B
+    { PINYIN_In,  PINYIN_Uai       },         // C
+    { PINYIN_Ia,  PINYIN_Ua        },         // D
+    { PINYIN_E,   PINYIN_ZeroFinal },         // E
+    { PINYIN_En,  PINYIN_ZeroFinal },         // F
+    { PINYIN_Ng,  PINYIN_Eng       },         // G
+    { PINYIN_Ang, PINYIN_ZeroFinal },         // H
+    { PINYIN_I,   PINYIN_ZeroFinal },         // I
+    { PINYIN_An,  PINYIN_ZeroFinal },         // J
+    { PINYIN_Ao,  PINYIN_ZeroFinal },         // K
+    { PINYIN_Ai,  PINYIN_ZeroFinal },         // L
+    { PINYIN_Ui,  PINYIN_Ue        },         // M
+    { PINYIN_Un,  PINYIN_ZeroFinal },         // N
+    { PINYIN_Uo,  PINYIN_O         },         // O
+    { PINYIN_Uan, PINYIN_ZeroFinal },         // P
+    { PINYIN_Ei,  PINYIN_ZeroFinal },         // Q
+    { PINYIN_Iu,  PINYIN_Er        },         // R
+    { PINYIN_Ong, PINYIN_Iong      },         // S
+    { PINYIN_Uang,PINYIN_Iang      },         // T
+    { PINYIN_U,   PINYIN_ZeroFinal },         // U
+    { PINYIN_V,   PINYIN_ZeroFinal },         // V
+    { PINYIN_Ian, PINYIN_ZeroFinal },         // W
+    { PINYIN_Ie,  PINYIN_ZeroFinal },         // X
+    { PINYIN_Ing, PINYIN_ZeroFinal },         // Y
+    { PINYIN_Iao, PINYIN_ZeroFinal },         // Z
+    { PINYIN_ZeroFinal, PINYIN_ZeroFinal },   // ;
+};
+
+
+static const PinyinInitial __shuang_pin_liushi_initial_map [] =
+{
+    PINYIN_ZeroInitial,    // A
+    PINYIN_Bo,             // B
+    PINYIN_Ci,             // C
+    PINYIN_De,             // D
+    PINYIN_ZeroInitial,    // E
+    PINYIN_Fo,             // F
+    PINYIN_Ge,             // G
+    PINYIN_He,             // H
+    PINYIN_Chi,            // I
+    PINYIN_Ji,             // J
+    PINYIN_Ke,             // K
+    PINYIN_Le,             // L
+    PINYIN_Mo,             // M
+    PINYIN_Ne,             // N
+    PINYIN_ZeroInitial,    // O
+    PINYIN_Po,             // P
+    PINYIN_Qi,             // Q
+    PINYIN_Ri,             // R
+    PINYIN_Si,             // S
+    PINYIN_Te,             // T
+    PINYIN_Shi,            // U
+    PINYIN_Zhi,            // V
+    PINYIN_Wu,             // W
+    PINYIN_Xi,             // X
+    PINYIN_Yi,             // Y
+    PINYIN_Zi,             // Z
+    PINYIN_ZeroInitial,    // ;
+};
+
+static const PinyinFinal __shuang_pin_liushi_final_map [][2] =
+{
+    { PINYIN_A,   PINYIN_ZeroFinal },         // A
+    { PINYIN_Ao,  PINYIN_ZeroFinal },         // B
+    { PINYIN_Ang, PINYIN_ZeroFinal },         // C
+    { PINYIN_Uan, PINYIN_ZeroFinal },         // D
+    { PINYIN_E,   PINYIN_ZeroFinal },         // E
+    { PINYIN_An,  PINYIN_ZeroFinal },         // F
+    { PINYIN_Ong, PINYIN_Iong      },         // G
+    { PINYIN_Ui,  PINYIN_Ue        },         // H
+    { PINYIN_I,   PINYIN_ZeroFinal },         // I
+    { PINYIN_Ia,  PINYIN_Ua        },         // J
+    { PINYIN_Un,  PINYIN_ZeroFinal },         // K
+    { PINYIN_Iu,  PINYIN_ZeroFinal },         // L
+    { PINYIN_In,  PINYIN_ZeroFinal },         // M
+    { PINYIN_Uang,PINYIN_Iang      },         // N
+    { PINYIN_Uo,  PINYIN_O         },         // O
+    { PINYIN_Ng,  PINYIN_Eng       },         // P
+    { PINYIN_Ing, PINYIN_ZeroFinal },         // Q
+    { PINYIN_Ou,  PINYIN_Er        },         // R
+    { PINYIN_Ai,  PINYIN_ZeroFinal },         // S
+    { PINYIN_Ian, PINYIN_ZeroFinal },         // T
+    { PINYIN_U,   PINYIN_ZeroFinal },         // U
+    { PINYIN_V,   PINYIN_En        },         // V
+    { PINYIN_Ei,  PINYIN_ZeroFinal },         // W
+    { PINYIN_Ie,  PINYIN_ZeroFinal },         // X
+    { PINYIN_Uai, PINYIN_ZeroFinal },         // Y
+    { PINYIN_Iao, PINYIN_ZeroFinal },         // Z
+    { PINYIN_ZeroFinal, PINYIN_ZeroFinal },   // ;
+};
+
+static const size_t    __zhuyin_zhuyin_map_start_char = 0x3105;
+static const size_t    __zhuyin_zhuyin_map_tone_start_idx = 37;
+static const PinyinKey __zhuyin_zhuyin_map [][3] = 
+{
+    {PinyinKey(PINYIN_Bo),PinyinKey(),PinyinKey()},
+    {PinyinKey(PINYIN_Po),PinyinKey(),PinyinKey()},
+    {PinyinKey(PINYIN_Mo),PinyinKey(),PinyinKey()},
+    {PinyinKey(PINYIN_Fo),PinyinKey(),PinyinKey()},
+    {PinyinKey(PINYIN_De),PinyinKey(),PinyinKey()},
+    {PinyinKey(PINYIN_Te),PinyinKey(),PinyinKey()},
+    {PinyinKey(PINYIN_Ne),PinyinKey(),PinyinKey()},
+    {PinyinKey(PINYIN_Le),PinyinKey(),PinyinKey()},
+    {PinyinKey(PINYIN_Ge),PinyinKey(),PinyinKey()},
+    {PinyinKey(PINYIN_Ke),PinyinKey(),PinyinKey()},
+    {PinyinKey(PINYIN_He),PinyinKey(),PinyinKey()},
+    {PinyinKey(PINYIN_Ji),PinyinKey(),PinyinKey()},
+    {PinyinKey(PINYIN_Qi),PinyinKey(),PinyinKey()},
+    {PinyinKey(PINYIN_Xi),PinyinKey(),PinyinKey()},
+    {PinyinKey(PINYIN_Zhi),PinyinKey(),PinyinKey()},
+    {PinyinKey(PINYIN_Chi),PinyinKey(),PinyinKey()},
+    {PinyinKey(PINYIN_Shi),PinyinKey(),PinyinKey()},
+    {PinyinKey(PINYIN_Ri),PinyinKey(),PinyinKey()},
+    {PinyinKey(PINYIN_Zi),PinyinKey(),PinyinKey()},
+    {PinyinKey(PINYIN_Ci),PinyinKey(),PinyinKey()},
+    {PinyinKey(PINYIN_Si),PinyinKey(),PinyinKey()},
+    {PinyinKey(PINYIN_ZeroInitial,PINYIN_A),PinyinKey(),PinyinKey()},
+    {PinyinKey(PINYIN_ZeroInitial,PINYIN_O),PinyinKey(),PinyinKey()},
+    {PinyinKey(PINYIN_ZeroInitial,PINYIN_E),PinyinKey(),PinyinKey()},
+    {PinyinKey(PINYIN_ZeroInitial,PINYIN_Ea),PinyinKey(),PinyinKey()},
+    {PinyinKey(PINYIN_ZeroInitial,PINYIN_Ai),PinyinKey(),PinyinKey()},
+    {PinyinKey(PINYIN_ZeroInitial,PINYIN_Ei),PinyinKey(),PinyinKey()},
+    {PinyinKey(PINYIN_ZeroInitial,PINYIN_Ao),PinyinKey(),PinyinKey()},
+    {PinyinKey(PINYIN_ZeroInitial,PINYIN_Ou),PinyinKey(),PinyinKey()},
+    {PinyinKey(PINYIN_ZeroInitial,PINYIN_An),PinyinKey(),PinyinKey()},
+    {PinyinKey(PINYIN_ZeroInitial,PINYIN_En),PinyinKey(),PinyinKey()},
+    {PinyinKey(PINYIN_ZeroInitial,PINYIN_Ang),PinyinKey(),PinyinKey()},
+    {PinyinKey(PINYIN_ZeroInitial,PINYIN_Eng),PinyinKey(),PinyinKey()},
+    {PinyinKey(PINYIN_ZeroInitial,PINYIN_Er),PinyinKey(),PinyinKey()},
+    {PinyinKey(PINYIN_ZeroInitial,PINYIN_I),PinyinKey(),PinyinKey()},
+    {PinyinKey(PINYIN_ZeroInitial,PINYIN_U),PinyinKey(),PinyinKey()},
+    {PinyinKey(PINYIN_ZeroInitial,PINYIN_V),PinyinKey(),PinyinKey()},
+};
+
+static const size_t __zhuyin_map_start_char = 0x20;
+#include "pinyin_zhuyin_map_data.h"
+
+static const PinyinKey (*__zhuyin_maps []) [3] = {
+    __zhuyin_zhuyin_map,
+    __zhuyin_standard_map,
+    __zhuyin_hsu_map,
+    __zhuyin_ibm_map,
+    __zhuyin_gin_yieh_map,
+    __zhuyin_et_map,
+    __zhuyin_et26_map,
+    0
+};
+
+
+//////////////////////////////////////////////////////////////////////////////
+// implementation of PinyinCustomSettings
+
+PinyinCustomSettings::PinyinCustomSettings ()
+    : use_incomplete (true)
+{
+    for (size_t i=0; i<=PINYIN_AmbLast; ++i)
+        use_ambiguities [i] = false;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// implementation of PinyinKey
+
+const guint16 PinyinKey::min_value = 0;
+const guint16 PinyinKey::max_value = PINYIN_Number_Of_Initials * PINYIN_Number_Of_Finals * PINYIN_Number_Of_Tones - 1;
+
+const char*
+PinyinKey::get_initial_string () const
+{
+    return __pinyin_initials [m_initial].latin;
+}
+
+const char*
+PinyinKey::get_initial_zhuyin_string () const
+{
+    if ((m_initial == PINYIN_Wu && m_final == PINYIN_U) ||
+        (m_initial == PINYIN_Yi &&
+         (m_final == PINYIN_I || m_final == PINYIN_In || m_final == PINYIN_Ing || m_final == PINYIN_Ong ||
+          m_final == PINYIN_U || m_final == PINYIN_Ue || m_final == PINYIN_Uan || m_final == PINYIN_Un)))
+        return "";
+
+    return __pinyin_initials [m_initial].zhuyin;
+}
+
+const char*
+PinyinKey::get_final_string () const
+{
+    return __pinyin_finals [m_final].latin;
+}
+
+const char*
+PinyinKey::get_final_zhuyin_string () const
+{
+    if (m_initial == PINYIN_Yi && m_final == PINYIN_Ong) {
+        return __pinyin_finals [PINYIN_Iong].zhuyin;
+    } else if (m_initial == PINYIN_Yi || m_initial == PINYIN_Ji || m_initial == PINYIN_Qi || m_initial == PINYIN_Xi) {
+        switch (m_final) {
+            case PINYIN_U:
+                return __pinyin_finals [PINYIN_V].zhuyin;
+            case PINYIN_Ue:
+                return __pinyin_finals [PINYIN_Ve].zhuyin;
+            case PINYIN_Uan:
+                return __pinyin_finals [PINYIN_Van].zhuyin;
+            case PINYIN_Un:
+                return __pinyin_finals [PINYIN_Vn].zhuyin;
+        }
+        if (m_initial == PINYIN_Yi && m_final == PINYIN_E)
+            return __pinyin_finals [PINYIN_Ea].zhuyin;
+    } else if ((m_initial == PINYIN_Ne || m_initial == PINYIN_Le) && m_final == PINYIN_Ue) {
+        return __pinyin_finals [PINYIN_Ve].zhuyin;
+    } else if ((m_initial == PINYIN_Zhi || m_initial == PINYIN_Chi || m_initial == PINYIN_Shi ||
+                m_initial == PINYIN_Zi  || m_initial == PINYIN_Ci  || m_initial == PINYIN_Si  ||
+                m_initial == PINYIN_Ri) && m_final == PINYIN_I) {
+        return "";
+    }
+
+    return __pinyin_finals [m_final].zhuyin;
+}
+
+const char*
+PinyinKey::get_tone_string () const
+{
+    return __pinyin_tones [m_tone].latin;
+}
+
+const char*
+PinyinKey::get_tone_zhuyin_string () const
+{
+    return __pinyin_tones [m_tone].zhuyin;
+}
+
+const char *
+PinyinKey::get_key_string () const
+{
+    char key [16];
+    g_snprintf (key, 15, "%s%s%s", get_initial_string(), get_final_string(), get_tone_string ());
+
+    return g_strdup(key);
+}
+
+const char *
+PinyinKey::get_key_zhuyin_string () const
+{
+    char key [32];
+    g_snprintf (key, 31, "%s%s%s", get_initial_zhuyin_string(), get_final_zhuyin_string(), get_tone_zhuyin_string ());
+
+    return g_strdup (key);
+}
+
+int
+PinyinKey::set (const PinyinValidator &validator, const char *str, int len)
+{
+    if (!str || ! (*str))
+        return 0;
+
+    PinyinDefaultParser parser;
+
+    return parser.parse_one_key (validator, *this, str, len);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// implementation of PinyinValidator
+BitmapPinyinValidator::BitmapPinyinValidator (const PinyinLargeTable *table)
+{
+    initialize (table);
+}
+
+void
+BitmapPinyinValidator::initialize (const PinyinLargeTable *table)
+{
+    memset (m_bitmap, 0, sizeof (m_bitmap));
+
+    if (!table) return;
+
+    for (guint16 val=0; val<=PinyinKey::max_value; ++val)
+        if (!table->has_key (PinyinKey (val)))
+            m_bitmap [val >> 3] |= (1 << (val % 8));
+}
+
+bool
+BitmapPinyinValidator::operator () (PinyinKey key) const
+{
+    if (key.is_empty ()) return false;
+
+    guint16 val = key.get_value ();
+
+    return  (m_bitmap [ val >> 3 ] & (1 << (val % 8))) == 0;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// implementation of PinyinParser
+PinyinParser::~PinyinParser ()
+{
+}
+
+struct PinyinReplaceRulePair
+{
+    PinyinInitial initial;
+    PinyinFinal   final;
+    PinyinInitial new_initial;
+    PinyinFinal   new_final;
+};
+
+class PinyinReplaceRulePairLessThan
+{
+public:
+    bool operator () (const PinyinReplaceRulePair &lhs, const PinyinReplaceRulePair &rhs) const {
+        if (lhs.initial < rhs.initial) return true;
+        if (lhs.initial > rhs.initial) return false;
+        return lhs.final < rhs.final;
+    }
+};
+
+void
+PinyinParser::normalize (PinyinKey &key)
+{
+    static const PinyinReplaceRulePair rules [] = 
+    {
+#if 0
+        {PINYIN_ZeroInitial, PINYIN_I,    PINYIN_Yi, PINYIN_I},
+        {PINYIN_ZeroInitial, PINYIN_Ia,   PINYIN_Yi, PINYIN_A},
+        {PINYIN_ZeroInitial, PINYIN_Ian,  PINYIN_Yi, PINYIN_An},
+        {PINYIN_ZeroInitial, PINYIN_Iang, PINYIN_Yi, PINYIN_Ang},
+        {PINYIN_ZeroInitial, PINYIN_Iao,  PINYIN_Yi, PINYIN_Ao},
+        {PINYIN_ZeroInitial, PINYIN_Ie,   PINYIN_Yi, PINYIN_E},
+        {PINYIN_ZeroInitial, PINYIN_In,   PINYIN_Yi, PINYIN_In},
+        {PINYIN_ZeroInitial, PINYIN_Ing,  PINYIN_Yi, PINYIN_Ing},
+        {PINYIN_ZeroInitial, PINYIN_Iong, PINYIN_Yi, PINYIN_Ong},
+        {PINYIN_ZeroInitial, PINYIN_Iu,   PINYIN_Yi, PINYIN_Ou},
+        {PINYIN_ZeroInitial, PINYIN_U,    PINYIN_Wu, PINYIN_U},
+        {PINYIN_ZeroInitial, PINYIN_Ua,   PINYIN_Wu, PINYIN_A},
+        {PINYIN_ZeroInitial, PINYIN_Uai,  PINYIN_Wu, PINYIN_Ai},
+        {PINYIN_ZeroInitial, PINYIN_Uan,  PINYIN_Wu, PINYIN_An},
+        {PINYIN_ZeroInitial, PINYIN_Uang, PINYIN_Wu, PINYIN_Ang},
+        {PINYIN_ZeroInitial, PINYIN_Ue,   PINYIN_Wu, PINYIN_E},
+        {PINYIN_ZeroInitial, PINYIN_Ueng, PINYIN_Wu, PINYIN_Eng},
+        {PINYIN_ZeroInitial, PINYIN_Ui,   PINYIN_Wu, PINYIN_Ei},
+        {PINYIN_ZeroInitial, PINYIN_Un,   PINYIN_Wu, PINYIN_En},
+        {PINYIN_ZeroInitial, PINYIN_Uo,   PINYIN_Wu, PINYIN_O},
+        {PINYIN_ZeroInitial, PINYIN_V,    PINYIN_Yi, PINYIN_U},
+        {PINYIN_ZeroInitial, PINYIN_Van,  PINYIN_Yi, PINYIN_Uan},
+        {PINYIN_ZeroInitial, PINYIN_Ve,   PINYIN_Yi, PINYIN_Ue},
+        {PINYIN_ZeroInitial, PINYIN_Vn,   PINYIN_Yi, PINYIN_Un},
+#endif
+        {PINYIN_Ji,          PINYIN_V,    PINYIN_Ji, PINYIN_U},
+        {PINYIN_Ji,          PINYIN_Van,  PINYIN_Ji, PINYIN_Uan},
+        {PINYIN_Ji,          PINYIN_Ve,   PINYIN_Ji, PINYIN_Ue},
+        {PINYIN_Ji,          PINYIN_Vn,   PINYIN_Ji, PINYIN_Un},
+        {PINYIN_Ne,          PINYIN_Ve,   PINYIN_Ne, PINYIN_Ue},
+        {PINYIN_Le,          PINYIN_Ve,   PINYIN_Le, PINYIN_Ue},
+        {PINYIN_Qi,          PINYIN_V,    PINYIN_Qi, PINYIN_U},
+        {PINYIN_Qi,          PINYIN_Van,  PINYIN_Qi, PINYIN_Uan},
+        {PINYIN_Qi,          PINYIN_Ve,   PINYIN_Qi, PINYIN_Ue},
+        {PINYIN_Qi,          PINYIN_Vn,   PINYIN_Qi, PINYIN_Un},
+        {PINYIN_Xi,          PINYIN_V,    PINYIN_Xi, PINYIN_U},
+        {PINYIN_Xi,          PINYIN_Van,  PINYIN_Xi, PINYIN_Uan},
+        {PINYIN_Xi,          PINYIN_Ve,   PINYIN_Xi, PINYIN_Ue},
+        {PINYIN_Xi,          PINYIN_Vn,   PINYIN_Xi, PINYIN_Un}
+    };
+    static const PinyinReplaceRulePair *rules_start = rules;
+    static const PinyinReplaceRulePair *rules_end   = rules + sizeof(rules)/sizeof(PinyinReplaceRulePair);
+
+    PinyinReplaceRulePair kp;
+
+    kp.initial = key.get_initial ();
+    kp.final = key.get_final ();
+
+    const PinyinReplaceRulePair *p = std_lite::lower_bound (rules_start, rules_end, kp, PinyinReplaceRulePairLessThan ());
+
+    if (p->initial == kp.initial && p->final == kp.final) {
+        key.set_initial (p->new_initial);
+        key.set_final (p->new_final);
+    }
+}
+
+//============== Internal functions used by PinyinDefaultParser ==============
+static int
+__default_parser_parse_initial (PinyinInitial &initial, const char *str, int len)
+{
+    int lastlen = 0;
+
+    initial = PINYIN_ZeroInitial;
+
+    if (str && *str >= 'a' && *str <= 'z') {
+        int start = __pinyin_initials_index [*str - 'a'].start;
+        int end = __pinyin_initials_index [*str - 'a'].num + start;
+
+        if (start > 0) {
+            for (int i = start; i < end; ++i) {
+                if ((len < 0 || len >= __pinyin_initials [i].latin_len) && __pinyin_initials [i].latin_len >= lastlen) {
+                    int j;
+                    for (j = 1; j < __pinyin_initials [i].latin_len; ++j) {
+                        if (str [j] != __pinyin_initials [i].latin [j])
+                            break;
+                    }
+                    if (j == __pinyin_initials [i].latin_len) {
+                        initial = static_cast<PinyinInitial>(i);
+                        lastlen = __pinyin_initials [i].latin_len;
+                    }
+                }
+            }
+        }
+    }
+
+    return lastlen;
+}
+static int
+__default_parser_parse_final (PinyinFinal &final, const char *str, int len)
+{
+    int lastlen = 0;
+
+    final = PINYIN_ZeroFinal;
+
+    if (str && *str >= 'a' && *str <= 'z') {
+        int start = __pinyin_finals_index [*str - 'a'].start;
+        int end = __pinyin_finals_index [*str - 'a'].num + start;
+
+        if (start > 0) {
+            for (int i = start; i < end; ++i) {
+                if ((len < 0 || len >= __pinyin_finals [i].latin_len) && __pinyin_finals [i].latin_len >= lastlen) {
+                    int j;
+                    for (j = 1; j < __pinyin_finals [i].latin_len; ++j) {
+                        if (str [j] != __pinyin_finals [i].latin [j])
+                            break;
+                    }
+                    if (j == __pinyin_finals [i].latin_len) {
+                        final = static_cast<PinyinFinal>(i);
+                        lastlen = __pinyin_finals [i].latin_len;
+                    }
+                }
+            }
+        }
+    }
+
+    return lastlen;
+}
+static int
+__default_parser_parse_tone (PinyinTone &tone, const char *str, int len)
+{
+    tone = PINYIN_ZeroTone;
+
+    if (str && (len >= 1 || len < 0)) {
+        int kt = (*str) - '0';
+        if (kt >= PINYIN_First && kt <= PINYIN_LastTone) {
+            tone = static_cast<PinyinTone>(kt);
+            return 1;
+        }
+    }
+    return 0;
+}
+
+static int
+__default_parser_parse_one_key (const PinyinValidator &validator, PinyinKey &key, const char *str, int len = -1)
+{
+    int initial_len = 0;
+    int final_len   = 0;
+    int tone_len    = 0;
+
+    const char *ptr;
+
+    PinyinInitial initial;
+    PinyinFinal   final;
+    PinyinTone    tone;
+
+    key.clear ();
+
+    if (!str || !len) return 0;
+
+    if (len < 0) len = strlen (str);
+
+    while (len > 0) {
+        ptr = str;
+
+        initial = PINYIN_ZeroInitial;
+        final   = PINYIN_ZeroFinal;
+        tone    = PINYIN_ZeroTone;
+
+        final_len = __default_parser_parse_final (final, ptr, len);
+        ptr += final_len;
+        len -= final_len;
+        // An initial is present
+        if (final == PINYIN_ZeroFinal) {
+            initial_len = __default_parser_parse_initial (initial, ptr, len);
+            ptr += initial_len;
+            len -= initial_len;
+            if (len){
+                final_len = __default_parser_parse_final (final, ptr, len);
+                ptr += final_len;
+                len -= final_len;
+            }
+        }
+
+        if (len)
+            tone_len = __default_parser_parse_tone (tone, ptr, len);
+        key.set (initial, final, tone);
+        PinyinParser::normalize (key);
+
+        // A valid key was found, return.
+        if (validator (key)) break;
+
+        // The key is invalid, reduce the len and find again.
+        len = initial_len + final_len + tone_len - 1;
+
+        initial_len = final_len = tone_len = 0;
+
+        key.clear ();
+    }
+
+    len = initial_len + final_len + tone_len;
+
+    return len;
+}
+
+struct DefaultParserCacheElement
+{
+    PinyinKey key;
+    PinyinKeyPos pos;
+    int num_keys;
+    int parsed_len;
+    int next_start;
+};
+
+typedef  GArray* DefaultParserCache; /* Array of DefaultParserCacheElement */
+
+static int
+__default_parser_parse_recursive (const PinyinValidator &validator,
+                                  DefaultParserCache    &cache,
+                                  int                   &real_start,
+                                  int                   &num_keys,
+                                  const char            *str,
+                                  int                    len,
+                                  int                    start)
+{
+    if (*str == 0 || len == 0) return 0;
+
+    int used_len = 0;
+
+    real_start = 0;
+    num_keys = 0;
+
+    if (*str == '\'' || *str == ' ') {
+        ++used_len;
+        ++str;
+        ++start;
+        --len;
+    }
+
+    if (!isalpha (*str) || !len)
+        return 0;
+
+    real_start = start;
+
+    // The best keys start from this position have been found, just return the result.
+    DefaultParserCacheElement* element = &g_array_index
+       (cache, DefaultParserCacheElement, start);
+                                                      
+                                                      
+    if (element->num_keys >=0) {
+        num_keys = element->num_keys;
+        return element->parsed_len;
+    }
+
+    PinyinKey first_key;
+    PinyinKey best_first_key;
+    PinyinKeyPos pos; 
+
+    int first_len = 0;
+    int best_first_len = 0;
+
+    int remained_len = 0;
+    int best_remained_len = 0;
+
+    int remained_keys = 0;
+    int best_remained_keys = 0;
+
+    int remained_start = 0;
+    int best_remained_start = 0;
+
+    first_len = __default_parser_parse_one_key (validator, first_key, str, len);
+
+    if (!first_len) {
+       element = &g_array_index(cache, DefaultParserCacheElement, start);
+
+        element->key = PinyinKey ();
+        element->num_keys = 0;
+        element->parsed_len = 0;
+       element->next_start = start;
+        return 0;
+    }
+
+    best_first_key = first_key;
+    best_first_len = first_len;
+
+    if (len > first_len) {
+        char ch1 = str [first_len -1];
+        char ch2 = str [first_len];
+
+        best_remained_len = __default_parser_parse_recursive (validator,
+                                                              cache,
+                                                              best_remained_start,
+                                                              best_remained_keys,
+                                                              str + first_len,
+                                                              len - first_len,
+                                                              start + first_len);
+
+        // For those keys which the last char is 'g' or 'n' or 'r', try put the end char into the next key.
+        if (first_len > 1 &&
+            (((ch1=='g' || ch1=='n' || ch1=='r') && (ch2=='a' || ch2=='e' || ch2=='i' || ch2=='o' || ch2=='u' || ch2=='v')) ||
+             ((ch1=='a' || ch1=='e' || ch1=='o') && (ch2=='i' || ch2=='n' || ch2=='o' || ch2=='r' || ch2=='u')))) {
+
+            first_len = __default_parser_parse_one_key (validator, first_key, str, first_len - 1);
+
+            if (first_len) {
+                remained_len = __default_parser_parse_recursive (validator,
+                                                                 cache, 
+                                                                 remained_start,
+                                                                 remained_keys,
+                                                                 str + first_len,
+                                                                 len - first_len,
+                                                                 start + first_len);
+
+
+       DefaultParserCacheElement* best_remained_element = &g_array_index
+           (cache, DefaultParserCacheElement, best_remained_start);            
+
+                // A better seq was found.
+                if (remained_len != 0 && (remained_len + first_len) >= (best_remained_len + best_first_len) &&
+                    (remained_keys <= best_remained_keys || best_remained_keys == 0)) {
+#if 0
+                    if ((remained_len + first_len) > (best_remained_len + best_first_len) ||
+                        remained_keys < best_remained_keys ||
+                        best_remained_element->key.get_final () == PINYIN_ZeroFinal ||
+                        best_remained_element->key.get_initial () == PINYIN_Wu ||
+                        best_remained_element->key.get_initial () == PINYIN_Yi) {
+#endif
+                        best_first_len = first_len;
+                        best_first_key = first_key;
+                        best_remained_len = remained_len;
+                        best_remained_keys = remained_keys;
+                        best_remained_start = remained_start;
+#if 0
+                    }
+#endif
+                }
+            }
+        }
+    }
+
+    num_keys = best_remained_keys + 1;
+    
+    
+    element = &g_array_index
+       (cache, DefaultParserCacheElement, start);
+    
+    pos.set_pos(start);
+    pos.set_length(best_first_len);
+
+    element->key = best_first_key;
+    element->pos = pos;
+    element->num_keys = num_keys;
+    element->parsed_len = used_len + best_first_len + best_remained_len;
+    element->next_start = best_remained_start;
+
+    return element->parsed_len;
+}
+//============================================================================
+
+PinyinDefaultParser::~PinyinDefaultParser ()
+{
+}
+
+int
+PinyinDefaultParser::parse_one_key (const PinyinValidator &validator, PinyinKey &key, const char *str, int len) const
+{
+    return __default_parser_parse_one_key (validator, key, str, len);
+}
+
+int
+PinyinDefaultParser::parse (const PinyinValidator &validator, PinyinKeyVector & keys, PinyinKeyPosVector & poses, const char *str, int len) const
+{
+    g_array_set_size(keys, 0);
+    g_array_set_size(poses, 0);
+
+    if (!str || !len) return 0;
+
+    if (len < 0) len = strlen (str);
+
+    DefaultParserCacheElement elm;
+
+    elm.num_keys = -1L;
+    elm.parsed_len = 0;
+    elm.next_start = 0;
+
+    DefaultParserCache cache = g_array_new (FALSE, TRUE, sizeof (DefaultParserCacheElement));
+    g_array_set_size(cache, len);
+    for ( size_t index = 0 ; index < len ; index++){
+       DefaultParserCacheElement * element =
+           &g_array_index(cache,DefaultParserCacheElement, index);
+       *element = elm; 
+    }
+    int start = 0;
+    int num_keys = 0;
+
+    len = __default_parser_parse_recursive (validator, cache, start, num_keys, str, len, 0);
+
+    for (size_t i=0; i<(size_t)num_keys; ++i) {
+       DefaultParserCacheElement* element = &g_array_index
+           (cache, DefaultParserCacheElement, start);
+        g_array_append_val(keys, element->key);
+       g_array_append_val(poses, element->pos);
+        start = element->next_start;
+    }
+
+    return len;
+}
+
+PinyinShuangPinParser::PinyinShuangPinParser (PinyinShuangPinScheme scheme)
+{
+    set_scheme (scheme);
+}
+
+PinyinShuangPinParser::PinyinShuangPinParser (const PinyinInitial initial_map[27], const PinyinFinal final_map[27][2])
+{
+    set_scheme (initial_map, final_map);
+}
+
+PinyinShuangPinParser::~PinyinShuangPinParser ()
+{
+}
+
+int
+PinyinShuangPinParser::parse_one_key (const PinyinValidator &validator, PinyinKey &key, const char *str, int len) const
+{
+    key.clear ();
+
+    if (!str || !len || ! (*str)) return 0;
+
+    if (len < 0) len = strlen (str);
+
+    PinyinInitial initial    = PINYIN_ZeroInitial;
+    PinyinFinal   final      = PINYIN_ZeroFinal;
+    PinyinFinal   final_cands [4] = { PINYIN_ZeroFinal, PINYIN_ZeroFinal, PINYIN_ZeroFinal, PINYIN_ZeroFinal };
+
+    PinyinTone    tone = PINYIN_ZeroTone;
+
+    int idx [2] = {-1, -1};
+    int used_len = 0;
+
+    size_t i;
+    bool matched = false;
+
+    for (i = 0; i < 2 && i < (size_t) len; ++i) {
+        if (str [i] >= 'a' && str [i] <= 'z') idx [i] = str [i] - 'a';
+        else if (str [i] == ';') idx [i] = 26;
+    }
+
+    // parse initial or final
+    if (idx [0] >= 0) {
+        initial = m_initial_map [idx[0]];
+        final_cands [0] = m_final_map [idx[0]][0];
+        final_cands [1] = m_final_map [idx[0]][1];
+    }
+
+    if (initial == PINYIN_ZeroInitial && final_cands [0] == PINYIN_ZeroFinal)
+        return 0;
+
+    // parse final, if str [0] == 'o' (idx [0] == 14) then just skip to parse final.
+    if (idx [1] >= 0 && (initial != PINYIN_ZeroInitial || idx[0] == 14)) {
+        final_cands [2] = m_final_map [idx [1]][0];
+        final_cands [3] = m_final_map [idx [1]][1];
+
+        for (i = 2; i < 4; ++i) {
+            if (final_cands [i] != PINYIN_ZeroFinal) {
+                key.set (initial, final_cands [i]);
+                PinyinParser::normalize (key);
+
+                if (validator (key)) {
+                    final = final_cands [i];
+                    matched = true;
+                    used_len = 2;
+                    str += 2;
+                    len -= 2;
+                    break;
+                }
+            }
+        }
+    }
+
+    if (!matched) {
+        initial = PINYIN_ZeroInitial;
+        for (i = 0; i < 2; ++i) {
+            key.set (initial, final_cands [i]);
+            PinyinParser::normalize (key);
+
+            if (validator (key)) {
+                final = final_cands [i];
+                matched = true;
+                used_len = 1;
+                ++str;
+                --len;
+                break;
+            }
+        }
+    }
+
+    if (!matched) return 0;
+
+    // parse tone
+    if (len) {
+        int kt = (*str) - '0';
+        if (kt >= PINYIN_First && kt <= PINYIN_LastTone) {
+            tone = static_cast<PinyinTone>(kt);
+
+            key.set (initial, final, tone);
+
+            if (validator (key)) {
+                return used_len + 1;
+            }
+        }
+    }
+
+    return used_len;
+}
+
+int
+PinyinShuangPinParser::parse (const PinyinValidator &validator, PinyinKeyVector &keys, PinyinKeyPosVector & poses, const char *str, int len) const
+{
+    g_array_set_size(keys, 0);
+    g_array_set_size(poses, 0);
+
+    if (!str || !len || ! (*str)) return 0;
+
+    if (len < 0) len = strlen (str);
+
+    int used_len = 0;
+
+    PinyinKey key;
+    PinyinKeyPos pos;
+
+    while (used_len < len) {
+        if (*str == '\'' || *str == ' ') {
+            ++str;
+            ++used_len;
+            continue;
+        }
+
+        int one_len = parse_one_key (validator, key, str, len);
+
+        if (one_len) {
+           pos.set_pos(used_len);
+           pos.set_length(one_len);
+            g_array_append_val(keys, key);
+           g_array_append_val(poses, pos);
+        } else {
+            break;
+        }
+
+        str += one_len;
+        used_len += one_len;
+    }
+
+    return used_len;
+}
+
+void
+PinyinShuangPinParser::set_scheme (PinyinShuangPinScheme scheme)
+{
+    switch (scheme) {
+        case SHUANG_PIN_STONE:
+            set_scheme (__shuang_pin_stone_initial_map, __shuang_pin_stone_final_map);
+            break;
+        case SHUANG_PIN_ZRM:
+            set_scheme (__shuang_pin_zrm_initial_map, __shuang_pin_zrm_final_map);
+            break;
+        case SHUANG_PIN_MS:
+            set_scheme (__shuang_pin_ms_initial_map, __shuang_pin_ms_final_map);
+            break;
+        case SHUANG_PIN_ZIGUANG:
+            set_scheme (__shuang_pin_ziguang_initial_map, __shuang_pin_ziguang_final_map);
+            break;
+        case SHUANG_PIN_ABC:
+            set_scheme (__shuang_pin_abc_initial_map, __shuang_pin_abc_final_map);
+            break;
+        case SHUANG_PIN_LIUSHI:
+            set_scheme (__shuang_pin_liushi_initial_map, __shuang_pin_liushi_final_map);
+            break;
+        default:
+            set_scheme (__shuang_pin_zrm_initial_map, __shuang_pin_zrm_final_map);
+            return;
+    }
+}
+
+void
+PinyinShuangPinParser::set_scheme (const PinyinInitial initial_map[27], const PinyinFinal final_map[27][2])
+{
+    for (size_t i = 0; i < 27; ++i) {
+        m_initial_map [i] = initial_map [i];
+        m_final_map [i][0] = final_map [i][0];
+        m_final_map [i][1] = final_map [i][1];
+    }
+}
+
+void
+PinyinShuangPinParser::get_scheme (PinyinInitial initial_map[27], PinyinFinal final_map[27][2])
+{
+    for (size_t i = 0; i < 27; ++i) {
+        initial_map [i] = m_initial_map [i];
+        final_map [i][0] = m_final_map [i][0];
+        final_map [i][1] = m_final_map [i][1];
+    }
+}
+
+namespace novel{
+
+//////////////////////////////////////////////////////////////////////////////
+// implementation of PinyinKey comparision classe
+int pinyin_compare_initial (const PinyinCustomSettings &custom,
+                           PinyinInitial lhs,
+                           PinyinInitial rhs)
+{
+       if ((lhs == rhs) ||
+               (custom.use_ambiguities [PINYIN_AmbZhiZi] &&
+                ((lhs == PINYIN_Zhi && rhs == PINYIN_Zi) ||
+                 (lhs == PINYIN_Zi && rhs == PINYIN_Zhi))) ||
+                         
+               (custom.use_ambiguities [PINYIN_AmbChiCi] &&
+                ((lhs == PINYIN_Chi && rhs == PINYIN_Ci) ||
+                 (lhs == PINYIN_Ci && rhs == PINYIN_Chi))) ||
+                         
+               (custom.use_ambiguities [PINYIN_AmbShiSi] &&
+                ((lhs == PINYIN_Shi && rhs == PINYIN_Si) ||
+                 (lhs == PINYIN_Si && rhs == PINYIN_Shi))) ||
+
+               (custom.use_ambiguities [PINYIN_AmbLeRi] && 
+                ((lhs == PINYIN_Le && rhs == PINYIN_Ri) ||
+                 (lhs == PINYIN_Ri && rhs == PINYIN_Le))) ||
+
+               (custom.use_ambiguities [PINYIN_AmbNeLe] && 
+                ((lhs == PINYIN_Ne && rhs == PINYIN_Le) ||
+                 (lhs == PINYIN_Le && rhs == PINYIN_Ne))) ||
+
+               (custom.use_ambiguities [PINYIN_AmbFoHe] && 
+                ((lhs == PINYIN_Fo && rhs == PINYIN_He) ||
+                 (lhs == PINYIN_He && rhs == PINYIN_Fo)))
+           )
+         return 0;
+       else if (lhs < rhs) return -1;
+       return 1;
+}
+
+int pinyin_compare_final (const PinyinCustomSettings &custom,
+                         PinyinFinal lhs,
+                         PinyinFinal rhs)
+{
+    if(((lhs == rhs) ||
+        (custom.use_ambiguities [PINYIN_AmbAnAng] &&
+         ((lhs == PINYIN_An && rhs == PINYIN_Ang) ||
+          (lhs == PINYIN_Ang && rhs == PINYIN_An))) ||
+              
+        (custom.use_ambiguities [PINYIN_AmbEnEng] &&
+         ((lhs == PINYIN_En && rhs == PINYIN_Eng) ||
+          (lhs == PINYIN_Eng && rhs == PINYIN_En))) ||
+              
+         (custom.use_ambiguities [PINYIN_AmbInIng] &&
+         ((lhs == PINYIN_In && rhs == PINYIN_Ing) ||
+          (lhs == PINYIN_Ing && rhs == PINYIN_In)))))
+        return 0;
+    else if (custom.use_incomplete && (lhs == PINYIN_ZeroFinal || rhs == PINYIN_ZeroFinal))
+        return 0;
+    else if (lhs < rhs) return -1;
+    return 1;
+}
+
+int pinyin_compare_tone (const PinyinCustomSettings &custom,
+                        PinyinTone lhs,
+                        PinyinTone rhs)
+{
+    if(lhs == rhs || !lhs || !rhs)
+        return 0;
+    else if (lhs < rhs) return -1;
+    return 1;
+}
+
+};
diff --git a/src/storage/pinyin_base.h b/src/storage/pinyin_base.h
new file mode 100644 (file)
index 0000000..374cc53
--- /dev/null
@@ -0,0 +1,728 @@
+/* 
+ *  novel-pinyin,
+ *  A Simplified Chinese Sentence-Based Pinyin Input Method Engine
+ *  Based On Markov Model.
+ *  
+ *  Copyright (C) 2002,2003,2006 James Su
+ *  
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ * 
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *  
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+/** @file pinyin_base.h
+ *  @brief the definitions of pinyin related classes and structs.
+ */
+
+#ifndef PINYIN_BASE_H
+#define PINYIN_BASE_H
+
+#include <glib.h>
+
+namespace novel{
+
+// Predefinition of some classes and structs
+struct PinyinKey;
+
+class PinyinValidator;
+class PinyinParser;
+
+struct PinyinKeyPos{
+    int    m_pos;
+    size_t m_len;
+    PinyinKeyPos(){
+       m_pos = 0;
+       m_len = 0;
+    }
+    void set_pos(int pos){
+       m_pos = pos;
+    }
+    void set_length(size_t len){
+       m_len = len;
+    }
+    int get_pos(){
+       return m_pos;
+    }
+    int get_end_pos(){
+       return m_pos + m_len;
+    }
+    size_t get_length(){
+       return m_len;
+    }
+};
+
+typedef GArray* PinyinKeyVector; /* Array of PinyinKey */
+typedef GArray* PinyinKeyPosVector; /* Array of PinyinKeyPos */
+
+
+struct PinyinCustomSettings;
+
+/**
+ * @brief enums of pinyin initial element.
+ *
+ * A pinyin key can be divided into three tokens:
+ * Initial -- such as B P M F D T N L  etc.
+ * Final   -- such as A O E I U V etc.
+ * Tone    -- can be 1, 2, 3, 4 and 5.
+ */
+enum PinyinInitial
+{
+    PINYIN_ZeroInitial = 0,    /**< zero initial. indicates invaild initial */
+    PINYIN_Bo  = 1,
+    PINYIN_Ci  = 2,
+    PINYIN_Chi = 3,
+    PINYIN_De  = 4,
+    PINYIN_Fo  = 5,
+    PINYIN_He  = 6,
+    PINYIN_Ge  = 7,
+    PINYIN_Ji  = 8,
+    PINYIN_Ke  = 9,
+    PINYIN_Mo  =10,
+    PINYIN_Ne  =11,
+    PINYIN_Le  =12,
+    PINYIN_Ri  =13,
+    PINYIN_Po  =14,
+    PINYIN_Qi  =15,
+    PINYIN_Si  =16,
+    PINYIN_Shi =17,
+    PINYIN_Te  =18,
+    PINYIN_Wu  =19,
+    PINYIN_Xi  =20,
+    PINYIN_Yi  =21,
+    PINYIN_Zi  =22,
+    PINYIN_Zhi =23,
+    PINYIN_LastInitial = PINYIN_Zhi,    /**< the last initial */
+    PINYIN_Number_Of_Initials = PINYIN_LastInitial + 1
+};
+
+/**
+ * @brief enums of pinyin final element.
+ */
+enum PinyinFinal
+{
+    PINYIN_ZeroFinal = 0,    /**< zero final. indicates invalid final */
+    PINYIN_A    = 1,
+    PINYIN_Ai   = 2,
+    PINYIN_An   = 3,
+    PINYIN_Ang  = 4,
+    PINYIN_Ao   = 5,
+    PINYIN_E    = 6,
+    PINYIN_Ea   = 7,
+    PINYIN_Ei   = 8,
+    PINYIN_En   = 9,
+    PINYIN_Eng  =10,
+    PINYIN_Er   =11,
+    PINYIN_I    =12,
+    PINYIN_Ia   =13,
+    PINYIN_Ian  =14,
+    PINYIN_Iang =15,
+    PINYIN_Iao  =16,
+    PINYIN_Ie   =17,
+    PINYIN_In   =18,
+    PINYIN_Ing  =19,
+    PINYIN_Iong =20,
+    PINYIN_Iu   =21,
+    PINYIN_Ng   =22,
+    PINYIN_O    =23,
+    PINYIN_Ong  =24,
+    PINYIN_Ou   =25,
+    PINYIN_U    =26,
+    PINYIN_Ua   =27,
+    PINYIN_Uai  =28,
+    PINYIN_Uan  =29,
+    PINYIN_Uang =30,
+    PINYIN_Ue   =31,
+    PINYIN_Ueng =32,
+    PINYIN_Ui   =33,
+    PINYIN_Un   =34,
+    PINYIN_Uo   =35,
+    PINYIN_V    =36,
+    PINYIN_Van  =37,
+    PINYIN_Ve   =38,
+    PINYIN_Vn   =39,
+    PINYIN_LastFinal = PINYIN_Vn,    /**< the last final */
+    PINYIN_Number_Of_Finals = PINYIN_LastFinal + 1
+};
+
+/**
+ * @brief enums of pinyin tone element.
+ */
+enum PinyinTone
+{
+    PINYIN_ZeroTone = 0,    /**< zero tone. this will be matched with all other tones. */
+    PINYIN_First  = 1,
+    PINYIN_Second = 2,
+    PINYIN_Third  = 3,
+    PINYIN_Fourth = 4,
+    PINYIN_Fifth  = 5,
+    PINYIN_LastTone = PINYIN_Fifth, /**< the last tone */
+    PINYIN_Number_Of_Tones = PINYIN_LastTone + 1
+};
+
+/**
+ * @brief enums of Shuang Pin Schemes.
+ */
+enum PinyinShuangPinScheme
+{
+    SHUANG_PIN_STONE      = 0,
+    SHUANG_PIN_ZRM        = 1,
+    SHUANG_PIN_MS         = 2,
+    SHUANG_PIN_ZIGUANG    = 3,
+    SHUANG_PIN_ABC        = 4,
+    SHUANG_PIN_LIUSHI     = 5,
+    SHUANG_PIN_CUSTOMIZED = 6,
+    SHUANG_PIN_DEFAULT    = SHUANG_PIN_ZRM
+};
+
+/**
+ * @brief enums of ZhuYin Schemes.
+ */
+enum PinyinZhuYinScheme
+{
+    ZHUYIN_ZHUYIN   = 0,
+    ZHUYIN_STANDARD = 1,
+    ZHUYIN_HSU      = 2,
+    ZHUYIN_IBM      = 3,
+    ZHUYIN_GIN_YIEH = 4,
+    ZHUYIN_ET       = 5,
+    ZHUYIN_ET26     = 6,
+    ZHUYIN_DEFAULT  = ZHUYIN_STANDARD
+};
+
+/**
+ * @brief enums of pinyin ambiguities.
+ *
+ * Some pinyin element maybe confused by somebody,
+ * We allow these ambiguities.
+ */
+enum PinyinAmbiguity
+{
+    PINYIN_AmbAny= 0,
+    PINYIN_AmbZhiZi,
+    PINYIN_AmbChiCi,
+    PINYIN_AmbShiSi,
+    PINYIN_AmbNeLe,
+    PINYIN_AmbLeRi,
+    PINYIN_AmbFoHe,
+    PINYIN_AmbAnAng,
+    PINYIN_AmbEnEng,
+    PINYIN_AmbInIng,
+    PINYIN_AmbLast = PINYIN_AmbInIng
+};
+
+/**
+ * @brief Structure to hold pinyin custom settings.
+ *
+ * user can custom the behavor of libpinyin by these settings.
+ */
+struct PinyinCustomSettings
+{
+    bool use_incomplete;
+        /**< allow incomplete pinyin key which only has inital. */
+
+    bool use_ambiguities [PINYIN_AmbLast + 1];
+        /**< allow ambiguous pinyin elements or not. */
+
+    PinyinCustomSettings ();
+
+    void set_use_incomplete (bool use) { use_incomplete = use; }
+    void set_use_ambiguities (PinyinAmbiguity amb, bool use)
+    {
+        if (amb == PINYIN_AmbAny)
+            for (size_t i=0; i<=PINYIN_AmbLast; ++i) use_ambiguities [i] = use;
+        else {
+            use_ambiguities [0] = false;
+                   use_ambiguities [static_cast<size_t>(amb)] = use;
+            for (size_t i=1; i<=PINYIN_AmbLast; ++i)
+                if (use_ambiguities [i]) {
+                    use_ambiguities [0] = true;
+                    break;
+                }
+        }
+    }
+
+    bool operator == (const PinyinCustomSettings &rhs) const
+    {
+        if (use_incomplete != rhs.use_incomplete)
+            return false;
+
+        for (size_t i=0; i <= PINYIN_AmbLast; ++i)
+            if (use_ambiguities [i] != rhs.use_ambiguities [i])
+                return false;
+
+        return true;
+    }
+
+    bool operator != (const PinyinCustomSettings &rhs) const
+    {
+        return !(*this == rhs);
+    }
+
+    guint32 to_value () const
+    {
+        guint32 val = 0;
+
+        if (use_incomplete) val |= 1;
+
+        for (size_t i=0; i <= PINYIN_AmbLast; ++i)
+            if (use_ambiguities [i])
+                val |= (1 << (i+1));
+
+        return val;
+    }
+
+    void from_value (guint32 val)
+    {
+        use_incomplete = ((val & 1) != 0);
+
+        for (size_t i=0; i <= PINYIN_AmbLast; ++i)
+            use_ambiguities [i] = ((val & (1 << (i+1))) != 0);
+    }
+};
+
+/**
+ * @brief Pinyin key class.
+ * 
+ * A pinyin key is a composed element of an initial, a final and a tone,
+ * which represents one or several Chinese ideographs
+ *
+ * The position and length information for the portion of string, from which
+ * the PinyinKey is parsed, are also stored in this structure.
+ */
+struct PinyinKey
+{
+    friend class PinyinBitmapIndexLevel;
+    friend inline int pinyin_exact_compare(const PinyinKey key_lhs[], 
+                                          const PinyinKey key_rhs[],
+                                          int word_length);
+    friend inline int pinyin_compare_with_ambiguities
+    (const PinyinCustomSettings &custom,
+     const PinyinKey* key_lhs,
+     const PinyinKey* key_rhs,
+     int word_length);
+    friend inline void compute_lower_value(const PinyinCustomSettings &custom,
+                                          PinyinKey in_keys[], 
+                                          PinyinKey out_keys[], 
+                                          int word_length);
+    friend inline void compute_upper_value(const PinyinCustomSettings &custom,
+                                          PinyinKey in_keys[], 
+                                          PinyinKey out_keys[], 
+                                          int word_length);
+    
+private:
+    guint16 m_initial : 5;   /**< pinyin initial */
+    guint16 m_final   : 6;   /**< pinyin final */
+    guint16 m_tone    : 3;   /**< pinyin tone */
+public:
+    /**
+     * @brief Minimal numerical value of a PinyinKey
+     * @sa get_value();
+     */
+    static const guint16 min_value;
+
+    /**
+     * @brief Maximal numerical value of a PinyinKey
+     * @sa get_value();
+     */
+    static const guint16 max_value;
+
+public:
+    /**
+     * Constructor.
+     *
+     * The default constructor of class PinyinKey.
+     */
+    PinyinKey (PinyinInitial initial = PINYIN_ZeroInitial,
+               PinyinFinal   final   = PINYIN_ZeroFinal,
+               PinyinTone    tone    = PINYIN_ZeroTone)
+        : m_initial (initial), m_final (final), m_tone (tone)
+    {
+    }
+
+    /**
+     * Constructor.
+     *
+     * Construct a PinyinKey object from a key string, with
+     * specified validator.
+     *
+     * @sa PinyinValidator
+     */
+    PinyinKey (const PinyinValidator &validator, const char *str, int len = -1)
+    {
+        set (validator, str, len);
+    }
+
+    PinyinKey (guint16 value)
+    {
+        set (value);
+    }
+    /**
+     * Clear the PinyinKey object.
+     */
+
+    void clear ()
+    {
+        m_initial = PINYIN_ZeroInitial;
+        m_final   = PINYIN_ZeroFinal;
+        m_tone    = PINYIN_ZeroTone;
+    }
+
+    /**
+     * Read PinyinKey value from a key string.
+     * 
+     * @param validator a PinyinValidator object to validate the key.
+     * @param key a Latin string including one or more pinyin keys.
+     * @return the number of characters used by this pinyin key.
+     */ 
+    int set (const PinyinValidator &validator, const char *str, int len = -1);
+
+    /**
+     * Set PinyinKey's value to initial, final and tone.
+     */
+    void set (PinyinInitial initial = PINYIN_ZeroInitial,
+              PinyinFinal final     = PINYIN_ZeroFinal,
+              PinyinTone tone       = PINYIN_ZeroTone)
+    {
+        m_initial = initial;
+        m_final   = final;
+        m_tone    = tone;
+    }
+
+    /**
+     * @brief Set this PinyinKey from its numerical value.
+     */
+    void set (guint16 value)
+    {
+        m_tone = value % PINYIN_Number_Of_Tones;
+        value /= PINYIN_Number_Of_Tones;
+        m_final = value % PINYIN_Number_Of_Finals;
+        m_initial = value / PINYIN_Number_Of_Finals;
+    }
+
+    /**
+     * @brief Get numerical value of this PinyinKey
+     */
+    guint16 get_value () const
+    {
+        return (m_initial * PINYIN_Number_Of_Finals + m_final) * PINYIN_Number_Of_Tones + m_tone;
+    }
+
+    /**
+     * Set PinyinKey's initial value to initial.
+     */
+    void set_initial (PinyinInitial initial = PINYIN_ZeroInitial)
+    {
+        m_initial = initial;
+    }
+
+    /**
+     * Set PinyinKey's final value to final.
+     */
+    void set_final (PinyinFinal final = PINYIN_ZeroFinal)
+    {
+        m_final = final;
+    }
+
+    /**
+     * Set PinyinKey's tone value to tone.
+     */
+    void set_tone (PinyinTone tone = PINYIN_ZeroTone)
+    {
+        m_tone = tone;
+    }
+
+    /**
+     * Get initial value of this key.
+     */
+    PinyinInitial get_initial () const
+    {
+        return static_cast<PinyinInitial>(m_initial);
+    }
+
+    /**
+     * Get final value of this key.
+     */
+    PinyinFinal get_final () const
+    {
+        return static_cast<PinyinFinal>(m_final);
+    }
+
+    /**
+     * Get tone value of this key.
+     */
+    PinyinTone get_tone () const
+    {
+        return static_cast<PinyinTone>(m_tone);
+    }
+
+    /**
+     * Get Latin name of this key's initial.
+     */
+    const char* get_initial_string () const;
+
+    /**
+     * Get Chinese ZhuYin name of this key's initial, in UTF-8 encoding.
+     */
+    const char* get_initial_zhuyin_string () const;
+
+    /**
+     * Get Latin name of this key's final.
+     */
+    const char* get_final_string () const;
+
+    /**
+     * Get Chinese ZhuYin name of this key's final, in UTF-8 encoding.
+     */
+    const char* get_final_zhuyin_string () const;
+
+    /**
+     * Get Latin name of this key's tone.
+     */
+    const char* get_tone_string () const;
+
+    /**
+     * Get Chinese ZhuYin name of this key's tone, in UTF-8 encoding.
+     */
+    const char* get_tone_zhuyin_string () const;
+
+    /**
+     * Get Latin name of this key.
+     */
+    const char * get_key_string () const;
+
+    /**
+     * Get Chinese ZhuYin name of this key, in UTF-8 encoding.
+     */
+    const char * get_key_zhuyin_string () const;
+
+    /**
+     * Check if this key is empty.
+     */
+    bool is_empty () const
+    {
+        return  m_initial == PINYIN_ZeroInitial && m_final == PINYIN_ZeroFinal && m_tone == PINYIN_ZeroTone;
+    }
+
+    /**
+     * Check if this key has both initial, final and tone.
+     */
+    bool is_complete () const
+    {
+        return m_initial != PINYIN_ZeroInitial && m_final != PINYIN_ZeroFinal && m_tone != PINYIN_ZeroTone;
+    }
+
+    bool operator == (PinyinKey rhs) const
+    {
+        return m_initial == rhs.m_initial && m_final == rhs.m_final && m_tone == rhs.m_tone;
+    }
+
+    bool operator != (PinyinKey rhs) const
+    {
+        return m_initial != rhs.m_initial || m_final != rhs.m_final || m_tone != rhs.m_tone;
+    }
+
+    bool operator < (PinyinKey rhs) const
+    {
+        if (m_initial < rhs.m_initial) return true;
+        if (m_initial > rhs.m_initial) return false;
+        if (m_final < rhs.m_final) return true;
+        if (m_final > rhs.m_final) return false;
+        return m_tone < rhs.m_tone;
+    }
+
+    bool operator > (PinyinKey rhs) const
+    {
+        if (m_initial > rhs.m_initial) return true;
+        if (m_initial < rhs.m_initial) return false;
+        if (m_final > rhs.m_final) return true;
+        if (m_final < rhs.m_final) return false;
+        return m_tone > rhs.m_tone;
+    }
+};
+
+/**
+ * NULL Validator of PinyinKey object.
+ *
+ * This class is for validating a PinyinKey object.
+ */
+class PinyinValidator
+{
+public:
+    /**
+     * Overloaded operator () function to validate a pinyin key.
+     *
+     * @param key The key to be validated.
+     * @return true if the key is valid.
+     */
+    virtual bool operator () (PinyinKey key) const = 0;
+};
+
+class PinyinLargeTable;
+/**
+ * Validator of PinyinKey object.
+ *
+ * This class is for validating a PinyinKey object.
+ */
+class BitmapPinyinValidator:public PinyinValidator
+{
+    char m_bitmap [(PINYIN_Number_Of_Initials * PINYIN_Number_Of_Finals * PINYIN_Number_Of_Tones + 7) / 8];
+
+public:
+    BitmapPinyinValidator (const PinyinLargeTable *table = 0);
+
+    /**
+     * initialize the validator with specified custom settings
+     * and PinyinLargeTable.
+     */
+    void initialize (const PinyinLargeTable *table = 0);
+
+    /**
+     * Overloaded operator () function to validate a pinyin key.
+     *
+     * @param key The key to be validated.
+     * @return true if the key is valid.
+     */
+    virtual bool operator () (PinyinKey key) const;
+};
+
+/**
+ * NULL Validator of PinyinKey object.
+ *
+ * This class is for validating a PinyinKey object.
+ */
+class NullPinyinValidator:public PinyinValidator
+{
+public:
+    /**
+     * Overloaded operator () function to validate a pinyin key.
+     *
+     * @param key The key to be validated.
+     * @return true if the key is valid.
+     */
+    virtual bool operator () (PinyinKey key) const{
+       return true;
+    }
+};
+
+/**
+ * @brief Class to translate string into PinyinKey.
+ */
+class PinyinParser
+{
+public: 
+    virtual ~PinyinParser ();
+
+    /**
+     * @brief Translate only one PinyinKey from a string.
+     *
+     * @param validator PinyinValidator object to valid result.
+     * @param key Stores result PinyinKey.
+     * @param str Input string in UTF-8 encoding, in most case this string is just a plain ASCII string,
+     *            but for ZhuYin Parser works in ZHUYIN_ZHUYIN scheme,
+     *            it's an UTF-8 string which contains ZhuYin chars.
+     * @param len The length of str, in number of chars rather than bytes.
+     *
+     * @return the number of chars were actually used.
+     */
+    virtual int parse_one_key (const PinyinValidator &validator, PinyinKey &key, const char *str, int len = -1) const = 0;
+
+    /**
+     * @brief Handy wrapper function of parse_one_key(), which accept a String object instead of char *.
+     */
+    int parse_one_key (const PinyinValidator &validator, PinyinKey &key, const char * &str) const
+    {
+        return parse_one_key (validator, key, str, g_utf8_strlen (str, -1));
+    }
+
+    /**
+     * @brief Translate the source string into a set of PinyinKeys.
+     *
+     * @param validator PinyinValidator object to valid result.
+     * @param keys Stores result PinyinKeys.
+     * @param str Input string in UTF-8 encoding, in most case this string is just a plain ASCII string,
+     *            but for ZhuYin Parser works in ZHUYIN_ZHUYIN scheme,
+     *            it's an UTF-8 string which contains ZhuYin chars.
+     * @param len The length of str, in number of chars rather than bytes.
+     *
+     * @return the number of chars were actually used.
+     */
+    virtual int parse (const PinyinValidator &validator, PinyinKeyVector & keys,PinyinKeyPosVector & poses, const char *str, int len = -1) const = 0;
+
+public:
+    static void normalize (PinyinKey &key);
+};
+
+/**
+ * The default Pinyin Parser which parses full pinyin string into PinyinKeys.
+ */
+class PinyinDefaultParser : public PinyinParser
+{
+public: 
+    virtual ~PinyinDefaultParser ();
+
+    virtual int parse_one_key (const PinyinValidator &validator, PinyinKey &key, const char *str, int len = -1) const;
+    virtual int parse (const PinyinValidator &validator, PinyinKeyVector & keys, PinyinKeyPosVector & poses, const char *str, int len = -1) const;
+
+public:
+    using PinyinParser::parse_one_key;
+    using PinyinParser::parse;
+};
+
+/* The valid input chars of ShuangPin is a-z and ';'
+ */
+class PinyinShuangPinParser : public PinyinParser
+{
+    PinyinInitial m_initial_map [27];
+    PinyinFinal   m_final_map [27][2];
+
+public:
+    /**
+     * Constructor 
+     *
+     * @param scheme the predefined ShuangPin scheme to be used.
+     */
+    PinyinShuangPinParser (PinyinShuangPinScheme scheme = SHUANG_PIN_DEFAULT);
+    PinyinShuangPinParser (const PinyinInitial initial_map[27], const PinyinFinal final_map[27][2]);
+
+    virtual ~PinyinShuangPinParser ();
+
+    virtual int parse_one_key (const PinyinValidator &validator, PinyinKey &key, const char *str, int len = -1) const;
+    virtual int parse (const PinyinValidator &validator, PinyinKeyVector &keys, PinyinKeyPosVector & poses, const char *str, int len = -1) const;
+
+public:
+    void set_scheme (PinyinShuangPinScheme scheme);
+    void set_scheme (const PinyinInitial initial_map[27], const PinyinFinal final_map[27][2]);
+
+    void get_scheme (PinyinInitial initial_map[27], PinyinFinal final_map[27][2]);
+
+public:
+    using PinyinParser::parse_one_key;
+    using PinyinParser::parse;
+};
+
+int pinyin_compare_initial (const PinyinCustomSettings &custom,
+                           PinyinInitial lhs,
+                           PinyinInitial rhs);
+
+int pinyin_compare_final (const PinyinCustomSettings &custom,
+                         PinyinFinal lhs,
+                         PinyinFinal rhs);
+
+int pinyin_compare_tone (const PinyinCustomSettings &custom,
+                        PinyinTone lhs,
+                        PinyinTone rhs);
+};
+
+using namespace novel;
+
+#endif
diff --git a/src/storage/pinyin_large_table.cpp b/src/storage/pinyin_large_table.cpp
new file mode 100644 (file)
index 0000000..794cca5
--- /dev/null
@@ -0,0 +1,690 @@
+/* 
+ *  novel-pinyin,
+ *  A Simplified Chinese Sentence-Based Pinyin Input Method Engine
+ *  Based On Markov Model.
+ *  
+ *  Copyright (C) 2006-2007 Peng Wu
+ *  
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ * 
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *  
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <assert.h>
+#include <string.h>
+#include "novel_types.h"
+#include "pinyin_base.h"
+#include "pinyin_phrase.h"
+#include "pinyin_large_table.h"
+
+
+PinyinBitmapIndexLevel::PinyinBitmapIndexLevel(PinyinCustomSettings * custom)
+    :m_custom(custom){
+    memset(m_pinyin_length_indexes, 0 , sizeof(m_pinyin_length_indexes));
+}
+
+void PinyinBitmapIndexLevel::reset(){
+    for ( int k = PINYIN_ZeroInitial; k < PINYIN_Number_Of_Initials; k++)
+       for ( int m = PINYIN_ZeroFinal; m < PINYIN_Number_Of_Finals; m++)
+           for ( int n = PINYIN_ZeroTone; n < PINYIN_Number_Of_Tones; n++){
+               PinyinLengthIndexLevel * length_array = 
+                   m_pinyin_length_indexes[k][m][n];
+               if ( length_array )
+                   delete length_array;
+           }
+}
+
+int PinyinBitmapIndexLevel::search( int phrase_length, /* in */ PinyinKey keys[],
+           /* out */ PhraseIndexRanges ranges) const{
+    return initial_level_search(phrase_length, keys, ranges);
+}
+
+int PinyinBitmapIndexLevel::initial_level_search(int phrase_length, 
+                                                /* in */PinyinKey keys[],
+                                                /* out */ PhraseIndexRanges ranges) const{
+
+#define MATCH(AMBIGUITY, ORIGIN, ANOTHER)  case ORIGIN:                        \
+    {                                                                   \
+       result |= final_level_search((PinyinInitial)first_key.m_initial,\
+                                   phrase_length, keys, ranges);               \
+       if ( custom.use_ambiguities [AMBIGUITY] ){                      \
+           result |= final_level_search(ANOTHER,                       \
+                                        phrase_length, keys, ranges);  \
+       }                                                               \
+       return result;                                                  \
+    }
+    
+    //deal with the ambiguities
+
+    int result = 0;
+    PinyinKey& first_key = keys[0];
+    PinyinCustomSettings &  custom= *m_custom;
+    
+    switch(first_key.m_initial){
+       
+       MATCH(PINYIN_AmbZhiZi, PINYIN_Zi, PINYIN_Zhi);
+       MATCH(PINYIN_AmbZhiZi, PINYIN_Zhi, PINYIN_Zi);
+       MATCH(PINYIN_AmbChiCi, PINYIN_Ci, PINYIN_Chi);
+       MATCH(PINYIN_AmbChiCi, PINYIN_Chi, PINYIN_Ci);
+       MATCH(PINYIN_AmbShiSi, PINYIN_Si, PINYIN_Shi);
+       MATCH(PINYIN_AmbShiSi, PINYIN_Shi, PINYIN_Si);
+       MATCH(PINYIN_AmbLeRi, PINYIN_Ri, PINYIN_Le);
+       MATCH(PINYIN_AmbNeLe, PINYIN_Ne, PINYIN_Le);
+       MATCH(PINYIN_AmbFoHe, PINYIN_Fo, PINYIN_He);
+       MATCH(PINYIN_AmbFoHe, PINYIN_He, PINYIN_Fo);
+
+    case PINYIN_Le:
+       {
+           result |= final_level_search((PinyinInitial)first_key.m_initial, 
+                                       phrase_length, keys, ranges);  
+           if ( custom.use_ambiguities [PINYIN_AmbLeRi] )              
+               result |= final_level_search(PINYIN_Ri, phrase_length,
+                                            keys, ranges);     
+           if ( custom.use_ambiguities [PINYIN_AmbNeLe] )              
+               result |= final_level_search(PINYIN_Ne, phrase_length, 
+                                            keys, ranges);
+           return result;
+       }
+    default:
+       {
+           return final_level_search((PinyinInitial)first_key.m_initial,
+                                     phrase_length, 
+                                     keys, ranges);
+       }
+  }
+#undef MATCH 
+}
+
+int PinyinBitmapIndexLevel::final_level_search(PinyinInitial initial,
+                                              int phrase_length, 
+                                              /* in */PinyinKey keys[],
+                                              /* out */ PhraseIndexRanges ranges) const{
+#define MATCH(AMBIGUITY, ORIGIN, ANOTHER) case ORIGIN:                         \
+    {                                                                  \
+       result = tone_level_search(initial,(PinyinFinal) first_key.m_final,\
+                                  phrase_length, keys, ranges);                \
+       if ( custom.use_ambiguities [AMBIGUITY] ){                      \
+           result |= tone_level_search(initial, ANOTHER,               \
+                                       phrase_length, keys, ranges);   \
+       }                                                               \
+       return result;                                                  \
+    }
+    
+    int result = 0;
+    PinyinKey& first_key = keys[0];
+    PinyinCustomSettings &  custom= *m_custom;
+
+    switch(first_key.m_final){
+    case PINYIN_ZeroFinal:
+       {
+           if (!custom.use_incomplete )
+               return result;
+           for ( int i  = PINYIN_A; i < PINYIN_Number_Of_Finals; ++i){
+               result |= tone_level_search(initial,(PinyinFinal)i , 
+                                           phrase_length, keys, ranges);
+           }
+           return result;
+       }
+       
+       MATCH(PINYIN_AmbAnAng, PINYIN_An, PINYIN_Ang);
+       MATCH(PINYIN_AmbAnAng, PINYIN_Ang, PINYIN_An);
+       MATCH(PINYIN_AmbEnEng, PINYIN_En, PINYIN_Eng);
+       MATCH(PINYIN_AmbEnEng, PINYIN_Eng, PINYIN_En);
+       MATCH(PINYIN_AmbInIng, PINYIN_In, PINYIN_Ing);
+       MATCH(PINYIN_AmbInIng, PINYIN_Ing, PINYIN_In);
+       
+    default:
+       {
+           return tone_level_search(initial,(PinyinFinal)first_key.m_final, 
+                                    phrase_length, keys, ranges);
+       }
+    }
+#undef MATCH
+}
+
+int PinyinBitmapIndexLevel::tone_level_search(PinyinInitial initial, 
+                                             PinyinFinal final,
+                                             int phrase_length, 
+                                             /* in */PinyinKey keys[],
+                                             /* out */ PhraseIndexRanges ranges) const{
+    int result = 0;
+    PinyinKey& first_key = keys[0];
+    PinyinCustomSettings &  custom= *m_custom;
+
+    switch ( first_key.m_tone ){
+    case PINYIN_ZeroTone:
+       {
+               //deal with ZeroTone in pinyin table files.
+           for ( int i = PINYIN_ZeroTone; i < PINYIN_Number_Of_Tones; ++i){
+               PinyinLengthIndexLevel * phrases = 
+                   m_pinyin_length_indexes[initial][final][(PinyinTone)i];
+               if ( phrases )
+                   result |= phrases->search(phrase_length - 1, &custom,
+                                             keys + 1, ranges);
+           }
+           return result;
+       }
+    default:
+       {
+           PinyinLengthIndexLevel * phrases = 
+               m_pinyin_length_indexes[initial][final]
+               [PINYIN_ZeroTone];
+           if ( phrases )
+               result = phrases->search(phrase_length - 1, &custom,
+                                        keys + 1, ranges);
+           phrases = m_pinyin_length_indexes[initial][final]
+               [(PinyinTone) first_key.m_tone];
+           if ( phrases )
+               result |= phrases->search(phrase_length - 1, &custom, 
+                                         keys + 1, ranges);
+           return result;
+       }
+    }
+       return result;
+}
+
+PinyinLengthIndexLevel::PinyinLengthIndexLevel(){
+    m_pinyin_array_indexes = g_array_new(FALSE, TRUE, sizeof(void *));
+}
+
+PinyinLengthIndexLevel::~PinyinLengthIndexLevel(){
+#define CASE(x) case x:                                                        \
+    {                                                                  \
+       PinyinArrayIndexLevel<x> * array = g_array_index                \
+           (m_pinyin_array_indexes, PinyinArrayIndexLevel<x> *, x);    \
+       if (array)                                                      \
+           delete array;                                               \
+       break;                                                          \
+    }
+    for ( int i = 0 ; i < m_pinyin_array_indexes->len; ++i){
+       switch (i){
+           CASE(0);
+           CASE(1);
+           CASE(2);
+           CASE(3);
+           CASE(4);
+           CASE(5);
+           CASE(6);
+           CASE(7);
+           CASE(8);
+           CASE(9);
+           CASE(10);
+           CASE(11);
+           CASE(12);
+           CASE(13);
+           CASE(14);
+           CASE(15);
+       default:
+           assert(false);
+       }
+    }
+    g_array_free(m_pinyin_array_indexes, TRUE);
+#undef CASE
+}
+
+int PinyinLengthIndexLevel::search( int phrase_length,
+                                   /* in */ PinyinCustomSettings * custom,
+                                   /* in */ PinyinKey keys[],
+                                   /* out */ PhraseIndexRanges ranges){
+    int result = SEARCH_NONE;
+    if(m_pinyin_array_indexes->len < phrase_length + 1)
+       return result;
+    if (m_pinyin_array_indexes->len > phrase_length + 1)
+       result |= SEARCH_CONTINUED;
+    
+#define CASE(len) case len:                                            \
+    {                                                                   \
+       PinyinArrayIndexLevel<len> * array = g_array_index              \
+           (m_pinyin_array_indexes, PinyinArrayIndexLevel<len> *, len); \
+       if ( !array )                                                   \
+           return result;                                              \
+       result |= array->search(custom, keys, ranges);                  \
+       return result;                                                  \
+    }
+
+    switch ( phrase_length ){
+       CASE(0);
+       CASE(1);
+       CASE(2);
+       CASE(3);
+       CASE(4);
+       CASE(5);
+       CASE(6);
+       CASE(7);
+       CASE(8);
+       CASE(9);
+       CASE(10);
+       CASE(11);
+       CASE(12);
+       CASE(13);
+       CASE(14);
+       CASE(15);
+    default:
+       assert(false);
+    }
+#undef CASE
+}
+
+template<size_t phrase_length>
+int PinyinArrayIndexLevel<phrase_length>::search(/* in */ PinyinCustomSettings * custom, /* in */ PinyinKey keys[], /* out */ PhraseIndexRanges ranges){
+  PhraseExactLessThan<phrase_length> m_lessthan;
+  PinyinIndexItem<phrase_length> * chunk_begin, * chunk_end;
+  chunk_begin = (PinyinIndexItem<phrase_length> *)m_chunk.begin();
+  chunk_end = (PinyinIndexItem<phrase_length> *)m_chunk.end();
+  //do the search
+  PinyinKey left_keys[phrase_length], right_keys[phrase_length];
+  compute_lower_value(*custom, keys, left_keys, phrase_length);
+  compute_upper_value(*custom, keys, right_keys, phrase_length);
+  PinyinIndexItem<phrase_length> left(left_keys, -1), right(right_keys, -1);
+
+  PinyinIndexItem<phrase_length> * begin = std_lite::lower_bound(chunk_begin, chunk_end, left, m_lessthan);
+  PinyinIndexItem<phrase_length> * end = std_lite::upper_bound(chunk_begin, chunk_end, right, m_lessthan);
+
+  return convert(custom, keys, begin, end, ranges);
+}
+
+template<size_t phrase_length>
+int PinyinArrayIndexLevel<phrase_length>::convert(PinyinCustomSettings * custom, PinyinKey keys[], PinyinIndexItem<phrase_length> * begin, PinyinIndexItem<phrase_length> * end, PhraseIndexRanges ranges){
+    PinyinIndexItem<phrase_length> * iter;
+    PhraseIndexRange cursor;
+    GArray * head, *cursor_head = NULL;
+    int result = SEARCH_NONE;
+    cursor.m_range_begin = -1; cursor.m_range_end = -1;
+    for ( iter = begin; iter != end; ++iter){
+       if ( ! 0 == 
+            pinyin_compare_with_ambiguities
+            (*custom, keys, iter->m_keys, phrase_length))
+           continue;
+       phrase_token_t token = iter->m_token;
+       head = ranges[PHRASE_INDEX_LIBRARY_INDEX(token)];
+       if ( NULL == head )
+           continue;
+
+        result |= SEARCH_OK;
+
+       if ( cursor.m_range_begin == -1 ){
+           cursor.m_range_begin = token;
+           cursor.m_range_end = token + 1;
+           cursor_head = head;
+       }else if (cursor.m_range_end == token && 
+                 PHRASE_INDEX_LIBRARY_INDEX(cursor.m_range_end) == 
+                 PHRASE_INDEX_LIBRARY_INDEX(token) ){
+           cursor.m_range_end++;
+       }else {
+           g_array_append_val(cursor_head, cursor);
+           cursor.m_range_begin = token; cursor.m_range_end = token + 1;
+           cursor_head = head;
+       }
+    }
+    if ( cursor.m_range_begin == -1 )
+       return result;
+
+    g_array_append_val(cursor_head, cursor);
+    return result;
+}
+
+int PinyinBitmapIndexLevel::add_index( int phrase_length, /* in */ PinyinKey keys[] ,/* in */ phrase_token_t token){
+    PinyinKey firstkey = keys[0];
+    PinyinLengthIndexLevel * &length_array = 
+       m_pinyin_length_indexes[firstkey.m_initial][firstkey.m_final][firstkey.m_tone];
+    if ( ! length_array ){
+       length_array = new PinyinLengthIndexLevel();
+    }
+    return length_array->add_index(phrase_length - 1, keys + 1, token);
+}
+
+int PinyinBitmapIndexLevel::remove_index( int phrase_length, /* in */ PinyinKey keys[] ,/* in */ phrase_token_t token){
+    PinyinKey firstkey = keys[0];
+    PinyinLengthIndexLevel * &length_array = 
+       m_pinyin_length_indexes[firstkey.m_initial][firstkey.m_final][firstkey.m_tone];
+    if ( length_array )
+       return length_array->add_index(phrase_length - 1, keys + 1, token);
+    return REMOVE_ITEM_DONOT_EXISTS;
+}
+
+int PinyinLengthIndexLevel::add_index( int phrase_length, /* in */ PinyinKey keys[] ,/* in */ phrase_token_t token){
+    assert(phrase_length + 1 < MAX_PHRASE_LENGTH);
+    if ( m_pinyin_array_indexes -> len <= phrase_length )
+       g_array_set_size(m_pinyin_array_indexes, phrase_length + 1);
+#define CASE(x)        case x:                                              \
+    {                                                                \
+       PinyinArrayIndexLevel<x> * &array = g_array_index            \
+           (m_pinyin_array_indexes, PinyinArrayIndexLevel<x> *, x); \
+       if ( !array )                                                \
+           array = new PinyinArrayIndexLevel<x>;                    \
+       return array->add_index(keys, token);                                \
+    }
+    switch(phrase_length){
+       CASE(0);
+       CASE(1);
+       CASE(2);
+       CASE(3);
+       CASE(4);
+       CASE(5);
+       CASE(6);
+       CASE(7);
+       CASE(8);
+       CASE(9);
+       CASE(10);
+       CASE(11);
+       CASE(12);
+       CASE(13);
+       CASE(14);
+       CASE(15);
+    default:
+       assert(false);
+    }
+#undef CASE
+}
+
+int PinyinLengthIndexLevel::remove_index( int phrase_length, /* in */ PinyinKey keys[] ,/* in */ phrase_token_t token){
+    assert(phrase_length + 1 < MAX_PHRASE_LENGTH);
+    if ( m_pinyin_array_indexes -> len <= phrase_length )
+       return false;
+#define CASE(x)        case x:                                                 \
+    {                                                                  \
+       PinyinArrayIndexLevel<x> * &array = g_array_index               \
+           (m_pinyin_array_indexes, PinyinArrayIndexLevel<x> *, x);    \
+       if ( !array )                                                   \
+           return false;                                               \
+       return array->remove_index(keys, token);                        \
+    }
+    switch(phrase_length){
+       CASE(0);
+       CASE(1);
+       CASE(2);
+       CASE(3);
+       CASE(4);
+       CASE(5);
+       CASE(6);
+       CASE(7);
+       CASE(8);
+       CASE(9);
+       CASE(10);
+       CASE(11);
+       CASE(12);
+       CASE(13);
+       CASE(14);
+       CASE(15);
+    default:
+       assert(false);
+    }
+#undef CASE
+}
+
+template<size_t phrase_length>
+int PinyinArrayIndexLevel<phrase_length>::add_index(/* in */ PinyinKey keys[] ,/* in */ phrase_token_t token){
+    PhraseExactLessThan<phrase_length> m_lessthan;
+    PinyinIndexItem<phrase_length> * buf_begin, * buf_end;
+
+    PinyinIndexItem<phrase_length> new_elem(keys, token);
+    buf_begin = (PinyinIndexItem<phrase_length> *) m_chunk.begin();
+    buf_end = (PinyinIndexItem<phrase_length> *) m_chunk.end();
+
+    std_lite::pair<PinyinIndexItem<phrase_length> *, PinyinIndexItem<phrase_length> *> range;
+    range = std_lite::equal_range(buf_begin, buf_end, new_elem, m_lessthan);
+
+    PinyinIndexItem<phrase_length> * cur_elem;
+    for ( cur_elem = range.first; 
+         cur_elem != range.second; ++cur_elem){
+       if ( cur_elem->m_token == token )
+           return INSERT_ITEM_EXISTS;
+       if ( cur_elem->m_token > token )
+           break;
+    }
+
+    int offset = (cur_elem - buf_begin) *
+       sizeof(PinyinIndexItem<phrase_length>);
+    m_chunk.insert_content(offset, &new_elem, 
+                          sizeof ( PinyinIndexItem<phrase_length> ));
+    return INSERT_OK;
+}
+
+template<size_t phrase_length>
+int PinyinArrayIndexLevel<phrase_length>::remove_index(/* in */ PinyinKey keys[] ,/* in */ phrase_token_t token){
+    PhraseExactLessThan<phrase_length> m_lessthan;
+    PinyinIndexItem<phrase_length> * buf_begin, * buf_end;
+
+    PinyinIndexItem<phrase_length> new_elem(keys, token);
+    buf_begin = (PinyinIndexItem<phrase_length> *) m_chunk.begin();
+    buf_end = (PinyinIndexItem<phrase_length> *) m_chunk.end();
+
+    std_lite::pair<PinyinIndexItem<phrase_length> *, PinyinIndexItem<phrase_length> *> range;
+    range = std_lite::equal_range(buf_begin, buf_end, new_elem, m_lessthan);
+
+    PinyinIndexItem<phrase_length> * cur_elem;
+    for ( cur_elem = range.first; 
+         cur_elem != range.second; ++cur_elem){
+       if ( cur_elem->m_token == token )
+           break;
+    }
+    if (cur_elem->m_token != token )
+       return REMOVE_ITEM_DONOT_EXISTS;
+
+    int offset = (cur_elem - buf_begin) *
+       sizeof(PinyinIndexItem<phrase_length>);
+    m_chunk.remove_content(offset, sizeof (PinyinIndexItem<phrase_length>));
+    return REMOVE_OK;
+}
+
+bool PinyinLargeTable::load_text(FILE * infile){
+    char pinyin[256];
+    char phrase[256];
+    phrase_token_t token;
+    size_t freq;    
+    while ( !feof(infile)){
+        fscanf(infile, "%s", pinyin);
+        fscanf(infile, "%s", phrase);
+        fscanf(infile, "%ld", &token);
+       fscanf(infile, "%ld", &freq);   
+       
+       PinyinDefaultParser parser;
+       NullPinyinValidator validator;
+       PinyinKeyVector keys;
+       PinyinKeyPosVector poses;
+       
+       keys = g_array_new(FALSE, FALSE, sizeof( PinyinKey));
+       poses = g_array_new(FALSE, FALSE, sizeof( PinyinKeyPos));
+       parser.parse(validator, keys, poses, pinyin);
+       
+       add_index( keys->len, (PinyinKey *)keys->data, token);
+
+       g_array_free(keys, true);
+       g_array_free(poses, true);
+    }
+       return true;
+}
+
+bool PinyinBitmapIndexLevel::load(MemoryChunk * chunk, table_offset_t offset,
+                                 table_offset_t end){
+    reset();
+    char * buf_begin = (char *) chunk->begin();
+    table_offset_t phrase_begin, phrase_end;
+    table_offset_t * index = (table_offset_t *) (buf_begin + offset);
+    phrase_end = *index;
+    for ( int m = 0; m < PINYIN_Number_Of_Initials; ++m )
+       for ( int n = 0; n < PINYIN_Number_Of_Finals; ++n)
+           for ( int k = 0; k < PINYIN_Number_Of_Tones; ++k){
+               phrase_begin = phrase_end;
+               index++;
+               phrase_end = *index;
+               if ( phrase_begin == phrase_end ) //null pointer
+                   continue;
+               PinyinLengthIndexLevel * phrases = new PinyinLengthIndexLevel;
+               m_pinyin_length_indexes[m][n][k] = phrases;
+               phrases->load(chunk, phrase_begin, phrase_end - 1);
+               assert( phrase_end <= end );
+               assert( *(buf_begin + phrase_end - 1) == c_separate);
+           }
+    offset += (PINYIN_Number_Of_Initials * PINYIN_Number_Of_Finals * PINYIN_Number_Of_Tones + 1) * sizeof ( table_offset_t);
+    assert( c_separate == *(buf_begin + offset));
+    return true;
+}
+
+bool PinyinBitmapIndexLevel::store(MemoryChunk * new_chunk, 
+                                  table_offset_t offset,
+                                  table_offset_t & end){
+    table_offset_t phrase_end;
+    table_offset_t index = offset;
+    offset += (PINYIN_Number_Of_Initials * PINYIN_Number_Of_Finals * PINYIN_Number_Of_Tones + 1) * sizeof ( table_offset_t);
+    //add '#'
+    new_chunk->set_content(offset, &c_separate, sizeof(char));
+    offset += sizeof(char);
+    new_chunk->set_content(index, &offset, sizeof(table_offset_t));
+    index += sizeof(table_offset_t);
+    for ( int m = 0; m < PINYIN_Number_Of_Initials; ++m )
+       for ( int n = 0; n < PINYIN_Number_Of_Finals; ++n)
+           for ( int k = 0; k < PINYIN_Number_Of_Tones; ++k){
+               PinyinLengthIndexLevel * phrases = m_pinyin_length_indexes[m][n][k];
+               if ( !phrases ){ //null pointer
+                   new_chunk->set_content(index, &offset, sizeof(table_offset_t));
+                   index += sizeof(table_offset_t);
+                   continue;
+               }
+               phrases->store(new_chunk, offset, phrase_end); //has a end '#'
+               offset = phrase_end;
+               //add '#'
+               new_chunk->set_content(offset, &c_separate, sizeof(char));
+               offset += sizeof(char);
+               new_chunk->set_content(index, &offset, sizeof(table_offset_t));
+               index += sizeof(table_offset_t);
+           }
+    end = offset;
+    return true;
+}
+
+bool PinyinLengthIndexLevel::load(MemoryChunk * chunk, table_offset_t offset, table_offset_t end){
+    char * buf_begin = (char *) chunk->begin();
+    guint32 nindex = *((guint32 *)(buf_begin + offset));
+    table_offset_t * index = (table_offset_t *)
+       (buf_begin + offset + sizeof(guint32));
+
+    table_offset_t phrase_begin, phrase_end = *index;
+    m_pinyin_array_indexes = g_array_new(FALSE, TRUE, sizeof(void *));
+    for ( size_t i = 1; i <= nindex; i++){
+       phrase_begin = phrase_end;
+       index++;
+       phrase_end = *index;
+       if ( phrase_begin == phrase_end ){
+           void * null = NULL;
+           g_array_append_val(m_pinyin_array_indexes , null);
+           continue;
+       }
+
+#define CASE(x) case x - 1:                                            \
+       {                                                               \
+           PinyinArrayIndexLevel<x> * phrase = new PinyinArrayIndexLevel<x>; \
+           phrase->load(chunk, phrase_begin, phrase_end - 1);          \
+           assert( *(buf_begin + phrase_end - 1) == c_separate);       \
+           assert( phrase_end <= end );                                \
+           g_array_append_val(m_pinyin_array_indexes, phrase);         \
+           break;                                                      \
+       }
+       switch ( i ){
+           CASE(0);
+           CASE(1);
+           CASE(2);
+           CASE(3);
+           CASE(4);
+           CASE(5);
+           CASE(6);
+           CASE(7);
+           CASE(8);
+           CASE(9);
+           CASE(10);
+           CASE(11);
+           CASE(12);
+           CASE(13);
+           CASE(14);
+           CASE(15);
+       default:
+           assert(false);
+       }
+#undef CASE
+    }
+    offset += sizeof(guint32) + (nindex + 1) * sizeof(table_offset_t);
+    assert ( c_separate == * (buf_begin + offset) );
+    return true;
+}
+
+bool PinyinLengthIndexLevel::store(MemoryChunk * new_chunk, table_offset_t offset, table_offset_t& end){
+    guint32 nindex = m_pinyin_array_indexes->len;
+    new_chunk->set_content(offset, &nindex, sizeof(guint32));
+    table_offset_t index = offset + sizeof(guint32);
+
+    offset += sizeof(guint32) + (nindex + 1) * sizeof(table_offset_t);
+    new_chunk->set_content(offset, &c_separate, sizeof(char));
+    offset += sizeof(char);
+    new_chunk->set_content(index, &offset, sizeof(table_offset_t));
+    index += sizeof(table_offset_t);
+    table_offset_t phrase_end;
+    for ( size_t i = 0 ; i < m_pinyin_array_indexes->len; ++i){
+#define CASE(x) case x:        {                                               \
+           PinyinArrayIndexLevel<x> * phrase = g_array_index           \
+               (m_pinyin_array_indexes, PinyinArrayIndexLevel<x> * , i); \
+           if ( !phrase ){                                             \
+               new_chunk->set_content                                  \
+                   (index, &offset, sizeof(table_offset_t));           \
+               index += sizeof(table_offset_t);                        \
+               continue;                                               \
+           }                                                           \
+           phrase->store(new_chunk, offset, phrase_end);               \
+           offset = phrase_end;                                        \
+           /*add '#'*/                                                 \
+           new_chunk->set_content(offset, &c_separate, sizeof(char));  \
+           offset += sizeof(char);                                     \
+           new_chunk->set_content(index, &offset, sizeof(table_offset_t));\
+           index += sizeof(table_offset_t);                            \
+           break;                                                      \
+       }
+       switch ( i ){
+           CASE(0);
+           CASE(1);
+           CASE(2);
+           CASE(3);
+           CASE(4);
+           CASE(5);
+           CASE(6);
+           CASE(7);
+           CASE(8);
+           CASE(9);
+           CASE(10);
+           CASE(11);
+           CASE(12);
+           CASE(13);
+           CASE(14);
+           CASE(15);
+       default:
+           assert(false);
+       }
+#undef CASE                                                    
+    }
+    end = offset;
+    return true;
+}
+
+template<size_t phrase_length>
+bool PinyinArrayIndexLevel<phrase_length>::
+load(MemoryChunk * chunk, table_offset_t offset, table_offset_t end){
+    char * buf_begin = (char *) chunk->begin();
+    m_chunk.set_chunk(buf_begin + offset, end - offset, NULL);
+    return true;
+}
+
+template<size_t phrase_length>
+bool PinyinArrayIndexLevel<phrase_length>::
+store(MemoryChunk * new_chunk, table_offset_t offset, table_offset_t& end){
+    new_chunk->set_content(offset, m_chunk.begin(), m_chunk.size());
+    end = offset + m_chunk.size();
+    return true;
+}
diff --git a/src/storage/pinyin_large_table.h b/src/storage/pinyin_large_table.h
new file mode 100755 (executable)
index 0000000..71b3640
--- /dev/null
@@ -0,0 +1,178 @@
+/* 
+ *  novel-pinyin,
+ *  A Simplified Chinese Sentence-Based Pinyin Input Method Engine
+ *  Based On Markov Model.
+ *  
+ *  Copyright (C) 2006-2007 Peng Wu
+ *  
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ * 
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *  
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef PINYIN_LARGE_TABLE_H
+#define PINYIN_LARGE_TABLE_H
+
+#include <stdio.h>
+#include "novel_types.h"
+#include "memory_chunk.h"
+
+namespace novel{
+
+/* Because this is not large,
+ * Store this in user home directory.
+ */
+
+class PinyinLengthIndexLevel;
+
+class PinyinBitmapIndexLevel{
+    friend class PinyinLargeTable;
+    PinyinCustomSettings * m_custom;
+protected:
+    PinyinLengthIndexLevel * m_pinyin_length_indexes[PINYIN_Number_Of_Initials]
+                                                    [PINYIN_Number_Of_Finals]
+                                                    [PINYIN_Number_Of_Tones];
+    //search function
+    int initial_level_search(int word_length, /* in */PinyinKey keys[],
+                            /* out */ PhraseIndexRanges ranges) const;
+    int final_level_search(PinyinInitial initial, int word_length, /* in */PinyinKey keys[], /* out */ PhraseIndexRanges ranges) const;
+    int tone_level_search(PinyinInitial initial, PinyinFinal final, int word_length, /* in */PinyinKey keys[], /* out */ PhraseIndexRanges ranges) const;
+    void reset();
+public:
+    PinyinBitmapIndexLevel(PinyinCustomSettings * custom);
+    ~PinyinBitmapIndexLevel(){
+       reset();
+    }
+
+    bool load(MemoryChunk * chunk, table_offset_t offset, table_offset_t end);
+    bool store(MemoryChunk * new_chunk, table_offset_t offset, table_offset_t &end);
+
+    /*bool load_text(FILE * file);*/
+    /*bool save_text(FILE * file);*/
+    
+    /*search/add_index method */
+    int search( int phrase_length, /* in */ PinyinKey keys[],
+               /* out */ PhraseIndexRanges ranges) const;
+    int add_index( int phrase_length, /* in */ PinyinKey keys[] ,/* in */ phrase_token_t token); 
+    int remove_index( int phrase_length, /* in */ PinyinKey keys[] ,/* in */ phrase_token_t token);
+};
+
+class PinyinLengthIndexLevel{
+protected:
+    GArray* m_pinyin_array_indexes;
+public:
+    PinyinLengthIndexLevel();
+    ~PinyinLengthIndexLevel();
+    bool load(MemoryChunk * chunk, table_offset_t offset, table_offset_t end);
+    bool store(MemoryChunk * new_chunk, table_offset_t offset, table_offset_t& end);
+    
+    /*search/add_index method */
+    int search( int phrase_length, /* in */ PinyinCustomSettings * custom,
+               /* in */ PinyinKey keys[],
+               /* out */ PhraseIndexRanges ranges);
+    int add_index( int phrase_length, /* in */ PinyinKey keys[] ,/* in */ phrase_token_t token); 
+    int remove_index( int phrase_length, /* in */ PinyinKey keys[] ,/* in */ phrase_token_t token);
+};
+
+template<size_t phrase_length>
+class PinyinArrayIndexLevel{
+protected:
+    MemoryChunk m_chunk;
+    int convert(PinyinCustomSettings * custom,
+               PinyinKey keys[],
+               PinyinIndexItem<phrase_length> * begin,
+               PinyinIndexItem<phrase_length> * end,
+               PhraseIndexRanges ranges);
+public:
+    bool load(MemoryChunk * chunk, table_offset_t offset, table_offset_t end);
+    bool store(MemoryChunk * new_chunk, table_offset_t offset, table_offset_t& end);
+    
+    /*search/add_index method */
+    int search(/* in */ PinyinCustomSettings * custom,
+              /* in */ PinyinKey keys[],
+              /* out */ PhraseIndexRanges ranges);
+    int add_index(/* in */ PinyinKey keys[] ,/* in */ phrase_token_t token); 
+    int remove_index(/* in */ PinyinKey keys[] ,/* in */ phrase_token_t token);
+};
+
+
+/* TODO: add file version check */
+class PinyinLargeTable{
+protected:
+    PinyinBitmapIndexLevel m_bitmap_table;
+    MemoryChunk * m_chunk;
+
+    void reset(){
+       if ( m_chunk ){
+           delete m_chunk;
+           m_chunk = NULL;
+       }
+    }
+
+public:
+    PinyinLargeTable(PinyinCustomSettings * custom):
+       m_bitmap_table(custom){
+       m_chunk = NULL;
+    }
+    
+    ~PinyinLargeTable(){
+       reset();
+    }
+
+    /*load/save method*/
+    bool load(MemoryChunk * chunk){
+       reset();
+       m_chunk = chunk;
+       return m_bitmap_table.load(chunk, 0 , chunk->size());
+    }
+
+    bool store(MemoryChunk * new_chunk){
+       table_offset_t end;
+       return m_bitmap_table.store(new_chunk, 0, end);
+    }
+    
+    bool load_text(FILE * file);
+/*
+    bool save_text(FILE * file){
+       return m_bitmap_table.save_text(file);
+    }
+*/
+    
+    /*search/add_index method */
+    int search( int phrase_length, /* in */ PinyinKey keys[],
+               /* out */ PhraseIndexRanges ranges){
+       return m_bitmap_table.search(phrase_length, keys, ranges);
+    }
+
+    int add_index( int phrase_length, /* in */ PinyinKey keys[] ,/* in */ phrase_token_t token){
+       return m_bitmap_table.add_index(phrase_length, keys, token);
+    }
+    int remove_index( int phrase_length, /* in */ PinyinKey keys[] ,/* in */ phrase_token_t token){
+       return m_bitmap_table.remove_index(phrase_length, keys, token);
+    }
+
+    bool has_key(PinyinKey key) const {
+       PhraseIndexRanges ranges;
+       memset(ranges, 0, sizeof(ranges));
+       ranges[1] = g_array_new(FALSE, FALSE, sizeof(PhraseIndexRange));
+       int result = m_bitmap_table.search(1, &key, ranges);
+       g_array_free(ranges[1], TRUE);
+       ranges[1] = NULL;
+       return result & SEARCH_OK;
+    }
+};
+
+};
+
+using namespace novel;
+#endif
diff --git a/src/storage/pinyin_phrase.h b/src/storage/pinyin_phrase.h
new file mode 100644 (file)
index 0000000..07ee0de
--- /dev/null
@@ -0,0 +1,298 @@
+/* 
+ *  novel-pinyin,
+ *  A Simplified Chinese Sentence-Based Pinyin Input Method Engine
+ *  Based On Markov Model.
+ *  
+ *  Copyright (C) 2006-2007 Peng Wu
+ *  
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ * 
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *  
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef PINYIN_PHRASE_H
+#define PINYIN_PHRASE_H
+
+#include <string.h>
+#include "stl_lite.h"
+
+namespace novel{
+
+static inline int pinyin_utility_sign(int value){
+  if(value > 0)
+    return 1;
+  else if (value < 0)
+    return -1;
+  else return 0;
+}
+
+inline int pinyin_exact_compare(const PinyinKey key_lhs[], 
+                               const PinyinKey key_rhs[],
+                               int phrase_length){
+  int i;
+  int result;
+  for ( i = 0 ; i < phrase_length ; i++){
+    result = key_lhs[i].m_initial - key_rhs[i].m_initial;
+    if ( result != 0 )
+      return pinyin_utility_sign(result);
+  }
+  for( i = 0 ; i < phrase_length ; i++){
+    result = key_lhs[i].m_final - key_rhs[i].m_final;
+    if ( result != 0 )
+      return pinyin_utility_sign(result);
+  }
+  for( i = 0 ; i < phrase_length ; i++){
+    result = key_lhs[i].m_tone - key_rhs[i].m_tone;
+    if ( result != 0 )
+      return pinyin_utility_sign(result);
+  }
+  return 0;
+}
+
+
+inline int pinyin_compare_with_ambiguities(const PinyinCustomSettings &custom,
+                                          const PinyinKey* key_lhs,
+                                          const PinyinKey* key_rhs,
+                                          int phrase_length){
+    int i;
+    int result;
+    for ( i = 0 ; i < phrase_length ; i++){
+       result = pinyin_compare_initial
+           (custom, 
+            (PinyinInitial)key_lhs[i].m_initial, 
+            (PinyinInitial)key_rhs[i].m_initial);
+       if ( result != 0 )
+           return result;
+    }
+    for( i = 0 ; i < phrase_length ; i++){
+       result = pinyin_compare_final
+           (custom, 
+            (PinyinFinal)key_lhs[i].m_final, 
+            (PinyinFinal)key_rhs[i].m_final);
+       if ( result != 0 )
+           return result;
+    }
+    for( i = 0 ; i < phrase_length ; i++){
+       result = pinyin_compare_tone
+           (custom,
+            (PinyinTone)key_lhs[i].m_tone,
+            (PinyinTone)key_rhs[i].m_tone);
+       if ( result != 0 )
+           return result;
+    }
+    return 0;
+}
+
+//compute pinyin lower bound
+//maybe replace by table lookup
+inline void compute_lower_value(const PinyinCustomSettings &custom,
+                               PinyinKey in_keys[], 
+                               PinyinKey out_keys[], 
+                               int phrase_length){
+    PinyinKey aKey = in_keys[0];
+    
+    for ( int i = 0; i < phrase_length; i++){
+       int k; int sel;
+       aKey = in_keys[i];
+       //deal with initial
+       sel = aKey.m_initial;
+       for( k = aKey.m_initial - 1; k >= PINYIN_ZeroInitial; k--){
+           if ( 0 != pinyin_compare_initial(custom, 
+                                            (PinyinInitial)k, 
+                                            (PinyinInitial)aKey.m_initial) )
+               break;
+           else
+               sel = k;
+       }
+       aKey.m_initial = (PinyinInitial)sel;
+       //deal with final
+       sel = aKey.m_final;
+       for( k = aKey.m_final - 1; k >= PINYIN_ZeroFinal; k--){
+           if ( 0 != pinyin_compare_final(custom,
+                                          (PinyinFinal)k,
+                                          (PinyinFinal)aKey.m_final) )
+               break;
+           else
+               sel = k;
+       }
+       aKey.m_final = (PinyinFinal)sel;
+       //deal with tone
+       sel = aKey.m_tone;
+       for( k = aKey.m_tone - 1; k >= PINYIN_ZeroTone; k--){
+           if ( 0 != pinyin_compare_tone(custom,
+                                         (PinyinTone)k, 
+                                         (PinyinTone)aKey.m_tone) )
+               break;
+           else
+           sel = k;
+       }
+       aKey.m_tone = (PinyinTone)sel;
+       //save the result
+       out_keys[i] = aKey;
+    }
+}
+
+//compute pinyin upper bound
+//maybe replace by table lookup
+inline void compute_upper_value(const PinyinCustomSettings &custom,
+                               PinyinKey in_keys[], 
+                               PinyinKey out_keys[],
+                               int phrase_length){
+    PinyinKey aKey = in_keys[0];
+    
+    for ( int i = 0; i < phrase_length; i++){
+       int k; int sel;
+       aKey = in_keys[i];
+       //deal with initial
+       sel = aKey.m_initial;
+       for( k = aKey.m_initial + 1; k <= PINYIN_LastInitial; k++){
+           if ( 0 != pinyin_compare_initial(custom, (PinyinInitial)k, (PinyinInitial)aKey.m_initial) )
+               break;
+           else
+               sel = k;
+       }
+       aKey.m_initial = (PinyinInitial)sel;
+       //deal with final
+       sel = aKey.m_final;
+       for( k = aKey.m_final + 1; k <= PINYIN_LastFinal; k++){
+           if ( 0 != pinyin_compare_final(custom, (PinyinFinal)k, (PinyinFinal)aKey.m_final) )
+               break;
+           else
+               sel = k;
+       }
+       aKey.m_final = (PinyinFinal)sel;
+       //deal with tone
+       sel = aKey.m_tone;
+       for( k = aKey.m_tone + 1; k <= PINYIN_LastTone; k++){
+           if ( 0 != pinyin_compare_tone(custom, (PinyinTone)k, (PinyinTone)aKey.m_tone) )
+               break;
+           else
+               sel = k;
+       }
+       aKey.m_tone = (PinyinTone)sel;
+       //save the result
+       out_keys[i] = aKey;
+    }
+}
+
+template<int phrase_length>
+struct PinyinIndexItem{
+    phrase_token_t m_token;
+    PinyinKey m_keys[phrase_length];
+public:
+    PinyinIndexItem<phrase_length>(PinyinKey * keys, phrase_token_t token){
+       memmove(m_keys, keys, sizeof(PinyinKey) * phrase_length);
+       m_token = token;
+    }
+};
+
+/*
+//just need less than  mode
+//this method mainly used in pinyin lookup
+template<int phrase_length>
+class PhraseCompareWithAmbiguities
+    : public std_lite::binary_function <const PinyinIndexItem <phrase_length>,
+                                  const PinyinIndexItem <phrase_length>, int>
+{
+    const PinyinCustomSettings & m_custom;
+public:
+    PhraseCompareWithAmbiguities<phrase_length>
+       (const PinyinCustomSettings & custom):m_custom(custom){}
+
+    int operator () (const PinyinIndexItem<phrase_length> &lhs,
+                      const PinyinIndexItem<phrase_length> &rhs) const{
+      PinyinKey * key_lhs = (PinyinKey *) lhs.m_keys;
+      PinyinKey * key_rhs = (PinyinKey *) rhs.m_keys;
+      return pinyin_compare_with_ambiguities(m_custom, 
+                                            key_lhs, key_rhs, phrase_length);
+    }
+};
+*/
+
+//for find the element in the phrase array
+template<int phrase_length>
+class PhraseExactCompare
+  : public std_lite::binary_function <const PinyinIndexItem<phrase_length>
+                                ,const PinyinIndexItem<phrase_length>, int>
+{
+public:
+  int operator () (const PinyinIndexItem<phrase_length> &lhs,
+                  const PinyinIndexItem<phrase_length> &rhs) const{
+    PinyinKey * key_lhs = (PinyinKey *) lhs.m_keys;
+    PinyinKey * key_rhs = (PinyinKey *) rhs.m_keys;
+    
+    return pinyin_exact_compare(key_lhs, key_rhs, phrase_length);
+  }
+};
+
+/*
+//for find the element in the phrase array
+template<int phrase_length>
+class PhraseExactCompareWithToken
+  : public std_lite::binary_function <const PinyinIndexItem<phrase_length>
+                                ,const PinyinIndexItem<phrase_length>, int>
+{
+public:
+  int operator () (const PinyinIndexItem<phrase_length> &lhs,
+                  const PinyinIndexItem<phrase_length> &rhs) const{
+    PinyinKey * key_lhs = (PinyinKey *) lhs.m_keys;
+    PinyinKey * key_rhs = (PinyinKey *) rhs.m_keys;
+    
+    phrase_token_t token_lhs = lhs.m_token;
+    phrase_token_t token_rhs = rhs.m_token;
+    
+    int result = pinyin_exact_compare(key_lhs, key_rhs, phrase_length);
+    if ( !result )
+       return result;
+    return pinyin_utility_sign(token_lhs - token_rhs);
+  }
+};
+*/
+
+template<int phrase_length>
+class PhraseExactLessThan
+    : public std_lite::binary_function <const PinyinIndexItem<phrase_length>
+                                  ,const PinyinIndexItem<phrase_length>,
+    bool>
+{
+ private:
+  PhraseExactCompare<phrase_length> m_compare;
+ public:
+  bool operator () (const PinyinIndexItem<phrase_length> &lhs,
+                  const PinyinIndexItem<phrase_length> &rhs) const{
+    return -1 == m_compare(lhs, rhs);
+  }
+};
+
+/*
+template<int phrase_length>
+class PhraseExactLessThanWithToken
+    : public std_lite::binary_function <const PinyinIndexItem<phrase_length>
+                                  ,const PinyinIndexItem<phrase_length>,
+    bool>
+{
+ private:
+  PhraseExactCompareWithToken<phrase_length> m_compare;
+ public:
+  bool operator () (const PinyinIndexItem<phrase_length> &lhs,
+                  const PinyinIndexItem<phrase_length> &rhs) const{
+    return -1 == m_compare(lhs, rhs);
+  }
+};
+*/
+
+};
+
+using namespace novel;
+
+#endif
diff --git a/src/storage/pinyin_zhuyin_map_data.h b/src/storage/pinyin_zhuyin_map_data.h
new file mode 100644 (file)
index 0000000..7557c5e
--- /dev/null
@@ -0,0 +1,582 @@
+static const PinyinKey __zhuyin_standard_map [][3] = 
+{
+/*   */{PinyinKey(1)     /* 1 */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* ! */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* " */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* # */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* $ */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* % */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* & */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* ' */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* ( */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* ) */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* * */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* + */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* , */{PinyinKey(42)   /* eh */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* - */{PinyinKey(66)   /* er */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* . */{PinyinKey(150)  /* ou */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* / */{PinyinKey(60)  /* eng */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* 0 */{PinyinKey(18)   /* an */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* 1 */{PinyinKey(240)   /* b */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* 2 */{PinyinKey(960)   /* d */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* 3 */{PinyinKey(3)     /* 3 */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* 4 */{PinyinKey(4)     /* 4 */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* 5 */{PinyinKey(5520) /* zh */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* 6 */{PinyinKey(2)     /* 2 */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* 7 */{PinyinKey(5)     /* 5 */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* 8 */{PinyinKey(6)     /* a */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* 9 */{PinyinKey(12)   /* ai */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* : */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* ; */{PinyinKey(24)  /* ang */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* < */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* = */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* > */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* ? */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* @ */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* A */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* B */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* C */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* D */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* E */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* F */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* G */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* H */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* I */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* J */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* K */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* L */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* M */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* N */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* O */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* P */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* Q */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* R */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* S */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* T */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* U */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* V */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* W */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* X */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* Y */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* Z */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* [ */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* \ */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* ] */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* ^ */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* _ */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* ` */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* a */{PinyinKey(2640)  /* m */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* b */{PinyinKey(3600)  /* r */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* c */{PinyinKey(1680)  /* h */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* d */{PinyinKey(2160)  /* k */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* e */{PinyinKey(1440)  /* g */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* f */{PinyinKey(3360)  /* q */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* g */{PinyinKey(4080) /* sh */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* h */{PinyinKey(480)   /* c */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* i */{PinyinKey(138)   /* o */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* j */{PinyinKey(156)   /* u */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* k */{PinyinKey(36)    /* e */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* l */{PinyinKey(30)   /* ao */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* m */{PinyinKey(216)   /* v */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* n */{PinyinKey(3840)  /* s */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* o */{PinyinKey(48)   /* ei */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* p */{PinyinKey(54)   /* en */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* q */{PinyinKey(3120)  /* p */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* r */{PinyinKey(1920)  /* j */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* s */{PinyinKey(2880)  /* n */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* t */{PinyinKey(720)  /* ch */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* u */{PinyinKey(72)    /* i */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* v */{PinyinKey(4800)  /* x */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* w */{PinyinKey(4320)  /* t */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* x */{PinyinKey(2400)  /* l */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* y */{PinyinKey(5280)  /* z */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* z */{PinyinKey(1200)  /* f */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* { */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* | */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+};
+
+static const PinyinKey __zhuyin_hsu_map [][3] = 
+{
+/*   */{PinyinKey(1)     /* 1 */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* ! */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* " */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* # */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* $ */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* % */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* & */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* ' */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* ( */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* ) */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* * */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* + */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* , */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* - */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* . */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* / */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* 0 */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* 1 */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* 2 */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* 3 */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* 4 */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* 5 */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* 6 */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* 7 */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* 8 */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* 9 */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* : */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* ; */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* < */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* = */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* > */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* ? */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* @ */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* A */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* B */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* C */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* D */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* E */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* F */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* G */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* H */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* I */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* J */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* K */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* L */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* M */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* N */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* O */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* P */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* Q */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* R */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* S */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* T */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* U */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* V */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* W */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* X */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* Y */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* Z */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* [ */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* \ */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* ] */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* ^ */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* _ */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* ` */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* a */{PinyinKey(480)   /* c */, PinyinKey(48)   /* ei */, PinyinKey(0)      /*  */},
+/* b */{PinyinKey(240)   /* b */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* c */{PinyinKey(4800)  /* x */, PinyinKey(4080) /* sh */, PinyinKey(0)      /*  */},
+/* d */{PinyinKey(960)   /* d */, PinyinKey(2)     /* 2 */, PinyinKey(0)      /*  */},
+/* e */{PinyinKey(72)    /* i */, PinyinKey(42)   /* eh */, PinyinKey(0)      /*  */},
+/* f */{PinyinKey(1200)  /* f */, PinyinKey(3)     /* 3 */, PinyinKey(0)      /*  */},
+/* g */{PinyinKey(1440)  /* g */, PinyinKey(36)    /* e */, PinyinKey(0)      /*  */},
+/* h */{PinyinKey(1680)  /* h */, PinyinKey(138)   /* o */, PinyinKey(0)      /*  */},
+/* i */{PinyinKey(12)   /* ai */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* j */{PinyinKey(1920)  /* j */, PinyinKey(5520) /* zh */, PinyinKey(4)     /* 4 */},
+/* k */{PinyinKey(2160)  /* k */, PinyinKey(24)  /* ang */, PinyinKey(0)      /*  */},
+/* l */{PinyinKey(2400)  /* l */, PinyinKey(60)  /* eng */, PinyinKey(66)   /* er */},
+/* m */{PinyinKey(2640)  /* m */, PinyinKey(18)   /* an */, PinyinKey(0)      /*  */},
+/* n */{PinyinKey(2880)  /* n */, PinyinKey(54)   /* en */, PinyinKey(0)      /*  */},
+/* o */{PinyinKey(150)  /* ou */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* p */{PinyinKey(3120)  /* p */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* q */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* r */{PinyinKey(3600)  /* r */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* s */{PinyinKey(3840)  /* s */, PinyinKey(5)     /* 5 */, PinyinKey(0)      /*  */},
+/* t */{PinyinKey(4320)  /* t */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* u */{PinyinKey(216)   /* v */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* v */{PinyinKey(3360)  /* q */, PinyinKey(720)  /* ch */, PinyinKey(0)      /*  */},
+/* w */{PinyinKey(30)   /* ao */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* x */{PinyinKey(156)   /* u */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* y */{PinyinKey(6)     /* a */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* z */{PinyinKey(5280)  /* z */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* { */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* | */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+};
+
+static const PinyinKey __zhuyin_ibm_map [][3] = 
+{
+/*   */{PinyinKey(1)     /* 1 */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* ! */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* " */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* # */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* $ */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* % */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* & */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* ' */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* ( */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* ) */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* * */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* + */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* , */{PinyinKey(3)     /* 3 */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* - */{PinyinKey(1680)  /* h */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* . */{PinyinKey(4)     /* 4 */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* / */{PinyinKey(5)     /* 5 */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* 0 */{PinyinKey(2160)  /* k */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* 1 */{PinyinKey(240)   /* b */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* 2 */{PinyinKey(3120)  /* p */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* 3 */{PinyinKey(2640)  /* m */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* 4 */{PinyinKey(1200)  /* f */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* 5 */{PinyinKey(960)   /* d */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* 6 */{PinyinKey(4320)  /* t */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* 7 */{PinyinKey(2880)  /* n */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* 8 */{PinyinKey(2400)  /* l */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* 9 */{PinyinKey(1440)  /* g */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* : */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* ; */{PinyinKey(30)   /* ao */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* < */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* = */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* > */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* ? */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* @ */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* A */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* B */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* C */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* D */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* E */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* F */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* G */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* H */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* I */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* J */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* K */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* L */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* M */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* N */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* O */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* P */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* Q */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* R */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* S */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* T */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* U */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* V */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* W */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* X */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* Y */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* Z */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* [ */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* \ */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* ] */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* ^ */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* _ */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* ` */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* a */{PinyinKey(72)    /* i */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* b */{PinyinKey(60)  /* eng */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* c */{PinyinKey(54)   /* en */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* d */{PinyinKey(216)   /* v */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* e */{PinyinKey(4800)  /* x */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* f */{PinyinKey(6)     /* a */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* g */{PinyinKey(138)   /* o */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* h */{PinyinKey(36)    /* e */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* i */{PinyinKey(5280)  /* z */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* j */{PinyinKey(42)   /* eh */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* k */{PinyinKey(12)   /* ai */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* l */{PinyinKey(48)   /* ei */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* m */{PinyinKey(2)     /* 2 */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* n */{PinyinKey(66)   /* er */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* o */{PinyinKey(480)   /* c */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* p */{PinyinKey(3840)  /* s */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* q */{PinyinKey(1920)  /* j */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* r */{PinyinKey(5520) /* zh */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* s */{PinyinKey(156)   /* u */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* t */{PinyinKey(720)  /* ch */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* u */{PinyinKey(3600)  /* r */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* v */{PinyinKey(24)  /* ang */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* w */{PinyinKey(3360)  /* q */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* x */{PinyinKey(18)   /* an */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* y */{PinyinKey(4080) /* sh */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* z */{PinyinKey(150)  /* ou */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* { */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* | */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+};
+
+static const PinyinKey __zhuyin_gin_yieh_map [][3] = 
+{
+/*   */{PinyinKey(1)     /* 1 */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* ! */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* " */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* # */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* $ */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* % */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* & */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* ' */{PinyinKey(60)  /* eng */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* ( */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* ) */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* * */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* + */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* , */{PinyinKey(6)     /* a */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* - */{PinyinKey(54)   /* en */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* . */{PinyinKey(12)   /* ai */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* / */{PinyinKey(18)   /* an */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* 0 */{PinyinKey(48)   /* ei */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* 1 */{PinyinKey(5)     /* 5 */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* 2 */{PinyinKey(240)   /* b */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* 3 */{PinyinKey(960)   /* d */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* 4 */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* 5 */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* 6 */{PinyinKey(5520) /* zh */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* 7 */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* 8 */{PinyinKey(72)    /* i */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* 9 */{PinyinKey(138)   /* o */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* : */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* ; */{PinyinKey(150)  /* ou */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* < */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* = */{PinyinKey(66)   /* er */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* > */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* ? */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* @ */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* A */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* B */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* C */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* D */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* E */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* F */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* G */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* H */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* I */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* J */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* K */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* L */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* M */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* N */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* O */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* P */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* Q */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* R */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* S */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* T */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* U */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* V */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* W */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* X */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* Y */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* Z */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* [ */{PinyinKey(24)  /* ang */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* \ */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* ] */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* ^ */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* _ */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* ` */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* a */{PinyinKey(3)     /* 3 */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* b */{PinyinKey(4800)  /* x */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* c */{PinyinKey(2400)  /* l */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* d */{PinyinKey(2880)  /* n */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* e */{PinyinKey(4320)  /* t */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* f */{PinyinKey(2160)  /* k */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* g */{PinyinKey(3360)  /* q */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* h */{PinyinKey(4080) /* sh */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* i */{PinyinKey(156)   /* u */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* j */{PinyinKey(480)   /* c */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* k */{PinyinKey(216)   /* v */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* l */{PinyinKey(42)   /* eh */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* m */{PinyinKey(3840)  /* s */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* n */{PinyinKey(3600)  /* r */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* o */{PinyinKey(36)    /* e */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* p */{PinyinKey(30)   /* ao */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* q */{PinyinKey(2)     /* 2 */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* r */{PinyinKey(1440)  /* g */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* s */{PinyinKey(2640)  /* m */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* t */{PinyinKey(1920)  /* j */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* u */{PinyinKey(5280)  /* z */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* v */{PinyinKey(1680)  /* h */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* w */{PinyinKey(3120)  /* p */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* x */{PinyinKey(1200)  /* f */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* y */{PinyinKey(720)  /* ch */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* z */{PinyinKey(4)     /* 4 */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* { */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* | */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+};
+
+static const PinyinKey __zhuyin_et_map [][3] = 
+{
+/*   */{PinyinKey(1)     /* 1 */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* ! */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* " */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* # */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* $ */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* % */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* & */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* ' */{PinyinKey(480)   /* c */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* ( */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* ) */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* * */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* + */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* , */{PinyinKey(5520) /* zh */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* - */{PinyinKey(60)  /* eng */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* . */{PinyinKey(720)  /* ch */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* / */{PinyinKey(4080) /* sh */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* 0 */{PinyinKey(24)  /* ang */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* 1 */{PinyinKey(5)     /* 5 */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* 2 */{PinyinKey(2)     /* 2 */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* 3 */{PinyinKey(3)     /* 3 */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* 4 */{PinyinKey(4)     /* 4 */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* 5 */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* 6 */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* 7 */{PinyinKey(3360)  /* q */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* 8 */{PinyinKey(18)   /* an */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* 9 */{PinyinKey(54)   /* en */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* : */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* ; */{PinyinKey(5280)  /* z */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* < */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* = */{PinyinKey(66)   /* er */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* > */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* ? */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* @ */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* A */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* B */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* C */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* D */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* E */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* F */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* G */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* H */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* I */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* J */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* K */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* L */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* M */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* N */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* O */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* P */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* Q */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* R */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* S */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* T */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* U */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* V */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* W */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* X */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* Y */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* Z */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* [ */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* \ */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* ] */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* ^ */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* _ */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* ` */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* a */{PinyinKey(6)     /* a */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* b */{PinyinKey(240)   /* b */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* c */{PinyinKey(4800)  /* x */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* d */{PinyinKey(960)   /* d */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* e */{PinyinKey(72)    /* i */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* f */{PinyinKey(1200)  /* f */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* g */{PinyinKey(1920)  /* j */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* h */{PinyinKey(1680)  /* h */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* i */{PinyinKey(12)   /* ai */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* j */{PinyinKey(3600)  /* r */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* k */{PinyinKey(2160)  /* k */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* l */{PinyinKey(2400)  /* l */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* m */{PinyinKey(2640)  /* m */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* n */{PinyinKey(2880)  /* n */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* o */{PinyinKey(138)   /* o */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* p */{PinyinKey(3120)  /* p */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* q */{PinyinKey(48)   /* ei */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* r */{PinyinKey(36)    /* e */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* s */{PinyinKey(3840)  /* s */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* t */{PinyinKey(4320)  /* t */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* u */{PinyinKey(216)   /* v */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* v */{PinyinKey(1440)  /* g */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* w */{PinyinKey(42)   /* eh */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* x */{PinyinKey(156)   /* u */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* y */{PinyinKey(150)  /* ou */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* z */{PinyinKey(30)   /* ao */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* { */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* | */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+};
+
+static const PinyinKey __zhuyin_et26_map [][3] = 
+{
+/*   */{PinyinKey(1)     /* 1 */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* ! */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* " */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* # */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* $ */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* % */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* & */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* ' */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* ( */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* ) */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* * */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* + */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* , */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* - */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* . */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* / */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* 0 */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* 1 */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* 2 */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* 3 */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* 4 */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* 5 */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* 6 */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* 7 */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* 8 */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* 9 */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* : */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* ; */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* < */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* = */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* > */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* ? */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* @ */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* A */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* B */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* C */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* D */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* E */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* F */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* G */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* H */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* I */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* J */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* K */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* L */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* M */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* N */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* O */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* P */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* Q */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* R */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* S */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* T */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* U */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* V */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* W */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* X */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* Y */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* Z */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* [ */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* \ */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* ] */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* ^ */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* _ */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* ` */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* a */{PinyinKey(6)     /* a */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* b */{PinyinKey(240)   /* b */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* c */{PinyinKey(4800)  /* x */, PinyinKey(4080) /* sh */, PinyinKey(0)      /*  */},
+/* d */{PinyinKey(960)   /* d */, PinyinKey(5)     /* 5 */, PinyinKey(0)      /*  */},
+/* e */{PinyinKey(72)    /* i */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* f */{PinyinKey(1200)  /* f */, PinyinKey(2)     /* 2 */, PinyinKey(0)      /*  */},
+/* g */{PinyinKey(1920)  /* j */, PinyinKey(5520) /* zh */, PinyinKey(0)      /*  */},
+/* h */{PinyinKey(1680)  /* h */, PinyinKey(66)   /* er */, PinyinKey(0)      /*  */},
+/* i */{PinyinKey(12)   /* ai */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* j */{PinyinKey(3600)  /* r */, PinyinKey(3)     /* 3 */, PinyinKey(0)      /*  */},
+/* k */{PinyinKey(2160)  /* k */, PinyinKey(4)     /* 4 */, PinyinKey(0)      /*  */},
+/* l */{PinyinKey(2400)  /* l */, PinyinKey(60)  /* eng */, PinyinKey(0)      /*  */},
+/* m */{PinyinKey(2640)  /* m */, PinyinKey(18)   /* an */, PinyinKey(0)      /*  */},
+/* n */{PinyinKey(2880)  /* n */, PinyinKey(54)   /* en */, PinyinKey(0)      /*  */},
+/* o */{PinyinKey(138)   /* o */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* p */{PinyinKey(3120)  /* p */, PinyinKey(150)  /* ou */, PinyinKey(0)      /*  */},
+/* q */{PinyinKey(5280)  /* z */, PinyinKey(48)   /* ei */, PinyinKey(0)      /*  */},
+/* r */{PinyinKey(36)    /* e */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* s */{PinyinKey(3840)  /* s */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* t */{PinyinKey(4320)  /* t */, PinyinKey(24)  /* ang */, PinyinKey(0)      /*  */},
+/* u */{PinyinKey(216)   /* v */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* v */{PinyinKey(1440)  /* g */, PinyinKey(3360)  /* q */, PinyinKey(0)      /*  */},
+/* w */{PinyinKey(480)   /* c */, PinyinKey(42)   /* eh */, PinyinKey(0)      /*  */},
+/* x */{PinyinKey(156)   /* u */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* y */{PinyinKey(720)  /* ch */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* z */{PinyinKey(30)   /* ao */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* { */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+/* | */{PinyinKey(0)      /*  */, PinyinKey(0)      /*  */, PinyinKey(0)      /*  */},
+};
+
diff --git a/src/training/Makefile.am b/src/training/Makefile.am
new file mode 100644 (file)
index 0000000..520e4e1
--- /dev/null
@@ -0,0 +1,36 @@
+## Makefile.am -- Process this file with automake to produce Makefile.in
+## Copyright (C) 2007 Peng Wu
+##
+## This program is free software; you can redistribute it and/or modify
+## it under the terms of the GNU General Public License as published by
+## the Free Software Foundation; either version 2, or (at your option)
+## any later version.
+##
+## This program is distributed in the hope that it will be useful,
+## but WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+## GNU General Public License for more details.
+##
+## You should have received a copy of the GNU General Public License
+## along with this program; if not, write to the Free Software
+## Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+MAINTAINERCLEANFILES    = Makefile.in
+
+INCLUDES               = -I$(top_srcdir)/src/include \
+                         -I$(top_srcdir)/src/storage \
+                         @GLIB2_CPPFLAGS@
+
+noinst_PROGRAMS                = gen_ngram gen_unigram estimate_interpolation
+
+gen_ngram_SOURCES      = gen_ngram.cpp
+
+gen_ngram_LDADD                = ../storage/libstorage.la @GLIB2_LDFLAGS@
+
+gen_unigram_SOURCES     = gen_unigram.cpp
+
+gen_unigram_LDADD       = ../storage/libstorage.la @GLIB2_LDFLAGS@
+
+estimate_interpolation_SOURCES = estimate_interpolation.cpp
+
+estimate_interpolation_LDADD = ../storage/libstorage.la @GLIB2_LDFLAGS@
diff --git a/src/training/estimate_interpolation.cpp b/src/training/estimate_interpolation.cpp
new file mode 100644 (file)
index 0000000..1a547bc
--- /dev/null
@@ -0,0 +1,151 @@
+/* 
+ *  novel-pinyin,
+ *  A Simplified Chinese Sentence-Based Pinyin Input Method Engine
+ *  Based On Markov Model.
+ *  
+ *  Copyright (C) 2006-2008 Peng Wu
+ *  
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ * 
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *  
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <locale.h>
+#include <math.h>
+#include <glib.h>
+#include "memory_chunk.h"
+#include "novel_types.h"
+#include "phrase_index.h"
+#include "ngram.h"
+
+parameter_t compute_interpolation(SingleGram * deleted_bigram,
+                                 FacadePhraseIndex * unigram,
+                                 SingleGram * bigram){
+    bool success;
+    parameter_t lambda = 0, next_lambda = 0.6;
+    parameter_t epsilon = 0.001;
+    
+    while ( fabs(lambda - next_lambda) > epsilon){
+       lambda = next_lambda;
+       next_lambda = 0;
+       guint32 table_num = 0;
+       parameter_t numerator = 0;
+       parameter_t part_of_denominator = 0;
+       
+       PhraseIndexRange range;
+       range.m_range_begin = token_min;
+       range.m_range_end = token_max;
+
+       BigramPhraseArray array = g_array_new(FALSE, FALSE, sizeof(BigramPhraseItem));
+       deleted_bigram->search(&range, array);
+
+       for ( int i = 0; i < array->len; ++i){
+           BigramPhraseItem * item = &g_array_index(array, BigramPhraseItem, i);
+           //get the phrase token
+           phrase_token_t token = item->m_token;
+           guint32 deleted_freq = 0;
+           assert(deleted_bigram->get_freq(token, deleted_freq));
+           {
+               guint32 freq = 0;
+               parameter_t elem_poss = 0;
+               if ( bigram && bigram->get_freq(token, freq)){
+                   guint32 total_freq;
+                   assert(bigram->get_total_freq(total_freq));
+                   assert(0 != total_freq);
+                   elem_poss = freq / (parameter_t) total_freq;
+               }
+               numerator = lambda * elem_poss;
+           }
+
+           {
+               guint32 freq = 0;
+               parameter_t elem_poss = 0;
+               PhraseItem item;
+               if (unigram->get_phrase_item(token, item)){
+                   guint32 freq = item.get_unigram_frequency();
+                   guint32 total_freq = unigram->get_phrase_index_total_freq();
+                   elem_poss = freq / (parameter_t)total_freq;
+               }
+               part_of_denominator = ( 1 - lambda) * elem_poss;
+           }
+           
+           if ( 0 == (numerator + part_of_denominator))
+               continue;
+           
+           next_lambda += deleted_freq * (numerator / (numerator + part_of_denominator));
+       }
+       assert(deleted_bigram->get_total_freq(table_num));
+       next_lambda /= table_num;
+
+       g_array_free(array, TRUE);
+    }
+    lambda = next_lambda;
+    return lambda;
+}
+    
+int main(int argc, char * argv[]){
+    FacadePhraseIndex phrase_index;
+    
+    //gb_char binary file
+    MemoryChunk * chunk = new MemoryChunk;
+    chunk->load("../../data/gb_char.bin");
+    phrase_index.load(1, chunk);
+    
+    //gbk_char binary file
+    chunk = new MemoryChunk;
+    chunk->load("../../data/gbk_char.bin");
+    phrase_index.load(2, chunk);
+
+    Bigram bigram;
+    bigram.attach("../../data/bigram.db", NULL);
+
+    Bigram deleted_bigram;
+    deleted_bigram.attach("../../data/deleted_bigram.db", NULL);
+
+    GArray * system_items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
+    GArray * user_items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
+
+    deleted_bigram.get_all_items(system_items, user_items);
+    assert(0 == user_items->len);
+    g_array_free(user_items, TRUE);
+
+    parameter_t lambda_sum = 0;
+    int lambda_count = 0;
+
+    for ( int i = 0; i < system_items->len; ++i ){
+       phrase_token_t * token = &g_array_index(system_items, phrase_token_t, i);
+       SingleGram * system = NULL, * user = NULL;
+       bigram.load(*token, system, user);
+       assert(NULL == user);
+       SingleGram * deleted_system = NULL, * deleted_user = NULL;
+       deleted_bigram.load(*token, deleted_system, deleted_user);
+       assert(NULL == deleted_user);
+       
+       parameter_t lambda = compute_interpolation(deleted_system, &phrase_index, system);
+       
+       printf("lambda:%f\n", lambda);
+
+       lambda_sum += lambda;
+       lambda_count ++;
+
+       if (system) delete system;
+       delete deleted_system;
+    }
+
+    printf("average lambda:%f\n", (lambda_sum/lambda_count));
+    g_array_free(system_items, TRUE);
+}
+
diff --git a/src/training/gen_ngram.cpp b/src/training/gen_ngram.cpp
new file mode 100644 (file)
index 0000000..4dfea78
--- /dev/null
@@ -0,0 +1,179 @@
+/* 
+ *  novel-pinyin,
+ *  A Simplified Chinese Sentence-Based Pinyin Input Method Engine
+ *  Based On Markov Model.
+ *  
+ *  Copyright (C) 2006-2007 Peng Wu
+ *  
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ * 
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *  
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <locale.h>
+#include <glib.h>
+#include "memory_chunk.h"
+#include "novel_types.h"
+#include "phrase_index.h"
+#include "ngram.h"
+
+
+static GHashTable * g_phrases;
+
+//read gb_char.table and gbk_char.table
+bool init_phrases(FILE * infile){
+    char pinyin[256];
+    char phrase[256];
+    phrase_token_t token;
+    size_t freq;
+    while (!feof(infile)){
+        fscanf(infile, "%s", pinyin);
+        fscanf(infile, "%s", phrase);
+        fscanf(infile, "%d", &token);
+        fscanf(infile, "%ld", &freq);
+        if ( feof(infile) )
+            break;
+       g_hash_table_insert(g_phrases, g_strdup(phrase), 
+                           GUINT_TO_POINTER(token));   
+    }
+       return true;
+}
+
+void print_help(){
+    printf("gen_ngram [--skip-pi-gram-training] [--skip-unigram-training]\n");
+    printf("          [--bigram-file <FILENAME>]\n");
+    exit(1);
+}
+
+int main(int argc, char * argv[]){
+       int i = 1;
+    bool train_pi_gram = true;
+    bool train_unigram = true;
+    const char * bigram_filename = "../../data/bigram.db";
+
+    setlocale(LC_ALL,"");
+    while ( i < argc ){
+       if ( strcmp("--help", argv[i] ) == 0){
+           print_help();
+       }else if ( strcmp("--skip-pi-gram-training", argv[i] ) == 0) {
+           train_pi_gram = false;
+       }else if ( strcmp("--skip-unigram-training", argv[i] ) == 0) {
+           train_unigram = false;
+       }else if ( strcmp("--bigram-file", argv[i] ) == 0){
+            if ( ++i >= argc )
+                print_help();
+            bigram_filename = argv[i];
+       }
+       ++i;
+    }
+    
+    g_phrases = g_hash_table_new_full(g_str_hash, g_str_equal, g_free, NULL);
+    //init phrase lookup
+    FILE * gb_file = fopen("../../data/gb_char.table", "r");
+    if ( gb_file == NULL ){
+       fprintf(stderr, "can't open gb_char.table!\n");
+       exit(1);
+    }
+    init_phrases(gb_file);
+    fclose(gb_file);
+    FILE * gbk_file = fopen("../../data/gbk_char.table", "r");
+    if ( gbk_file == NULL ){
+       fprintf(stderr, "can't open gbk_char.table!\n");
+       exit(1);
+    }
+    init_phrases(gbk_file);
+    fclose(gbk_file);
+
+    FacadePhraseIndex phrase_index;
+    
+    //gb_char binary file
+    MemoryChunk * chunk = new MemoryChunk;
+    chunk->load("../../data/gb_char.bin");
+    phrase_index.load(1, chunk);
+    
+    //gbk_char binary file
+    chunk = new MemoryChunk;
+    chunk->load("../../data/gbk_char.bin");
+    phrase_index.load(2, chunk);
+    
+    Bigram bigram;
+    bigram.attach(NULL, bigram_filename);
+    
+    
+    char* linebuf = (char *)malloc ( 1024 * sizeof (char) );
+    size_t size = 1024;
+    phrase_token_t last_token, cur_token = last_token = 0;
+    while( getline(&linebuf, &size, stdin) ){
+       if ( feof(stdin) )
+           break;
+        linebuf[strlen(linebuf)-1] = '\0';
+       
+       phrase_token_t token;
+       gpointer orig_key, value;
+       gboolean result = g_hash_table_lookup_extended
+           (g_phrases, linebuf, &orig_key, &value);
+       if (result){
+           token = GPOINTER_TO_UINT(value);
+       }else{
+           token = 0;
+       }
+       
+       last_token = cur_token;
+       cur_token = token;
+       if ( cur_token ){
+           //training uni-gram
+           if ( train_unigram )
+               phrase_index.add_unigram_frequency(cur_token, 1);
+       }
+       if ( cur_token ){
+           SingleGram * system = NULL, * user = NULL;
+           if ( 0 == last_token ){
+               if (train_pi_gram)
+                   bigram.load(sentence_start, system, user);
+           } else
+               bigram.load(last_token, system, user);
+           assert(NULL == system);
+           if ( NULL == user ){
+               user = new SingleGram;
+           }
+           guint32 freq, total_freq;
+           //increase freq
+           user->get_freq(cur_token, freq);
+           user->set_freq(cur_token, freq + 1);
+           //increase total freq
+           user->get_total_freq(total_freq);
+           user->set_total_freq(total_freq + 1);
+           if ( 0 == last_token ){
+               if ( train_pi_gram )
+                   bigram.store(sentence_start, user);
+           }else
+               bigram.store(last_token, user);
+           delete user;
+       }
+    }
+    
+    MemoryChunk * new_chunk = new MemoryChunk;
+    phrase_index.store(1, new_chunk);
+    new_chunk->save("../../data/gb_char.bin");
+    phrase_index.load(1, new_chunk);
+
+    new_chunk = new MemoryChunk;
+    phrase_index.store(2, new_chunk);
+    new_chunk->save("../../data/gbk_char.bin");
+    phrase_index.load(2, new_chunk);
+
+    return 0;
+}
diff --git a/src/training/gen_unigram.cpp b/src/training/gen_unigram.cpp
new file mode 100644 (file)
index 0000000..7e76693
--- /dev/null
@@ -0,0 +1,65 @@
+/* 
+ *  novel-pinyin,
+ *  A Simplified Chinese Sentence-Based Pinyin Input Method Engine
+ *  Based On Markov Model.
+ *  
+ *  Copyright (C) 2006-2007 Peng Wu
+ *  
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ * 
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *  
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <stdio.h>
+#include "memory_chunk.h"
+#include "novel_types.h"
+#include "phrase_index.h"
+
+//increase all unigram frequency by one.
+
+int main(int argc, char * argv[]){
+
+    FacadePhraseIndex phrase_index;
+    
+    //gb_char binary file
+    MemoryChunk * chunk = new MemoryChunk;
+    chunk->load("../../data/gb_char.bin");
+    phrase_index.load(1, chunk);
+    
+    //gbk_char binary file
+    chunk = new MemoryChunk;
+    chunk->load("../../data/gbk_char.bin");
+    phrase_index.load(2, chunk);
+
+    for ( size_t i = 16777217; i <= 16870566; ++i){
+       phrase_index.add_unigram_frequency(i, 1);
+    }
+
+#if 0
+    for ( size_t i = 33554433; i <= 33570193 ; ++i){
+       phrase_index.add_unigram_frequency(i, 1);
+    }
+#endif
+
+    MemoryChunk * new_chunk = new MemoryChunk;
+    phrase_index.store(1, new_chunk);
+    new_chunk->save("../../data/gb_char.bin");
+    phrase_index.load(1, new_chunk);
+
+    new_chunk = new MemoryChunk;
+    phrase_index.store(2, new_chunk);
+    new_chunk->save("../../data/gbk_char.bin");
+    phrase_index.load(2, new_chunk);
+
+    return 0;
+}
diff --git a/tests/Makefile.am b/tests/Makefile.am
new file mode 100644 (file)
index 0000000..f36e5f9
--- /dev/null
@@ -0,0 +1,25 @@
+## Makefile.am -- Process this file with automake to produce Makefile.in
+## Copyright (C) 2007 Peng Wu
+##
+## This program is free software; you can redistribute it and/or modify
+## it under the terms of the GNU General Public License as published by
+## the Free Software Foundation; either version 2, or (at your option)
+## any later version.
+##
+## This program is distributed in the hope that it will be useful,
+## but WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+## GNU General Public License for more details.
+##
+## You should have received a copy of the GNU General Public License
+## along with this program; if not, write to the Free Software
+## Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+AUTOMAKE_OPTIONS       = gnu
+SUBDIRS                = include storage lookup
+
+MAINTAINERCLEANFILES   = Makefile.in 
+
+CLEANFILES             = *.bak 
+
+ACLOCAL                        = aclocal -I $(ac_aux_dir)
diff --git a/tests/include/Makefile.am b/tests/include/Makefile.am
new file mode 100644 (file)
index 0000000..53bc089
--- /dev/null
@@ -0,0 +1,25 @@
+## Makefile.am -- Process this file with automake to produce Makefile.in
+## Copyright (C) 2007 Peng Wu
+##
+## This program is free software; you can redistribute it and/or modify
+## it under the terms of the GNU General Public License as published by
+## the Free Software Foundation; either version 2, or (at your option)
+## any later version.
+##
+## This program is distributed in the hope that it will be useful,
+## but WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+## GNU General Public License for more details.
+##
+## You should have received a copy of the GNU General Public License
+## along with this program; if not, write to the Free Software
+## Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+INCLUDES                = -I$(top_srcdir)/src/include
+
+noinst_PROGRAMS          = test_memory_chunk
+
+test_memory_chunk_SOURCES    = test_memory_chunk.cpp
+
+test_memory_chunk_LDADD      = ../../src/storage/libstorage.la @GLIB2_LIBS@ 
+
diff --git a/tests/include/test_memory_chunk.cpp b/tests/include/test_memory_chunk.cpp
new file mode 100755 (executable)
index 0000000..6282d93
--- /dev/null
@@ -0,0 +1,90 @@
+#include <stdio.h>
+#include <iostream>
+#include "memory_chunk.h"
+// Test Memory Chunk Functionality
+
+int main(int argc, char * argv[]){
+  MemoryChunk* chunk;
+  chunk = new MemoryChunk();
+  int i = 12;
+  chunk->set_content(0, &i, sizeof(int));
+
+  int * p = (int *)chunk->begin();
+  assert(chunk->size() == sizeof(int));
+  std::cout<<*p<<std::endl;
+  std::cout<<chunk->capacity()<<std::endl;
+  p = & i;
+  chunk->set_chunk(p, sizeof(int), NULL);
+  short t = 5;
+  chunk->set_content(sizeof(int), &t, sizeof(short));
+  assert( sizeof(int) + sizeof(short) == chunk->size());
+  std::cout<<chunk->capacity()<<std::endl;
+
+  p = (int *)chunk->begin();
+  short * p2 =(short *)(((char *) (chunk->begin())) + sizeof(int));
+  std::cout<<*p<<'\t'<<*p2<<std::endl;
+
+  chunk->set_content(sizeof(int) + sizeof(short), &t, sizeof(short));
+  
+  assert( sizeof(int) + (sizeof(short) << 1) == chunk->size());
+  std::cout<<chunk->capacity()<<std::endl;
+  p = (int *)chunk->begin();
+  p2 =(short *)(((char *) (chunk->begin())) + sizeof(int));
+  std::cout<<*p<<'\t'<<*p2<<'\t'<<*(p2 + 1)<<std::endl;
+
+  chunk->set_size(sizeof(int) + sizeof(short) *3);
+  p = (int *)chunk->begin();
+  p2 =(short *)(((char *) (chunk->begin())) + sizeof(int));
+
+  chunk->set_content(0, &i, sizeof(int));
+
+  *(p2+2) = 3;
+  std::cout<<*p<<'\t'<<*p2<<'\t'<<*(p2 + 1)<<'\t'<<*(p2+2)<<std::endl;
+
+  int m = 10;
+  chunk->set_chunk(&m, sizeof(int), NULL);
+  int n = 12;
+  chunk->insert_content(sizeof(int), &n, sizeof(int));
+  n = 11;
+  chunk->insert_content(sizeof(int), &n, sizeof(int));
+
+  int * p3 = (int *)chunk->begin();
+  std::cout<<*p3<<'\t'<<*(p3+1)<<'\t'<<*(p3+2)<<std::endl;
+       
+  chunk->remove_content(sizeof(int), sizeof(int));
+  std::cout<<*p3<<'\t'<<*(p3+1)<<std::endl;
+
+  int tmp;
+  assert(chunk->get_content(sizeof(int), &tmp, sizeof(int)));
+  std::cout<<tmp<<std::endl;
+  
+  
+  delete chunk;
+
+  const char * filename =  "/tmp/version";
+  const char * version = "0.2.0";
+
+  chunk =  new MemoryChunk;
+  bool retval = chunk->load(filename);
+  if ( !retval ){
+      std::cerr<<"can't find chunk"<<std::endl;
+  }else{
+      if ( memcmp(version, chunk->begin(), strlen(version) + 1) == 0){
+         std::cout<<"match"<<std::endl;
+      }
+
+  }
+
+  chunk->set_content(0, version, strlen(version) + 1);
+  chunk->save(filename);
+
+  retval = chunk->load(filename);
+  if ( !retval ){
+      std::cerr<<"can't find chunk"<<std::endl;
+  }
+  if ( memcmp(version, chunk->begin(), strlen(version) + 1) == 0){
+      std::cout<<"match"<<std::endl;
+  }
+
+  return 0;
+}
diff --git a/tests/lookup/Makefile.am b/tests/lookup/Makefile.am
new file mode 100644 (file)
index 0000000..ca863ce
--- /dev/null
@@ -0,0 +1,27 @@
+## Makefile.am -- Process this file with automake to produce Makefile.in
+## Copyright (C) 2007 Peng Wu
+##
+## This program is free software; you can redistribute it and/or modify
+## it under the terms of the GNU General Public License as published by
+## the Free Software Foundation; either version 2, or (at your option)
+## any later version.
+##
+## This program is distributed in the hope that it will be useful,
+## but WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+## GNU General Public License for more details.
+##
+## You should have received a copy of the GNU General Public License
+## along with this program; if not, write to the Free Software
+## Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+INCLUDES                = -I$(top_srcdir)/src/include \
+                         -I$(top_srcdir)/src/storage \
+                         -I$(top_srcdir)/src/lookup \
+                         @GLIB2_CPPFLAGS@
+
+noinst_PROGRAMS          = test_simple_lookup 
+
+test_simple_lookup_SOURCES = test_simple_lookup.cpp
+
+test_simple_lookup_LDADD   = ../../src/storage/libstorage.la ../../src/lookup/liblookup.la @GLIB2_LDFLAGS@
diff --git a/tests/lookup/test_simple_lookup.cpp b/tests/lookup/test_simple_lookup.cpp
new file mode 100644 (file)
index 0000000..04f4dce
--- /dev/null
@@ -0,0 +1,108 @@
+#include <string.h>
+#include <stdio.h>
+#include <sys/time.h>
+#include <glib.h>
+#include "novel_types.h"
+#include "pinyin_base.h"
+#include "pinyin_phrase.h"
+#include "pinyin_large_table.h"
+#include "phrase_index.h"
+#include "ngram.h"
+#include "lookup.h"
+
+size_t bench_times = 1000;
+
+guint32 record_time ()
+{
+    timeval tv;
+    gettimeofday (&tv, NULL);
+    return (guint32) tv.tv_sec * 1000000 + tv.tv_usec;
+}
+
+void print_time (guint32 old_time, guint32 times)
+{
+    timeval tv;
+    gettimeofday (&tv, NULL);
+
+    guint32 wasted = (guint32) tv.tv_sec * 1000000 + tv.tv_usec - old_time;
+
+    printf("Spent %d us for %d operations, %f us/op, %f times/s.\n\n" , wasted , times , ((double) wasted)/times , times * 1000000.0/wasted );
+}
+
+
+int main( int argc, char * argv[]){
+
+    PinyinCustomSettings custom;
+    PinyinLargeTable largetable(&custom);
+
+    BitmapPinyinValidator validator;
+    validator.initialize(&largetable); 
+
+    MemoryChunk * new_chunk = new MemoryChunk;
+    new_chunk->load("../../data/pinyin_index.bin");
+    largetable.load(new_chunk);
+    
+    FacadePhraseIndex phrase_index;
+    new_chunk = new MemoryChunk;
+    new_chunk->load("../../data/gb_char.bin");
+    phrase_index.load(1, new_chunk);
+    new_chunk = new MemoryChunk;
+    new_chunk->load("../../data/gbk_char.bin");
+    phrase_index.load(2, new_chunk);
+
+    Bigram bigram;
+    bigram.attach("../../data/bigram.db", "/tmp/bigram.db");
+    
+    PinyinLookup pinyin_lookup(&custom, &largetable, &phrase_index, &bigram);
+    
+    char* linebuf = (char *)malloc ( 1024 * sizeof (char) );
+    size_t size = 1024;
+    while( getline(&linebuf, &size, stdin) ){
+        linebuf[strlen(linebuf)-1] = '\0';
+       if ( strcmp ( linebuf, "quit" ) == 0)
+           break;
+       
+       PinyinDefaultParser parser;
+       PinyinKeyVector keys;
+       PinyinKeyPosVector poses;
+
+       validator.initialize(&largetable);
+       
+       keys = g_array_new(FALSE, FALSE, sizeof( PinyinKey));
+       poses = g_array_new(FALSE, FALSE, sizeof( PinyinKeyPos));
+       parser.parse(validator, keys, poses,linebuf);
+
+       if ( 0 == keys->len )
+           continue;
+       CandidateConstraints constraints = g_array_new(FALSE, FALSE, sizeof(lookup_constraint_t));
+
+       g_array_set_size(constraints, keys->len);
+       for ( size_t i = 0; i < constraints->len; ++i){
+           lookup_constraint_t * constraint = &g_array_index(constraints, lookup_constraint_t, i);
+           constraint->m_type = NO_CONSTRAINT;
+       }
+
+       MatchResults results = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
+       
+       guint32 start_time = record_time();
+       size_t times = 100;
+       for ( size_t i = 0; i < times; ++i)
+           pinyin_lookup.get_best_match(keys, constraints, results);
+       print_time(start_time, times);
+       for ( size_t i = 0; i < results->len; ++i){
+           phrase_token_t * token = &g_array_index(results, phrase_token_t, i);
+           if ( NULL == *token)
+               continue;
+           printf("pos:%d,token:%d\t", i, *token);
+       }
+       printf("\n");
+       char * sentence = NULL;
+       pinyin_lookup.convert_to_utf8(results, sentence);
+       printf("%s\n", sentence);
+
+       g_array_free(keys, true);
+       g_array_free(poses, true);
+       g_free(sentence);
+    }
+    free(linebuf);
+}
diff --git a/tests/storage/Makefile.am b/tests/storage/Makefile.am
new file mode 100644 (file)
index 0000000..e38c690
--- /dev/null
@@ -0,0 +1,41 @@
+## Makefile.am -- Process this file with automake to produce Makefile.in
+## Copyright (C) 2007 Peng Wu
+##
+## This program is free software; you can redistribute it and/or modify
+## it under the terms of the GNU General Public License as published by
+## the Free Software Foundation; either version 2, or (at your option)
+## any later version.
+##
+## This program is distributed in the hope that it will be useful,
+## but WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+## GNU General Public License for more details.
+##
+## You should have received a copy of the GNU General Public License
+## along with this program; if not, write to the Free Software
+## Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+INCLUDES                = -I$(top_srcdir)/src/include \
+                         -I$(top_srcdir)/src/storage \
+                         @GLIB2_CPPFLAGS@
+
+noinst_PROGRAMS          = test_parser \
+                          test_pinyin_index \
+                          test_phrase_index \
+                          test_ngram
+
+test_parser_SOURCES    = test_parser.cpp
+
+test_parser_LDADD      = ../../src/storage/libstorage.la @GLIB2_LDFLAGS@
+
+test_pinyin_index_SOURCES    = test_pinyin_index.cpp
+
+test_pinyin_index_LDADD      = ../../src/storage/libstorage.la @GLIB2_LDFLAGS@
+
+test_phrase_index_SOURCES = test_phrase_index.cpp
+
+test_phrase_index_LDADD = ../../src/storage/libstorage.la @GLIB2_LDFLAGS@
+
+test_ngram_SOURCES     = test_ngram.cpp
+
+test_ngram_LDADD       = ../../src/storage/libstorage.la @GLIB2_LDFLAGS@
diff --git a/tests/storage/test_ngram.cpp b/tests/storage/test_ngram.cpp
new file mode 100644 (file)
index 0000000..7bdb141
--- /dev/null
@@ -0,0 +1,126 @@
+#include <stdio.h>
+#include "memory_chunk.h"
+#include "novel_types.h"
+#include "ngram.h"
+
+
+int main(int argc, char * argv[]){
+    SingleGram single_gram;
+    
+    const guint32 total_freq = 16;
+    assert(single_gram.set_total_freq(total_freq));
+    
+
+    phrase_token_t tokens[6] = { 2, 6, 4, 3, 1, 3};
+    guint32 freqs[6] = { 1, 2, 4, 8, 16, 32};
+
+    for(int i = 0; i < 6 ;++i){
+       single_gram.set_freq(tokens[i], freqs[i]);
+    }
+
+    guint32 freq;
+    single_gram.get_freq(3, freq);
+    assert(freq == 32);
+
+    printf("--------------------------------------------------------\n");
+    PhraseIndexRange range;
+    BigramPhraseArray array = g_array_new(FALSE, FALSE, sizeof(BigramPhraseItem));
+    range.m_range_begin = 0; range.m_range_end = 8;
+    single_gram.search(&range,array);
+    for ( int i = 0; i < array->len; ++i){
+       BigramPhraseItem * item = &g_array_index(array, BigramPhraseItem, i);
+       printf("item:%d:%f\n", item->m_token, item->m_freq);
+    } 
+
+
+    assert(single_gram.get_total_freq(freq));
+    assert(freq == total_freq);
+
+
+    Bigram bigram;
+    assert(bigram.attach(NULL, "/tmp/system.db"));
+    bigram.store(1, &single_gram);
+    single_gram.set_freq(5, 8);
+    single_gram.set_total_freq(32);
+    
+    bigram.store(2, &single_gram);
+
+    printf("--------------------------------------------------------\n");
+    SingleGram * system, * user;
+    bigram.load(1, system, user);
+    assert(NULL == system);
+    g_array_set_size(array, 0);
+    range.m_range_begin = 0; range.m_range_end = 8;
+    user->search(&range,array);
+    for ( int i = 0; i < array->len; ++i){
+       BigramPhraseItem * item = &g_array_index(array, BigramPhraseItem, i);
+       printf("item:%d:%f\n", item->m_token, item->m_freq);
+    } 
+    delete user;
+
+    printf("--------------------------------------------------------\n");
+    bigram.load(2, system, user);
+    assert(NULL == system);
+    g_array_set_size(array, 0);
+    range.m_range_begin = 0; range.m_range_end = 8;
+    user->search(&range,array);
+    for ( int i = 0; i < array->len; ++i){
+       BigramPhraseItem * item = &g_array_index(array, BigramPhraseItem, i);
+       printf("item:%d:%f\n", item->m_token, item->m_freq);
+    } 
+    delete user;
+    
+    bigram.attach("/tmp/system.db", NULL);
+    printf("--------------------------------------------------------\n");
+    bigram.load(1, system, user);
+    assert(NULL == user);
+    g_array_set_size(array, 0);
+    range.m_range_begin = 0; range.m_range_end = 8;
+    system->search(&range,array);
+    for ( int i = 0; i < array->len; ++i){
+       BigramPhraseItem * item = &g_array_index(array, BigramPhraseItem, i);
+       printf("item:%d:%f\n", item->m_token, item->m_freq);
+    } 
+    delete system;
+    
+    printf("--------------------------------------------------------\n");
+    bigram.load(2, system, user);
+    assert(NULL == user);
+    g_array_set_size(array, 0);
+    range.m_range_begin = 0; range.m_range_end = 8;
+    system->search(&range,array);
+    for ( int i = 0; i < array->len; ++i){
+       BigramPhraseItem * item = &g_array_index(array, BigramPhraseItem, i);
+       printf("item:%d:%f\n", item->m_token, item->m_freq);
+    }
+    delete system;
+
+    printf("--------------------------------------------------------\n");
+    single_gram.prune();
+    g_array_set_size(array, 0);
+    range.m_range_begin = 0; range.m_range_end = 8;
+    single_gram.search(&range,array);
+    for ( int i = 0; i < array->len; ++i){
+        BigramPhraseItem * item = &g_array_index(array, BigramPhraseItem, i);
+        printf("item:%d:%f\n", item->m_token, item->m_freq);
+    }
+    assert(single_gram.get_total_freq(freq));
+    printf("total_freq:%d\n", freq);
+
+    g_array_free(array, TRUE);
+
+    GArray * system_items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
+    GArray * user_items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
+    bigram.get_all_items(system_items, user_items);
+
+    printf("----------------------system----------------------------\n");
+    for ( int i = 0; i < system_items->len; ++i){
+       phrase_token_t * token = &g_array_index(system_items, phrase_token_t, i);
+       printf("item:%d\n", *token);
+    }
+    printf("-----------------------user-----------------------------\n");
+    for ( int i = 0; i < user_items->len; ++i){
+       phrase_token_t * token = &g_array_index(user_items, phrase_token_t, i);
+       printf("item:%d\n", *token);
+    }
+}
diff --git a/tests/storage/test_parser.cpp b/tests/storage/test_parser.cpp
new file mode 100644 (file)
index 0000000..ba5bfb8
--- /dev/null
@@ -0,0 +1,165 @@
+/*
+ * libpinyin
+ * 
+ * Copyright (c) 2006 James Su <suzhe@tsinghua.org.cn>
+ *
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place, Suite 330,
+ * Boston, MA  02111-1307  USA
+ *
+ * $Id$
+ *
+ */
+
+#include <string.h>
+#include <iostream>
+#include "pinyin_base.h"
+
+typedef std::string String;
+
+
+static const char *help_msg =
+    "Too few argument!\n"
+    "Usage:\n"
+    "  test-parser [options]\n\n"
+    "  -i            Use incomplete pinyin.\n"
+    "  -f table      Use specified pinyin table file.\n"
+    "  -p parser     Use specified parser instead of Default.\n"
+    "                parser could be:\n"
+    "                sp-stone\n"
+    "                sp-zrm\n"
+    "                sp-ms\n"
+    "                sp-ziguang\n"
+    "                sp-abc\n"
+    "                sp-liushi\n"
+    "                zy-zhuyin\n"
+    "                zy-standard\n"
+    "                zy-hsu\n"
+    "                zy-ibm\n"
+    "                zy-gin-yieh\n"
+    "                zy-et\n"
+    "                zy-et26\n";
+
+int main (int argc, char * argv [])
+{
+    NullPinyinValidator validator;
+    PinyinKeyVector keys;
+    PinyinKeyPosVector poses;
+    PinyinCustomSettings custom;
+    PinyinParser *parser = 0;
+    //PinyinTable table;
+    const char *tablefile = "../data/pinyin-table.txt";
+
+    keys = g_array_new(FALSE, FALSE, sizeof( PinyinKey));
+    poses = g_array_new(FALSE, FALSE, sizeof( PinyinKeyPos));
+
+    int i = 0;
+    while (i<argc) {
+        if (++i >= argc) break;
+
+        if (String ("-h") == argv [i] || String ("--help") == argv [i]) {
+            std::cout << help_msg;
+            return 0;
+        }
+
+        if (String ("-i") == argv [i]) {
+            custom.set_use_incomplete (true);
+            continue;
+        }
+
+        if (String ("-p") == argv [i]) {
+            if (++i >= argc) {
+                std::cerr << "No argument for option " << argv [i-1] << "\n";
+                return -1;
+            }
+            if (!strcmp (argv[i], "sp") || !strcmp (argv[i], "sp-default"))
+                parser = new PinyinShuangPinParser ();
+            else if (!strcmp (argv[i], "sp-stone"))
+                parser = new PinyinShuangPinParser (SHUANG_PIN_STONE);
+            else if (!strcmp (argv[i], "sp-zrm"))
+                parser = new PinyinShuangPinParser (SHUANG_PIN_ZRM);
+            else if (!strcmp (argv[i], "sp-ms"))
+                parser = new PinyinShuangPinParser (SHUANG_PIN_MS);
+            else if (!strcmp (argv[i], "sp-ziguang"))
+                parser = new PinyinShuangPinParser (SHUANG_PIN_ZIGUANG);
+            else if (!strcmp (argv[i], "sp-abc"))
+                parser = new PinyinShuangPinParser (SHUANG_PIN_ABC);
+            else if (!strcmp (argv[i], "sp-liushi"))
+                parser = new PinyinShuangPinParser (SHUANG_PIN_LIUSHI);
+            continue;
+        }
+
+        if (String ("-f") == argv [i]) {
+            if (++i >= argc) {
+                std::cerr << "No argument for option " << argv [i-1] << "\n";
+                return -1;
+            }
+            tablefile = argv [i];
+            continue;
+        }
+
+        std::cerr << "Invalid option: " << argv [i] << "\n";
+        return -1;
+    };
+
+    if (!parser) parser = new PinyinDefaultParser ();
+
+/*
+    if (!table.load (tablefile)) {
+        std::cerr << "Failed to load tablefile: " << tablefile << "\n";
+        return -1;
+    }
+*/
+    //table.update_custom_settings (custom);
+
+
+    char buf[1024];
+
+    while (1) {
+        std::cout << "Input:" << std::flush;
+        std::cin.getline (buf, 1023, '\n');
+
+        if (strncmp (buf, "quit", 4) == 0) break;
+
+        int len = parser->parse (validator, keys, poses,(const char *) buf);
+
+        std::cout << "Parsed " << len << " chars, " << keys->len << " keys:\n";
+
+        for (size_t i=0; i < keys->len; ++i){
+           PinyinKey * key = &g_array_index(keys, PinyinKey, i);
+            std::cout << key->get_key_string () << " ";
+       }
+
+       std::cout << std::endl;
+
+       for ( size_t i=0; i < poses->len; ++i){
+           PinyinKeyPos * pos = &g_array_index(poses, PinyinKeyPos, i);
+           std::cout << pos->get_pos() << " " << pos->get_length()<<" ";
+       }
+
+        std::cout << std::endl;
+
+        for (size_t i=0; i < keys->len; ++i){
+           PinyinKey * key = &g_array_index(keys, PinyinKey, i);
+            std::cout <<  key->get_key_zhuyin_string () << " ";
+       }
+
+        std::cout << std::endl;
+    }
+}
+
+/*
+vi:ts=4:nowrap:ai:expandtab
+*/
diff --git a/tests/storage/test_phrase_index.cpp b/tests/storage/test_phrase_index.cpp
new file mode 100644 (file)
index 0000000..d858ae2
--- /dev/null
@@ -0,0 +1,141 @@
+#include <stdio.h>
+#include <sys/time.h>
+#include <glib.h>
+#include "memory_chunk.h"
+#include "pinyin_base.h"
+#include "phrase_index.h"
+
+size_t bench_times = 100000;
+
+guint32 record_time ()
+{
+    timeval tv;
+    gettimeofday (&tv, NULL);
+    return (guint32) tv.tv_sec * 1000000 + tv.tv_usec;
+}
+
+void print_time (guint32 old_time, guint32 times)
+{
+    timeval tv;
+    gettimeofday (&tv, NULL);
+
+    guint32 wasted = (guint32) tv.tv_sec * 1000000 + tv.tv_usec - old_time;
+
+    printf("Spent %d us for %d operations, %f us/op, %f times/s.\n\n" , wasted , times , ((double) wasted)/times , times * 1000000.0/wasted );
+}
+
+
+int main(int argc, char * argv[]){
+    PhraseItem phrase_item;
+    utf16_t string1 = 2;
+    PinyinKey key1 = PinyinKey((PinyinInitial)3,(PinyinFinal)3,(PinyinTone)3);
+    PinyinKey key2 = PinyinKey((PinyinInitial)4,(PinyinFinal)4,(PinyinTone)4);
+
+
+    phrase_item.set_phrase_string(1, &string1);
+    phrase_item.append_pronunciation(&key1, 100);
+    phrase_item.append_pronunciation(&key2, 300);
+
+    assert(phrase_item.get_phrase_length() == 1);
+
+    PinyinKey key3;
+    guint32 freq;
+    phrase_item.get_nth_pronunciation(0, &key3, freq);
+    assert(key3 == key1);
+    assert(freq == 100);
+    phrase_item.get_nth_pronunciation(1, &key3, freq);
+    assert(key3 == key2);
+    assert(freq == 300);
+
+    PinyinCustomSettings custom;
+    gfloat poss = phrase_item.get_pinyin_possibility(custom, &key1);
+    printf("pinyin possiblitiy:%f\n", poss);
+
+    assert(phrase_item.get_unigram_frequency() == 0);
+
+    utf16_t string2;
+    phrase_item.get_phrase_string(&string2);
+    assert(string1 == string2);
+
+    FacadePhraseIndex phrase_index;
+    assert(phrase_index.add_phrase_item(1, &phrase_item));
+
+    MemoryChunk* chunk = new MemoryChunk;
+    assert(phrase_index.store(0, chunk));
+    assert(phrase_index.load(0, chunk));
+
+    PhraseItem item2;
+    guint32 time = record_time();
+    for ( int i = 0; i < bench_times; ++i){
+       phrase_index.get_phrase_item(1, item2);
+       assert(item2.get_unigram_frequency() == 0);
+       assert(item2.get_n_pronunciation() == 2);
+       assert(item2.get_phrase_length() == 1);
+       assert(item2.get_pinyin_possibility(custom, &key2) == 0.75);
+    }
+    print_time(time, bench_times);
+
+    {
+    PhraseItem item3;
+    phrase_index.get_phrase_item(1, item3);
+    item3.increase_pinyin_possibility(custom, &key1, 200);
+    assert(item3.get_pinyin_possibility(custom, &key1) == 0.5) ;
+    }
+
+    {
+    PhraseItem item5;
+    phrase_index.get_phrase_item(1, item5);
+    gfloat poss = item5.get_pinyin_possibility(custom, &key1);
+    printf("pinyin poss:%f\n", poss);
+    assert(poss == 0.5);
+    }
+
+    FacadePhraseIndex phrase_index_load;
+
+    FILE* infile = fopen("../../data/gb_char.table", "r");
+    if ( NULL == infile ){
+       printf("open gb_char.table failed!\n");
+       exit(1);
+    }
+
+    phrase_index_load.load_text(1, infile);
+    fclose(infile);
+
+    infile = fopen("../../data/gbk_char.table", "r");
+    if ( NULL == infile ){
+       printf("open gbk_char.table failed!\n");
+       exit(1);
+    }
+
+    phrase_index_load.load_text(2, infile);
+    fclose(infile);
+
+    MemoryChunk* store1 = new MemoryChunk;
+    phrase_index_load.store(1, store1);
+    phrase_index_load.load(1, store1);
+
+    MemoryChunk* store2 = new MemoryChunk;
+    phrase_index_load.store(2, store2);
+    phrase_index_load.load(2, store2);
+
+    phrase_index_load.get_phrase_item(16870555, item2);
+    assert( item2.get_phrase_length() == 14);
+    assert( item2.get_n_pronunciation() == 1);
+
+    gunichar2 buf[1024];
+    item2.get_phrase_string(buf);
+    char * string = g_utf16_to_utf8( buf, 14, NULL, NULL, NULL);
+    printf("%s\n", string);
+    g_free(string);
+
+    guint32 delta = 3;
+    phrase_index_load.add_unigram_frequency(16870555, delta);
+    phrase_index_load.get_phrase_item(16870555, item2);
+    assert( item2.get_unigram_frequency() == 3);
+
+    phrase_index_load.get_phrase_item(16777222, item2);
+    assert(item2.get_phrase_length() == 1);
+    assert(item2.get_n_pronunciation() == 5);
+
+    return 0;
+}
diff --git a/tests/storage/test_pinyin_index.cpp b/tests/storage/test_pinyin_index.cpp
new file mode 100644 (file)
index 0000000..e79eb3b
--- /dev/null
@@ -0,0 +1,148 @@
+#include <string.h>
+#include <stdio.h>
+#include <sys/time.h>
+#include <glib.h>
+#include "novel_types.h"
+#include "pinyin_base.h"
+#include "pinyin_phrase.h"
+#include "pinyin_large_table.h"
+#include "phrase_index.h"
+
+size_t bench_times = 1000;
+
+guint32 record_time ()
+{
+    timeval tv;
+    gettimeofday (&tv, NULL);
+    return (guint32) tv.tv_sec * 1000000 + tv.tv_usec;
+}
+
+void print_time (guint32 old_time, guint32 times)
+{
+    timeval tv;
+    gettimeofday (&tv, NULL);
+
+    guint32 wasted = (guint32) tv.tv_sec * 1000000 + tv.tv_usec - old_time;
+
+    printf("Spent %d us for %d operations, %f us/op, %f times/s.\n\n" , wasted , times , ((double) wasted)/times , times * 1000000.0/wasted );
+}
+
+
+int main( int argc, char * argv[]){
+
+    PinyinCustomSettings custom;
+    PinyinLargeTable largetable(&custom);
+
+    FILE * gbfile = fopen("../../data/gb_char.table", "r");
+    if ( gbfile == NULL) {
+       printf("open gb_char.table failed!");
+       return 1;
+    }
+    FILE * gbkfile = fopen("../../data/gbk_char.table","r");
+    if ( gbkfile == NULL) {
+       printf("open gb_char.table failed!");
+       return 1;
+    }
+    
+    largetable.load_text(gbfile);
+    fclose(gbfile);
+    largetable.load_text(gbkfile);
+    fclose(gbkfile);
+
+    FacadePhraseIndex phrase_index;
+
+    FILE* infile = fopen("../../data/gb_char.table", "r");
+    if ( NULL == infile ){
+       printf("open gb_char.table failed!\n");
+       exit(1);
+    }
+
+    phrase_index.load_text(1, infile);
+    fclose(infile);
+
+    infile = fopen("../../data/gbk_char.table", "r");
+    if ( NULL == infile ){
+       printf("open gbk_char.table failed!\n");
+       exit(1);
+    }
+
+    phrase_index.load_text(2, infile);
+    fclose(infile);
+
+    MemoryChunk* new_chunk = new MemoryChunk;
+    largetable.store(new_chunk);
+    largetable.load(new_chunk);
+    
+    char* linebuf = (char *)malloc ( 1024 * sizeof (char) );
+    size_t size = 1024;
+    while( getline(&linebuf, &size, stdin) ){
+        linebuf[strlen(linebuf)-1] = '\0';
+       if ( strcmp ( linebuf, "quit" ) == 0)
+           break;
+       
+       PinyinDefaultParser parser;
+       NullPinyinValidator validator;
+       PinyinKeyVector keys;
+       PinyinKeyPosVector poses;
+       
+       keys = g_array_new(FALSE, FALSE, sizeof( PinyinKey));
+       poses = g_array_new(FALSE, FALSE, sizeof( PinyinKeyPos));
+       parser.parse(validator, keys, poses, linebuf);
+       
+       guint32 start = record_time();
+
+       PhraseIndexRanges ranges;
+       for( int i = 0 ; i < PHRASE_INDEX_LIBRARY_COUNT ; ++i){
+           ranges[i] = g_array_new(FALSE, FALSE, sizeof (PhraseIndexRange));
+       }
+       for ( int i = 0 ; i < bench_times; ++i){
+           largetable.search(keys->len, (PinyinKey *)keys->data, ranges);
+       }
+       
+       for( int i = 0 ; i < PHRASE_INDEX_LIBRARY_COUNT ; ++i){
+           GArray * range = ranges[i];
+           g_array_set_size( range, 0);
+       }
+       print_time(start, bench_times);
+
+       largetable.search(keys->len, (PinyinKey *)keys->data, ranges);
+       for( int i = 0 ; i < PHRASE_INDEX_LIBRARY_COUNT ; ++i){
+           GArray * range = ranges[i];
+           if ( range ){
+               for (int k = 0; k < range->len; ++k){
+                   PhraseIndexRange* onerange = &g_array_index(range, PhraseIndexRange, k);
+                   printf("start:%ld\tend:%ld\n", onerange->m_range_begin, onerange->m_range_end); 
+                   PhraseItem item;
+                   for ( phrase_token_t token = onerange->m_range_begin; token != onerange->m_range_end; ++token){
+                       phrase_index.get_phrase_item( token, item);
+                       gunichar2 bufstr[1024];
+                       item.get_phrase_string(bufstr);
+                       char * string = g_utf16_to_utf8
+                           ( bufstr, item.get_phrase_length(), 
+                             NULL, NULL, NULL);
+                       printf("%s\t", string);
+                       g_free(string);
+                       PinyinKey pinyin_buffer[1024];
+                       size_t npron = item.get_n_pronunciation();
+                       guint32 freq;
+                       for ( size_t n = 0; n < npron; ++n){
+                           item.get_nth_pronunciation(n, pinyin_buffer, freq);
+                           for ( size_t o = 0; o < item.get_phrase_length(); ++o){
+                               printf("%s'", pinyin_buffer[o].get_key_string());
+                           }
+                           printf("\b \t %d", freq);
+                       }
+                       printf("\n");
+                   }
+               }
+               if ( range->len)
+                   printf("range items number:%d\n", range->len);
+           }
+           g_array_set_size( range, 0);
+       }
+
+       g_array_free(keys, TRUE);
+       g_array_free(poses, TRUE);
+    }
+    free(linebuf);
+}
diff --git a/utils/Makefile.am b/utils/Makefile.am
new file mode 100644 (file)
index 0000000..1f0d85d
--- /dev/null
@@ -0,0 +1,25 @@
+## Makefile.am -- Process this file with automake to produce Makefile.in
+## Copyright (C) 2007 Peng Wu
+##
+## This program is free software; you can redistribute it and/or modify
+## it under the terms of the GNU General Public License as published by
+## the Free Software Foundation; either version 2, or (at your option)
+## any later version.
+##
+## This program is distributed in the hope that it will be useful,
+## but WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+## GNU General Public License for more details.
+##
+## You should have received a copy of the GNU General Public License
+## along with this program; if not, write to the Free Software
+## Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+AUTOMAKE_OPTIONS       = gnu
+SUBDIRS                = storage
+
+MAINTAINERCLEANFILES   = Makefile.in 
+
+CLEANFILES             = *.bak 
+
+ACLOCAL                        = aclocal -I $(ac_aux_dir)
diff --git a/utils/storage/Makefile.am b/utils/storage/Makefile.am
new file mode 100644 (file)
index 0000000..9328174
--- /dev/null
@@ -0,0 +1,30 @@
+## Makefile.am -- Process this file with automake to produce Makefile.in
+## Copyright (C) 2007 Peng Wu
+##
+## This program is free software; you can redistribute it and/or modify
+## it under the terms of the GNU General Public License as published by
+## the Free Software Foundation; either version 2, or (at your option)
+## any later version.
+##
+## This program is distributed in the hope that it will be useful,
+## but WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+## GNU General Public License for more details.
+##
+## You should have received a copy of the GNU General Public License
+## along with this program; if not, write to the Free Software
+## Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+INCLUDES                = -I$(top_srcdir)/src/include \
+                         -I$(top_srcdir)/src/storage \
+                         @GLIB2_CPPFLAGS@
+
+noinst_PROGRAMS          = gen_pinyin_table gen_binary_files
+
+gen_pinyin_table_SOURCES    = gen_pinyin_table.cpp
+
+gen_pinyin_table_LDADD      = ../../src/storage/libstorage.la @GLIB2_LDFLAGS@
+
+gen_binary_files_SOURCES    = gen_binary_files.cpp
+
+gen_binary_files_LDADD      = ../../src/storage/libstorage.la @GLIB2_LDFLAGS@
diff --git a/utils/storage/gen_binary_files.cpp b/utils/storage/gen_binary_files.cpp
new file mode 100644 (file)
index 0000000..7386106
--- /dev/null
@@ -0,0 +1,68 @@
+#include <stdio.h>
+#include "memory_chunk.h"
+#include "novel_types.h"
+#include "pinyin_base.h"
+#include "pinyin_phrase.h"
+#include "pinyin_large_table.h"
+#include "phrase_index.h"
+
+int main(int argc, char * argv[]){
+    /* generate pinyin index*/
+    PinyinCustomSettings custom;
+    PinyinLargeTable largetable(&custom);
+
+    FILE * gbfile = fopen("../../data/gb_char.table", "r");
+    if ( gbfile == NULL) {
+       printf("open gb_char.table failed!");
+       return 1;
+    }
+    FILE * gbkfile = fopen("../../data/gbk_char.table","r");
+    if ( gbkfile == NULL) {
+       printf("open gb_char.table failed!");
+       return 1;
+    }
+    
+    largetable.load_text(gbfile);
+    fclose(gbfile);
+    largetable.load_text(gbkfile);
+    fclose(gbkfile);
+
+    MemoryChunk * new_chunk = new MemoryChunk;
+    largetable.store(new_chunk);
+    new_chunk->save("../../data/pinyin_index.bin");
+    largetable.load(new_chunk);
+    
+
+    /* generate phrase index*/
+    FacadePhraseIndex phrase_index;
+
+    FILE* infile = fopen("../../data/gb_char.table", "r");
+    if ( NULL == infile ){
+       printf("open gb_char.table failed!\n");
+       exit(1);
+    }
+
+    phrase_index.load_text(1, infile);
+    fclose(infile);
+
+    infile = fopen("../../data/gbk_char.table", "r");
+    if ( NULL == infile ){
+       printf("open gbk_char.table failed!\n");
+       exit(1);
+    }
+
+    phrase_index.load_text(2, infile);
+    fclose(infile);
+
+    new_chunk = new MemoryChunk;
+    phrase_index.store(1, new_chunk);
+    new_chunk->save("../../data/gb_char.bin");
+    phrase_index.load(1, new_chunk);
+
+    new_chunk = new MemoryChunk;
+    phrase_index.store(2, new_chunk);
+    new_chunk->save("../../data/gbk_char.bin");
+    phrase_index.load(2, new_chunk);
+    
+    return 0;
+}
diff --git a/utils/storage/gen_pinyin_table.cpp b/utils/storage/gen_pinyin_table.cpp
new file mode 100644 (file)
index 0000000..38e6a27
--- /dev/null
@@ -0,0 +1,248 @@
+#include <stdio.h>
+#include <locale.h>
+#include <glib.h>
+#include "novel_types.h"
+#include "pinyin_base.h"
+#include "pinyin_phrase.h"
+
+
+GTree * g_pinyin_tree;
+GArray * g_item_array[MAX_PHRASE_LENGTH + 1];
+
+struct phrase_item{
+    size_t length;
+    gunichar * uniphrase;
+};
+
+struct pinyin_and_freq_item{
+    GArray * pinyin;
+    guint32 freq;
+};
+
+struct item{
+    phrase_item * phrase;
+    GArray * pinyin_and_freq_array;       /* Array of pinyin_and_freq_item. */
+};
+
+void feed_file(const char * filename);
+
+void feed_line(const char * phrase, const char * pinyin, const guint32 freq);
+
+void store_in_item_array();
+
+void sort_item_array();
+
+void gen_phrase_file(const char * outfilename, int phrase_index);
+
+void print_help(){
+    printf("Usage: gen_pinyin_table -t <PHRASE_INDEX> "
+          "-o <OUTPUTFILE> <FILE1> <FILE2> .. <FILEn>\n");
+    printf("<OUTPUTFILE> the result output file\n");
+    printf("<FILEi> input pinyin files\n");
+    printf("<PHRASE_INDEX> phrase index identifier\n");
+    exit(1);
+}
+
+gint phrase_item_compare(gconstpointer a, gconstpointer b){
+    phrase_item * itema = (phrase_item *) a;
+    phrase_item * itemb = (phrase_item *) b;
+    if ( itema->length != itemb->length )
+       return itema->length - itemb->length;
+    else 
+       return memcmp(itema->uniphrase, itemb->uniphrase, 
+                     sizeof(gunichar) * itema->length);
+}
+
+int main(int argc, char * argv[]){  
+    char outfilename[1024]="temp.out";
+    int phrase_index = 0;
+    int i = 1;
+
+    g_pinyin_tree = g_tree_new(phrase_item_compare);
+
+    setlocale(LC_ALL,"");
+    while (  i < argc ){
+       if ( strcmp("--help", argv[i] ) == 0) {
+               print_help();
+       }else if ( strcmp("-t", argv[i] ) == 0){
+           if ( ++i >= argc )
+               print_help();
+           phrase_index = atoi(argv[i]);
+       }else if ( strcmp("-o", argv[i] ) == 0 ){
+           if ( ++i >= argc )
+               print_help();
+           strcpy( outfilename, argv[i]);
+       } else {
+           feed_file(argv[i]);
+       }
+       ++i;
+    }
+    
+    printf("nnodes: %d\n", g_tree_nnodes(g_pinyin_tree));
+
+    store_in_item_array();
+    sort_item_array();
+    gen_phrase_file(outfilename, phrase_index);
+
+    return 0;
+}
+
+
+void feed_file ( const char * filename){
+    char phrase[1024], pinyin[1024];
+    guint32 n_freq;
+    FILE * infile = fopen(filename, "r");
+    if ( NULL == infile ){
+        fprintf(stderr, "Can't open file %s.\n", filename);
+        exit(1);
+    }
+    while ( !feof(infile)){
+       fscanf(infile, "%s", phrase);
+       fscanf(infile, "%s", pinyin);
+       fscanf(infile, "%u", &n_freq);
+       if (feof(infile))
+               break;
+       feed_line(phrase, pinyin, n_freq);
+    }
+    fclose(infile);
+}
+
+void feed_line (const char * phrase, const char * pinyin, const guint32 freq){
+    phrase_item * new_phrase_ptr = (phrase_item *)
+       malloc( sizeof(phrase_item));     
+    new_phrase_ptr->length = g_utf8_strlen(phrase, -1);
+       /* FIXME: modify ">" to ">=" according to pinyin_large_table.cpp
+        *      where is the code which I don't want to touch. :-)
+        */
+       if (new_phrase_ptr->length >= MAX_PHRASE_LENGTH ) {
+               printf("too long phrase:%s\t%s\t%d\n", phrase, pinyin, freq);
+               free(new_phrase_ptr);
+               return;
+       }
+    new_phrase_ptr->uniphrase = g_utf8_to_ucs4(phrase, -1, NULL, NULL, NULL);
+    
+    PinyinDefaultParser parser;
+    NullPinyinValidator validator;
+    PinyinKeyVector keys;
+    PinyinKeyPosVector poses;
+    
+    keys = g_array_new(FALSE, FALSE, sizeof( PinyinKey));
+    poses = g_array_new(FALSE, FALSE, sizeof( PinyinKeyPos));
+    parser.parse(validator, keys, poses, pinyin);
+
+    GArray * array = (GArray *)g_tree_lookup(g_pinyin_tree, new_phrase_ptr);
+
+    pinyin_and_freq_item value_item;
+    value_item.pinyin = keys;
+    value_item.freq = freq;
+    
+    if(new_phrase_ptr->length != value_item.pinyin->len){
+       printf("error:phrase:%s\tpinyin:%s\n", phrase, pinyin);
+       return;
+    }
+
+    if ( array == NULL){
+       array = g_array_new(FALSE, TRUE, sizeof(pinyin_and_freq_item));
+       g_array_append_val(array, value_item);
+       g_tree_insert(g_pinyin_tree, new_phrase_ptr, array);
+       return;
+    }
+    bool found = false;
+    for ( int i = 0; i < array->len ; ++i){
+       pinyin_and_freq_item * old_value_item = &g_array_index(array, pinyin_and_freq_item, i);
+       int result = pinyin_exact_compare((PinyinKey *)value_item.pinyin->data, 
+                                         (PinyinKey *)old_value_item->pinyin->data , value_item.pinyin->len);
+       if ( result == 0 ){
+           printf("Duplicate item: phrase:%s\tpinyin:%s\tfreq:%u\n", 
+                  phrase, pinyin, freq);
+           old_value_item->freq += freq;
+           found = true;
+       }
+    }
+
+    g_array_free(poses, TRUE);
+    
+    if ( !found ){
+       g_array_append_val(array, value_item);
+       g_tree_insert(g_pinyin_tree, new_phrase_ptr, array);
+    }else
+       g_array_free(keys, TRUE);
+
+    free(new_phrase_ptr);
+    //g_array_free(keys, TRUE);
+}
+
+gboolean store_one_item (gpointer key, gpointer value, gpointer data){
+    item oneitem;
+    oneitem.phrase = (phrase_item *)key; 
+    oneitem.pinyin_and_freq_array = (GArray *)value;
+    int length = oneitem.phrase->length;
+    g_array_append_val(g_item_array[length], oneitem);
+    return FALSE;
+}
+
+void store_in_item_array(){
+    for ( int i = 1; i < MAX_PHRASE_LENGTH + 1; ++i){
+       g_item_array[i] = g_array_new(FALSE, TRUE, sizeof(item));
+    }
+    g_tree_foreach(g_pinyin_tree, store_one_item, NULL);
+}
+
+gint phrase_array_compare ( gconstpointer a, gconstpointer b, gpointer user_data){
+    int phrase_length = *((int *) user_data);
+    GArray * arraya = 
+       g_array_index(((item *)a)->pinyin_and_freq_array, pinyin_and_freq_item, 0).pinyin;
+    GArray * arrayb = 
+       g_array_index(((item *)b)->pinyin_and_freq_array, pinyin_and_freq_item, 0).pinyin;
+    return  pinyin_exact_compare((PinyinKey *)arraya->data, (PinyinKey*)arrayb->data, phrase_length);
+}
+
+void sort_item_array(){
+    for ( int i = 1; i < MAX_PHRASE_LENGTH + 1; ++i){
+       g_array_sort_with_data(g_item_array[i], phrase_array_compare , &i);
+    }
+}
+
+void gen_phrase_file(const char * outfilename, int phrase_index){
+    FILE * outfile = fopen(outfilename, "w");
+    if (NULL == outfile ) {
+        fprintf(stderr, "Can't write file %s.\n", outfilename);
+        exit(1);
+    }
+    phrase_token_t token = 1;
+    char pinyin_buffer[4096];
+    //phrase length
+    for ( int i = 1; i < MAX_PHRASE_LENGTH + 1; ++i){
+       GArray * item_array = g_item_array[i];
+       //item array
+       for( int m = 0; m < item_array->len; ++m){
+           item* oneitem = & g_array_index(item_array, item, m);
+           phrase_item * phrase = oneitem->phrase;
+           GArray * pinyin_and_freqs = oneitem->pinyin_and_freq_array;
+           const char * phrase_buffer = g_ucs4_to_utf8(phrase->uniphrase,
+                                                phrase->length, 
+                                                NULL, NULL, NULL);
+           //each pinyin
+           for( int n = 0 ; n < pinyin_and_freqs->len; ++n){
+               pinyin_and_freq_item * pinyin_and_freq = &g_array_index(pinyin_and_freqs, pinyin_and_freq_item, n);
+               GArray * pinyin = pinyin_and_freq->pinyin;
+               PinyinKey * key = &g_array_index(pinyin, PinyinKey, 0);
+               strcpy(pinyin_buffer,key->get_key_string());
+               for (size_t k = 1; k < pinyin->len; ++k){
+                   strcat(pinyin_buffer, "'");
+                   PinyinKey * key = &g_array_index(pinyin, PinyinKey, k);
+                   strcat(pinyin_buffer, key->get_key_string ());
+               }
+               guint32 freq = pinyin_and_freq -> freq;
+               if ( freq < 3 ) 
+                   freq = 3;
+               fprintf( outfile, "%s\t%s\t%d\t%d\n", 
+                        pinyin_buffer, phrase_buffer, 
+                        PHRASE_INDEX_MAKE_TOKEN(phrase_index, token),
+                        freq);
+           }
+           token++;
+       }
+    }
+    fclose(outfile);
+}