2 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
4 * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved.
6 * The contents of this file are subject to the terms of either the GNU Lesser
7 * General Public License Version 2.1 only ("LGPL") or the Common Development and
8 * Distribution License ("CDDL")(collectively, the "License"). You may not use this
9 * file except in compliance with the License. You can obtain a copy of the CDDL at
10 * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at
11 * http://www.opensource.org/licenses/lgpl-license.php. See the License for the
12 * specific language governing permissions and limitations under the License. When
13 * distributing the software, include this License Header Notice in each file and
14 * include the full text of the License in the License file as well as the
17 * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
19 * For Covered Software in this distribution, this License shall be governed by the
20 * laws of the State of California (excluding conflict-of-law provisions).
21 * Any litigation relating to this License shall be subject to the jurisdiction of
22 * the Federal Courts of the Northern District of California and the state courts
23 * of the State of California, with venue lying in Santa Clara County, California.
27 * If you wish your version of this file to be governed by only the CDDL or only
28 * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to
29 * include this software in this distribution under the [CDDL or LGPL Version 2.1]
30 * license." If you don't indicate a single choice of license, a recipient has the
31 * option to distribute your version of this file under either the CDDL or the LGPL
32 * Version 2.1, or to extend the choice of license to its licensees as provided
33 * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL
34 * Version 2 license, then the option applies only if the new code is made subject
35 * to such option by the copyright holder.
56 #include "../sim_fmerge.h"
58 #include "idngram_merge.h"
62 WriteOut(FILE* out, std::map<CSIM_Idngram<N>, unsigned int> & map)
64 typedef typename std::map<CSIM_Idngram<N>,
65 unsigned int>::iterator TMapIterator;
66 TMapIterator its = map.begin(), ite = map.end();
67 for (; its != ite; ++its) {
68 fwrite(its->first.ids, sizeof(TSIMWordId), N, out);
69 fwrite(&(its->second), sizeof(unsigned int), 1, out);
76 ProcessingRead(FILE *fp,
78 std::vector<long>& para_offsets,
81 typedef CSIM_Idngram<N> TNgram;
82 typedef typename std::map<CSIM_Idngram<N>, unsigned int> TMap;
87 TSIMWordId* ids = ngram.ids;
88 fread(ids, sizeof(TSIMWordId), N - 1, fp);
89 while (fread(ids + N - 1, sizeof(TSIMWordId), 1, fp) == 1) {
90 assert(map[ngram] < UINT_MAX);
92 if (map.size() >= paraMax) {
93 printf("."); fflush(stdout);
95 para_offsets.push_back(ftell(swap));
97 for (int i = 0; i < N - 1; ++i) ids[i] = ids[i + 1];
100 printf("."); fflush(stdout);
102 para_offsets.push_back(ftell(swap));
106 static struct option long_options[] =
108 { "NMax", 1, 0, 'n' },
109 { "out", 1, 0, 'o' },
110 { "swap", 1, 0, 's' },
111 { "para", 1, 0, 'p' },
116 static int paraMax = 0;
117 static char* output = NULL;
118 static char* swapfile = NULL;
123 printf("Usage:\n\tids2ngram options idsfile[ idsfile...]\n");
124 printf("\nDescription\n");
126 " This program generate idngram file, which is a sorted [id1,..idN,freq] array, from binary id stream files.\n");
127 printf("\nInput:\n");
128 printf("\tBinary id stream files looks like [id0,...,idX]\n");
129 printf("\nOptions:\n");
130 printf("\t -n N # N-gram\n");
131 printf("\t -s swapfile # intermedia temporary file\n");
133 "\t -o outputfile # result idngram file [id1, ... idN, freq]*\n");
134 printf("\t -p para_size # maxium ngram-items per para\n");
135 printf("\nExample:\n");
137 " Following example will use three input idstream file idsfile[1,2,3] to generate the idngram file all.id3gram. Each para (internal map size or hash size) would be 1024000, using swap file for temp result. All temp para result would final be merged to got the final result.\n");
139 "\tids2idngram -n 3 -s /tmp/swap -o all.id3gram -p 1024000 idsfile1 idsfile2 idsfile3\n\n");
144 getParameters(int argc, char* const argv[])
146 int option_index = 0;
149 getopt_long(argc, argv, "p:n:s:o:", long_options,
150 &option_index)) != -1) {
153 N = atoi(strdup(optarg));
156 paraMax = atoi(strdup(optarg));
159 output = strdup(optarg);
162 swapfile = strdup(optarg);
168 if (N < 1 || N > 3 || paraMax < 1024 || output == NULL || swapfile == NULL)
172 static std::vector<long> para_offsets;
175 main(int argc, char* argv[])
177 getParameters(argc, argv);
178 FILE *swap = fopen(swapfile, "wb+");
179 FILE *out = fopen(output, "wb+");
180 if (optind >= argc) ShowUsage();
181 while (optind < argc) {
182 printf("Processing %s:", argv[optind]); fflush(stdout);
183 FILE *fp = fopen(argv[optind], "rb");
186 ProcessingRead<1>(fp, swap, para_offsets, paraMax);
189 ProcessingRead<2>(fp, swap, para_offsets, paraMax);
192 ProcessingRead<3>(fp, swap, para_offsets, paraMax);
196 printf("\n"); fflush(stdout);
199 printf("Merging..."); fflush(stdout);
202 ProcessingIdngramMerge<1>(swap, out, para_offsets);
205 ProcessingIdngramMerge<2>(swap, out, para_offsets);
208 ProcessingIdngramMerge<3>(swap, out, para_offsets);
211 printf("Done\n"); fflush(stdout);