Upstream version 9.38.198.0
[platform/framework/web/crosswalk.git] / src / third_party / cld_2 / src / internal / cld2_dynamic_data_extractor.cc
1 // Copyright 2014 Google Inc. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //     http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "cld2_dynamic_data.h"
16 #include "cld2_dynamic_data_extractor.h"
17 #include "cld2_dynamic_data_loader.h" // for verifying the written data
18 #include "integral_types.h"
19 #include "cld2tablesummary.h"
20 #include "utf8statetable.h"
21
22 using namespace std;
23 namespace CLD2DynamicDataExtractor {
24 static int DEBUG=0;
25 void setDebug(int debug) {
26   DEBUG=debug;
27 }
28
29 int advance(FILE* f, CLD2::uint32 position) {
30   const char ZERO = 0;
31   int pad = position - ftell(f);
32   if (DEBUG) fprintf(stdout, "  Adding %d bytes of padding\n", pad);
33   while (pad-- > 0) {
34     fwrite(&ZERO,1,1,f);
35   }
36   return pad;
37 }
38
39 void writeChunk(FILE *f, const void* data, CLD2::uint32 startAt, CLD2::uint32 length) {
40   if (DEBUG) fprintf(stdout, "  Write chunk @%d, len=%d\n", startAt, length);
41   advance(f, startAt);
42   if (DEBUG) fprintf(stdout, "  Writing %d bytes of data", length);;
43   fwrite(data, 1, length, f);
44 }
45
46 void writeDataFile(const CLD2::ScoringTables* data,
47                    const CLD2DynamicData::Supplement* supplement,
48                    const char* fileName) {
49   // The order here is hardcoded and MUST NOT BE CHANGED, else you will de-sync
50   // with the reading code.
51   const char ZERO = 0;
52   const int NUM_TABLES = 7;
53   const CLD2::CLD2TableSummary* tableSummaries[NUM_TABLES];
54   tableSummaries[0] = data->unigram_compat_obj;
55   tableSummaries[1] = data->deltabi_obj;
56   tableSummaries[2] = data->distinctbi_obj;
57   tableSummaries[3] = data->quadgram_obj;
58   tableSummaries[4] = data->quadgram_obj2;
59   tableSummaries[5] = data->deltaocta_obj;
60   tableSummaries[6] = data->distinctocta_obj;
61
62   CLD2DynamicData::TableHeader tableHeaders[NUM_TABLES];
63   CLD2DynamicData::FileHeader fileHeader;
64   fileHeader.numTablesEncoded = NUM_TABLES;
65   fileHeader.tableHeaders = tableHeaders;
66   initUtf8Headers(&fileHeader, data->unigram_obj);
67   initDeltaHeaders(&fileHeader, supplement->lengthOf_kAvgDeltaOctaScore);
68   initTableHeaders(tableSummaries, NUM_TABLES, supplement, tableHeaders);
69   alignAll(&fileHeader, 16); // Align all sections to 128-bit boundaries
70
71   // We are ready to rock.
72   for (int x=0; x<CLD2DynamicData::DATA_FILE_MARKER_LENGTH; x++)
73     fileHeader.sanityString[x] = CLD2DynamicData::DATA_FILE_MARKER[x];
74   FILE* outFile = fopen(fileName, "w");
75   fwrite(fileHeader.sanityString, 1, CLD2DynamicData::DATA_FILE_MARKER_LENGTH, outFile);
76   fwrite(&(fileHeader.totalFileSizeBytes), 4, 1, outFile);
77   fwrite(&(fileHeader.utf8PropObj_state0), 4, 1, outFile);
78   fwrite(&(fileHeader.utf8PropObj_state0_size), 4, 1, outFile);
79   fwrite(&(fileHeader.utf8PropObj_total_size), 4, 1, outFile);
80   fwrite(&(fileHeader.utf8PropObj_max_expand), 4, 1, outFile);
81   fwrite(&(fileHeader.utf8PropObj_entry_shift), 4, 1, outFile);
82   fwrite(&(fileHeader.utf8PropObj_bytes_per_entry), 4, 1, outFile);
83   fwrite(&(fileHeader.utf8PropObj_losub), 4, 1, outFile);
84   fwrite(&(fileHeader.utf8PropObj_hiadd), 4, 1, outFile);
85   fwrite(&(fileHeader.startOf_utf8PropObj_state_table), 4, 1, outFile);
86   fwrite(&(fileHeader.lengthOf_utf8PropObj_state_table), 4, 1, outFile);
87   fwrite(&(fileHeader.startOf_utf8PropObj_remap_base), 4, 1, outFile);
88   fwrite(&(fileHeader.lengthOf_utf8PropObj_remap_base), 4, 1, outFile);
89   fwrite(&(fileHeader.startOf_utf8PropObj_remap_string), 4, 1, outFile);
90   fwrite(&(fileHeader.lengthOf_utf8PropObj_remap_string), 4, 1, outFile);
91   fwrite(&(fileHeader.startOf_utf8PropObj_fast_state), 4, 1, outFile);
92   fwrite(&(fileHeader.lengthOf_utf8PropObj_fast_state), 4, 1, outFile);
93   fwrite(&(fileHeader.startOf_kAvgDeltaOctaScore), 4, 1, outFile);
94   fwrite(&(fileHeader.lengthOf_kAvgDeltaOctaScore), 4, 1, outFile);
95   fwrite(&(fileHeader.numTablesEncoded), 4, 1, outFile);
96   for (int x=0; x<NUM_TABLES; x++) {
97     CLD2DynamicData::TableHeader& tHeader = fileHeader.tableHeaders[x];
98     fwrite(&(tHeader.kCLDTableSizeOne), 4, 1, outFile);
99     fwrite(&(tHeader.kCLDTableSize), 4, 1, outFile);
100     fwrite(&(tHeader.kCLDTableKeyMask), 4, 1, outFile);
101     fwrite(&(tHeader.kCLDTableBuildDate), 4, 1, outFile);
102     fwrite(&(tHeader.startOf_kCLDTable), 4, 1, outFile);
103     fwrite(&(tHeader.lengthOf_kCLDTable), 4, 1, outFile);
104     fwrite(&(tHeader.startOf_kCLDTableInd), 4, 1, outFile);
105     fwrite(&(tHeader.lengthOf_kCLDTableInd), 4, 1, outFile);
106     fwrite(&(tHeader.startOf_kRecognizedLangScripts), 4, 1, outFile);
107     fwrite(&(tHeader.lengthOf_kRecognizedLangScripts), 4, 1, outFile);
108   }
109
110   // Write data blob
111   // 1. UTF8 Object
112   writeChunk(outFile,
113     data->unigram_obj->state_table,
114     fileHeader.startOf_utf8PropObj_state_table,
115     fileHeader.lengthOf_utf8PropObj_state_table);
116   // FIXME: Unsafe to rely on this since RemapEntry is not a bit-packed structure
117   writeChunk(outFile,
118     data->unigram_obj->remap_base,
119     fileHeader.startOf_utf8PropObj_remap_base,
120     fileHeader.lengthOf_utf8PropObj_remap_base);
121   writeChunk(outFile,
122     data->unigram_obj->remap_string,
123     fileHeader.startOf_utf8PropObj_remap_string,
124     fileHeader.lengthOf_utf8PropObj_remap_string - 1);
125   fwrite(&ZERO,1,1,outFile); // null terminator
126   if (fileHeader.startOf_utf8PropObj_fast_state > 0) {
127     writeChunk(outFile,
128       data->unigram_obj->fast_state,
129       fileHeader.startOf_utf8PropObj_fast_state,
130       fileHeader.lengthOf_utf8PropObj_fast_state - 1);
131     fwrite(&ZERO,1,1,outFile); // null terminator
132   }
133
134   // 2. kAvgDeltaOctaScore array
135   writeChunk(outFile,
136     data->kExpectedScore,
137     fileHeader.startOf_kAvgDeltaOctaScore,
138     fileHeader.lengthOf_kAvgDeltaOctaScore);
139
140   // 3. Each table
141   for (int x=0; x<NUM_TABLES; x++) {
142     const CLD2::CLD2TableSummary* summary = tableSummaries[x];
143     CLD2DynamicData::TableHeader& tHeader = fileHeader.tableHeaders[x];
144     // NB: Safe to directly write IndirectProbBucket4 as it is just an alias for CLD2::uint32
145     writeChunk(outFile,
146       summary->kCLDTable,
147       tHeader.startOf_kCLDTable,
148       tHeader.lengthOf_kCLDTable);
149     writeChunk(outFile,
150       summary->kCLDTableInd,
151       tHeader.startOf_kCLDTableInd,
152       tHeader.lengthOf_kCLDTableInd);
153     writeChunk(outFile,
154       summary->kRecognizedLangScripts,
155       tHeader.startOf_kRecognizedLangScripts,
156       tHeader.lengthOf_kRecognizedLangScripts - 1);
157     fwrite(&ZERO,1,1,outFile); // null terminator
158   }
159   fclose(outFile);
160 }
161
162 void initTableHeaders(const CLD2::CLD2TableSummary** summaries,
163                       const int numSummaries,
164                       const CLD2DynamicData::Supplement* supplement,
165                       CLD2DynamicData::TableHeader* tableHeaders) {
166   // Important: As documented in the .h, we assume that the Supplement data
167   // structure contains exactly one entry in indirectTableSizes for each
168   // CLD2TableSummary, in the same order.
169   for (int x=0; x<numSummaries; x++) {
170     const CLD2::CLD2TableSummary* summary = summaries[x];
171     CLD2DynamicData::TableHeader& tableHeader = tableHeaders[x];
172
173     // Copy the primitive bits
174     tableHeader.kCLDTableSizeOne = summary->kCLDTableSizeOne;
175     tableHeader.kCLDTableSize = summary->kCLDTableSize;
176     tableHeader.kCLDTableKeyMask = summary->kCLDTableKeyMask;
177     tableHeader.kCLDTableBuildDate = summary->kCLDTableBuildDate;
178
179     // Calculate size information
180     CLD2::uint32 bytesPerBucket = sizeof(CLD2::IndirectProbBucket4);
181     CLD2::uint32 numBuckets = summary->kCLDTableSize;
182     CLD2::uint32 tableSizeBytes = bytesPerBucket * numBuckets;
183     CLD2::uint32 indirectTableSizeBytes =
184       supplement->indirectTableSizes[x] * sizeof(CLD2::uint32);
185     CLD2::uint32 recognizedScriptsSizeBytes =
186       strlen(summary->kRecognizedLangScripts) + 1; // note null terminator
187
188     // Place size information into header. We'll align on byte boundaries later.
189     tableHeader.lengthOf_kCLDTable = tableSizeBytes;
190     tableHeader.lengthOf_kCLDTableInd = indirectTableSizeBytes;
191     tableHeader.lengthOf_kRecognizedLangScripts =
192       recognizedScriptsSizeBytes; // null terminator counted above
193   }
194 }
195
196 // Assuming that all fields have been set in the specified header, re-align
197 // the starting positions of all data chunks to be aligned along 64-bit
198 // boundaries for maximum efficiency.
199 void alignAll(CLD2DynamicData::FileHeader* header, const int alignment) {
200   CLD2::uint32 totalPadding = 0;
201   if (DEBUG) { fprintf(stdout, "Align for %d bits.\n", (alignment*8)); }
202   CLD2::uint32 headerSize = CLD2DynamicData::calculateHeaderSize(
203     header->numTablesEncoded);
204   CLD2::uint32 offset = headerSize;
205
206   { // scoping block
207     int stateTablePad = alignment - (offset % alignment);
208     if (stateTablePad == alignment) stateTablePad = 0;
209     totalPadding += stateTablePad;
210     if (DEBUG) { fprintf(stdout, "Alignment for stateTable adjusted by %d\n", stateTablePad); }
211     offset += stateTablePad;
212     header->startOf_utf8PropObj_state_table = offset;
213     offset += header->lengthOf_utf8PropObj_state_table;
214   }
215
216   { // scoping block
217     int remapPad = alignment - (offset % alignment);
218     if (remapPad == alignment) remapPad = 0;
219     totalPadding += remapPad;
220     if (DEBUG) { fprintf(stdout, "Alignment for remap adjusted by %d\n", remapPad); }
221     offset += remapPad;
222     header->startOf_utf8PropObj_remap_base = offset;
223     offset += header->lengthOf_utf8PropObj_remap_base;
224   }
225
226   { // scoping block
227     int remapStringPad = alignment - (offset % alignment);
228     if (remapStringPad == alignment) remapStringPad = 0;
229     totalPadding += remapStringPad;
230     if (DEBUG) { fprintf(stdout, "Alignment for remapString adjusted by %d\n", remapStringPad); }
231     offset += remapStringPad;
232     header->startOf_utf8PropObj_remap_string = offset;
233     offset += header->lengthOf_utf8PropObj_remap_string; // null terminator already counted in initUtf8Headers
234   }
235
236   { // scoping block
237     int fastStatePad = alignment - (offset % alignment);
238     if (fastStatePad == alignment) fastStatePad = 0;
239     totalPadding += fastStatePad;
240     if (DEBUG) { fprintf(stdout, "Alignment for fastState adjusted by %d\n", fastStatePad); }
241     offset += fastStatePad;
242     if (header->lengthOf_utf8PropObj_fast_state > 0) {
243       header->startOf_utf8PropObj_fast_state = offset;
244       offset += header->lengthOf_utf8PropObj_fast_state; // null terminator already counted in initUtf8Headers
245     } else {
246       header->startOf_utf8PropObj_fast_state = 0;
247     }
248   }
249
250   { // scoping block
251     int deltaOctaPad = alignment - (offset % alignment);
252     if (deltaOctaPad == alignment) deltaOctaPad = 0;
253     totalPadding += deltaOctaPad;
254     if (DEBUG) { fprintf(stdout, "Alignment for deltaOctaScore adjusted by %d\n", deltaOctaPad); }
255     offset += deltaOctaPad;
256     header->startOf_kAvgDeltaOctaScore = offset;
257     offset += header->lengthOf_kAvgDeltaOctaScore;
258   }
259   
260   for (int x=0; x<header->numTablesEncoded; x++) {
261     CLD2DynamicData::TableHeader& tableHeader = header->tableHeaders[x];
262     int tablePad = alignment - (offset % alignment);
263     if (tablePad == alignment) tablePad = 0;
264     totalPadding += tablePad;
265     if (DEBUG) { fprintf(stdout, "Alignment for table %d adjusted by %d\n", x, tablePad); }
266     offset += tablePad;
267     tableHeader.startOf_kCLDTable = offset;
268     offset += tableHeader.lengthOf_kCLDTable;
269
270     int indirectPad = alignment - (offset % alignment);
271     if (indirectPad == alignment) indirectPad = 0;
272     totalPadding += indirectPad;
273     if (DEBUG) { fprintf(stdout, "Alignment for tableInd %d adjusted by %d\n", x, indirectPad); }
274     offset += indirectPad;
275     tableHeader.startOf_kCLDTableInd = offset;
276     offset += tableHeader.lengthOf_kCLDTableInd;
277
278     int scriptsPad = alignment - (offset % alignment);
279     if (scriptsPad == alignment) scriptsPad = 0;
280     totalPadding += scriptsPad;
281     if (DEBUG) { fprintf(stdout, "Alignment for scriptsPad %d adjusted by %d", x, scriptsPad); }
282     offset += scriptsPad;
283     tableHeader.startOf_kRecognizedLangScripts = offset;
284     offset += tableHeader.lengthOf_kRecognizedLangScripts; // null terminator already counted in initTableHeaders
285   }
286
287   // Now that we know exactly how much data we have written, store it in the
288   // header as a sanity check
289   header->totalFileSizeBytes = offset;
290
291   if (DEBUG) {
292     fprintf(stdout, "Data aligned.\n");
293     fprintf(stdout, "Header size:  %d bytes\n", headerSize);
294     fprintf(stdout, "Data size:    %d bytes\n", (offset - totalPadding));
295     fprintf(stdout, "Padding size: %d bytes\n", totalPadding);
296     fprintf(stdout, "  cld_generated_CjkUni_obj: %d bytes\n", (
297         header->lengthOf_utf8PropObj_state_table +
298         header->lengthOf_utf8PropObj_remap_string +
299         header->lengthOf_utf8PropObj_fast_state));
300     fprintf(stdout, "  kAvgDeltaOctaScore:       %d bytes\n",
301         header->lengthOf_kAvgDeltaOctaScore);
302     fprintf(stdout, "  kCjkCompat_obj:           %d bytes\n", (
303         header->tableHeaders[0].lengthOf_kCLDTable +
304         header->tableHeaders[0].lengthOf_kCLDTableInd +
305         header->tableHeaders[0].lengthOf_kRecognizedLangScripts + 1));
306     fprintf(stdout, "  kCjkDeltaBi_obj:          %d bytes\n", (
307         header->tableHeaders[1].lengthOf_kCLDTable +
308         header->tableHeaders[1].lengthOf_kCLDTableInd +
309         header->tableHeaders[1].lengthOf_kRecognizedLangScripts + 1));
310     fprintf(stdout, "  kDistinctBiTable_obj:     %d bytes\n", (
311         header->tableHeaders[2].lengthOf_kCLDTable +
312         header->tableHeaders[2].lengthOf_kCLDTableInd +
313         header->tableHeaders[2].lengthOf_kRecognizedLangScripts + 1));
314     fprintf(stdout, "  kQuad_obj:                %d bytes\n", (
315         header->tableHeaders[3].lengthOf_kCLDTable +
316         header->tableHeaders[3].lengthOf_kCLDTableInd +
317         header->tableHeaders[3].lengthOf_kRecognizedLangScripts + 1));
318     fprintf(stdout, "  kQuad_obj2:               %d bytes\n", (
319         header->tableHeaders[4].lengthOf_kCLDTable +
320         header->tableHeaders[4].lengthOf_kCLDTableInd +
321         header->tableHeaders[4].lengthOf_kRecognizedLangScripts + 1));
322     fprintf(stdout, "  kDeltaOcta_obj:           %d bytes\n", (
323         header->tableHeaders[5].lengthOf_kCLDTable +
324         header->tableHeaders[5].lengthOf_kCLDTableInd +
325         header->tableHeaders[5].lengthOf_kRecognizedLangScripts + 1));
326     fprintf(stdout, "  kDistinctOcta_obj:        %d bytes\n", (
327         header->tableHeaders[6].lengthOf_kCLDTable +
328         header->tableHeaders[6].lengthOf_kCLDTableInd +
329         header->tableHeaders[6].lengthOf_kRecognizedLangScripts + 1));
330   }
331 }
332
333 void initDeltaHeaders(CLD2DynamicData::FileHeader* header, const CLD2::uint32 deltaLength) {
334   header->startOf_kAvgDeltaOctaScore = 0;
335   header->lengthOf_kAvgDeltaOctaScore = deltaLength;
336 }
337
338 void initUtf8Headers(CLD2DynamicData::FileHeader* header, const CLD2::UTF8PropObj* utf8Object) {
339   header->utf8PropObj_state0 = utf8Object->state0;
340   header->utf8PropObj_state0_size = utf8Object->state0_size;
341   header->utf8PropObj_total_size = utf8Object->total_size;
342   header->utf8PropObj_max_expand = utf8Object->max_expand;
343   header->utf8PropObj_entry_shift = utf8Object->entry_shift;
344   header->utf8PropObj_bytes_per_entry = utf8Object->bytes_per_entry;
345   header->utf8PropObj_losub = utf8Object->losub;
346   header->utf8PropObj_hiadd = utf8Object->hiadd;
347   header->lengthOf_utf8PropObj_state_table = utf8Object->total_size;
348   header->lengthOf_utf8PropObj_remap_base = sizeof(CLD2::RemapEntry); // TODO: Can this ever have more than one entry?
349   header->lengthOf_utf8PropObj_remap_string = strlen(
350     reinterpret_cast<const char*>(utf8Object->remap_string)) + 1; // note null terminator
351   if (utf8Object->fast_state == NULL) {
352     header->lengthOf_utf8PropObj_fast_state = 0; // not applicable
353   } else {
354     header->lengthOf_utf8PropObj_fast_state = strlen(
355       reinterpret_cast<const char*>(utf8Object->fast_state)) + 1; // note null terminator
356   }
357 }
358 } // End namespace CLD2DynamicDataExtractor