1 // Copyright 2014 Google Inc. All Rights Reserved.
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
15 #include "cld2_dynamic_data.h"
16 #include "cld2_dynamic_data_extractor.h"
17 #include "cld2_dynamic_data_loader.h" // for verifying the written data
18 #include "integral_types.h"
19 #include "cld2tablesummary.h"
20 #include "utf8statetable.h"
23 namespace CLD2DynamicDataExtractor {
25 void setDebug(int debug) {
29 int advance(FILE* f, CLD2::uint32 position) {
31 int pad = position - ftell(f);
32 if (DEBUG) fprintf(stdout, " Adding %d bytes of padding\n", pad);
39 void writeChunk(FILE *f, const void* data, CLD2::uint32 startAt, CLD2::uint32 length) {
40 if (DEBUG) fprintf(stdout, " Write chunk @%d, len=%d\n", startAt, length);
42 if (DEBUG) fprintf(stdout, " Writing %d bytes of data", length);;
43 fwrite(data, 1, length, f);
46 void writeDataFile(const CLD2::ScoringTables* data,
47 const CLD2DynamicData::Supplement* supplement,
48 const char* fileName) {
49 // The order here is hardcoded and MUST NOT BE CHANGED, else you will de-sync
50 // with the reading code.
52 const int NUM_TABLES = 7;
53 const CLD2::CLD2TableSummary* tableSummaries[NUM_TABLES];
54 tableSummaries[0] = data->unigram_compat_obj;
55 tableSummaries[1] = data->deltabi_obj;
56 tableSummaries[2] = data->distinctbi_obj;
57 tableSummaries[3] = data->quadgram_obj;
58 tableSummaries[4] = data->quadgram_obj2;
59 tableSummaries[5] = data->deltaocta_obj;
60 tableSummaries[6] = data->distinctocta_obj;
62 CLD2DynamicData::TableHeader tableHeaders[NUM_TABLES];
63 CLD2DynamicData::FileHeader fileHeader;
64 fileHeader.numTablesEncoded = NUM_TABLES;
65 fileHeader.tableHeaders = tableHeaders;
66 initUtf8Headers(&fileHeader, data->unigram_obj);
67 initDeltaHeaders(&fileHeader, supplement->lengthOf_kAvgDeltaOctaScore);
68 initTableHeaders(tableSummaries, NUM_TABLES, supplement, tableHeaders);
69 alignAll(&fileHeader, 16); // Align all sections to 128-bit boundaries
71 // We are ready to rock.
72 for (int x=0; x<CLD2DynamicData::DATA_FILE_MARKER_LENGTH; x++)
73 fileHeader.sanityString[x] = CLD2DynamicData::DATA_FILE_MARKER[x];
74 FILE* outFile = fopen(fileName, "w");
75 fwrite(fileHeader.sanityString, 1, CLD2DynamicData::DATA_FILE_MARKER_LENGTH, outFile);
76 fwrite(&(fileHeader.totalFileSizeBytes), 4, 1, outFile);
77 fwrite(&(fileHeader.utf8PropObj_state0), 4, 1, outFile);
78 fwrite(&(fileHeader.utf8PropObj_state0_size), 4, 1, outFile);
79 fwrite(&(fileHeader.utf8PropObj_total_size), 4, 1, outFile);
80 fwrite(&(fileHeader.utf8PropObj_max_expand), 4, 1, outFile);
81 fwrite(&(fileHeader.utf8PropObj_entry_shift), 4, 1, outFile);
82 fwrite(&(fileHeader.utf8PropObj_bytes_per_entry), 4, 1, outFile);
83 fwrite(&(fileHeader.utf8PropObj_losub), 4, 1, outFile);
84 fwrite(&(fileHeader.utf8PropObj_hiadd), 4, 1, outFile);
85 fwrite(&(fileHeader.startOf_utf8PropObj_state_table), 4, 1, outFile);
86 fwrite(&(fileHeader.lengthOf_utf8PropObj_state_table), 4, 1, outFile);
87 fwrite(&(fileHeader.startOf_utf8PropObj_remap_base), 4, 1, outFile);
88 fwrite(&(fileHeader.lengthOf_utf8PropObj_remap_base), 4, 1, outFile);
89 fwrite(&(fileHeader.startOf_utf8PropObj_remap_string), 4, 1, outFile);
90 fwrite(&(fileHeader.lengthOf_utf8PropObj_remap_string), 4, 1, outFile);
91 fwrite(&(fileHeader.startOf_utf8PropObj_fast_state), 4, 1, outFile);
92 fwrite(&(fileHeader.lengthOf_utf8PropObj_fast_state), 4, 1, outFile);
93 fwrite(&(fileHeader.startOf_kAvgDeltaOctaScore), 4, 1, outFile);
94 fwrite(&(fileHeader.lengthOf_kAvgDeltaOctaScore), 4, 1, outFile);
95 fwrite(&(fileHeader.numTablesEncoded), 4, 1, outFile);
96 for (int x=0; x<NUM_TABLES; x++) {
97 CLD2DynamicData::TableHeader& tHeader = fileHeader.tableHeaders[x];
98 fwrite(&(tHeader.kCLDTableSizeOne), 4, 1, outFile);
99 fwrite(&(tHeader.kCLDTableSize), 4, 1, outFile);
100 fwrite(&(tHeader.kCLDTableKeyMask), 4, 1, outFile);
101 fwrite(&(tHeader.kCLDTableBuildDate), 4, 1, outFile);
102 fwrite(&(tHeader.startOf_kCLDTable), 4, 1, outFile);
103 fwrite(&(tHeader.lengthOf_kCLDTable), 4, 1, outFile);
104 fwrite(&(tHeader.startOf_kCLDTableInd), 4, 1, outFile);
105 fwrite(&(tHeader.lengthOf_kCLDTableInd), 4, 1, outFile);
106 fwrite(&(tHeader.startOf_kRecognizedLangScripts), 4, 1, outFile);
107 fwrite(&(tHeader.lengthOf_kRecognizedLangScripts), 4, 1, outFile);
113 data->unigram_obj->state_table,
114 fileHeader.startOf_utf8PropObj_state_table,
115 fileHeader.lengthOf_utf8PropObj_state_table);
116 // FIXME: Unsafe to rely on this since RemapEntry is not a bit-packed structure
118 data->unigram_obj->remap_base,
119 fileHeader.startOf_utf8PropObj_remap_base,
120 fileHeader.lengthOf_utf8PropObj_remap_base);
122 data->unigram_obj->remap_string,
123 fileHeader.startOf_utf8PropObj_remap_string,
124 fileHeader.lengthOf_utf8PropObj_remap_string - 1);
125 fwrite(&ZERO,1,1,outFile); // null terminator
126 if (fileHeader.startOf_utf8PropObj_fast_state > 0) {
128 data->unigram_obj->fast_state,
129 fileHeader.startOf_utf8PropObj_fast_state,
130 fileHeader.lengthOf_utf8PropObj_fast_state - 1);
131 fwrite(&ZERO,1,1,outFile); // null terminator
134 // 2. kAvgDeltaOctaScore array
136 data->kExpectedScore,
137 fileHeader.startOf_kAvgDeltaOctaScore,
138 fileHeader.lengthOf_kAvgDeltaOctaScore);
141 for (int x=0; x<NUM_TABLES; x++) {
142 const CLD2::CLD2TableSummary* summary = tableSummaries[x];
143 CLD2DynamicData::TableHeader& tHeader = fileHeader.tableHeaders[x];
144 // NB: Safe to directly write IndirectProbBucket4 as it is just an alias for CLD2::uint32
147 tHeader.startOf_kCLDTable,
148 tHeader.lengthOf_kCLDTable);
150 summary->kCLDTableInd,
151 tHeader.startOf_kCLDTableInd,
152 tHeader.lengthOf_kCLDTableInd);
154 summary->kRecognizedLangScripts,
155 tHeader.startOf_kRecognizedLangScripts,
156 tHeader.lengthOf_kRecognizedLangScripts - 1);
157 fwrite(&ZERO,1,1,outFile); // null terminator
162 void initTableHeaders(const CLD2::CLD2TableSummary** summaries,
163 const int numSummaries,
164 const CLD2DynamicData::Supplement* supplement,
165 CLD2DynamicData::TableHeader* tableHeaders) {
166 // Important: As documented in the .h, we assume that the Supplement data
167 // structure contains exactly one entry in indirectTableSizes for each
168 // CLD2TableSummary, in the same order.
169 for (int x=0; x<numSummaries; x++) {
170 const CLD2::CLD2TableSummary* summary = summaries[x];
171 CLD2DynamicData::TableHeader& tableHeader = tableHeaders[x];
173 // Copy the primitive bits
174 tableHeader.kCLDTableSizeOne = summary->kCLDTableSizeOne;
175 tableHeader.kCLDTableSize = summary->kCLDTableSize;
176 tableHeader.kCLDTableKeyMask = summary->kCLDTableKeyMask;
177 tableHeader.kCLDTableBuildDate = summary->kCLDTableBuildDate;
179 // Calculate size information
180 CLD2::uint32 bytesPerBucket = sizeof(CLD2::IndirectProbBucket4);
181 CLD2::uint32 numBuckets = summary->kCLDTableSize;
182 CLD2::uint32 tableSizeBytes = bytesPerBucket * numBuckets;
183 CLD2::uint32 indirectTableSizeBytes =
184 supplement->indirectTableSizes[x] * sizeof(CLD2::uint32);
185 CLD2::uint32 recognizedScriptsSizeBytes =
186 strlen(summary->kRecognizedLangScripts) + 1; // note null terminator
188 // Place size information into header. We'll align on byte boundaries later.
189 tableHeader.lengthOf_kCLDTable = tableSizeBytes;
190 tableHeader.lengthOf_kCLDTableInd = indirectTableSizeBytes;
191 tableHeader.lengthOf_kRecognizedLangScripts =
192 recognizedScriptsSizeBytes; // null terminator counted above
196 // Assuming that all fields have been set in the specified header, re-align
197 // the starting positions of all data chunks to be aligned along 64-bit
198 // boundaries for maximum efficiency.
199 void alignAll(CLD2DynamicData::FileHeader* header, const int alignment) {
200 CLD2::uint32 totalPadding = 0;
201 if (DEBUG) { fprintf(stdout, "Align for %d bits.\n", (alignment*8)); }
202 CLD2::uint32 headerSize = CLD2DynamicData::calculateHeaderSize(
203 header->numTablesEncoded);
204 CLD2::uint32 offset = headerSize;
207 int stateTablePad = alignment - (offset % alignment);
208 if (stateTablePad == alignment) stateTablePad = 0;
209 totalPadding += stateTablePad;
210 if (DEBUG) { fprintf(stdout, "Alignment for stateTable adjusted by %d\n", stateTablePad); }
211 offset += stateTablePad;
212 header->startOf_utf8PropObj_state_table = offset;
213 offset += header->lengthOf_utf8PropObj_state_table;
217 int remapPad = alignment - (offset % alignment);
218 if (remapPad == alignment) remapPad = 0;
219 totalPadding += remapPad;
220 if (DEBUG) { fprintf(stdout, "Alignment for remap adjusted by %d\n", remapPad); }
222 header->startOf_utf8PropObj_remap_base = offset;
223 offset += header->lengthOf_utf8PropObj_remap_base;
227 int remapStringPad = alignment - (offset % alignment);
228 if (remapStringPad == alignment) remapStringPad = 0;
229 totalPadding += remapStringPad;
230 if (DEBUG) { fprintf(stdout, "Alignment for remapString adjusted by %d\n", remapStringPad); }
231 offset += remapStringPad;
232 header->startOf_utf8PropObj_remap_string = offset;
233 offset += header->lengthOf_utf8PropObj_remap_string; // null terminator already counted in initUtf8Headers
237 int fastStatePad = alignment - (offset % alignment);
238 if (fastStatePad == alignment) fastStatePad = 0;
239 totalPadding += fastStatePad;
240 if (DEBUG) { fprintf(stdout, "Alignment for fastState adjusted by %d\n", fastStatePad); }
241 offset += fastStatePad;
242 if (header->lengthOf_utf8PropObj_fast_state > 0) {
243 header->startOf_utf8PropObj_fast_state = offset;
244 offset += header->lengthOf_utf8PropObj_fast_state; // null terminator already counted in initUtf8Headers
246 header->startOf_utf8PropObj_fast_state = 0;
251 int deltaOctaPad = alignment - (offset % alignment);
252 if (deltaOctaPad == alignment) deltaOctaPad = 0;
253 totalPadding += deltaOctaPad;
254 if (DEBUG) { fprintf(stdout, "Alignment for deltaOctaScore adjusted by %d\n", deltaOctaPad); }
255 offset += deltaOctaPad;
256 header->startOf_kAvgDeltaOctaScore = offset;
257 offset += header->lengthOf_kAvgDeltaOctaScore;
260 for (int x=0; x<header->numTablesEncoded; x++) {
261 CLD2DynamicData::TableHeader& tableHeader = header->tableHeaders[x];
262 int tablePad = alignment - (offset % alignment);
263 if (tablePad == alignment) tablePad = 0;
264 totalPadding += tablePad;
265 if (DEBUG) { fprintf(stdout, "Alignment for table %d adjusted by %d\n", x, tablePad); }
267 tableHeader.startOf_kCLDTable = offset;
268 offset += tableHeader.lengthOf_kCLDTable;
270 int indirectPad = alignment - (offset % alignment);
271 if (indirectPad == alignment) indirectPad = 0;
272 totalPadding += indirectPad;
273 if (DEBUG) { fprintf(stdout, "Alignment for tableInd %d adjusted by %d\n", x, indirectPad); }
274 offset += indirectPad;
275 tableHeader.startOf_kCLDTableInd = offset;
276 offset += tableHeader.lengthOf_kCLDTableInd;
278 int scriptsPad = alignment - (offset % alignment);
279 if (scriptsPad == alignment) scriptsPad = 0;
280 totalPadding += scriptsPad;
281 if (DEBUG) { fprintf(stdout, "Alignment for scriptsPad %d adjusted by %d", x, scriptsPad); }
282 offset += scriptsPad;
283 tableHeader.startOf_kRecognizedLangScripts = offset;
284 offset += tableHeader.lengthOf_kRecognizedLangScripts; // null terminator already counted in initTableHeaders
287 // Now that we know exactly how much data we have written, store it in the
288 // header as a sanity check
289 header->totalFileSizeBytes = offset;
292 fprintf(stdout, "Data aligned.\n");
293 fprintf(stdout, "Header size: %d bytes\n", headerSize);
294 fprintf(stdout, "Data size: %d bytes\n", (offset - totalPadding));
295 fprintf(stdout, "Padding size: %d bytes\n", totalPadding);
296 fprintf(stdout, " cld_generated_CjkUni_obj: %d bytes\n", (
297 header->lengthOf_utf8PropObj_state_table +
298 header->lengthOf_utf8PropObj_remap_string +
299 header->lengthOf_utf8PropObj_fast_state));
300 fprintf(stdout, " kAvgDeltaOctaScore: %d bytes\n",
301 header->lengthOf_kAvgDeltaOctaScore);
302 fprintf(stdout, " kCjkCompat_obj: %d bytes\n", (
303 header->tableHeaders[0].lengthOf_kCLDTable +
304 header->tableHeaders[0].lengthOf_kCLDTableInd +
305 header->tableHeaders[0].lengthOf_kRecognizedLangScripts + 1));
306 fprintf(stdout, " kCjkDeltaBi_obj: %d bytes\n", (
307 header->tableHeaders[1].lengthOf_kCLDTable +
308 header->tableHeaders[1].lengthOf_kCLDTableInd +
309 header->tableHeaders[1].lengthOf_kRecognizedLangScripts + 1));
310 fprintf(stdout, " kDistinctBiTable_obj: %d bytes\n", (
311 header->tableHeaders[2].lengthOf_kCLDTable +
312 header->tableHeaders[2].lengthOf_kCLDTableInd +
313 header->tableHeaders[2].lengthOf_kRecognizedLangScripts + 1));
314 fprintf(stdout, " kQuad_obj: %d bytes\n", (
315 header->tableHeaders[3].lengthOf_kCLDTable +
316 header->tableHeaders[3].lengthOf_kCLDTableInd +
317 header->tableHeaders[3].lengthOf_kRecognizedLangScripts + 1));
318 fprintf(stdout, " kQuad_obj2: %d bytes\n", (
319 header->tableHeaders[4].lengthOf_kCLDTable +
320 header->tableHeaders[4].lengthOf_kCLDTableInd +
321 header->tableHeaders[4].lengthOf_kRecognizedLangScripts + 1));
322 fprintf(stdout, " kDeltaOcta_obj: %d bytes\n", (
323 header->tableHeaders[5].lengthOf_kCLDTable +
324 header->tableHeaders[5].lengthOf_kCLDTableInd +
325 header->tableHeaders[5].lengthOf_kRecognizedLangScripts + 1));
326 fprintf(stdout, " kDistinctOcta_obj: %d bytes\n", (
327 header->tableHeaders[6].lengthOf_kCLDTable +
328 header->tableHeaders[6].lengthOf_kCLDTableInd +
329 header->tableHeaders[6].lengthOf_kRecognizedLangScripts + 1));
333 void initDeltaHeaders(CLD2DynamicData::FileHeader* header, const CLD2::uint32 deltaLength) {
334 header->startOf_kAvgDeltaOctaScore = 0;
335 header->lengthOf_kAvgDeltaOctaScore = deltaLength;
338 void initUtf8Headers(CLD2DynamicData::FileHeader* header, const CLD2::UTF8PropObj* utf8Object) {
339 header->utf8PropObj_state0 = utf8Object->state0;
340 header->utf8PropObj_state0_size = utf8Object->state0_size;
341 header->utf8PropObj_total_size = utf8Object->total_size;
342 header->utf8PropObj_max_expand = utf8Object->max_expand;
343 header->utf8PropObj_entry_shift = utf8Object->entry_shift;
344 header->utf8PropObj_bytes_per_entry = utf8Object->bytes_per_entry;
345 header->utf8PropObj_losub = utf8Object->losub;
346 header->utf8PropObj_hiadd = utf8Object->hiadd;
347 header->lengthOf_utf8PropObj_state_table = utf8Object->total_size;
348 header->lengthOf_utf8PropObj_remap_base = sizeof(CLD2::RemapEntry); // TODO: Can this ever have more than one entry?
349 header->lengthOf_utf8PropObj_remap_string = strlen(
350 reinterpret_cast<const char*>(utf8Object->remap_string)) + 1; // note null terminator
351 if (utf8Object->fast_state == NULL) {
352 header->lengthOf_utf8PropObj_fast_state = 0; // not applicable
354 header->lengthOf_utf8PropObj_fast_state = strlen(
355 reinterpret_cast<const char*>(utf8Object->fast_state)) + 1; // note null terminator
358 } // End namespace CLD2DynamicDataExtractor