2 // Licensed under the Apache License, Version 2.0 (the "License");
3 // you may not use this file except in compliance with the License.
4 // You may obtain a copy of the License at
6 // http://www.apache.org/licenses/LICENSE-2.0
8 // Unless required by applicable law or agreed to in writing, software
9 // distributed under the License is distributed on an "AS IS" BASIS,
10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 // See the License for the specific language governing permissions and
12 // limitations under the License.
22 #include "cld2_dynamic_compat.h" // for win32/posix compatibility
23 #include "cld2_dynamic_data.h"
24 #include "cld2_dynamic_data_loader.h"
25 #include "integral_types.h"
26 #include "cld2tablesummary.h"
27 #include "utf8statetable.h"
28 #include "scoreonescriptspan.h"
30 namespace CLD2DynamicDataLoader {
33 CLD2DynamicData::FileHeader* loadHeaderFromFile(const char* fileName) {
34 FILE* inFile = fopen(fileName, "r");
38 return loadInternal(inFile, NULL, -1);
41 CLD2DynamicData::FileHeader* loadHeaderFromRaw(const void* basePointer,
42 const uint32_t length) {
43 return loadInternal(NULL, basePointer, length);
47 #define CLD2_READINT(field) \
49 bytesRead += 4 * fread(&(header->field), 4, 1, inFile);\
51 memcpy(&(header->field), (((char*)(basePointer)) + bytesRead), 4);\
54 CLD2DynamicData::FileHeader* loadInternal(FILE* inFile, const void* basePointer, const uint32_t length) {
55 const bool sourceIsFile = (inFile != NULL);
57 CLD2DynamicData::FileHeader* header = new CLD2DynamicData::FileHeader;
59 // TODO: force null-terminate char* strings for safety
61 bytesRead += fread(header->sanityString, 1, CLD2DynamicData::DATA_FILE_MARKER_LENGTH, inFile);
63 memcpy(header->sanityString, basePointer, CLD2DynamicData::DATA_FILE_MARKER_LENGTH);
64 bytesRead += CLD2DynamicData::DATA_FILE_MARKER_LENGTH;
67 if (!CLD2DynamicData::mem_compare(
69 CLD2DynamicData::DATA_FILE_MARKER,
70 CLD2DynamicData::DATA_FILE_MARKER_LENGTH)) {
71 fprintf(stderr, "Malformed header: bad file marker!\n");
76 CLD2_READINT(totalFileSizeBytes);
77 CLD2_READINT(utf8PropObj_state0);
78 CLD2_READINT(utf8PropObj_state0_size);
79 CLD2_READINT(utf8PropObj_total_size);
80 CLD2_READINT(utf8PropObj_max_expand);
81 CLD2_READINT(utf8PropObj_entry_shift);
82 CLD2_READINT(utf8PropObj_bytes_per_entry);
83 CLD2_READINT(utf8PropObj_losub);
84 CLD2_READINT(utf8PropObj_hiadd);
85 CLD2_READINT(startOf_utf8PropObj_state_table);
86 CLD2_READINT(lengthOf_utf8PropObj_state_table);
87 CLD2_READINT(startOf_utf8PropObj_remap_base);
88 CLD2_READINT(lengthOf_utf8PropObj_remap_base);
89 CLD2_READINT(startOf_utf8PropObj_remap_string);
90 CLD2_READINT(lengthOf_utf8PropObj_remap_string);
91 CLD2_READINT(startOf_utf8PropObj_fast_state);
92 CLD2_READINT(lengthOf_utf8PropObj_fast_state);
93 CLD2_READINT(startOf_kAvgDeltaOctaScore);
94 CLD2_READINT(lengthOf_kAvgDeltaOctaScore);
95 CLD2_READINT(numTablesEncoded);
97 CLD2DynamicData::TableHeader* tableHeaders = new CLD2DynamicData::TableHeader[header->numTablesEncoded];
98 header->tableHeaders = tableHeaders;
99 for (int x=0; x<header->numTablesEncoded; x++) {
100 CLD2DynamicData::TableHeader *header = &(tableHeaders[x]);
101 CLD2_READINT(kCLDTableSizeOne);
102 CLD2_READINT(kCLDTableSize);
103 CLD2_READINT(kCLDTableKeyMask);
104 CLD2_READINT(kCLDTableBuildDate);
105 CLD2_READINT(startOf_kCLDTable);
106 CLD2_READINT(lengthOf_kCLDTable);
107 CLD2_READINT(startOf_kCLDTableInd);
108 CLD2_READINT(lengthOf_kCLDTableInd);
109 CLD2_READINT(startOf_kRecognizedLangScripts);
110 CLD2_READINT(lengthOf_kRecognizedLangScripts);
113 // Confirm header size is correct.
114 int expectedHeaderSize = CLD2DynamicData::calculateHeaderSize(header->numTablesEncoded);
115 if (expectedHeaderSize != bytesRead) {
116 fprintf(stderr, "Header size mismatch! Expected %d, but read %d\n", expectedHeaderSize, bytesRead);
118 delete[] tableHeaders;
124 // Confirm file size is correct.
125 fseek(inFile, 0, SEEK_END);
126 actualSize = ftell(inFile);
132 if (actualSize != header->totalFileSizeBytes) {
133 fprintf(stderr, "File size mismatch! Expected %d, but found %d\n", header->totalFileSizeBytes, actualSize);
135 delete[] tableHeaders;
141 void unloadDataFile(CLD2::ScoringTables** scoringTables,
142 void** mmapAddress, uint32_t* mmapLength) {
144 // See https://code.google.com/p/cld2/issues/detail?id=20
145 fprintf(stderr, "dynamic data unloading from file is not currently supported on win32, use raw mode instead.");
147 #else // i.e., is POSIX (no support for Mac prior to OSX)
148 CLD2DynamicDataLoader::unloadDataRaw(scoringTables);
149 munmap(*mmapAddress, *mmapLength);
152 #endif // ifdef _WIN32
155 void unloadDataRaw(CLD2::ScoringTables** scoringTables) {
156 free(const_cast<CLD2::UTF8PropObj*>((*scoringTables)->unigram_obj));
157 (*scoringTables)->unigram_obj = NULL;
158 delete((*scoringTables)->unigram_compat_obj); // tableSummaries[0] from loadDataFile
159 (*scoringTables)->unigram_compat_obj = NULL;
160 delete(*scoringTables);
161 *scoringTables = NULL;
164 CLD2::ScoringTables* loadDataFile(const char* fileName,
165 void** mmapAddressOut, uint32_t* mmapLengthOut) {
168 // See https://code.google.com/p/cld2/issues/detail?id=20
169 fprintf(stderr, "dynamic data loading from file is not currently supported on win32, use raw mode instead.");
171 #else // i.e., is POSIX (no support for Mac prior to OSX)
172 CLD2DynamicData::FileHeader* header = loadHeaderFromFile(fileName);
173 if (header == NULL) {
177 // Initialize the memory map
178 int inFileHandle = OPEN(fileName, O_RDONLY);
179 void* mapped = mmap(NULL, header->totalFileSizeBytes,
180 PROT_READ, MAP_PRIVATE, inFileHandle, 0);
181 // Record the map address. This allows callers to unmap
182 *mmapAddressOut=mapped;
183 *mmapLengthOut=header->totalFileSizeBytes;
186 return loadDataInternal(header, mapped, header->totalFileSizeBytes);
187 #endif // ifdef _WIN32
190 CLD2::ScoringTables* loadDataRaw(const void* basePointer, const uint32_t length) {
191 CLD2DynamicData::FileHeader* header = loadHeaderFromRaw(basePointer, length);
192 return loadDataInternal(header, basePointer, length);
195 CLD2::ScoringTables* loadDataInternal(CLD2DynamicData::FileHeader* header, const void* basePointer, const uint32_t length) {
197 const CLD2::uint8* state_table = static_cast<const CLD2::uint8*>(basePointer) +
198 header->startOf_utf8PropObj_state_table;
199 // FIXME: Unsafe to rely on this since RemapEntry is not a bit-packed structure
200 const CLD2::RemapEntry* remap_base =
201 reinterpret_cast<const CLD2::RemapEntry*>(
202 static_cast<const CLD2::uint8*>(basePointer) +
203 header->startOf_utf8PropObj_remap_base);
204 const CLD2::uint8* remap_string = static_cast<const CLD2::uint8*>(basePointer) +
205 header->startOf_utf8PropObj_remap_string;
206 const CLD2::uint8* fast_state =
207 header->startOf_utf8PropObj_fast_state == 0 ? 0 :
208 static_cast<const CLD2::uint8*>(basePointer) +
209 header->startOf_utf8PropObj_fast_state;
211 // Populate intermediate object. Horrible casting required because the struct
212 // is all read-only integers, and doesn't have a constructor. Yikes.
213 // TODO: It might actually be less horrible to memcpy the data in <shudder>
214 const CLD2::UTF8PropObj* unigram_obj = reinterpret_cast<CLD2::UTF8PropObj*>(malloc(sizeof(CLD2::UTF8PropObj)));
215 *const_cast<CLD2::uint32*>(&unigram_obj->state0) = header->utf8PropObj_state0;
216 *const_cast<CLD2::uint32*>(&unigram_obj->state0_size) = header->utf8PropObj_state0_size;
217 *const_cast<CLD2::uint32*>(&unigram_obj->total_size) = header->utf8PropObj_total_size;
218 *const_cast<int*>(&unigram_obj->max_expand) = header->utf8PropObj_max_expand;
219 *const_cast<int*>(&unigram_obj->entry_shift) = header->utf8PropObj_entry_shift;
220 *const_cast<int*>(&unigram_obj->bytes_per_entry) = header->utf8PropObj_bytes_per_entry;
221 *const_cast<CLD2::uint32*>(&unigram_obj->losub) = header->utf8PropObj_losub;
222 *const_cast<CLD2::uint32*>(&unigram_obj->hiadd) = header->utf8PropObj_hiadd;
223 *const_cast<const CLD2::uint8**>(&unigram_obj->state_table) = state_table;
224 *const_cast<const CLD2::RemapEntry**>(&unigram_obj->remap_base) = remap_base;
225 *const_cast<const CLD2::uint8**>(&unigram_obj->remap_string) = remap_string;
226 *const_cast<const CLD2::uint8**>(&unigram_obj->fast_state) = fast_state;
228 // 2. kAvgDeltaOctaScore array
229 const short* read_kAvgDeltaOctaScore = reinterpret_cast<const short*>(
230 static_cast<const CLD2::uint8*>(basePointer) +
231 header->startOf_kAvgDeltaOctaScore);
234 CLD2::CLD2TableSummary* tableSummaries = new CLD2::CLD2TableSummary[header->numTablesEncoded];
235 for (int x=0; x<header->numTablesEncoded; x++) {
236 CLD2::CLD2TableSummary &summary = tableSummaries[x];
237 CLD2DynamicData::TableHeader& tHeader = header->tableHeaders[x];
238 const CLD2::IndirectProbBucket4* kCLDTable =
239 reinterpret_cast<const CLD2::IndirectProbBucket4*>(
240 static_cast<const CLD2::uint8*>(basePointer) + tHeader.startOf_kCLDTable);
241 const CLD2::uint32* kCLDTableInd =
242 reinterpret_cast<const CLD2::uint32*>(
243 static_cast<const CLD2::uint8*>(basePointer) + tHeader.startOf_kCLDTableInd);
244 const char* kRecognizedLangScripts =
245 static_cast<const char*>(basePointer) + tHeader.startOf_kRecognizedLangScripts;
247 summary.kCLDTable = kCLDTable;
248 summary.kCLDTableInd = kCLDTableInd;
249 summary.kCLDTableSizeOne = tHeader.kCLDTableSizeOne;
250 summary.kCLDTableSize = tHeader.kCLDTableSize;
251 summary.kCLDTableKeyMask = tHeader.kCLDTableKeyMask;
252 summary.kCLDTableBuildDate = tHeader.kCLDTableBuildDate;
253 summary.kRecognizedLangScripts = kRecognizedLangScripts;
256 // Tie everything together
257 CLD2::ScoringTables* result = new CLD2::ScoringTables;
258 result->unigram_obj = unigram_obj;
259 result->unigram_compat_obj = &tableSummaries[0];
260 result->deltabi_obj = &tableSummaries[1];
261 result->distinctbi_obj = &tableSummaries[2];
262 result->quadgram_obj = &tableSummaries[3];
263 result->quadgram_obj2 = &tableSummaries[4];
264 result->deltaocta_obj = &tableSummaries[5];
265 result->distinctocta_obj = &tableSummaries[6];
266 result->kExpectedScore = read_kAvgDeltaOctaScore;
267 delete[] header->tableHeaders;
272 } // namespace CLD2DynamicDataLoader