8764c3e5ac55c9ad21095b52928e9eba2b19c1cc
[platform/framework/web/crosswalk.git] / src / third_party / cld_2 / src / internal / cld2_dynamic_data_loader.cc
1 //
2 // Licensed under the Apache License, Version 2.0 (the "License");
3 // you may not use this file except in compliance with the License.
4 // You may obtain a copy of the License at
5 //
6 //     http://www.apache.org/licenses/LICENSE-2.0
7 //
8 // Unless required by applicable law or agreed to in writing, software
9 // distributed under the License is distributed on an "AS IS" BASIS,
10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 // See the License for the specific language governing permissions and
12 // limitations under the License.
13
14 #include <assert.h>
15 #include <stdint.h>
16 #include <stdio.h>
17 #include <fstream>
18 #include <fcntl.h>
19 #include <stdlib.h>
20 #include <string.h>
21
22 #include "cld2_dynamic_compat.h" // for win32/posix compatibility
23 #include "cld2_dynamic_data.h"
24 #include "cld2_dynamic_data_loader.h"
25 #include "integral_types.h"
26 #include "cld2tablesummary.h"
27 #include "utf8statetable.h"
28 #include "scoreonescriptspan.h"
29
30 namespace CLD2DynamicDataLoader {
31 static int DEBUG=0;
32
33 CLD2DynamicData::FileHeader* loadHeaderFromFile(const char* fileName) {
34   FILE* inFile = fopen(fileName, "r");
35   if (inFile == NULL) {
36     return NULL;
37   }
38   return loadInternal(inFile, NULL, -1);
39 }
40
41 CLD2DynamicData::FileHeader* loadHeaderFromRaw(const void* basePointer,
42                                                const uint32_t length) {
43   return loadInternal(NULL, basePointer, length);
44 }
45
46
47 #define CLD2_READINT(field) \
48   if (sourceIsFile) {\
49     bytesRead += 4 * fread(&(header->field), 4, 1, inFile);\
50   } else {\
51     memcpy(&(header->field), (((char*)(basePointer)) + bytesRead), 4);\
52     bytesRead += 4;\
53   }
54 CLD2DynamicData::FileHeader* loadInternal(FILE* inFile, const void* basePointer, const uint32_t length) {
55   const bool sourceIsFile = (inFile != NULL);
56   int bytesRead = 0;
57   CLD2DynamicData::FileHeader* header = new CLD2DynamicData::FileHeader;
58
59   // TODO: force null-terminate char* strings for safety
60   if (sourceIsFile) {
61     bytesRead += fread(header->sanityString, 1, CLD2DynamicData::DATA_FILE_MARKER_LENGTH, inFile);
62   } else {
63     memcpy(header->sanityString, basePointer, CLD2DynamicData::DATA_FILE_MARKER_LENGTH);
64     bytesRead += CLD2DynamicData::DATA_FILE_MARKER_LENGTH;
65   }
66
67   if (!CLD2DynamicData::mem_compare(
68                                     header->sanityString,
69                                     CLD2DynamicData::DATA_FILE_MARKER,
70                                     CLD2DynamicData::DATA_FILE_MARKER_LENGTH)) {
71     fprintf(stderr, "Malformed header: bad file marker!\n");
72     delete header;
73     return NULL;
74   }
75
76   CLD2_READINT(totalFileSizeBytes);
77   CLD2_READINT(utf8PropObj_state0);
78   CLD2_READINT(utf8PropObj_state0_size);
79   CLD2_READINT(utf8PropObj_total_size);
80   CLD2_READINT(utf8PropObj_max_expand);
81   CLD2_READINT(utf8PropObj_entry_shift);
82   CLD2_READINT(utf8PropObj_bytes_per_entry);
83   CLD2_READINT(utf8PropObj_losub);
84   CLD2_READINT(utf8PropObj_hiadd);
85   CLD2_READINT(startOf_utf8PropObj_state_table);
86   CLD2_READINT(lengthOf_utf8PropObj_state_table);
87   CLD2_READINT(startOf_utf8PropObj_remap_base);
88   CLD2_READINT(lengthOf_utf8PropObj_remap_base);
89   CLD2_READINT(startOf_utf8PropObj_remap_string);
90   CLD2_READINT(lengthOf_utf8PropObj_remap_string);
91   CLD2_READINT(startOf_utf8PropObj_fast_state);
92   CLD2_READINT(lengthOf_utf8PropObj_fast_state);
93   CLD2_READINT(startOf_kAvgDeltaOctaScore);
94   CLD2_READINT(lengthOf_kAvgDeltaOctaScore);
95   CLD2_READINT(numTablesEncoded);
96
97   CLD2DynamicData::TableHeader* tableHeaders = new CLD2DynamicData::TableHeader[header->numTablesEncoded];
98   header->tableHeaders = tableHeaders;
99   for (int x=0; x<header->numTablesEncoded; x++) {
100     CLD2DynamicData::TableHeader *header = &(tableHeaders[x]);
101     CLD2_READINT(kCLDTableSizeOne);
102     CLD2_READINT(kCLDTableSize);
103     CLD2_READINT(kCLDTableKeyMask);
104     CLD2_READINT(kCLDTableBuildDate);
105     CLD2_READINT(startOf_kCLDTable);
106     CLD2_READINT(lengthOf_kCLDTable);
107     CLD2_READINT(startOf_kCLDTableInd);
108     CLD2_READINT(lengthOf_kCLDTableInd);
109     CLD2_READINT(startOf_kRecognizedLangScripts);
110     CLD2_READINT(lengthOf_kRecognizedLangScripts);
111   }
112
113   // Confirm header size is correct.
114   int expectedHeaderSize = CLD2DynamicData::calculateHeaderSize(header->numTablesEncoded);
115   if (expectedHeaderSize != bytesRead) {
116     fprintf(stderr, "Header size mismatch! Expected %d, but read %d\n", expectedHeaderSize, bytesRead);
117     delete header;
118     delete[] tableHeaders;
119     return NULL;
120   }
121
122   int actualSize = 0;
123   if (sourceIsFile) {
124     // Confirm file size is correct.
125     fseek(inFile, 0, SEEK_END);
126     actualSize = ftell(inFile);
127     fclose(inFile);
128   } else {
129     actualSize = length;
130   }
131
132   if (actualSize != header->totalFileSizeBytes) {
133     fprintf(stderr, "File size mismatch! Expected %d, but found %d\n", header->totalFileSizeBytes, actualSize);
134     delete header;
135     delete[] tableHeaders;
136     return NULL;
137   }
138   return header;
139 }
140
141 void unloadDataFile(CLD2::ScoringTables** scoringTables,
142                     void** mmapAddress, uint32_t* mmapLength) {
143 #ifdef _WIN32
144   // See https://code.google.com/p/cld2/issues/detail?id=20
145   fprintf(stderr, "dynamic data unloading from file is not currently supported on win32, use raw mode instead.");
146   return;
147 #else // i.e., is POSIX (no support for Mac prior to OSX)
148   CLD2DynamicDataLoader::unloadDataRaw(scoringTables);
149   munmap(*mmapAddress, *mmapLength);
150   *mmapAddress = NULL;
151   *mmapLength = 0;
152 #endif // ifdef _WIN32
153 }
154
155 void unloadDataRaw(CLD2::ScoringTables** scoringTables) {
156   free(const_cast<CLD2::UTF8PropObj*>((*scoringTables)->unigram_obj));
157   (*scoringTables)->unigram_obj = NULL;
158   delete((*scoringTables)->unigram_compat_obj); // tableSummaries[0] from loadDataFile
159   (*scoringTables)->unigram_compat_obj = NULL;
160   delete(*scoringTables);
161   *scoringTables = NULL;
162 }
163
164 CLD2::ScoringTables* loadDataFile(const char* fileName,
165                                   void** mmapAddressOut, uint32_t* mmapLengthOut) {
166
167 #ifdef _WIN32
168   // See https://code.google.com/p/cld2/issues/detail?id=20
169   fprintf(stderr, "dynamic data loading from file is not currently supported on win32, use raw mode instead.");
170   return NULL;
171 #else // i.e., is POSIX (no support for Mac prior to OSX)
172   CLD2DynamicData::FileHeader* header = loadHeaderFromFile(fileName);
173   if (header == NULL) {
174     return NULL;
175   }
176
177   // Initialize the memory map
178   int inFileHandle = OPEN(fileName, O_RDONLY);
179   void* mapped = mmap(NULL, header->totalFileSizeBytes,
180     PROT_READ, MAP_PRIVATE, inFileHandle, 0);
181   // Record the map address. This allows callers to unmap 
182   *mmapAddressOut=mapped;
183   *mmapLengthOut=header->totalFileSizeBytes;
184   CLOSE(inFileHandle);
185
186   return loadDataInternal(header, mapped, header->totalFileSizeBytes);
187 #endif // ifdef _WIN32
188 }
189
190 CLD2::ScoringTables* loadDataRaw(const void* basePointer, const uint32_t length) {
191   CLD2DynamicData::FileHeader* header = loadHeaderFromRaw(basePointer, length);
192   return loadDataInternal(header, basePointer, length);
193 }
194
195 CLD2::ScoringTables* loadDataInternal(CLD2DynamicData::FileHeader* header, const void* basePointer, const uint32_t length) {
196   // 1. UTF8 Object
197   const CLD2::uint8* state_table = static_cast<const CLD2::uint8*>(basePointer) +
198     header->startOf_utf8PropObj_state_table;
199   // FIXME: Unsafe to rely on this since RemapEntry is not a bit-packed structure
200   const CLD2::RemapEntry* remap_base =
201     reinterpret_cast<const CLD2::RemapEntry*>(
202       static_cast<const CLD2::uint8*>(basePointer) +
203       header->startOf_utf8PropObj_remap_base);
204   const CLD2::uint8* remap_string = static_cast<const CLD2::uint8*>(basePointer) +
205     header->startOf_utf8PropObj_remap_string;
206   const CLD2::uint8* fast_state =
207     header->startOf_utf8PropObj_fast_state == 0 ? 0 :
208       static_cast<const CLD2::uint8*>(basePointer) +
209       header->startOf_utf8PropObj_fast_state;
210
211   // Populate intermediate object. Horrible casting required because the struct
212   // is all read-only integers, and doesn't have a constructor. Yikes.
213   // TODO: It might actually be less horrible to memcpy the data in <shudder>
214   const CLD2::UTF8PropObj* unigram_obj = reinterpret_cast<CLD2::UTF8PropObj*>(malloc(sizeof(CLD2::UTF8PropObj)));
215   *const_cast<CLD2::uint32*>(&unigram_obj->state0) = header->utf8PropObj_state0;
216   *const_cast<CLD2::uint32*>(&unigram_obj->state0_size) = header->utf8PropObj_state0_size;
217   *const_cast<CLD2::uint32*>(&unigram_obj->total_size) = header->utf8PropObj_total_size;
218   *const_cast<int*>(&unigram_obj->max_expand) = header->utf8PropObj_max_expand;
219   *const_cast<int*>(&unigram_obj->entry_shift) = header->utf8PropObj_entry_shift;
220   *const_cast<int*>(&unigram_obj->bytes_per_entry) = header->utf8PropObj_bytes_per_entry;
221   *const_cast<CLD2::uint32*>(&unigram_obj->losub) = header->utf8PropObj_losub;
222   *const_cast<CLD2::uint32*>(&unigram_obj->hiadd) = header->utf8PropObj_hiadd;
223   *const_cast<const CLD2::uint8**>(&unigram_obj->state_table) = state_table;
224   *const_cast<const CLD2::RemapEntry**>(&unigram_obj->remap_base) = remap_base;
225   *const_cast<const CLD2::uint8**>(&unigram_obj->remap_string) = remap_string;
226   *const_cast<const CLD2::uint8**>(&unigram_obj->fast_state) = fast_state;
227
228   // 2. kAvgDeltaOctaScore array
229   const short* read_kAvgDeltaOctaScore = reinterpret_cast<const short*>(
230     static_cast<const CLD2::uint8*>(basePointer) +
231     header->startOf_kAvgDeltaOctaScore);
232
233   // 3. Each table
234   CLD2::CLD2TableSummary* tableSummaries = new CLD2::CLD2TableSummary[header->numTablesEncoded];
235   for (int x=0; x<header->numTablesEncoded; x++) {
236     CLD2::CLD2TableSummary &summary = tableSummaries[x];
237     CLD2DynamicData::TableHeader& tHeader = header->tableHeaders[x];
238     const CLD2::IndirectProbBucket4* kCLDTable =
239       reinterpret_cast<const CLD2::IndirectProbBucket4*>(
240         static_cast<const CLD2::uint8*>(basePointer) + tHeader.startOf_kCLDTable);
241     const CLD2::uint32* kCLDTableInd =
242       reinterpret_cast<const CLD2::uint32*>(
243         static_cast<const CLD2::uint8*>(basePointer) + tHeader.startOf_kCLDTableInd);
244     const char* kRecognizedLangScripts =
245       static_cast<const char*>(basePointer) + tHeader.startOf_kRecognizedLangScripts;
246
247     summary.kCLDTable = kCLDTable;
248     summary.kCLDTableInd = kCLDTableInd;
249     summary.kCLDTableSizeOne = tHeader.kCLDTableSizeOne;
250     summary.kCLDTableSize = tHeader.kCLDTableSize;
251     summary.kCLDTableKeyMask = tHeader.kCLDTableKeyMask;
252     summary.kCLDTableBuildDate = tHeader.kCLDTableBuildDate;
253     summary.kRecognizedLangScripts = kRecognizedLangScripts;
254   }
255
256   // Tie everything together
257   CLD2::ScoringTables* result = new CLD2::ScoringTables;
258   result->unigram_obj = unigram_obj;
259   result->unigram_compat_obj = &tableSummaries[0];
260   result->deltabi_obj = &tableSummaries[1];
261   result->distinctbi_obj = &tableSummaries[2];
262   result->quadgram_obj = &tableSummaries[3];
263   result->quadgram_obj2 = &tableSummaries[4];
264   result->deltaocta_obj = &tableSummaries[5];
265   result->distinctocta_obj = &tableSummaries[6];
266   result->kExpectedScore = read_kAvgDeltaOctaScore;
267   delete[] header->tableHeaders;
268   delete header;
269   return result;
270 }
271
272 } // namespace CLD2DynamicDataLoader