1 /* Writing Qt .qm files.
2 Copyright (C) 2003, 2005-2007, 2009, 2015 Free Software Foundation,
4 Written by Bruno Haible <bruno@clisp.org>, 2003.
6 This program is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program. If not, see <http://www.gnu.org/licenses/>. */
36 #include "po-charset.h"
37 #include "msgl-iconv.h"
38 #include "hash-string.h"
43 #include "binary-io.h"
44 #include "fwriteerror.h"
47 #define _(str) gettext (str)
49 /* Qt .qm files are read by the QTranslator::load() function and written
50 by the Qt QTranslator::save() function.
52 The Qt tool 'msg2qm' uses the latter function and can convert PO files
53 to .qm files. But since 'msg2qm' is marked as an "old" tool in Qt 3.0.5's
54 i18n.html documentation and therefore likely to disappear, we provide the
55 same functionality here.
57 The format of .qm files, as reverse engineered from the functions
58 QTranslator::save(const QString& filename, SaveMode mode)
59 QTranslator::squeeze(SaveMode mode)
60 QTranslatorMessage::write(QDataStream& stream, bool strip, Prefix prefix)
61 elfHash(const char* name)
62 in qt-3.0.5, is as follows:
64 It's a binary data format. Elements are u8 (byte), u16, u32. They are
65 written in big-endian order.
67 The file starts with a magic string of 16 bytes:
68 3C B8 64 18 CA EF 9C 95 CD 21 1C BF 60 A1 BD DD
70 Then come three sections. Each of the three sections is optional. Each
73 u8 section_type; // 0x42 = hashes, 0x69 = messages, 0x2f = contexts
74 u32 length; // number of bytes of the data
78 In the first section, the hashes section, the data has the following
80 It's a sorted array of
82 u32 hashcode; // elfHash of the concatenation of msgid and
83 // disambiguating-comment
84 u32 offset; // offset within the data[] of the messages section
86 It's sorted in ascending order by hashcode as primary sorting criteria
87 and - when the hashcodes are the same - by offset as secondary criteria.
89 In the second section, the messages section, the data has the following
91 It's a sequence of records, each representing a message, in no
92 particular order. Each record is a sequence of subsections, each
93 introduced by a particular subsection tag. The possible subsection tags
94 are (and they usually occur in this order):
95 - 03: Translation. Followed by the msgstr in UCS-2 or UTF-16 format:
100 - 08: Disambiguating-comment. Followed by the NUL-terminated,
101 ISO-8859-1 encoded, disambiguating-comment string:
103 u32 length; // number of bytes including the NUL at the end
106 - 06: SourceText, i.e. msgid. Followed by the NUL-terminated,
107 ISO-8859-1 encoded, msgid:
109 u32 length; // number of bytes including the NUL at the end
112 - 02: SourceText16, i.e. msgid. Encoded as UCS-2, but must actually
118 This subsection tag is obsoleted by SourceText.
119 - 07: Context. Followed by the NUL-terminated, ISO-8859-1 encoded,
120 context string (usually a C++ class name or empty):
122 u32 length; // number of bytes including the NUL at the end
125 - 04: Context16. Encoded as UCS-2, but must actually be ISO-8859-1.
130 This subsection tag is obsoleted by Context.
131 - 05: Hash. Followed by
133 u32 hashcode; // elfHash of the concatenation of msgid and
134 // disambiguating-comment
136 - 01: End. Designates the end of the record. No further data.
137 Usually the following subsections are written, but some of them are
140 - 08: Disambiguating-comment (optional).
141 - 06: SourceText (optional).
142 - 07: Context (optional).
145 A subsection can be omitted if the value to be output is the same as
146 for the previous record.
148 The third section, the contexts section, contains the set of all occurring
149 context strings. This section is optional; it is used to speed up the
150 search. The data is a hash table with the following structure:
153 u16 buckets[table_size];
158 for i = 0, ..., table_size:
159 if there are context strings with elfHash(context)%table_size == i:
160 for all context strings with elfHash(context)%table_size == i:
161 len := min(length(context),255); // truncated to length 255
167 u8 zero[1]; // signals the end of this bucket
168 u8 padding[0 or 1]; // padding for even number of bytes
170 buckets[i] is 0 for an empty bucket, or the offset in pool[] where
171 the context strings for this bucket start, divided by 2.
172 This context section must not be used
173 - if the empty context is used, or
174 - if a context of length > 255 is used, or
175 - if the context pool's size would be > 2^17.
177 The elfHash function is the same as our hash_string function, except that
178 at the end it maps a hash code of 0x00000000 to 0x00000001.
180 When we convert from PO file format, all disambiguating-comments and
181 contexts are empty, and therefore the contexts section can be omitted. */
184 /* Write a u8 (a single byte) to the output stream. */
186 write_u8 (FILE *output_file, unsigned char value)
188 putc (value, output_file);
191 /* Write a u16 (two bytes) to the output stream. */
193 write_u16 (FILE *output_file, unsigned short value)
195 unsigned char data[2];
197 data[0] = (value >> 8) & 0xff;
198 data[1] = value & 0xff;
200 fwrite (data, 2, 1, output_file);
203 /* Write a u32 (four bytes) to the output stream. */
205 write_u32 (FILE *output_file, unsigned int value)
207 unsigned char data[4];
209 data[0] = (value >> 24) & 0xff;
210 data[1] = (value >> 16) & 0xff;
211 data[2] = (value >> 8) & 0xff;
212 data[3] = value & 0xff;
214 fwrite (data, 4, 1, output_file);
218 #define obstack_chunk_alloc xmalloc
219 #define obstack_chunk_free free
221 /* Add a u8 (a single byte) to an obstack. */
223 append_u8 (struct obstack *mempool, unsigned char value)
225 unsigned char data[1];
229 obstack_grow (mempool, data, 1);
232 /* Add a u16 (two bytes) to an obstack. */
234 append_u16 (struct obstack *mempool, unsigned short value)
236 unsigned char data[2];
238 data[0] = (value >> 8) & 0xff;
239 data[1] = value & 0xff;
241 obstack_grow (mempool, data, 2);
244 /* Add a u32 (four bytes) to an obstack. */
246 append_u32 (struct obstack *mempool, unsigned int value)
248 unsigned char data[4];
250 data[0] = (value >> 24) & 0xff;
251 data[1] = (value >> 16) & 0xff;
252 data[2] = (value >> 8) & 0xff;
253 data[3] = value & 0xff;
255 obstack_grow (mempool, data, 4);
258 /* Add an ISO-8859-1 encoded string to an obstack. */
260 append_base_string (struct obstack *mempool, const char *string)
262 size_t length = strlen (string) + 1;
263 append_u32 (mempool, length);
264 obstack_grow (mempool, string, length);
267 /* Add an UTF-16 encoded string to an obstack. */
269 append_unicode_string (struct obstack *mempool, const unsigned short *string,
272 append_u32 (mempool, length * 2);
273 for (; length > 0; string++, length--)
274 append_u16 (mempool, *string);
277 /* Retrieve a 4-byte integer from memory. */
278 static inline unsigned int
279 peek_u32 (const unsigned char *p)
281 return (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
284 /* Convert an UTF-8 string to ISO-8859-1, without error checking. */
286 conv_to_iso_8859_1 (const char *string)
288 size_t length = strlen (string);
289 const char *str = string;
290 const char *str_limit = string + length;
291 /* Conversion to ISO-8859-1 can only reduce the number of bytes. */
292 char *result = XNMALLOC (length + 1, char);
295 while (str < str_limit)
298 str += u8_mbtouc (&uc, (const unsigned char *) str, str_limit - str);
299 /* It has already been verified that the string fits in ISO-8859-1. */
302 /* Store as ISO-8859-1. */
303 *q++ = (unsigned char) uc;
306 assert (q - result <= length);
311 /* Convert an UTF-8 string to UTF-16, returning its size (number of UTF-16
312 codepoints) in *SIZEP. */
313 static unsigned short *
314 conv_to_utf16 (const char *string, size_t *sizep)
316 size_t length = strlen (string);
317 const char *str = string;
318 const char *str_limit = string + length;
319 /* Conversion to UTF-16 can at most double the number of bytes. */
320 unsigned short *result = XNMALLOC (length, unsigned short);
321 unsigned short *q = result;
323 while (str < str_limit)
326 str += u8_mbtouc (&uc, (const unsigned char *) str, str_limit - str);
328 /* UCS-2 character. */
329 *q++ = (unsigned short) uc;
332 /* UTF-16 surrogate. */
333 *q++ = 0xd800 + ((uc - 0x10000) >> 10);
334 *q++ = 0xdc00 + ((uc - 0x10000) & 0x3ff);
337 assert (q - result <= 2 * length);
343 /* Return the Qt hash code of a string. */
345 string_hashcode (const char *str)
349 h = hash_string (str);
355 /* Compare two entries of the hashes section. */
357 cmp_hashes (const void *va, const void *vb)
359 const unsigned char *a = (const unsigned char *) va;
360 const unsigned char *b = (const unsigned char *) vb;
361 unsigned int a_hashcode = peek_u32 (a);
362 unsigned int b_hashcode = peek_u32 (b);
364 if (a_hashcode != b_hashcode)
365 return (a_hashcode >= b_hashcode ? 1 : -1);
368 unsigned int a_offset = peek_u32 (a + 4);
369 unsigned int b_offset = peek_u32 (b + 4);
371 if (a_offset != b_offset)
372 return (a_offset >= b_offset ? 1 : -1);
379 /* Write a section to the output stream. */
381 write_section (FILE *output_file, unsigned char tag, void *data, size_t size)
383 /* A section can be omitted if it is empty. */
386 write_u8 (output_file, tag);
387 write_u32 (output_file, size);
388 fwrite (data, size, 1, output_file);
393 /* Write an entire .qm file. */
395 write_qm (FILE *output_file, message_list_ty *mlp)
397 static unsigned char magic[16] =
399 0x3C, 0xB8, 0x64, 0x18, 0xCA, 0xEF, 0x9C, 0x95,
400 0xCD, 0x21, 0x1C, 0xBF, 0x60, 0xA1, 0xBD, 0xDD
402 struct obstack hashes_pool;
403 struct obstack messages_pool;
406 obstack_init (&hashes_pool);
407 obstack_init (&messages_pool);
409 /* Prepare the hashes section and the messages section. */
410 for (j = 0; j < mlp->nitems; j++)
412 message_ty *mp = mlp->item[j];
414 /* No need to emit the header entry, it's not needed at runtime. */
417 char *msgctxt_as_iso_8859_1 =
418 conv_to_iso_8859_1 (mp->msgctxt != NULL ? mp->msgctxt : "");
419 char *msgid_as_iso_8859_1 = conv_to_iso_8859_1 (mp->msgid);
421 unsigned short *msgstr_as_utf16 =
422 conv_to_utf16 (mp->msgstr, &msgstr_len);
423 unsigned int hashcode = string_hashcode (msgid_as_iso_8859_1);
424 unsigned int offset = obstack_object_size (&messages_pool);
426 /* Add a record to the hashes section. */
427 append_u32 (&hashes_pool, hashcode);
428 append_u32 (&hashes_pool, offset);
430 /* Add a record to the messages section. */
432 append_u8 (&messages_pool, 0x03);
433 append_unicode_string (&messages_pool, msgstr_as_utf16, msgstr_len);
435 append_u8 (&messages_pool, 0x08);
436 append_base_string (&messages_pool, "");
438 append_u8 (&messages_pool, 0x06);
439 append_base_string (&messages_pool, msgid_as_iso_8859_1);
441 append_u8 (&messages_pool, 0x07);
442 append_base_string (&messages_pool, msgctxt_as_iso_8859_1);
444 append_u8 (&messages_pool, 0x05);
445 append_u32 (&messages_pool, hashcode);
447 append_u8 (&messages_pool, 0x01);
449 free (msgstr_as_utf16);
450 free (msgid_as_iso_8859_1);
451 free (msgctxt_as_iso_8859_1);
455 /* Sort the hashes section. */
457 size_t nstrings = obstack_object_size (&hashes_pool) / 8;
459 qsort (obstack_base (&hashes_pool), nstrings, 8, cmp_hashes);
462 /* Write the magic number. */
463 fwrite (magic, sizeof (magic), 1, output_file);
465 /* Write the hashes section. */
466 write_section (output_file, 0x42, obstack_base (&hashes_pool),
467 obstack_object_size (&hashes_pool));
469 /* Write the messages section. */
470 write_section (output_file, 0x69, obstack_base (&messages_pool),
471 obstack_object_size (&messages_pool));
473 /* Decide whether to write a contexts section. */
475 bool can_write_contexts = true;
477 for (j = 0; j < mlp->nitems; j++)
479 message_ty *mp = mlp->item[j];
482 if (mp->msgctxt == NULL || mp->msgctxt[0] == '\0'
483 || strlen (mp->msgctxt) > 255)
485 can_write_contexts = false;
490 if (can_write_contexts)
492 hash_table all_contexts;
494 unsigned long table_size;
496 /* Collect the contexts, removing duplicates. */
497 hash_init (&all_contexts, 10);
498 for (j = 0; j < mlp->nitems; j++)
500 message_ty *mp = mlp->item[j];
503 hash_insert_entry (&all_contexts,
504 mp->msgctxt, strlen (mp->msgctxt) + 1,
508 /* Compute the number of different contexts. */
509 num_contexts = all_contexts.size;
511 /* Compute a suitable hash table size. */
512 table_size = next_prime (num_contexts * 1.7);
513 if (table_size >= 0x10000)
516 /* Put the contexts into a hash table of size table_size. */
518 struct list_cell { const char *context; struct list_cell *next; };
519 struct list_cell *list_memory =
520 XNMALLOC (table_size, struct list_cell);
521 struct list_cell *freelist;
522 struct bucket { struct list_cell *head; struct list_cell **tail; };
523 struct bucket *buckets = XNMALLOC (table_size, struct bucket);
526 freelist = list_memory;
528 for (i = 0; i < table_size; i++)
530 buckets[i].head = NULL;
531 buckets[i].tail = &buckets[i].head;
541 while (hash_iterate (&all_contexts, &iter, &key, &keylen, &null)
544 const char *context = (const char *)key;
545 i = string_hashcode (context) % table_size;
546 freelist->context = context;
547 freelist->next = NULL;
548 *buckets[i].tail = freelist;
549 buckets[i].tail = &freelist->next;
554 /* Determine the total context pool size. */
559 for (i = 0; i < table_size; i++)
560 if (buckets[i].head != NULL)
562 const struct list_cell *p;
564 for (p = buckets[i].head; p != NULL; p = p->next)
565 pool_size += 1 + strlen (p->context);
567 if ((pool_size % 2) != 0)
570 if (pool_size <= 0x20000)
572 /* Prepare the contexts section. */
573 struct obstack contexts_pool;
576 obstack_init (&contexts_pool);
578 append_u16 (&contexts_pool, table_size);
580 for (i = 0; i < table_size; i++)
581 if (buckets[i].head != NULL)
583 const struct list_cell *p;
585 append_u16 (&contexts_pool, pool_offset / 2);
586 for (p = buckets[i].head; p != NULL; p = p->next)
587 pool_offset += 1 + strlen (p->context);
589 if ((pool_offset % 2) != 0)
593 append_u16 (&contexts_pool, 0);
594 if (!(pool_offset == pool_size))
597 append_u16 (&contexts_pool, 0);
599 for (i = 0; i < table_size; i++)
600 if (buckets[i].head != NULL)
602 const struct list_cell *p;
604 for (p = buckets[i].head; p != NULL; p = p->next)
606 append_u8 (&contexts_pool, strlen (p->context));
607 obstack_grow (&contexts_pool,
608 p->context, strlen (p->context));
609 pool_offset += 1 + strlen (p->context);
611 append_u8 (&contexts_pool, 0);
613 if ((pool_offset % 2) != 0)
615 append_u8 (&contexts_pool, 0);
619 if (!(pool_offset == pool_size))
622 if (!(obstack_object_size (&contexts_pool)
623 == 2 + 2 * table_size + pool_size))
626 /* Write the contexts section. */
627 write_section (output_file, 0x2f, obstack_base (&contexts_pool),
628 obstack_object_size (&contexts_pool));
630 obstack_free (&contexts_pool, NULL);
638 hash_destroy (&all_contexts);
642 obstack_free (&messages_pool, NULL);
643 obstack_free (&hashes_pool, NULL);
648 msgdomain_write_qt (message_list_ty *mlp, const char *canon_encoding,
649 const char *domain_name, const char *file_name)
653 /* If no entry for this domain don't even create the file. */
654 if (mlp->nitems != 0)
656 /* Determine whether mlp has plural entries. */
662 for (j = 0; j < mlp->nitems; j++)
663 if (mlp->item[j]->msgid_plural != NULL)
667 multiline_error (xstrdup (""),
669 message catalog has plural form translations\n\
670 but the Qt message catalog format doesn't support plural handling\n")));
675 /* Convert the messages to Unicode. */
676 iconv_message_list (mlp, canon_encoding, po_charset_utf8, NULL);
678 /* Determine whether mlp has non-ISO-8859-1 msgctxt entries. */
682 for (j = 0; j < mlp->nitems; j++)
684 const char *string = mlp->item[j]->msgctxt;
688 /* An UTF-8 encoded string fits in ISO-8859-1 if and only if
689 all its bytes are < 0xc4. */
690 for (; *string; string++)
691 if ((unsigned char) *string >= 0xc4)
693 multiline_error (xstrdup (""),
695 message catalog has msgctxt strings containing characters outside ISO-8859-1\n\
696 but the Qt message catalog format supports Unicode only in the translated\n\
697 strings, not in the context strings\n")));
704 /* Determine whether mlp has non-ISO-8859-1 msgid entries. */
708 for (j = 0; j < mlp->nitems; j++)
710 const char *string = mlp->item[j]->msgid;
712 /* An UTF-8 encoded string fits in ISO-8859-1 if and only if all
713 its bytes are < 0xc4. */
714 for (; *string; string++)
715 if ((unsigned char) *string >= 0xc4)
717 multiline_error (xstrdup (""),
719 message catalog has msgid strings containing characters outside ISO-8859-1\n\
720 but the Qt message catalog format supports Unicode only in the translated\n\
721 strings, not in the untranslated strings\n")));
727 if (strcmp (domain_name, "-") == 0)
729 output_file = stdout;
730 SET_BINARY (fileno (output_file));
734 output_file = fopen (file_name, "wb");
735 if (output_file == NULL)
737 error (0, errno, _("error while opening \"%s\" for writing"),
743 if (output_file != NULL)
745 write_qm (output_file, mlp);
747 /* Make sure nothing went wrong. */
748 if (fwriteerror (output_file))
749 error (EXIT_FAILURE, errno, _("error while writing \"%s\" file"),