1 /* Writing Qt .qm files.
2 Copyright (C) 2003, 2005-2007, 2009 Free Software Foundation, Inc.
3 Written by Bruno Haible <bruno@clisp.org>, 2003.
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3 of the License, or
8 (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <http://www.gnu.org/licenses/>. */
35 #include "po-charset.h"
36 #include "msgl-iconv.h"
37 #include "hash-string.h"
42 #include "binary-io.h"
43 #include "fwriteerror.h"
46 #define _(str) gettext (str)
48 /* Qt .qm files are read by the QTranslator::load() function and written
49 by the Qt QTranslator::save() function.
51 The Qt tool 'msg2qm' uses the latter function and can convert PO files
52 to .qm files. But since 'msg2qm' is marked as an "old" tool in Qt 3.0.5's
53 i18n.html documentation and therefore likely to disappear, we provide the
54 same functionality here.
56 The format of .qm files, as reverse engineered from the functions
57 QTranslator::save(const QString& filename, SaveMode mode)
58 QTranslator::squeeze(SaveMode mode)
59 QTranslatorMessage::write(QDataStream& stream, bool strip, Prefix prefix)
60 elfHash(const char* name)
61 in qt-3.0.5, is as follows:
63 It's a binary data format. Elements are u8 (byte), u16, u32. They are
64 written in big-endian order.
66 The file starts with a magic string of 16 bytes:
67 3C B8 64 18 CA EF 9C 95 CD 21 1C BF 60 A1 BD DD
69 Then come three sections. Each of the three sections is optional. Each
72 u8 section_type; // 0x42 = hashes, 0x69 = messages, 0x2f = contexts
73 u32 length; // number of bytes of the data
77 In the first section, the hashes section, the data has the following
79 It's a sorted array of
81 u32 hashcode; // elfHash of the concatenation of msgid and
82 // disambiguating-comment
83 u32 offset; // offset within the data[] of the messages section
85 It's sorted in ascending order by hashcode as primary sorting criteria
86 and - when the hashcodes are the same - by offset as secondary criteria.
88 In the second section, the messages section, the data has the following
90 It's a sequence of records, each representing a message, in no
91 particular order. Each record is a sequence of subsections, each
92 introduced by a particular subsection tag. The possible subsection tags
93 are (and they usually occur in this order):
94 - 03: Translation. Followed by the msgstr in UCS-2 or UTF-16 format:
99 - 08: Disambiguating-comment. Followed by the NUL-terminated,
100 ISO-8859-1 encoded, disambiguating-comment string:
102 u32 length; // number of bytes including the NUL at the end
105 - 06: SourceText, i.e. msgid. Followed by the NUL-terminated,
106 ISO-8859-1 encoded, msgid:
108 u32 length; // number of bytes including the NUL at the end
111 - 02: SourceText16, i.e. msgid. Encoded as UCS-2, but must actually
117 This subsection tag is obsoleted by SourceText.
118 - 07: Context. Followed by the NUL-terminated, ISO-8859-1 encoded,
119 context string (usually a C++ class name or empty):
121 u32 length; // number of bytes including the NUL at the end
124 - 04: Context16. Encoded as UCS-2, but must actually be ISO-8859-1.
129 This subsection tag is obsoleted by Context.
130 - 05: Hash. Followed by
132 u32 hashcode; // elfHash of the concatenation of msgid and
133 // disambiguating-comment
135 - 01: End. Designates the end of the record. No further data.
136 Usually the following subsections are written, but some of them are
139 - 08: Disambiguating-comment (optional).
140 - 06: SourceText (optional).
141 - 07: Context (optional).
144 A subsection can be omitted if the value to be output is the same as
145 for the previous record.
147 The third section, the contexts section, contains the set of all occurring
148 context strings. This section is optional; it is used to speed up the
149 search. The data is a hash table with the following structure:
152 u16 buckets[table_size];
157 for i = 0, ..., table_size:
158 if there are context strings with elfHash(context)%table_size == i:
159 for all context strings with elfHash(context)%table_size == i:
160 len := min(length(context),255); // truncated to length 255
166 u8 zero[1]; // signals the end of this bucket
167 u8 padding[0 or 1]; // padding for even number of bytes
169 buckets[i] is 0 for an empty bucket, or the offset in pool[] where
170 the context strings for this bucket start, divided by 2.
171 This context section must not be used
172 - if the empty context is used, or
173 - if a context of length > 255 is used, or
174 - if the context pool's size would be > 2^17.
176 The elfHash function is the same as our hash_string function, except that
177 at the end it maps a hash code of 0x00000000 to 0x00000001.
179 When we convert from PO file format, all disambiguating-comments and
180 contexts are empty, and therefore the contexts section can be omitted. */
183 /* Write a u8 (a single byte) to the output stream. */
185 write_u8 (FILE *output_file, unsigned char value)
187 putc (value, output_file);
190 /* Write a u16 (two bytes) to the output stream. */
192 write_u16 (FILE *output_file, unsigned short value)
194 unsigned char data[2];
196 data[0] = (value >> 8) & 0xff;
197 data[1] = value & 0xff;
199 fwrite (data, 2, 1, output_file);
202 /* Write a u32 (four bytes) to the output stream. */
204 write_u32 (FILE *output_file, unsigned int value)
206 unsigned char data[4];
208 data[0] = (value >> 24) & 0xff;
209 data[1] = (value >> 16) & 0xff;
210 data[2] = (value >> 8) & 0xff;
211 data[3] = value & 0xff;
213 fwrite (data, 4, 1, output_file);
217 #define obstack_chunk_alloc xmalloc
218 #define obstack_chunk_free free
220 /* Add a u8 (a single byte) to an obstack. */
222 append_u8 (struct obstack *mempool, unsigned char value)
224 unsigned char data[1];
228 obstack_grow (mempool, data, 1);
231 /* Add a u16 (two bytes) to an obstack. */
233 append_u16 (struct obstack *mempool, unsigned short value)
235 unsigned char data[2];
237 data[0] = (value >> 8) & 0xff;
238 data[1] = value & 0xff;
240 obstack_grow (mempool, data, 2);
243 /* Add a u32 (four bytes) to an obstack. */
245 append_u32 (struct obstack *mempool, unsigned int value)
247 unsigned char data[4];
249 data[0] = (value >> 24) & 0xff;
250 data[1] = (value >> 16) & 0xff;
251 data[2] = (value >> 8) & 0xff;
252 data[3] = value & 0xff;
254 obstack_grow (mempool, data, 4);
257 /* Add an ISO-8859-1 encoded string to an obstack. */
259 append_base_string (struct obstack *mempool, const char *string)
261 size_t length = strlen (string) + 1;
262 append_u32 (mempool, length);
263 obstack_grow (mempool, string, length);
266 /* Add an UTF-16 encoded string to an obstack. */
268 append_unicode_string (struct obstack *mempool, const unsigned short *string,
271 append_u32 (mempool, length * 2);
272 for (; length > 0; string++, length--)
273 append_u16 (mempool, *string);
276 /* Retrieve a 4-byte integer from memory. */
277 static inline unsigned int
278 peek_u32 (const unsigned char *p)
280 return (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
283 /* Convert an UTF-8 string to ISO-8859-1, without error checking. */
285 conv_to_iso_8859_1 (const char *string)
287 size_t length = strlen (string);
288 const char *str = string;
289 const char *str_limit = string + length;
290 /* Conversion to ISO-8859-1 can only reduce the number of bytes. */
291 char *result = XNMALLOC (length + 1, char);
294 while (str < str_limit)
297 str += u8_mbtouc (&uc, (const unsigned char *) str, str_limit - str);
298 /* It has already been verified that the string fits in ISO-8859-1. */
301 /* Store as ISO-8859-1. */
302 *q++ = (unsigned char) uc;
305 assert (q - result <= length);
310 /* Convert an UTF-8 string to UTF-16, returning its size (number of UTF-16
311 codepoints) in *SIZEP. */
312 static unsigned short *
313 conv_to_utf16 (const char *string, size_t *sizep)
315 size_t length = strlen (string);
316 const char *str = string;
317 const char *str_limit = string + length;
318 /* Conversion to UTF-16 can at most double the number of bytes. */
319 unsigned short *result = XNMALLOC (length, unsigned short);
320 unsigned short *q = result;
322 while (str < str_limit)
325 str += u8_mbtouc (&uc, (const unsigned char *) str, str_limit - str);
327 /* UCS-2 character. */
328 *q++ = (unsigned short) uc;
331 /* UTF-16 surrogate. */
332 *q++ = 0xd800 + ((uc - 0x10000) >> 10);
333 *q++ = 0xdc00 + ((uc - 0x10000) & 0x3ff);
336 assert (q - result <= 2 * length);
342 /* Return the Qt hash code of a string. */
344 string_hashcode (const char *str)
348 h = hash_string (str);
354 /* Compare two entries of the hashes section. */
356 cmp_hashes (const void *va, const void *vb)
358 const unsigned char *a = (const unsigned char *) va;
359 const unsigned char *b = (const unsigned char *) vb;
360 unsigned int a_hashcode = peek_u32 (a);
361 unsigned int b_hashcode = peek_u32 (b);
363 if (a_hashcode != b_hashcode)
364 return (a_hashcode >= b_hashcode ? 1 : -1);
367 unsigned int a_offset = peek_u32 (a + 4);
368 unsigned int b_offset = peek_u32 (b + 4);
370 if (a_offset != b_offset)
371 return (a_offset >= b_offset ? 1 : -1);
378 /* Write a section to the output stream. */
380 write_section (FILE *output_file, unsigned char tag, void *data, size_t size)
382 /* A section can be omitted if it is empty. */
385 write_u8 (output_file, tag);
386 write_u32 (output_file, size);
387 fwrite (data, size, 1, output_file);
392 /* Write an entire .qm file. */
394 write_qm (FILE *output_file, message_list_ty *mlp)
396 static unsigned char magic[16] =
398 0x3C, 0xB8, 0x64, 0x18, 0xCA, 0xEF, 0x9C, 0x95,
399 0xCD, 0x21, 0x1C, 0xBF, 0x60, 0xA1, 0xBD, 0xDD
401 struct obstack hashes_pool;
402 struct obstack messages_pool;
405 obstack_init (&hashes_pool);
406 obstack_init (&messages_pool);
408 /* Prepare the hashes section and the messages section. */
409 for (j = 0; j < mlp->nitems; j++)
411 message_ty *mp = mlp->item[j];
413 /* No need to emit the header entry, it's not needed at runtime. */
416 char *msgctxt_as_iso_8859_1 =
417 conv_to_iso_8859_1 (mp->msgctxt != NULL ? mp->msgctxt : "");
418 char *msgid_as_iso_8859_1 = conv_to_iso_8859_1 (mp->msgid);
420 unsigned short *msgstr_as_utf16 =
421 conv_to_utf16 (mp->msgstr, &msgstr_len);
422 unsigned int hashcode = string_hashcode (msgid_as_iso_8859_1);
423 unsigned int offset = obstack_object_size (&messages_pool);
425 /* Add a record to the hashes section. */
426 append_u32 (&hashes_pool, hashcode);
427 append_u32 (&hashes_pool, offset);
429 /* Add a record to the messages section. */
431 append_u8 (&messages_pool, 0x03);
432 append_unicode_string (&messages_pool, msgstr_as_utf16, msgstr_len);
434 append_u8 (&messages_pool, 0x08);
435 append_base_string (&messages_pool, "");
437 append_u8 (&messages_pool, 0x06);
438 append_base_string (&messages_pool, msgid_as_iso_8859_1);
440 append_u8 (&messages_pool, 0x07);
441 append_base_string (&messages_pool, msgctxt_as_iso_8859_1);
443 append_u8 (&messages_pool, 0x05);
444 append_u32 (&messages_pool, hashcode);
446 append_u8 (&messages_pool, 0x01);
448 free (msgstr_as_utf16);
449 free (msgid_as_iso_8859_1);
450 free (msgctxt_as_iso_8859_1);
454 /* Sort the hashes section. */
456 size_t nstrings = obstack_object_size (&hashes_pool) / 8;
458 qsort (obstack_base (&hashes_pool), nstrings, 8, cmp_hashes);
461 /* Write the magic number. */
462 fwrite (magic, sizeof (magic), 1, output_file);
464 /* Write the hashes section. */
465 write_section (output_file, 0x42, obstack_base (&hashes_pool),
466 obstack_object_size (&hashes_pool));
468 /* Write the messages section. */
469 write_section (output_file, 0x69, obstack_base (&messages_pool),
470 obstack_object_size (&messages_pool));
472 /* Decide whether to write a contexts section. */
474 bool can_write_contexts = true;
476 for (j = 0; j < mlp->nitems; j++)
478 message_ty *mp = mlp->item[j];
481 if (mp->msgctxt == NULL || mp->msgctxt[0] == '\0'
482 || strlen (mp->msgctxt) > 255)
484 can_write_contexts = false;
489 if (can_write_contexts)
491 hash_table all_contexts;
493 unsigned long table_size;
495 /* Collect the contexts, removing duplicates. */
496 hash_init (&all_contexts, 10);
497 for (j = 0; j < mlp->nitems; j++)
499 message_ty *mp = mlp->item[j];
502 hash_insert_entry (&all_contexts,
503 mp->msgctxt, strlen (mp->msgctxt) + 1,
507 /* Compute the number of different contexts. */
508 num_contexts = all_contexts.size;
510 /* Compute a suitable hash table size. */
511 table_size = next_prime (num_contexts * 1.7);
512 if (table_size >= 0x10000)
515 /* Put the contexts into a hash table of size table_size. */
517 struct list_cell { const char *context; struct list_cell *next; };
518 struct list_cell *list_memory =
519 XNMALLOC (table_size, struct list_cell);
520 struct list_cell *freelist;
521 struct bucket { struct list_cell *head; struct list_cell **tail; };
522 struct bucket *buckets = XNMALLOC (table_size, struct bucket);
525 freelist = list_memory;
527 for (i = 0; i < table_size; i++)
529 buckets[i].head = NULL;
530 buckets[i].tail = &buckets[i].head;
540 while (hash_iterate (&all_contexts, &iter, &key, &keylen, &null)
543 const char *context = (const char *)key;
544 i = string_hashcode (context) % table_size;
545 freelist->context = context;
546 freelist->next = NULL;
547 *buckets[i].tail = freelist;
548 buckets[i].tail = &freelist->next;
553 /* Determine the total context pool size. */
558 for (i = 0; i < table_size; i++)
559 if (buckets[i].head != NULL)
561 const struct list_cell *p;
563 for (p = buckets[i].head; p != NULL; p = p->next)
564 pool_size += 1 + strlen (p->context);
566 if ((pool_size % 2) != 0)
569 if (pool_size <= 0x20000)
571 /* Prepare the contexts section. */
572 struct obstack contexts_pool;
575 obstack_init (&contexts_pool);
577 append_u16 (&contexts_pool, table_size);
579 for (i = 0; i < table_size; i++)
580 if (buckets[i].head != NULL)
582 const struct list_cell *p;
584 append_u16 (&contexts_pool, pool_offset / 2);
585 for (p = buckets[i].head; p != NULL; p = p->next)
586 pool_offset += 1 + strlen (p->context);
588 if ((pool_offset % 2) != 0)
592 append_u16 (&contexts_pool, 0);
593 if (!(pool_offset == pool_size))
596 append_u16 (&contexts_pool, 0);
598 for (i = 0; i < table_size; i++)
599 if (buckets[i].head != NULL)
601 const struct list_cell *p;
603 for (p = buckets[i].head; p != NULL; p = p->next)
605 append_u8 (&contexts_pool, strlen (p->context));
606 obstack_grow (&contexts_pool,
607 p->context, strlen (p->context));
608 pool_offset += 1 + strlen (p->context);
610 append_u8 (&contexts_pool, 0);
612 if ((pool_offset % 2) != 0)
614 append_u8 (&contexts_pool, 0);
618 if (!(pool_offset == pool_size))
621 if (!(obstack_object_size (&contexts_pool)
622 == 2 + 2 * table_size + pool_size))
625 /* Write the contexts section. */
626 write_section (output_file, 0x2f, obstack_base (&contexts_pool),
627 obstack_object_size (&contexts_pool));
629 obstack_free (&contexts_pool, NULL);
637 hash_destroy (&all_contexts);
641 obstack_free (&messages_pool, NULL);
642 obstack_free (&hashes_pool, NULL);
647 msgdomain_write_qt (message_list_ty *mlp, const char *canon_encoding,
648 const char *domain_name, const char *file_name)
652 /* If no entry for this domain don't even create the file. */
653 if (mlp->nitems != 0)
655 /* Determine whether mlp has plural entries. */
661 for (j = 0; j < mlp->nitems; j++)
662 if (mlp->item[j]->msgid_plural != NULL)
666 multiline_error (xstrdup (""),
668 message catalog has plural form translations\n\
669 but the Qt message catalog format doesn't support plural handling\n")));
674 /* Convert the messages to Unicode. */
675 iconv_message_list (mlp, canon_encoding, po_charset_utf8, NULL);
677 /* Determine whether mlp has non-ISO-8859-1 msgctxt entries. */
681 for (j = 0; j < mlp->nitems; j++)
683 const char *string = mlp->item[j]->msgctxt;
687 /* An UTF-8 encoded string fits in ISO-8859-1 if and only if
688 all its bytes are < 0xc4. */
689 for (; *string; string++)
690 if ((unsigned char) *string >= 0xc4)
692 multiline_error (xstrdup (""),
694 message catalog has msgctxt strings containing characters outside ISO-8859-1\n\
695 but the Qt message catalog format supports Unicode only in the translated\n\
696 strings, not in the context strings\n")));
703 /* Determine whether mlp has non-ISO-8859-1 msgid entries. */
707 for (j = 0; j < mlp->nitems; j++)
709 const char *string = mlp->item[j]->msgid;
711 /* An UTF-8 encoded string fits in ISO-8859-1 if and only if all
712 its bytes are < 0xc4. */
713 for (; *string; string++)
714 if ((unsigned char) *string >= 0xc4)
716 multiline_error (xstrdup (""),
718 message catalog has msgid strings containing characters outside ISO-8859-1\n\
719 but the Qt message catalog format supports Unicode only in the translated\n\
720 strings, not in the untranslated strings\n")));
726 if (strcmp (domain_name, "-") == 0)
728 output_file = stdout;
729 SET_BINARY (fileno (output_file));
733 output_file = fopen (file_name, "wb");
734 if (output_file == NULL)
736 error (0, errno, _("error while opening \"%s\" for writing"),
742 if (output_file != NULL)
744 write_qm (output_file, mlp);
746 /* Make sure nothing went wrong. */
747 if (fwriteerror (output_file))
748 error (EXIT_FAILURE, errno, _("error while writing \"%s\" file"),