1 /* Message list concatenation and duplicate handling.
2 Copyright (C) 2001-2003, 2005-2008, 2012 Free Software Foundation, Inc.
3 Written by Bruno Haible <haible@clisp.cons.org>, 2001.
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3 of the License, or
8 (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <http://www.gnu.org/licenses/>. */
35 #include "xvasprintf.h"
37 #include "read-catalog.h"
38 #include "po-charset.h"
39 #include "msgl-ascii.h"
40 #include "msgl-equal.h"
41 #include "msgl-iconv.h"
48 #define _(str) gettext (str)
51 /* These variables control which messages are selected. */
55 /* If true, use the first available translation.
56 If false, merge all available translations into one and fuzzy it. */
59 /* If true, merge like msgcomm.
60 If false, merge like msgcat and msguniq. */
61 bool msgcomm_mode = false;
63 /* If true, omit the header entry.
64 If false, keep the header entry present in the input. */
65 bool omit_header = false;
69 is_message_selected (const message_ty *tmp)
71 int used = (tmp->used >= 0 ? tmp->used : - tmp->used);
73 return (is_header (tmp)
74 ? !omit_header /* keep the header entry */
75 : (used > more_than && used < less_than));
80 is_message_needed (const message_ty *mp)
83 && ((!is_header (mp) && mp->is_fuzzy) || mp->msgstr[0] == '\0'))
84 /* Weak translation. Needed if there are only weak translations. */
85 return mp->tmp->used < 0 && is_message_selected (mp->tmp);
87 /* Good translation. */
88 return is_message_selected (mp->tmp);
92 /* The use_first logic. */
94 is_message_first_needed (const message_ty *mp)
96 if (mp->tmp->obsolete && is_message_needed (mp))
98 mp->tmp->obsolete = false;
107 catenate_msgdomain_list (string_list_ty *file_list,
108 catalog_input_format_ty input_syntax,
111 const char * const *files = file_list->item;
112 size_t nfiles = file_list->nitems;
113 msgdomain_list_ty **mdlps;
114 const char ***canon_charsets;
115 const char ***identifications;
116 msgdomain_list_ty *total_mdlp;
117 const char *canon_to_code;
120 /* Read input files. */
121 mdlps = XNMALLOC (nfiles, msgdomain_list_ty *);
122 for (n = 0; n < nfiles; n++)
123 mdlps[n] = read_catalog_file (files[n], input_syntax);
125 /* Determine the canonical name of each input file's encoding. */
126 canon_charsets = XNMALLOC (nfiles, const char **);
127 for (n = 0; n < nfiles; n++)
129 msgdomain_list_ty *mdlp = mdlps[n];
132 canon_charsets[n] = XNMALLOC (mdlp->nitems, const char *);
133 for (k = 0; k < mdlp->nitems; k++)
135 message_list_ty *mlp = mdlp->item[k]->messages;
136 const char *canon_from_code = NULL;
140 for (j = 0; j < mlp->nitems; j++)
141 if (is_header (mlp->item[j]) && !mlp->item[j]->obsolete)
143 const char *header = mlp->item[j]->msgstr;
147 const char *charsetstr = c_strstr (header, "charset=");
149 if (charsetstr != NULL)
153 const char *canon_charset;
155 charsetstr += strlen ("charset=");
156 len = strcspn (charsetstr, " \t\n");
157 charset = (char *) xmalloca (len + 1);
158 memcpy (charset, charsetstr, len);
161 canon_charset = po_charset_canonicalize (charset);
162 if (canon_charset == NULL)
164 /* Don't give an error for POT files, because
165 POT files usually contain only ASCII
167 const char *filename = files[n];
168 size_t filenamelen = strlen (filename);
171 && memcmp (filename + filenamelen - 4,
173 && strcmp (charset, "CHARSET") == 0)
174 canon_charset = po_charset_ascii;
176 error (EXIT_FAILURE, 0,
178 present charset \"%s\" is not a portable encoding name"),
184 if (canon_from_code == NULL)
185 canon_from_code = canon_charset;
186 else if (canon_from_code != canon_charset)
187 error (EXIT_FAILURE, 0,
189 two different charsets \"%s\" and \"%s\" in input file"),
190 canon_from_code, canon_charset);
194 if (canon_from_code == NULL)
196 if (is_ascii_message_list (mlp))
197 canon_from_code = po_charset_ascii;
198 else if (mdlp->encoding != NULL)
199 canon_from_code = mdlp->encoding;
203 error (EXIT_FAILURE, 0, _("\
204 input file '%s' doesn't contain a header entry with a charset specification"),
207 error (EXIT_FAILURE, 0, _("\
208 domain \"%s\" in input file '%s' doesn't contain a header entry with a charset specification"),
209 mdlp->item[k]->domain, files[n]);
213 canon_charsets[n][k] = canon_from_code;
217 /* Determine textual identifications of each file/domain combination. */
218 identifications = XNMALLOC (nfiles, const char **);
219 for (n = 0; n < nfiles; n++)
221 const char *filename = basename (files[n]);
222 msgdomain_list_ty *mdlp = mdlps[n];
225 identifications[n] = XNMALLOC (mdlp->nitems, const char *);
226 for (k = 0; k < mdlp->nitems; k++)
228 const char *domain = mdlp->item[k]->domain;
229 message_list_ty *mlp = mdlp->item[k]->messages;
230 char *project_id = NULL;
232 for (j = 0; j < mlp->nitems; j++)
233 if (is_header (mlp->item[j]) && !mlp->item[j]->obsolete)
235 const char *header = mlp->item[j]->msgstr;
239 const char *cp = c_strstr (header, "Project-Id-Version:");
245 cp += sizeof ("Project-Id-Version:") - 1;
247 endp = strchr (cp, '\n');
249 endp = cp + strlen (cp);
251 while (cp < endp && *cp == ' ')
256 size_t len = endp - cp;
257 project_id = XNMALLOC (len + 1, char);
258 memcpy (project_id, cp, len);
259 project_id[len] = '\0';
266 identifications[n][k] =
268 ? (k > 0 ? xasprintf ("%s:%s (%s)", filename, domain, project_id)
269 : xasprintf ("%s (%s)", filename, project_id))
270 : (k > 0 ? xasprintf ("%s:%s", filename, domain)
271 : xasprintf ("%s", filename)));
275 /* Create list of resulting messages, but don't fill it. Only count
276 the number of translations for each message.
277 If for a message, there is at least one non-fuzzy, non-empty translation,
278 use only the non-fuzzy, non-empty translations. Otherwise use the
279 fuzzy or empty translations as well. */
280 total_mdlp = msgdomain_list_alloc (true);
281 for (n = 0; n < nfiles; n++)
283 msgdomain_list_ty *mdlp = mdlps[n];
286 for (k = 0; k < mdlp->nitems; k++)
288 const char *domain = mdlp->item[k]->domain;
289 message_list_ty *mlp = mdlp->item[k]->messages;
290 message_list_ty *total_mlp;
292 total_mlp = msgdomain_list_sublist (total_mdlp, domain, true);
294 for (j = 0; j < mlp->nitems; j++)
296 message_ty *mp = mlp->item[j];
300 tmp = message_list_search (total_mlp, mp->msgctxt, mp->msgid);
303 tmp = message_alloc (mp->msgctxt, mp->msgid, mp->msgid_plural,
305 tmp->is_fuzzy = true; /* may be set to false later */
306 for (i = 0; i < NFORMATS; i++)
307 tmp->is_format[i] = undecided; /* may be set to yes/no later */
308 tmp->range.min = - INT_MAX;
309 tmp->range.max = - INT_MAX;
310 tmp->do_wrap = yes; /* may be set to no later */
311 tmp->obsolete = true; /* may be set to false later */
312 tmp->alternative_count = 0;
313 tmp->alternative = NULL;
314 message_list_append (total_mlp, tmp);
318 && ((!is_header (mp) && mp->is_fuzzy)
319 || mp->msgstr[0] == '\0'))
320 /* Weak translation. Counted as negative tmp->used. */
326 /* Good translation. Counted as positive tmp->used. */
337 /* Remove messages that are not used and need not be converted. */
338 for (n = 0; n < nfiles; n++)
340 msgdomain_list_ty *mdlp = mdlps[n];
343 for (k = 0; k < mdlp->nitems; k++)
345 message_list_ty *mlp = mdlp->item[k]->messages;
347 message_list_remove_if_not (mlp,
349 ? is_message_first_needed
350 : is_message_needed);
352 /* If no messages are remaining, drop the charset. */
353 if (mlp->nitems == 0)
354 canon_charsets[n][k] = NULL;
360 for (k = 0; k < total_mdlp->nitems; k++)
362 message_list_ty *mlp = total_mdlp->item[k]->messages;
364 message_list_remove_if_not (mlp, is_message_selected);
368 /* Determine the common known a-priori encoding, if any. */
371 bool all_same_encoding = true;
373 for (n = 1; n < nfiles; n++)
374 if (mdlps[n]->encoding != mdlps[0]->encoding)
376 all_same_encoding = false;
380 if (all_same_encoding)
381 total_mdlp->encoding = mdlps[0]->encoding;
384 /* Determine the target encoding for the remaining messages. */
387 /* Canonicalize target encoding. */
388 canon_to_code = po_charset_canonicalize (to_code);
389 if (canon_to_code == NULL)
390 error (EXIT_FAILURE, 0,
391 _("target charset \"%s\" is not a portable encoding name."),
396 /* No target encoding was specified. Test whether the messages are
397 all in a single encoding. If so, conversion is not needed. */
398 const char *first = NULL;
399 const char *second = NULL;
400 bool with_ASCII = false;
401 bool with_UTF8 = false;
402 bool all_ASCII_compatible = true;
404 for (n = 0; n < nfiles; n++)
406 msgdomain_list_ty *mdlp = mdlps[n];
409 for (k = 0; k < mdlp->nitems; k++)
410 if (canon_charsets[n][k] != NULL)
412 if (canon_charsets[n][k] == po_charset_ascii)
417 first = canon_charsets[n][k];
418 else if (canon_charsets[n][k] != first && second == NULL)
419 second = canon_charsets[n][k];
421 if (strcmp (canon_charsets[n][k], "UTF-8") == 0)
424 if (!po_charset_ascii_compatible (canon_charsets[n][k]))
425 all_ASCII_compatible = false;
430 if (with_ASCII && !all_ASCII_compatible)
432 /* assert (first != NULL); */
434 second = po_charset_ascii;
439 /* A conversion is needed. Warn the user since he hasn't asked
440 for it and might be surprised. */
442 multiline_warning (xasprintf (_("warning: ")),
444 Input files contain messages in different encodings, UTF-8 among others.\n\
445 Converting the output to UTF-8.\n\
448 multiline_warning (xasprintf (_("warning: ")),
450 Input files contain messages in different encodings, %s and %s among others.\n\
451 Converting the output to UTF-8.\n\
452 To select a different output encoding, use the --to-code option.\n\
454 canon_to_code = po_charset_utf8;
456 else if (first != NULL && with_ASCII && all_ASCII_compatible)
458 /* The conversion is a no-op conversion. Don't warn the user,
459 but still perform the conversion, in order to check that the
460 input was really ASCII. */
461 canon_to_code = first;
465 /* No conversion needed. */
466 canon_to_code = NULL;
470 /* Now convert the remaining messages to to_code. */
471 if (canon_to_code != NULL)
472 for (n = 0; n < nfiles; n++)
474 msgdomain_list_ty *mdlp = mdlps[n];
477 for (k = 0; k < mdlp->nitems; k++)
478 if (canon_charsets[n][k] != NULL)
479 /* If the user hasn't given a to_code, don't bother doing a noop
480 conversion that would only replace the charset name in the
481 header entry with its canonical equivalent. */
482 if (!(to_code == NULL && canon_charsets[n][k] == canon_to_code))
483 if (iconv_message_list (mdlp->item[k]->messages,
484 canon_charsets[n][k], canon_to_code,
487 multiline_error (xstrdup (""),
489 Conversion of file %s from %s encoding to %s encoding\n\
490 changes some msgids or msgctxts.\n\
491 Either change all msgids and msgctxts to be pure ASCII, or ensure they are\n\
492 UTF-8 encoded from the beginning, i.e. already in your source code files.\n"),
493 files[n], canon_charsets[n][k],
499 /* Fill the resulting messages. */
500 for (n = 0; n < nfiles; n++)
502 msgdomain_list_ty *mdlp = mdlps[n];
505 for (k = 0; k < mdlp->nitems; k++)
507 message_list_ty *mlp = mdlp->item[k]->messages;
509 for (j = 0; j < mlp->nitems; j++)
511 message_ty *mp = mlp->item[j];
512 message_ty *tmp = mp->tmp;
515 /* No need to discard unneeded weak translations here;
516 they have already been filtered out above. */
517 if (use_first || tmp->used == 1 || tmp->used == -1)
519 /* Copy mp, as only message, into tmp. */
520 tmp->msgstr = mp->msgstr;
521 tmp->msgstr_len = mp->msgstr_len;
524 for (i = 0; i < mp->comment->nitems; i++)
525 message_comment_append (tmp, mp->comment->item[i]);
527 for (i = 0; i < mp->comment_dot->nitems; i++)
528 message_comment_dot_append (tmp,
529 mp->comment_dot->item[i]);
530 for (i = 0; i < mp->filepos_count; i++)
531 message_comment_filepos (tmp, mp->filepos[i].file_name,
532 mp->filepos[i].line_number);
533 tmp->is_fuzzy = mp->is_fuzzy;
534 for (i = 0; i < NFORMATS; i++)
535 tmp->is_format[i] = mp->is_format[i];
536 tmp->range = mp->range;
537 tmp->do_wrap = mp->do_wrap;
538 tmp->prev_msgctxt = mp->prev_msgctxt;
539 tmp->prev_msgid = mp->prev_msgid;
540 tmp->prev_msgid_plural = mp->prev_msgid_plural;
541 tmp->obsolete = mp->obsolete;
543 else if (msgcomm_mode)
545 /* Copy mp, as only message, into tmp. */
546 if (tmp->msgstr == NULL)
548 tmp->msgstr = mp->msgstr;
549 tmp->msgstr_len = mp->msgstr_len;
551 tmp->is_fuzzy = mp->is_fuzzy;
552 tmp->prev_msgctxt = mp->prev_msgctxt;
553 tmp->prev_msgid = mp->prev_msgid;
554 tmp->prev_msgid_plural = mp->prev_msgid_plural;
556 if (mp->comment && tmp->comment == NULL)
557 for (i = 0; i < mp->comment->nitems; i++)
558 message_comment_append (tmp, mp->comment->item[i]);
559 if (mp->comment_dot && tmp->comment_dot == NULL)
560 for (i = 0; i < mp->comment_dot->nitems; i++)
561 message_comment_dot_append (tmp,
562 mp->comment_dot->item[i]);
563 for (i = 0; i < mp->filepos_count; i++)
564 message_comment_filepos (tmp, mp->filepos[i].file_name,
565 mp->filepos[i].line_number);
566 for (i = 0; i < NFORMATS; i++)
567 if (tmp->is_format[i] == undecided)
568 tmp->is_format[i] = mp->is_format[i];
569 if (tmp->range.min == - INT_MAX
570 && tmp->range.max == - INT_MAX)
571 tmp->range = mp->range;
572 else if (has_range_p (mp->range) && has_range_p (tmp->range))
574 if (mp->range.min < tmp->range.min)
575 tmp->range.min = mp->range.min;
576 if (mp->range.max > tmp->range.max)
577 tmp->range.max = mp->range.max;
584 if (tmp->do_wrap == undecided)
585 tmp->do_wrap = mp->do_wrap;
586 tmp->obsolete = false;
590 /* Copy mp, among others, into tmp. */
591 char *id = xasprintf ("#-#-#-#-# %s #-#-#-#-#",
592 identifications[n][k]);
595 if (tmp->alternative_count == 0)
598 i = tmp->alternative_count;
599 nbytes = (i + 1) * sizeof (struct altstr);
600 tmp->alternative = xrealloc (tmp->alternative, nbytes);
601 tmp->alternative[i].msgstr = mp->msgstr;
602 tmp->alternative[i].msgstr_len = mp->msgstr_len;
603 tmp->alternative[i].msgstr_end =
604 tmp->alternative[i].msgstr + tmp->alternative[i].msgstr_len;
605 tmp->alternative[i].comment = mp->comment;
606 tmp->alternative[i].comment_dot = mp->comment_dot;
607 tmp->alternative[i].id = id;
608 tmp->alternative_count = i + 1;
610 for (i = 0; i < mp->filepos_count; i++)
611 message_comment_filepos (tmp, mp->filepos[i].file_name,
612 mp->filepos[i].line_number);
614 tmp->is_fuzzy = false;
615 for (i = 0; i < NFORMATS; i++)
616 if (mp->is_format[i] == yes)
617 tmp->is_format[i] = yes;
618 else if (mp->is_format[i] == no
619 && tmp->is_format[i] == undecided)
620 tmp->is_format[i] = no;
621 if (tmp->range.min == - INT_MAX
622 && tmp->range.max == - INT_MAX)
623 tmp->range = mp->range;
624 else if (has_range_p (mp->range) && has_range_p (tmp->range))
626 if (mp->range.min < tmp->range.min)
627 tmp->range.min = mp->range.min;
628 if (mp->range.max > tmp->range.max)
629 tmp->range.max = mp->range.max;
636 if (mp->do_wrap == no)
638 /* Don't fill tmp->prev_msgid in this case. */
640 tmp->obsolete = false;
648 for (k = 0; k < total_mdlp->nitems; k++)
650 message_list_ty *mlp = total_mdlp->item[k]->messages;
652 for (j = 0; j < mlp->nitems; j++)
654 message_ty *tmp = mlp->item[j];
656 if (tmp->alternative_count > 0)
658 /* Test whether all alternative translations are equal. */
659 struct altstr *first = &tmp->alternative[0];
662 for (i = 0; i < tmp->alternative_count; i++)
663 if (!(tmp->alternative[i].msgstr_len == first->msgstr_len
664 && memcmp (tmp->alternative[i].msgstr, first->msgstr,
665 first->msgstr_len) == 0))
668 if (i == tmp->alternative_count)
670 /* All alternatives are equal. */
671 tmp->msgstr = first->msgstr;
672 tmp->msgstr_len = first->msgstr_len;
676 /* Concatenate the alternative msgstrs into a single one,
677 separated by markers. */
685 for (i = 0; i < tmp->alternative_count; i++)
687 size_t id_len = strlen (tmp->alternative[i].id);
689 len += tmp->alternative[i].msgstr_len;
691 p = tmp->alternative[i].msgstr;
692 p_end = tmp->alternative[i].msgstr_end;
693 for (; p < p_end; p += strlen (p) + 1)
697 new_msgstr = XNMALLOC (len, char);
701 /* Test whether there's one more plural form to
703 for (i = 0; i < tmp->alternative_count; i++)
704 if (tmp->alternative[i].msgstr
705 < tmp->alternative[i].msgstr_end)
707 if (i == tmp->alternative_count)
710 /* Process next plural form. */
711 for (i = 0; i < tmp->alternative_count; i++)
712 if (tmp->alternative[i].msgstr
713 < tmp->alternative[i].msgstr_end)
715 if (np > new_msgstr && np[-1] != '\0'
719 len = strlen (tmp->alternative[i].id);
720 memcpy (np, tmp->alternative[i].id, len);
724 len = strlen (tmp->alternative[i].msgstr);
725 memcpy (np, tmp->alternative[i].msgstr, len);
727 tmp->alternative[i].msgstr += len + 1;
730 /* Plural forms are separated by NUL bytes. */
733 tmp->msgstr = new_msgstr;
734 tmp->msgstr_len = np - new_msgstr;
736 tmp->is_fuzzy = true;
739 /* Test whether all alternative comments are equal. */
740 for (i = 0; i < tmp->alternative_count; i++)
741 if (tmp->alternative[i].comment == NULL
742 || !string_list_equal (tmp->alternative[i].comment,
746 if (i == tmp->alternative_count)
747 /* All alternatives are equal. */
748 tmp->comment = first->comment;
750 /* Concatenate the alternative comments into a single one,
751 separated by markers. */
752 for (i = 0; i < tmp->alternative_count; i++)
754 string_list_ty *slp = tmp->alternative[i].comment;
760 message_comment_append (tmp, tmp->alternative[i].id);
761 for (l = 0; l < slp->nitems; l++)
762 message_comment_append (tmp, slp->item[l]);
766 /* Test whether all alternative dot comments are equal. */
767 for (i = 0; i < tmp->alternative_count; i++)
768 if (tmp->alternative[i].comment_dot == NULL
769 || !string_list_equal (tmp->alternative[i].comment_dot,
773 if (i == tmp->alternative_count)
774 /* All alternatives are equal. */
775 tmp->comment_dot = first->comment_dot;
777 /* Concatenate the alternative dot comments into a single one,
778 separated by markers. */
779 for (i = 0; i < tmp->alternative_count; i++)
781 string_list_ty *slp = tmp->alternative[i].comment_dot;
787 message_comment_dot_append (tmp,
788 tmp->alternative[i].id);
789 for (l = 0; l < slp->nitems; l++)
790 message_comment_dot_append (tmp, slp->item[l]);