1 /* Message list concatenation and duplicate handling.
2 Copyright (C) 2001-2003, 2005-2008, 2012, 2015 Free Software
4 Written by Bruno Haible <haible@clisp.cons.org>, 2001.
6 This program is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program. If not, see <http://www.gnu.org/licenses/>. */
36 #include "xvasprintf.h"
38 #include "read-catalog.h"
39 #include "po-charset.h"
40 #include "msgl-ascii.h"
41 #include "msgl-equal.h"
42 #include "msgl-iconv.h"
49 #define _(str) gettext (str)
52 /* These variables control which messages are selected. */
56 /* If true, use the first available translation.
57 If false, merge all available translations into one and fuzzy it. */
60 /* If true, merge like msgcomm.
61 If false, merge like msgcat and msguniq. */
62 bool msgcomm_mode = false;
64 /* If true, omit the header entry.
65 If false, keep the header entry present in the input. */
66 bool omit_header = false;
70 is_message_selected (const message_ty *tmp)
72 int used = (tmp->used >= 0 ? tmp->used : - tmp->used);
74 return (is_header (tmp)
75 ? !omit_header /* keep the header entry */
76 : (used > more_than && used < less_than));
81 is_message_needed (const message_ty *mp)
84 && ((!is_header (mp) && mp->is_fuzzy) || mp->msgstr[0] == '\0'))
85 /* Weak translation. Needed if there are only weak translations. */
86 return mp->tmp->used < 0 && is_message_selected (mp->tmp);
88 /* Good translation. */
89 return is_message_selected (mp->tmp);
93 /* The use_first logic. */
95 is_message_first_needed (const message_ty *mp)
97 if (mp->tmp->obsolete && is_message_needed (mp))
99 mp->tmp->obsolete = false;
108 catenate_msgdomain_list (string_list_ty *file_list,
109 catalog_input_format_ty input_syntax,
112 const char * const *files = file_list->item;
113 size_t nfiles = file_list->nitems;
114 msgdomain_list_ty **mdlps;
115 const char ***canon_charsets;
116 const char ***identifications;
117 msgdomain_list_ty *total_mdlp;
118 const char *canon_to_code;
121 /* Read input files. */
122 mdlps = XNMALLOC (nfiles, msgdomain_list_ty *);
123 for (n = 0; n < nfiles; n++)
124 mdlps[n] = read_catalog_file (files[n], input_syntax);
126 /* Determine the canonical name of each input file's encoding. */
127 canon_charsets = XNMALLOC (nfiles, const char **);
128 for (n = 0; n < nfiles; n++)
130 msgdomain_list_ty *mdlp = mdlps[n];
133 canon_charsets[n] = XNMALLOC (mdlp->nitems, const char *);
134 for (k = 0; k < mdlp->nitems; k++)
136 message_list_ty *mlp = mdlp->item[k]->messages;
137 const char *canon_from_code = NULL;
141 for (j = 0; j < mlp->nitems; j++)
142 if (is_header (mlp->item[j]) && !mlp->item[j]->obsolete)
144 const char *header = mlp->item[j]->msgstr;
148 const char *charsetstr = c_strstr (header, "charset=");
150 if (charsetstr != NULL)
154 const char *canon_charset;
156 charsetstr += strlen ("charset=");
157 len = strcspn (charsetstr, " \t\n");
158 charset = (char *) xmalloca (len + 1);
159 memcpy (charset, charsetstr, len);
162 canon_charset = po_charset_canonicalize (charset);
163 if (canon_charset == NULL)
165 /* Don't give an error for POT files, because
166 POT files usually contain only ASCII
168 const char *filename = files[n];
169 size_t filenamelen = strlen (filename);
172 && memcmp (filename + filenamelen - 4,
174 && strcmp (charset, "CHARSET") == 0)
175 canon_charset = po_charset_ascii;
177 error (EXIT_FAILURE, 0,
179 present charset \"%s\" is not a portable encoding name"),
185 if (canon_from_code == NULL)
186 canon_from_code = canon_charset;
187 else if (canon_from_code != canon_charset)
188 error (EXIT_FAILURE, 0,
190 two different charsets \"%s\" and \"%s\" in input file"),
191 canon_from_code, canon_charset);
195 if (canon_from_code == NULL)
197 if (is_ascii_message_list (mlp))
198 canon_from_code = po_charset_ascii;
199 else if (mdlp->encoding != NULL)
200 canon_from_code = mdlp->encoding;
204 error (EXIT_FAILURE, 0, _("\
205 input file '%s' doesn't contain a header entry with a charset specification"),
208 error (EXIT_FAILURE, 0, _("\
209 domain \"%s\" in input file '%s' doesn't contain a header entry with a charset specification"),
210 mdlp->item[k]->domain, files[n]);
214 canon_charsets[n][k] = canon_from_code;
218 /* Determine textual identifications of each file/domain combination. */
219 identifications = XNMALLOC (nfiles, const char **);
220 for (n = 0; n < nfiles; n++)
222 const char *filename = basename (files[n]);
223 msgdomain_list_ty *mdlp = mdlps[n];
226 identifications[n] = XNMALLOC (mdlp->nitems, const char *);
227 for (k = 0; k < mdlp->nitems; k++)
229 const char *domain = mdlp->item[k]->domain;
230 message_list_ty *mlp = mdlp->item[k]->messages;
231 char *project_id = NULL;
233 for (j = 0; j < mlp->nitems; j++)
234 if (is_header (mlp->item[j]) && !mlp->item[j]->obsolete)
236 const char *header = mlp->item[j]->msgstr;
240 const char *cp = c_strstr (header, "Project-Id-Version:");
246 cp += sizeof ("Project-Id-Version:") - 1;
248 endp = strchr (cp, '\n');
250 endp = cp + strlen (cp);
252 while (cp < endp && *cp == ' ')
257 size_t len = endp - cp;
258 project_id = XNMALLOC (len + 1, char);
259 memcpy (project_id, cp, len);
260 project_id[len] = '\0';
267 identifications[n][k] =
269 ? (k > 0 ? xasprintf ("%s:%s (%s)", filename, domain, project_id)
270 : xasprintf ("%s (%s)", filename, project_id))
271 : (k > 0 ? xasprintf ("%s:%s", filename, domain)
272 : xasprintf ("%s", filename)));
276 /* Create list of resulting messages, but don't fill it. Only count
277 the number of translations for each message.
278 If for a message, there is at least one non-fuzzy, non-empty translation,
279 use only the non-fuzzy, non-empty translations. Otherwise use the
280 fuzzy or empty translations as well. */
281 total_mdlp = msgdomain_list_alloc (true);
282 for (n = 0; n < nfiles; n++)
284 msgdomain_list_ty *mdlp = mdlps[n];
287 for (k = 0; k < mdlp->nitems; k++)
289 const char *domain = mdlp->item[k]->domain;
290 message_list_ty *mlp = mdlp->item[k]->messages;
291 message_list_ty *total_mlp;
293 total_mlp = msgdomain_list_sublist (total_mdlp, domain, true);
295 for (j = 0; j < mlp->nitems; j++)
297 message_ty *mp = mlp->item[j];
301 tmp = message_list_search (total_mlp, mp->msgctxt, mp->msgid);
304 tmp = message_alloc (mp->msgctxt, mp->msgid, mp->msgid_plural,
306 tmp->is_fuzzy = true; /* may be set to false later */
307 for (i = 0; i < NFORMATS; i++)
308 tmp->is_format[i] = undecided; /* may be set to yes/no later */
309 tmp->range.min = - INT_MAX;
310 tmp->range.max = - INT_MAX;
311 tmp->do_wrap = yes; /* may be set to no later */
312 for (i = 0; i < NSYNTAXCHECKS; i++)
313 tmp->do_syntax_check[i] = undecided; /* may be set to yes/no later */
314 tmp->obsolete = true; /* may be set to false later */
315 tmp->alternative_count = 0;
316 tmp->alternative = NULL;
317 message_list_append (total_mlp, tmp);
321 && ((!is_header (mp) && mp->is_fuzzy)
322 || mp->msgstr[0] == '\0'))
323 /* Weak translation. Counted as negative tmp->used. */
329 /* Good translation. Counted as positive tmp->used. */
340 /* Remove messages that are not used and need not be converted. */
341 for (n = 0; n < nfiles; n++)
343 msgdomain_list_ty *mdlp = mdlps[n];
346 for (k = 0; k < mdlp->nitems; k++)
348 message_list_ty *mlp = mdlp->item[k]->messages;
350 message_list_remove_if_not (mlp,
352 ? is_message_first_needed
353 : is_message_needed);
355 /* If no messages are remaining, drop the charset. */
356 if (mlp->nitems == 0)
357 canon_charsets[n][k] = NULL;
363 for (k = 0; k < total_mdlp->nitems; k++)
365 message_list_ty *mlp = total_mdlp->item[k]->messages;
367 message_list_remove_if_not (mlp, is_message_selected);
371 /* Determine the common known a-priori encoding, if any. */
374 bool all_same_encoding = true;
376 for (n = 1; n < nfiles; n++)
377 if (mdlps[n]->encoding != mdlps[0]->encoding)
379 all_same_encoding = false;
383 if (all_same_encoding)
384 total_mdlp->encoding = mdlps[0]->encoding;
387 /* Determine the target encoding for the remaining messages. */
390 /* Canonicalize target encoding. */
391 canon_to_code = po_charset_canonicalize (to_code);
392 if (canon_to_code == NULL)
393 error (EXIT_FAILURE, 0,
394 _("target charset \"%s\" is not a portable encoding name."),
399 /* No target encoding was specified. Test whether the messages are
400 all in a single encoding. If so, conversion is not needed. */
401 const char *first = NULL;
402 const char *second = NULL;
403 bool with_ASCII = false;
404 bool with_UTF8 = false;
405 bool all_ASCII_compatible = true;
407 for (n = 0; n < nfiles; n++)
409 msgdomain_list_ty *mdlp = mdlps[n];
412 for (k = 0; k < mdlp->nitems; k++)
413 if (canon_charsets[n][k] != NULL)
415 if (canon_charsets[n][k] == po_charset_ascii)
420 first = canon_charsets[n][k];
421 else if (canon_charsets[n][k] != first && second == NULL)
422 second = canon_charsets[n][k];
424 if (strcmp (canon_charsets[n][k], "UTF-8") == 0)
427 if (!po_charset_ascii_compatible (canon_charsets[n][k]))
428 all_ASCII_compatible = false;
433 if (with_ASCII && !all_ASCII_compatible)
435 /* assert (first != NULL); */
437 second = po_charset_ascii;
442 /* A conversion is needed. Warn the user since he hasn't asked
443 for it and might be surprised. */
445 multiline_warning (xasprintf (_("warning: ")),
447 Input files contain messages in different encodings, UTF-8 among others.\n\
448 Converting the output to UTF-8.\n\
451 multiline_warning (xasprintf (_("warning: ")),
453 Input files contain messages in different encodings, %s and %s among others.\n\
454 Converting the output to UTF-8.\n\
455 To select a different output encoding, use the --to-code option.\n\
457 canon_to_code = po_charset_utf8;
459 else if (first != NULL && with_ASCII && all_ASCII_compatible)
461 /* The conversion is a no-op conversion. Don't warn the user,
462 but still perform the conversion, in order to check that the
463 input was really ASCII. */
464 canon_to_code = first;
468 /* No conversion needed. */
469 canon_to_code = NULL;
473 /* Now convert the remaining messages to to_code. */
474 if (canon_to_code != NULL)
475 for (n = 0; n < nfiles; n++)
477 msgdomain_list_ty *mdlp = mdlps[n];
480 for (k = 0; k < mdlp->nitems; k++)
481 if (canon_charsets[n][k] != NULL)
482 /* If the user hasn't given a to_code, don't bother doing a noop
483 conversion that would only replace the charset name in the
484 header entry with its canonical equivalent. */
485 if (!(to_code == NULL && canon_charsets[n][k] == canon_to_code))
486 if (iconv_message_list (mdlp->item[k]->messages,
487 canon_charsets[n][k], canon_to_code,
490 multiline_error (xstrdup (""),
492 Conversion of file %s from %s encoding to %s encoding\n\
493 changes some msgids or msgctxts.\n\
494 Either change all msgids and msgctxts to be pure ASCII, or ensure they are\n\
495 UTF-8 encoded from the beginning, i.e. already in your source code files.\n"),
496 files[n], canon_charsets[n][k],
502 /* Fill the resulting messages. */
503 for (n = 0; n < nfiles; n++)
505 msgdomain_list_ty *mdlp = mdlps[n];
508 for (k = 0; k < mdlp->nitems; k++)
510 message_list_ty *mlp = mdlp->item[k]->messages;
512 for (j = 0; j < mlp->nitems; j++)
514 message_ty *mp = mlp->item[j];
515 message_ty *tmp = mp->tmp;
518 /* No need to discard unneeded weak translations here;
519 they have already been filtered out above. */
520 if (use_first || tmp->used == 1 || tmp->used == -1)
522 /* Copy mp, as only message, into tmp. */
523 tmp->msgstr = mp->msgstr;
524 tmp->msgstr_len = mp->msgstr_len;
527 for (i = 0; i < mp->comment->nitems; i++)
528 message_comment_append (tmp, mp->comment->item[i]);
530 for (i = 0; i < mp->comment_dot->nitems; i++)
531 message_comment_dot_append (tmp,
532 mp->comment_dot->item[i]);
533 for (i = 0; i < mp->filepos_count; i++)
534 message_comment_filepos (tmp, mp->filepos[i].file_name,
535 mp->filepos[i].line_number);
536 tmp->is_fuzzy = mp->is_fuzzy;
537 for (i = 0; i < NFORMATS; i++)
538 tmp->is_format[i] = mp->is_format[i];
539 tmp->range = mp->range;
540 tmp->do_wrap = mp->do_wrap;
541 for (i = 0; i < NSYNTAXCHECKS; i++)
542 tmp->do_syntax_check[i] = mp->do_syntax_check[i];
543 tmp->prev_msgctxt = mp->prev_msgctxt;
544 tmp->prev_msgid = mp->prev_msgid;
545 tmp->prev_msgid_plural = mp->prev_msgid_plural;
546 tmp->obsolete = mp->obsolete;
548 else if (msgcomm_mode)
550 /* Copy mp, as only message, into tmp. */
551 if (tmp->msgstr == NULL)
553 tmp->msgstr = mp->msgstr;
554 tmp->msgstr_len = mp->msgstr_len;
556 tmp->is_fuzzy = mp->is_fuzzy;
557 tmp->prev_msgctxt = mp->prev_msgctxt;
558 tmp->prev_msgid = mp->prev_msgid;
559 tmp->prev_msgid_plural = mp->prev_msgid_plural;
561 if (mp->comment && tmp->comment == NULL)
562 for (i = 0; i < mp->comment->nitems; i++)
563 message_comment_append (tmp, mp->comment->item[i]);
564 if (mp->comment_dot && tmp->comment_dot == NULL)
565 for (i = 0; i < mp->comment_dot->nitems; i++)
566 message_comment_dot_append (tmp,
567 mp->comment_dot->item[i]);
568 for (i = 0; i < mp->filepos_count; i++)
569 message_comment_filepos (tmp, mp->filepos[i].file_name,
570 mp->filepos[i].line_number);
571 for (i = 0; i < NFORMATS; i++)
572 if (tmp->is_format[i] == undecided)
573 tmp->is_format[i] = mp->is_format[i];
574 if (tmp->range.min == - INT_MAX
575 && tmp->range.max == - INT_MAX)
576 tmp->range = mp->range;
577 else if (has_range_p (mp->range) && has_range_p (tmp->range))
579 if (mp->range.min < tmp->range.min)
580 tmp->range.min = mp->range.min;
581 if (mp->range.max > tmp->range.max)
582 tmp->range.max = mp->range.max;
589 if (tmp->do_wrap == undecided)
590 tmp->do_wrap = mp->do_wrap;
591 for (i = 0; i < NSYNTAXCHECKS; i++)
592 if (tmp->do_syntax_check[i] == undecided)
593 tmp->do_syntax_check[i] = mp->do_syntax_check[i];
594 tmp->obsolete = false;
598 /* Copy mp, among others, into tmp. */
599 char *id = xasprintf ("#-#-#-#-# %s #-#-#-#-#",
600 identifications[n][k]);
603 if (tmp->alternative_count == 0)
606 i = tmp->alternative_count;
607 nbytes = (i + 1) * sizeof (struct altstr);
608 tmp->alternative = xrealloc (tmp->alternative, nbytes);
609 tmp->alternative[i].msgstr = mp->msgstr;
610 tmp->alternative[i].msgstr_len = mp->msgstr_len;
611 tmp->alternative[i].msgstr_end =
612 tmp->alternative[i].msgstr + tmp->alternative[i].msgstr_len;
613 tmp->alternative[i].comment = mp->comment;
614 tmp->alternative[i].comment_dot = mp->comment_dot;
615 tmp->alternative[i].id = id;
616 tmp->alternative_count = i + 1;
618 for (i = 0; i < mp->filepos_count; i++)
619 message_comment_filepos (tmp, mp->filepos[i].file_name,
620 mp->filepos[i].line_number);
622 tmp->is_fuzzy = false;
623 for (i = 0; i < NFORMATS; i++)
624 if (mp->is_format[i] == yes)
625 tmp->is_format[i] = yes;
626 else if (mp->is_format[i] == no
627 && tmp->is_format[i] == undecided)
628 tmp->is_format[i] = no;
629 if (tmp->range.min == - INT_MAX
630 && tmp->range.max == - INT_MAX)
631 tmp->range = mp->range;
632 else if (has_range_p (mp->range) && has_range_p (tmp->range))
634 if (mp->range.min < tmp->range.min)
635 tmp->range.min = mp->range.min;
636 if (mp->range.max > tmp->range.max)
637 tmp->range.max = mp->range.max;
644 if (mp->do_wrap == no)
646 for (i = 0; i < NSYNTAXCHECKS; i++)
647 if (mp->do_syntax_check[i] == yes)
648 tmp->do_syntax_check[i] = yes;
649 else if (mp->do_syntax_check[i] == no
650 && tmp->do_syntax_check[i] == undecided)
651 tmp->do_syntax_check[i] = no;
652 /* Don't fill tmp->prev_msgid in this case. */
654 tmp->obsolete = false;
662 for (k = 0; k < total_mdlp->nitems; k++)
664 message_list_ty *mlp = total_mdlp->item[k]->messages;
666 for (j = 0; j < mlp->nitems; j++)
668 message_ty *tmp = mlp->item[j];
670 if (tmp->alternative_count > 0)
672 /* Test whether all alternative translations are equal. */
673 struct altstr *first = &tmp->alternative[0];
676 for (i = 0; i < tmp->alternative_count; i++)
677 if (!(tmp->alternative[i].msgstr_len == first->msgstr_len
678 && memcmp (tmp->alternative[i].msgstr, first->msgstr,
679 first->msgstr_len) == 0))
682 if (i == tmp->alternative_count)
684 /* All alternatives are equal. */
685 tmp->msgstr = first->msgstr;
686 tmp->msgstr_len = first->msgstr_len;
690 /* Concatenate the alternative msgstrs into a single one,
691 separated by markers. */
699 for (i = 0; i < tmp->alternative_count; i++)
701 size_t id_len = strlen (tmp->alternative[i].id);
703 len += tmp->alternative[i].msgstr_len;
705 p = tmp->alternative[i].msgstr;
706 p_end = tmp->alternative[i].msgstr_end;
707 for (; p < p_end; p += strlen (p) + 1)
711 new_msgstr = XNMALLOC (len, char);
715 /* Test whether there's one more plural form to
717 for (i = 0; i < tmp->alternative_count; i++)
718 if (tmp->alternative[i].msgstr
719 < tmp->alternative[i].msgstr_end)
721 if (i == tmp->alternative_count)
724 /* Process next plural form. */
725 for (i = 0; i < tmp->alternative_count; i++)
726 if (tmp->alternative[i].msgstr
727 < tmp->alternative[i].msgstr_end)
729 if (np > new_msgstr && np[-1] != '\0'
733 len = strlen (tmp->alternative[i].id);
734 memcpy (np, tmp->alternative[i].id, len);
738 len = strlen (tmp->alternative[i].msgstr);
739 memcpy (np, tmp->alternative[i].msgstr, len);
741 tmp->alternative[i].msgstr += len + 1;
744 /* Plural forms are separated by NUL bytes. */
747 tmp->msgstr = new_msgstr;
748 tmp->msgstr_len = np - new_msgstr;
750 tmp->is_fuzzy = true;
753 /* Test whether all alternative comments are equal. */
754 for (i = 0; i < tmp->alternative_count; i++)
755 if (tmp->alternative[i].comment == NULL
756 || !string_list_equal (tmp->alternative[i].comment,
760 if (i == tmp->alternative_count)
761 /* All alternatives are equal. */
762 tmp->comment = first->comment;
764 /* Concatenate the alternative comments into a single one,
765 separated by markers. */
766 for (i = 0; i < tmp->alternative_count; i++)
768 string_list_ty *slp = tmp->alternative[i].comment;
774 message_comment_append (tmp, tmp->alternative[i].id);
775 for (l = 0; l < slp->nitems; l++)
776 message_comment_append (tmp, slp->item[l]);
780 /* Test whether all alternative dot comments are equal. */
781 for (i = 0; i < tmp->alternative_count; i++)
782 if (tmp->alternative[i].comment_dot == NULL
783 || !string_list_equal (tmp->alternative[i].comment_dot,
787 if (i == tmp->alternative_count)
788 /* All alternatives are equal. */
789 tmp->comment_dot = first->comment_dot;
791 /* Concatenate the alternative dot comments into a single one,
792 separated by markers. */
793 for (i = 0; i < tmp->alternative_count; i++)
795 string_list_ty *slp = tmp->alternative[i].comment_dot;
801 message_comment_dot_append (tmp,
802 tmp->alternative[i].id);
803 for (l = 0; l < slp->nitems; l++)
804 message_comment_dot_append (tmp, slp->item[l]);