2 Copyright (C) 1989-2018 Free Software Foundation, Inc.
3 Written by James Clark (jjc@jclark.com)
5 This file is part of groff.
7 groff is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation, either version 3 of the License, or
10 (at your option) any later version.
12 groff is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 You should have received a copy of the GNU General Public License
18 along with this program. If not, see <http://www.gnu.org/licenses/>. */
28 void yyerror(const char *);
31 static const char *format_serial(char c, int n);
38 label_info(const string &);
41 label_info *lookup_label(const string &label);
45 // Does the tentative label depend on the reference?
46 CONTAINS_VARIABLE = 01,
51 virtual ~expression() { }
52 virtual void evaluate(int, const reference &, string &,
53 substring_position &) = 0;
54 virtual unsigned analyze() { return 0; }
57 class at_expr : public expression {
60 void evaluate(int, const reference &, string &, substring_position &);
61 unsigned analyze() { return CONTAINS_VARIABLE|CONTAINS_AT; }
64 class format_expr : public expression {
69 format_expr(char c, int w = 0, int f = 1)
70 : type(c), width(w), first_number(f) { }
71 void evaluate(int, const reference &, string &, substring_position &);
72 unsigned analyze() { return CONTAINS_FORMAT; }
75 class field_expr : public expression {
79 field_expr(char nm, int num) : number(num), name(nm) { }
80 void evaluate(int, const reference &, string &, substring_position &);
81 unsigned analyze() { return CONTAINS_VARIABLE; }
84 class literal_expr : public expression {
87 literal_expr(const char *ptr, int len) : s(ptr, len) { }
88 void evaluate(int, const reference &, string &, substring_position &);
91 class unary_expr : public expression {
95 unary_expr(expression *e) : expr(e) { }
96 ~unary_expr() { delete expr; }
97 void evaluate(int, const reference &, string &, substring_position &) = 0;
98 unsigned analyze() { return expr ? expr->analyze() : 0; }
101 // This caches the analysis of an expression.
103 class analyzed_expr : public unary_expr {
106 analyzed_expr(expression *);
107 void evaluate(int, const reference &, string &, substring_position &);
108 unsigned analyze() { return flags; }
111 class star_expr : public unary_expr {
113 star_expr(expression *e) : unary_expr(e) { }
114 void evaluate(int, const reference &, string &, substring_position &);
116 return ((expr ? (expr->analyze() & ~CONTAINS_VARIABLE) : 0)
121 typedef void map_func(const char *, const char *, string &);
123 class map_expr : public unary_expr {
126 map_expr(expression *e, map_func *f) : unary_expr(e), func(f) { }
127 void evaluate(int, const reference &, string &, substring_position &);
130 typedef const char *extractor_func(const char *, const char *, const char **);
132 class extractor_expr : public unary_expr {
134 extractor_func *func;
136 enum { BEFORE = +1, MATCH = 0, AFTER = -1 };
137 extractor_expr(expression *e, extractor_func *f, int pt)
138 : unary_expr(e), part(pt), func(f) { }
139 void evaluate(int, const reference &, string &, substring_position &);
142 class truncate_expr : public unary_expr {
145 truncate_expr(expression *e, int i) : unary_expr(e), n(i) { }
146 void evaluate(int, const reference &, string &, substring_position &);
149 class separator_expr : public unary_expr {
151 separator_expr(expression *e) : unary_expr(e) { }
152 void evaluate(int, const reference &, string &, substring_position &);
155 class binary_expr : public expression {
160 binary_expr(expression *e1, expression *e2) : expr1(e1), expr2(e2) { }
161 ~binary_expr() { delete expr1; delete expr2; }
162 void evaluate(int, const reference &, string &, substring_position &) = 0;
164 return (expr1 ? expr1->analyze() : 0) | (expr2 ? expr2->analyze() : 0);
168 class alternative_expr : public binary_expr {
170 alternative_expr(expression *e1, expression *e2) : binary_expr(e1, e2) { }
171 void evaluate(int, const reference &, string &, substring_position &);
174 class list_expr : public binary_expr {
176 list_expr(expression *e1, expression *e2) : binary_expr(e1, e2) { }
177 void evaluate(int, const reference &, string &, substring_position &);
180 class substitute_expr : public binary_expr {
182 substitute_expr(expression *e1, expression *e2) : binary_expr(e1, e2) { }
183 void evaluate(int, const reference &, string &, substring_position &);
186 class ternary_expr : public expression {
192 ternary_expr(expression *e1, expression *e2, expression *e3)
193 : expr1(e1), expr2(e2), expr3(e3) { }
194 ~ternary_expr() { delete expr1; delete expr2; delete expr3; }
195 void evaluate(int, const reference &, string &, substring_position &) = 0;
197 return ((expr1 ? expr1->analyze() : 0)
198 | (expr2 ? expr2->analyze() : 0)
199 | (expr3 ? expr3->analyze() : 0));
203 class conditional_expr : public ternary_expr {
205 conditional_expr(expression *e1, expression *e2, expression *e3)
206 : ternary_expr(e1, e2, e3) { }
207 void evaluate(int, const reference &, string &, substring_position &);
210 static expression *parsed_label = 0;
211 static expression *parsed_date_label = 0;
212 static expression *parsed_short_label = 0;
214 static expression *parse_result;
223 struct { int ndigits; int val; } dig;
224 struct { int start; int len; } str;
227 /* uppercase or lowercase letter */
228 %token <num> TOKEN_LETTER
229 /* literal characters */
230 %token <str> TOKEN_LITERAL
232 %token <num> TOKEN_DIGIT
234 %type <expr> conditional
235 %type <expr> alternative
238 %type <expr> substitute
239 %type <expr> optional_conditional
242 %type <num> optional_number
249 { parse_result = ($1 ? new analyzed_expr($1) : 0); }
255 | alternative '?' optional_conditional ':' conditional
256 { $$ = new conditional_expr($1, $3, $5); }
259 optional_conditional:
269 | alternative '|' list
270 { $$ = new alternative_expr($1, $3); }
271 | alternative '&' list
272 { $$ = new conditional_expr($1, $3, 0); }
279 { $$ = new list_expr($1, $2); }
285 | substitute '~' string
286 { $$ = new substitute_expr($1, $3); }
291 { $$ = new at_expr; }
294 $$ = new literal_expr(literals.contents() + $1.start,
298 { $$ = new field_expr($1, 0); }
299 | TOKEN_LETTER number
300 { $$ = new field_expr($1, $2 - 1); }
308 $$ = new format_expr($2);
311 command_error("unrecognized format '%1'", char($2));
312 $$ = new format_expr('a');
319 $$ = new format_expr('0', $2.ndigits, $2.val);
321 | string '.' flag TOKEN_LETTER optional_number
325 $$ = new map_expr($1, lowercase);
328 $$ = new map_expr($1, uppercase);
331 $$ = new map_expr($1, capitalize);
334 $$ = new map_expr($1, reverse_name);
337 $$ = new map_expr($1, abbreviate_name);
340 $$ = new extractor_expr($1, find_year, $3);
343 $$ = new extractor_expr($1, find_last_name, $3);
347 command_error("unknown function '%1'", char($4));
353 { $$ = new truncate_expr($1, $3); }
355 { $$ = new truncate_expr($1, -$3); }
357 { $$ = new star_expr($1); }
358 | '(' optional_conditional ')'
360 | '<' optional_conditional '>'
361 { $$ = new separator_expr($2); }
380 { $$.ndigits = 1; $$.val = $1; }
382 { $$.ndigits = $1.ndigits + 1; $$.val = $1.val*10 + $2; }
397 /* bison defines const to be empty unless __STDC__ is defined, which it
398 isn't under cfront */
404 const char *spec_ptr;
405 const char *spec_end;
406 const char *spec_cur;
408 static char uppercase_array[] = {
409 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
410 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
411 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
415 static char lowercase_array[] = {
416 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
417 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p',
418 'q', 'r', 's', 't', 'u', 'v', 'w', 'x',
424 while (spec_ptr < spec_end && csspace(*spec_ptr))
427 if (spec_ptr >= spec_end)
429 unsigned char c = *spec_ptr++;
435 yylval.num = c - '0';
439 yylval.str.start = literals.length();
440 for (; spec_ptr < spec_end; spec_ptr++) {
441 if (*spec_ptr == '\'') {
442 if (++spec_ptr < spec_end && *spec_ptr == '\'')
445 yylval.str.len = literals.length() - yylval.str.start;
446 return TOKEN_LITERAL;
450 literals += *spec_ptr;
452 yylval.str.len = literals.length() - yylval.str.start;
453 return TOKEN_LITERAL;
458 int set_label_spec(const char *label_spec)
460 spec_cur = spec_ptr = label_spec;
461 spec_end = strchr(label_spec, '\0');
466 parsed_label = parse_result;
470 int set_date_label_spec(const char *label_spec)
472 spec_cur = spec_ptr = label_spec;
473 spec_end = strchr(label_spec, '\0');
477 delete parsed_date_label;
478 parsed_date_label = parse_result;
482 int set_short_label_spec(const char *label_spec)
484 spec_cur = spec_ptr = label_spec;
485 spec_end = strchr(label_spec, '\0');
489 delete parsed_short_label;
490 parsed_short_label = parse_result;
494 void yyerror(const char *message)
496 if (spec_cur < spec_end)
497 command_error("label specification %1 before '%2'", message, spec_cur);
499 command_error("label specification %1 at end of string",
503 void at_expr::evaluate(int tentative, const reference &ref,
504 string &result, substring_position &)
507 ref.canonicalize_authors(result);
509 const char *end, *start = ref.get_authors(&end);
511 result.append(start, end - start);
515 void format_expr::evaluate(int tentative, const reference &ref,
516 string &result, substring_position &)
520 const label_info *lp = ref.get_label_ptr();
521 int num = lp == 0 ? ref.get_number() : lp->count;
523 result += format_serial(type, num + 1);
525 const char *ptr = i_to_a(num + first_number);
526 int pad = width - strlen(ptr);
533 static const char *format_serial(char c, int n)
536 static char buf[128]; // more than enough.
542 // troff uses z and w to represent 10000 and 5000 in Roman
543 // numerals; I can find no historical basis for this usage
544 const char *s = c == 'i' ? "zwmdclxvi" : "ZWMDCLXVI";
551 for (int i = 1000; i > 0; i /= 10, s += 2) {
598 // this is derived from troff/reg.c
605 *p++ = c == 'a' ? lowercase_array[d - 1] :
606 uppercase_array[d - 1];
626 void field_expr::evaluate(int, const reference &ref,
627 string &result, substring_position &)
630 const char *start = ref.get_field(name, &end);
632 start = nth_field(number, start, &end);
634 result.append(start, end - start);
638 void literal_expr::evaluate(int, const reference &,
639 string &result, substring_position &)
644 analyzed_expr::analyzed_expr(expression *e)
645 : unary_expr(e), flags(e ? e->analyze() : 0)
649 void analyzed_expr::evaluate(int tentative, const reference &ref,
650 string &result, substring_position &pos)
653 expr->evaluate(tentative, ref, result, pos);
656 void star_expr::evaluate(int tentative, const reference &ref,
657 string &result, substring_position &pos)
659 const label_info *lp = ref.get_label_ptr();
661 && (lp == 0 || lp->total > 1)
663 expr->evaluate(tentative, ref, result, pos);
666 void separator_expr::evaluate(int tentative, const reference &ref,
667 string &result, substring_position &pos)
669 int start_length = result.length();
670 int is_first = pos.start < 0;
672 expr->evaluate(tentative, ref, result, pos);
674 pos.start = start_length;
675 pos.length = result.length() - start_length;
679 void map_expr::evaluate(int tentative, const reference &ref,
680 string &result, substring_position &)
684 substring_position temp_pos;
685 expr->evaluate(tentative, ref, temp, temp_pos);
686 (*func)(temp.contents(), temp.contents() + temp.length(), result);
690 void extractor_expr::evaluate(int tentative, const reference &ref,
691 string &result, substring_position &)
695 substring_position temp_pos;
696 expr->evaluate(tentative, ref, temp, temp_pos);
697 const char *end, *start = (*func)(temp.contents(),
698 temp.contents() + temp.length(),
703 result.append(temp.contents(), start - temp.contents());
709 result.append(start, end - start);
713 result.append(end, temp.contents() + temp.length() - end);
721 static void first_part(int len, const char *ptr, const char *end,
725 const char *token_start = ptr;
726 if (!get_token(&ptr, end))
728 const token_info *ti = lookup_token(token_start, ptr);
729 int counts = ti->sortify_non_empty(token_start, ptr);
730 if (counts && --len < 0)
732 if (counts || ti->is_accent())
733 result.append(token_start, ptr - token_start);
737 static void last_part(int len, const char *ptr, const char *end,
740 const char *start = ptr;
743 const char *token_start = ptr;
744 if (!get_token(&ptr, end))
746 const token_info *ti = lookup_token(token_start, ptr);
747 if (ti->sortify_non_empty(token_start, ptr))
751 int skip = count - len;
754 const char *token_start = ptr;
755 if (!get_token(&ptr, end))
757 const token_info *ti = lookup_token(token_start, ptr);
758 if (ti->sortify_non_empty(token_start, ptr) && --skip < 0) {
764 first_part(len, ptr, end, result);
767 void truncate_expr::evaluate(int tentative, const reference &ref,
768 string &result, substring_position &)
772 substring_position temp_pos;
773 expr->evaluate(tentative, ref, temp, temp_pos);
774 const char *start = temp.contents();
775 const char *end = start + temp.length();
777 first_part(n, start, end, result);
779 last_part(-n, start, end, result);
783 void alternative_expr::evaluate(int tentative, const reference &ref,
784 string &result, substring_position &pos)
786 int start_length = result.length();
788 expr1->evaluate(tentative, ref, result, pos);
789 if (result.length() == start_length && expr2)
790 expr2->evaluate(tentative, ref, result, pos);
793 void list_expr::evaluate(int tentative, const reference &ref,
794 string &result, substring_position &pos)
797 expr1->evaluate(tentative, ref, result, pos);
799 expr2->evaluate(tentative, ref, result, pos);
802 void substitute_expr::evaluate(int tentative, const reference &ref,
803 string &result, substring_position &pos)
805 int start_length = result.length();
807 expr1->evaluate(tentative, ref, result, pos);
808 if (result.length() > start_length && result[result.length() - 1] == '-') {
809 // ought to see if pos covers the -
810 result.set_length(result.length() - 1);
812 expr2->evaluate(tentative, ref, result, pos);
816 void conditional_expr::evaluate(int tentative, const reference &ref,
817 string &result, substring_position &pos)
820 substring_position temp_pos;
822 expr1->evaluate(tentative, ref, temp, temp_pos);
823 if (temp.length() > 0) {
825 expr2->evaluate(tentative, ref, result, pos);
829 expr3->evaluate(tentative, ref, result, pos);
833 void reference::pre_compute_label()
835 if (parsed_label != 0
836 && (parsed_label->analyze() & expression::CONTAINS_VARIABLE)) {
838 substring_position temp_pos;
839 parsed_label->evaluate(1, *this, label, temp_pos);
840 label_ptr = lookup_label(label);
844 void reference::compute_label()
848 parsed_label->evaluate(0, *this, label, separator_pos);
849 if (short_label_flag && parsed_short_label)
850 parsed_short_label->evaluate(0, *this, short_label, short_separator_pos);
853 if (parsed_date_label) {
854 substring_position temp_pos;
855 parsed_date_label->evaluate(0, *this, new_date, temp_pos);
860 label_ptr->count += 1;
863 void reference::immediate_compute_label()
866 label_ptr->total = 2; // force use of disambiguator
870 int reference::merge_labels(reference **v, int n, label_type type,
873 if (abbreviate_label_ranges)
874 return merge_labels_by_number(v, n, type, result);
876 return merge_labels_by_parts(v, n, type, result);
879 int reference::merge_labels_by_number(reference **v, int n, label_type type,
884 int num = get_number();
885 // Only merge three or more labels.
886 if (v[0]->get_number() != num + 1
887 || v[1]->get_number() != num + 2)
890 for (i = 2; i < n; i++)
891 if (v[i]->get_number() != num + i + 1)
893 result = get_label(type);
894 result += label_range_indicator;
895 result += v[i - 1]->get_label(type);
899 const substring_position &reference::get_separator_pos(label_type type) const
901 if (type == SHORT_LABEL && short_label_flag)
902 return short_separator_pos;
904 return separator_pos;
907 const string &reference::get_label(label_type type) const
909 if (type == SHORT_LABEL && short_label_flag)
915 int reference::merge_labels_by_parts(reference **v, int n, label_type type,
920 const string &lb = get_label(type);
921 const substring_position &sp = get_separator_pos(type);
923 || sp.start != v[0]->get_separator_pos(type).start
924 || memcmp(lb.contents(), v[0]->get_label(type).contents(),
930 result += separate_label_second_parts;
931 const substring_position &s = v[i]->get_separator_pos(type);
932 int sep_end_pos = s.start + s.length;
933 result.append(v[i]->get_label(type).contents() + sep_end_pos,
934 v[i]->get_label(type).length() - sep_end_pos);
936 && sp.start == v[i]->get_separator_pos(type).start
937 && memcmp(lb.contents(), v[i]->get_label(type).contents(),
944 label_info::label_info(const string &s)
945 : start(label_pool.length()), length(s.length()), count(0), total(1)
950 static label_info **label_table = 0;
951 static int label_table_size = 0;
952 static int label_table_used = 0;
954 label_info *lookup_label(const string &label)
956 if (label_table == 0) {
957 label_table = new label_info *[17];
958 label_table_size = 17;
959 for (int i = 0; i < 17; i++)
962 unsigned h = hash_string(label.contents(), label.length()) % label_table_size;
964 for (ptr = label_table + h;
967 ? (ptr = label_table + label_table_size - 1)
969 if ((*ptr)->length == label.length()
970 && memcmp(label_pool.contents() + (*ptr)->start, label.contents(),
971 label.length()) == 0) {
975 label_info *result = *ptr = new label_info(label);
976 if (++label_table_used * 2 > label_table_size) {
978 label_info **old_table = label_table;
979 int old_size = label_table_size;
980 label_table_size = next_size(label_table_size);
981 label_table = new label_info *[label_table_size];
983 for (i = 0; i < label_table_size; i++)
985 for (i = 0; i < old_size; i++)
987 h = hash_string(label_pool.contents() + old_table[i]->start,
988 old_table[i]->length);
990 for (p = label_table + (h % label_table_size);
993 ? (p = label_table + label_table_size - 1)
1005 for (int i = 0; i < label_table_size; i++) {
1006 delete label_table[i];
1009 label_table_used = 0;
1013 static void consider_authors(reference **start, reference **end, int i);
1015 void compute_labels(reference **v, int n)
1018 && (parsed_label->analyze() & expression::CONTAINS_AT)
1019 && sort_fields.length() >= 2
1020 && sort_fields[0] == 'A'
1021 && sort_fields[1] == '+')
1022 consider_authors(v, v + n, 0);
1023 for (int i = 0; i < n; i++)
1024 v[i]->compute_label();
1028 /* A reference with a list of authors <A0,A1,...,AN> _needs_ author i
1029 where 0 <= i <= N if there exists a reference with a list of authors
1030 <B0,B1,...,BM> such that <A0,A1,...,AN> != <B0,B1,...,BM> and M >= i
1031 and Aj = Bj for 0 <= j < i. In this case if we can't say "A0,
1032 A1,...,A(i-1) et al" because this would match both <A0,A1,...,AN> and
1033 <B0,B1,...,BM>. If a reference needs author i we only have to call
1034 need_author(j) for some j >= i such that the reference also needs
1037 /* This function handles 2 tasks:
1038 determine which authors are needed (cannot be elided with et al.);
1039 determine which authors can have only last names in the labels.
1041 References >= start and < end have the same first i author names.
1042 Also they're sorted by A+. */
1044 static void consider_authors(reference **start, reference **end, int i)
1048 reference **p = start;
1049 if (i >= (*p)->get_nauthors()) {
1050 for (++p; p < end && i >= (*p)->get_nauthors(); p++)
1052 if (p < end && i > 0) {
1053 // If we have an author list <A B C> and an author list <A B C D>,
1054 // then both lists need C.
1055 for (reference **q = start; q < end; q++)
1056 (*q)->need_author(i - 1);
1061 reference **last_name_start = p;
1062 reference **name_start = p;
1064 p < end && i < (*p)->get_nauthors()
1065 && same_author_last_name(**last_name_start, **p, i);
1067 if (!same_author_name(**name_start, **p, i)) {
1068 consider_authors(name_start, p, i + 1);
1072 consider_authors(name_start, p, i + 1);
1073 if (last_name_start == name_start) {
1074 for (reference **q = last_name_start; q < p; q++)
1075 (*q)->set_last_name_unambiguous(i);
1077 // If we have an author list <A B C D> and <A B C E>, then the lists
1078 // need author D and E respectively.
1079 if (name_start > start || p < end) {
1080 for (reference **q = last_name_start; q < p; q++)
1081 (*q)->need_author(i);
1086 int same_author_last_name(const reference &r1, const reference &r2, int n)
1089 const char *as1 = r1.get_sort_field(0, n, 0, &ae1);
1091 const char *as2 = r2.get_sort_field(0, n, 0, &ae2);
1092 if (!as1 && !as2) return 1; // they are the same
1093 if (!as1 || !as2) return 0;
1094 return ae1 - as1 == ae2 - as2 && memcmp(as1, as2, ae1 - as1) == 0;
1097 int same_author_name(const reference &r1, const reference &r2, int n)
1100 const char *as1 = r1.get_sort_field(0, n, -1, &ae1);
1102 const char *as2 = r2.get_sort_field(0, n, -1, &ae2);
1103 if (!as1 && !as2) return 1; // they are the same
1104 if (!as1 || !as2) return 0;
1105 return ae1 - as1 == ae2 - as2 && memcmp(as1, as2, ae1 - as1) == 0;
1109 void int_set::set(int i)
1113 if (bytei >= v.length()) {
1114 int old_length = v.length();
1115 v.set_length(bytei + 1);
1116 for (int j = old_length; j <= bytei; j++)
1119 v[bytei] |= 1 << (i & 7);
1122 int int_set::get(int i) const
1126 return bytei >= v.length() ? 0 : (v[bytei] & (1 << (i & 7))) != 0;
1129 void reference::set_last_name_unambiguous(int i)
1131 last_name_unambiguous.set(i);
1134 void reference::need_author(int n)
1136 if (n > last_needed_author)
1137 last_needed_author = n;
1140 const char *reference::get_authors(const char **end) const
1142 if (!computed_authors) {
1143 ((reference *)this)->computed_authors = 1;
1144 string &result = ((reference *)this)->authors;
1145 int na = get_nauthors();
1147 for (int i = 0; i < na; i++) {
1148 if (last_name_unambiguous.get(i)) {
1149 const char *e, *start = get_author_last_name(i, &e);
1151 result.append(start, e - start);
1154 const char *e, *start = get_author(i, &e);
1156 result.append(start, e - start);
1158 if (i == last_needed_author
1159 && et_al.length() > 0
1160 && et_al_min_elide > 0
1161 && last_needed_author + et_al_min_elide < na
1162 && na >= et_al_min_total) {
1168 result += join_authors_exactly_two;
1169 else if (i < na - 2)
1170 result += join_authors_default;
1172 result += join_authors_last_two;
1176 const char *start = authors.contents();
1177 *end = start + authors.length();
1181 int reference::get_nauthors() const
1186 for (na = 0; get_author(na, &dummy) != 0; na++)
1188 ((reference *)this)->nauthors = na;