2 /* Copyright (C) 1989-2014 Free Software Foundation, Inc.
3 Written by James Clark (jjc@jclark.com)
5 This file is part of groff.
7 groff is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation, either version 3 of the License, or
10 (at your option) any later version.
12 groff is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 You should have received a copy of the GNU General Public License
18 along with this program. If not, see <http://www.gnu.org/licenses/>. */
25 static const char *find_day(const char *, const char *, const char **);
26 static int find_month(const char *start, const char *end);
27 static void abbreviate_names(string &);
29 #define DEFAULT_ARTICLES "the\000a\000an"
31 string articles(DEFAULT_ARTICLES, sizeof(DEFAULT_ARTICLES));
33 // Multiple occurrences of fields are separated by FIELD_SEPARATOR.
34 const char FIELD_SEPARATOR = '\0';
36 const char MULTI_FIELD_NAMES[] = "AE";
37 const char *AUTHOR_FIELDS = "AQ";
39 enum { OTHER, JOURNAL_ARTICLE, BOOK, ARTICLE_IN_BOOK, TECH_REPORT, BELL_TM };
41 const char *reference_types[] = {
50 static string temp_fields[256];
52 reference::reference(const char *start, int len, reference_id *ridp)
53 : h(0), merged(0), no(-1), field(0), nfields(0), label_ptr(0),
54 computed_authors(0), last_needed_author(-1), nauthors(-1)
57 for (i = 0; i < 256; i++)
58 field_index[i] = NULL_FIELD_INDEX;
65 const char *end = start + len;
66 const char *ptr = start;
69 if (ptr + 1 < end && ptr[1] != '\0'
70 && ((ptr[1] != '%' && ptr[1] == annotation_field)
71 || (ptr + 2 < end && ptr[1] == '%' && ptr[2] != '\0'
72 && discard_fields.search(ptr[2]) < 0))) {
75 string &f = temp_fields[(unsigned char)ptr[1]];
77 while (ptr < end && csspace(*ptr))
89 if (ptr >= end || *ptr == '%')
93 else if (ptr + 1 < end && ptr[1] != '\0' && ptr[1] != '%'
94 && discard_fields.search(ptr[1]) < 0) {
95 string &f = temp_fields[(unsigned char)ptr[1]];
97 if (strchr(MULTI_FIELD_NAMES, ptr[1]) != 0)
108 while (ptr < end && *ptr != '\n')
110 // strip trailing white space
112 while (q > p && q[-1] != '\n' && csspace(q[-1]))
130 while (ptr < end && *ptr++ != '\n')
132 if (ptr >= end || *ptr == '%')
137 for (i = 0; i < 256; i++)
138 if (temp_fields[i].length() > 0)
140 field = new string[nfields];
142 for (i = 0; i < 256; i++)
143 if (temp_fields[i].length() > 0) {
144 field[j].move(temp_fields[i]);
145 if (abbreviate_fields.search(i) >= 0)
146 abbreviate_names(field[j]);
152 reference::~reference()
155 ad_delete(nfields) field;
158 // ref is the inline, this is the database ref
160 void reference::merge(reference &ref)
163 for (i = 0; i < 256; i++)
164 if (field_index[i] != NULL_FIELD_INDEX)
165 temp_fields[i].move(field[field_index[i]]);
166 for (i = 0; i < 256; i++)
167 if (ref.field_index[i] != NULL_FIELD_INDEX)
168 temp_fields[i].move(ref.field[ref.field_index[i]]);
169 for (i = 0; i < 256; i++)
170 field_index[i] = NULL_FIELD_INDEX;
171 int old_nfields = nfields;
173 for (i = 0; i < 256; i++)
174 if (temp_fields[i].length() > 0)
176 if (nfields != old_nfields) {
178 ad_delete(old_nfields) field;
179 field = new string[nfields];
182 for (i = 0; i < 256; i++)
183 if (temp_fields[i].length() > 0) {
184 field[j].move(temp_fields[i]);
191 void reference::insert_field(unsigned char c, string &s)
193 assert(s.length() > 0);
194 if (field_index[c] != NULL_FIELD_INDEX) {
195 field[field_index[c]].move(s);
198 assert(field_index[c] == NULL_FIELD_INDEX);
199 string *old_field = field;
200 field = new string[nfields + 1];
203 for (i = 0; i < int(c); i++)
204 if (field_index[i] != NULL_FIELD_INDEX)
206 for (i = 0; i < pos; i++)
207 field[i].move(old_field[i]);
209 for (i = pos; i < nfields; i++)
210 field[i + 1].move(old_field[i]);
212 ad_delete(nfields) old_field;
214 field_index[c] = pos;
215 for (i = c + 1; i < 256; i++)
216 if (field_index[i] != NULL_FIELD_INDEX)
220 void reference::delete_field(unsigned char c)
222 if (field_index[c] == NULL_FIELD_INDEX)
224 string *old_field = field;
225 field = new string[nfields - 1];
227 for (i = 0; i < int(field_index[c]); i++)
228 field[i].move(old_field[i]);
229 for (i = field_index[c]; i < nfields - 1; i++)
230 field[i].move(old_field[i + 1]);
232 ad_delete(nfields) old_field;
234 field_index[c] = NULL_FIELD_INDEX;
235 for (i = c + 1; i < 256; i++)
236 if (field_index[i] != NULL_FIELD_INDEX)
240 void reference::compute_hash_code()
246 for (int i = 0; i < nfields; i++)
247 if (field[i].length() > 0) {
249 h ^= hash_string(field[i].contents(), field[i].length());
254 void reference::set_number(int n)
259 const char SORT_SEP = '\001';
260 const char SORT_SUB_SEP = '\002';
261 const char SORT_SUB_SUB_SEP = '\003';
263 // sep specifies additional word separators
265 void sortify_words(const char *s, const char *end, const char *sep,
269 int need_separator = 0;
271 const char *token_start = s;
272 if (!get_token(&s, end))
274 if ((s - token_start == 1
275 && (*token_start == ' '
276 || *token_start == '\n'
277 || (sep && *token_start != '\0'
278 && strchr(sep, *token_start) != 0)))
279 || (s - token_start == 2
280 && token_start[0] == '\\' && token_start[1] == ' ')) {
285 const token_info *ti = lookup_token(token_start, s);
286 if (ti->sortify_non_empty(token_start, s)) {
287 if (need_separator) {
291 ti->sortify(token_start, s, result);
298 void sortify_word(const char *s, const char *end, string &result)
301 const char *token_start = s;
302 if (!get_token(&s, end))
304 const token_info *ti = lookup_token(token_start, s);
305 ti->sortify(token_start, s, result);
309 void sortify_other(const char *s, int len, string &key)
311 sortify_words(s, s + len, 0, key);
314 void sortify_title(const char *s, int len, string &key)
316 const char *end = s + len;
317 for (; s < end && (*s == ' ' || *s == '\n'); s++)
321 const char *token_start = ptr;
322 if (!get_token(&ptr, end))
324 if (ptr - token_start == 1
325 && (*token_start == ' ' || *token_start == '\n'))
329 unsigned int first_word_len = ptr - s - 1;
330 const char *ae = articles.contents() + articles.length();
331 for (const char *a = articles.contents();
333 a = strchr(a, '\0') + 1)
334 if (first_word_len == strlen(a)) {
336 for (j = 0; j < first_word_len; j++)
337 if (a[j] != cmlower(s[j]))
339 if (j >= first_word_len) {
341 for (; s < end && (*s == ' ' || *s == '\n'); s++)
347 sortify_words(s, end, 0, key);
350 void sortify_name(const char *s, int len, string &key)
352 const char *last_name_end;
353 const char *last_name = find_last_name(s, s + len, &last_name_end);
354 sortify_word(last_name, last_name_end, key);
355 key += SORT_SUB_SUB_SEP;
357 sortify_words(s, last_name, ".", key);
358 key += SORT_SUB_SUB_SEP;
359 if (last_name_end < s + len)
360 sortify_words(last_name_end, s + len, ".,", key);
363 void sortify_date(const char *s, int len, string &key)
365 const char *year_end;
366 const char *year_start = find_year(s, s + len, &year_end);
368 // Things without years are often `forthcoming', so it makes sense
369 // that they sort after things with explicit years.
371 sortify_words(s, s + len, 0, key);
374 int n = year_end - year_start;
379 while (year_start < year_end)
380 key += *year_start++;
381 int m = find_month(s, s + len);
386 const char *day_start = find_day(s, s + len, &day_end);
389 if (day_end - day_start == 1)
391 while (day_start < day_end)
395 // SORT_{SUB,SUB_SUB}_SEP can creep in from use of @ in label specification.
397 void sortify_label(const char *s, int len, string &key)
399 const char *end = s + len;
403 ptr < end && *ptr != SORT_SUB_SEP && *ptr != SORT_SUB_SUB_SEP;
407 sortify_words(s, ptr, 0, key);
415 void reference::compute_sort_key()
417 if (sort_fields.length() == 0)
420 const char *sf = sort_fields.contents();
422 while (*sf != '\0') {
424 sort_key += SORT_SEP;
432 else if (csdigit(*sf)) {
434 long l = strtol(sf, &ptr, 10);
435 if (l == 0 && ptr == sf)
448 sortify_label(label.contents(), label.length(), sort_key);
449 else if (f == AUTHOR_FIELDS[0])
450 sortify_authors(n, sort_key);
452 sortify_field(f, n, sort_key);
454 sort_fields.set_length(sort_fields.length() - 1);
457 void reference::sortify_authors(int n, string &result) const
459 for (const char *p = AUTHOR_FIELDS; *p != '\0'; p++)
460 if (contains_field(*p)) {
461 sortify_field(*p, n, result);
464 sortify_field(AUTHOR_FIELDS[0], n, result);
467 void reference::canonicalize_authors(string &result) const
469 int len = result.length();
470 sortify_authors(INT_MAX, result);
471 if (result.length() > len)
472 result += SORT_SUB_SEP;
475 void reference::sortify_field(unsigned char f, int n, string &result) const
477 typedef void (*sortify_t)(const char *, int, string &);
478 sortify_t sortifier = sortify_other;
482 sortifier = sortify_name;
485 sortifier = sortify_date;
490 sortifier = sortify_title;
493 int fi = field_index[(unsigned char)f];
494 if (fi != NULL_FIELD_INDEX) {
495 string &str = field[fi];
496 const char *start = str.contents();
497 const char *end = start + str.length();
498 for (int i = 0; i < n && start < end; i++) {
499 const char *p = start;
500 while (start < end && *start != FIELD_SEPARATOR)
503 result += SORT_SUB_SEP;
504 (*sortifier)(p, start - p, result);
511 int compare_reference(const reference &r1, const reference &r2)
515 const char *s1 = r1.sort_key.contents();
516 int n1 = r1.sort_key.length();
517 const char *s2 = r2.sort_key.contents();
518 int n2 = r2.sort_key.length();
519 for (; n1 > 0 && n2 > 0; --n1, --n2, ++s1, ++s2)
521 return (int)(unsigned char)*s1 - (int)(unsigned char)*s2;
526 return r1.no - r2.no;
529 int same_reference(const reference &r1, const reference &r2)
531 if (!r1.rid.is_null() && r1.rid == r2.rid)
535 if (r1.nfields != r2.nfields)
538 for (i = 0; i < 256; i++)
539 if (r1.field_index != r2.field_index)
541 for (i = 0; i < r1.nfields; i++)
542 if (r1.field[i] != r2.field[i])
547 const char *find_last_name(const char *start, const char *end,
550 const char *ptr = start;
551 const char *last_word = start;
553 const char *token_start = ptr;
554 if (!get_token(&ptr, end))
556 if (ptr - token_start == 1) {
557 if (*token_start == ',') {
561 else if (*token_start == ' ' || *token_start == '\n') {
562 if (ptr < end && *ptr != ' ' && *ptr != '\n')
571 void abbreviate_name(const char *ptr, const char *end, string &result)
573 const char *last_name_end;
574 const char *last_name_start = find_last_name(ptr, end, &last_name_end);
577 const char *token_start = ptr;
578 if (!get_token(&ptr, last_name_start))
580 const token_info *ti = lookup_token(token_start, ptr);
582 if ((ptr - token_start == 1 && *token_start == ' ')
583 || (ptr - token_start == 2 && token_start[0] == '\\'
584 && token_start[1] == ' '))
587 result += period_before_initial;
589 result += period_before_other;
592 result.append(token_start, ptr - token_start);
593 if (ti->is_upper()) {
594 const char *lower_ptr = ptr;
598 if (!get_token(&ptr, last_name_start))
600 if ((ptr - token_start == 1 && *token_start == ' ')
601 || (ptr - token_start == 2 && token_start[0] == '\\'
602 && token_start[1] == ' '))
604 ti = lookup_token(token_start, ptr);
605 if (ti->is_hyphen()) {
606 const char *ptr1 = ptr;
607 if (get_token(&ptr1, last_name_start)) {
608 ti = lookup_token(ptr, ptr1);
609 if (ti->is_upper()) {
610 result += period_before_hyphen;
611 result.append(token_start, ptr1 - token_start);
616 else if (ti->is_upper()) {
617 // MacDougal -> MacD.
618 result.append(lower_ptr, ptr - lower_ptr);
622 else if (first_token && ti->is_accent()) {
623 result.append(token_start, ptr - token_start);
632 result += period_before_last_name;
633 result.append(last_name_start, end - last_name_start);
636 static void abbreviate_names(string &result)
640 const char *ptr = str.contents();
641 const char *end = ptr + str.length();
643 const char *name_end = (char *)memchr(ptr, FIELD_SEPARATOR, end - ptr);
646 abbreviate_name(ptr, name_end, result);
650 result += FIELD_SEPARATOR;
654 void reverse_name(const char *ptr, const char *name_end, string &result)
656 const char *last_name_end;
657 const char *last_name_start = find_last_name(ptr, name_end, &last_name_end);
658 result.append(last_name_start, last_name_end - last_name_start);
659 while (last_name_start > ptr
660 && (last_name_start[-1] == ' ' || last_name_start[-1] == '\n'))
662 if (last_name_start > ptr) {
664 result.append(ptr, last_name_start - ptr);
666 if (last_name_end < name_end)
667 result.append(last_name_end, name_end - last_name_end);
670 void reverse_names(string &result, int n)
676 const char *ptr = str.contents();
677 const char *end = ptr + str.length();
680 result.append(ptr, end - ptr);
683 const char *name_end = (char *)memchr(ptr, FIELD_SEPARATOR, end - ptr);
686 reverse_name(ptr, name_end, result);
690 result += FIELD_SEPARATOR;
694 // Return number of field separators.
696 int join_fields(string &f)
698 const char *ptr = f.contents();
699 int len = f.length();
702 for (j = 0; j < len; j++)
703 if (ptr[j] == FIELD_SEPARATOR)
705 if (nfield_seps == 0)
708 int field_seps_left = nfield_seps;
709 for (j = 0; j < len; j++) {
710 if (ptr[j] == FIELD_SEPARATOR) {
711 if (nfield_seps == 1)
712 temp += join_authors_exactly_two;
713 else if (--field_seps_left == 0)
714 temp += join_authors_last_two;
716 temp += join_authors_default;
725 void uppercase(const char *start, const char *end, string &result)
728 const char *token_start = start;
729 if (!get_token(&start, end))
731 const token_info *ti = lookup_token(token_start, start);
732 ti->upper_case(token_start, start, result);
736 void lowercase(const char *start, const char *end, string &result)
739 const char *token_start = start;
740 if (!get_token(&start, end))
742 const token_info *ti = lookup_token(token_start, start);
743 ti->lower_case(token_start, start, result);
747 void capitalize(const char *ptr, const char *end, string &result)
749 int in_small_point_size = 0;
751 const char *start = ptr;
752 if (!get_token(&ptr, end))
754 const token_info *ti = lookup_token(start, ptr);
755 const char *char_end = ptr;
756 int is_lower = ti->is_lower();
757 if ((is_lower || ti->is_upper()) && get_token(&ptr, end)) {
758 const token_info *ti2 = lookup_token(char_end, ptr);
759 if (!ti2->is_accent())
763 if (!in_small_point_size) {
765 in_small_point_size = 1;
767 ti->upper_case(start, char_end, result);
768 result.append(char_end, ptr - char_end);
771 if (in_small_point_size) {
773 in_small_point_size = 0;
775 result.append(start, ptr - start);
778 if (in_small_point_size)
782 void capitalize_field(string &str)
785 capitalize(str.contents(), str.contents() + str.length(), temp);
789 int is_terminated(const char *ptr, const char *end)
791 const char *last_token = end;
794 if (!get_token(&ptr, end))
798 return end - last_token == 1
799 && (*last_token == '.' || *last_token == '!' || *last_token == '?');
802 void reference::output(FILE *fp)
805 for (int i = 0; i < 256; i++)
806 if (field_index[i] != NULL_FIELD_INDEX && i != annotation_field) {
807 string &f = field[field_index[i]];
809 int j = reverse_fields.search(i);
812 int len = reverse_fields.length();
813 if (++j < len && csdigit(reverse_fields[j])) {
814 n = reverse_fields[j] - '0';
815 for (++j; j < len && csdigit(reverse_fields[j]); j++)
816 // should check for overflow
817 n = n*10 + reverse_fields[j] - '0';
824 int is_multiple = join_fields(f) > 0;
825 if (capitalize_fields.search(i) >= 0)
827 if (memchr(f.contents(), '\n', f.length()) == 0) {
828 fprintf(fp, ".ds [%c ", i);
829 if (f[0] == ' ' || f[0] == '\\' || f[0] == '"')
835 fprintf(fp, ".de [%c\n", i);
840 int multiple_pages = 0;
841 const char *s = f.contents();
842 const char *end = f.contents() + f.length();
844 const char *token_start = s;
845 if (!get_token(&s, end))
847 const token_info *ti = lookup_token(token_start, s);
848 if (ti->is_hyphen() || ti->is_range_sep()) {
853 fprintf(fp, ".nr [P %d\n", multiple_pages);
856 fprintf(fp, ".nr [E %d\n", is_multiple);
858 for (const char *p = "TAO"; *p; p++) {
859 int fi = field_index[(unsigned char)*p];
860 if (fi != NULL_FIELD_INDEX) {
861 string &f = field[fi];
862 fprintf(fp, ".nr [%c %d\n", *p,
863 is_terminated(f.contents(), f.contents() + f.length()));
867 fprintf(fp, ".][ %d %s\n", t, reference_types[t]);
868 if (annotation_macro.length() > 0 && annotation_field >= 0
869 && field_index[annotation_field] != NULL_FIELD_INDEX) {
871 put_string(annotation_macro, fp);
873 put_string(field[field_index[annotation_field]], fp);
877 void reference::print_sort_key_comment(FILE *fp)
880 put_string(sort_key, fp);
884 const char *find_year(const char *start, const char *end, const char **endp)
887 while (start < end && !csdigit(*start))
889 const char *ptr = start;
892 while (ptr < end && csdigit(*ptr))
894 if (ptr - start == 4 || ptr - start == 3
896 && (start[0] >= '4' || (start[0] == '3' && start[1] >= '2')))) {
905 static const char *find_day(const char *start, const char *end,
909 while (start < end && !csdigit(*start))
911 const char *ptr = start;
914 while (ptr < end && csdigit(*ptr))
916 if ((ptr - start == 1 && start[0] != '0')
917 || (ptr - start == 2 &&
920 || (start[0] == '3' && start[1] <= '1')
921 || (start[0] == '0' && start[1] != '0')))) {
930 static int find_month(const char *start, const char *end)
932 static const char *months[] = {
947 while (start < end && !csalpha(*start))
949 const char *ptr = start;
952 while (ptr < end && csalpha(*ptr))
954 if (ptr - start >= 3) {
955 for (unsigned int i = 0; i < sizeof(months)/sizeof(months[0]); i++) {
956 const char *q = months[i];
957 const char *p = start;
958 for (; p < ptr; p++, q++)
959 if (cmlower(*p) != *q)
970 int reference::contains_field(char c) const
972 return field_index[(unsigned char)c] != NULL_FIELD_INDEX;
975 int reference::classify()
977 if (contains_field('J'))
978 return JOURNAL_ARTICLE;
979 if (contains_field('B'))
980 return ARTICLE_IN_BOOK;
981 if (contains_field('G'))
983 if (contains_field('R'))
985 if (contains_field('I'))
987 if (contains_field('M'))
992 const char *reference::get_year(const char **endp) const
994 if (field_index['D'] != NULL_FIELD_INDEX) {
995 string &date = field[field_index['D']];
996 const char *start = date.contents();
997 const char *end = start + date.length();
998 return find_year(start, end, endp);
1004 const char *reference::get_field(unsigned char c, const char **endp) const
1006 if (field_index[c] != NULL_FIELD_INDEX) {
1007 string &f = field[field_index[c]];
1008 const char *start = f.contents();
1009 *endp = start + f.length();
1016 const char *reference::get_date(const char **endp) const
1018 return get_field('D', endp);
1021 const char *nth_field(int i, const char *start, const char **endp)
1024 start = (char *)memchr(start, FIELD_SEPARATOR, *endp - start);
1029 const char *e = (char *)memchr(start, FIELD_SEPARATOR, *endp - start);
1035 const char *reference::get_author(int i, const char **endp) const
1037 for (const char *f = AUTHOR_FIELDS; *f != '\0'; f++) {
1038 const char *start = get_field(*f, endp);
1040 if (strchr(MULTI_FIELD_NAMES, *f) != 0)
1041 return nth_field(i, start, endp);
1051 const char *reference::get_author_last_name(int i, const char **endp) const
1053 for (const char *f = AUTHOR_FIELDS; *f != '\0'; f++) {
1054 const char *start = get_field(*f, endp);
1056 if (strchr(MULTI_FIELD_NAMES, *f) != 0) {
1057 start = nth_field(i, start, endp);
1062 return find_last_name(start, *endp, endp);
1070 void reference::set_date(string &d)
1072 if (d.length() == 0)
1075 insert_field('D', d);
1078 int same_year(const reference &r1, const reference &r2)
1081 const char *ys1 = r1.get_year(&ye1);
1083 const char *ys2 = r2.get_year(&ye2);
1086 return same_date(r1, r2);
1092 else if (ye1 - ys1 != ye2 - ys2)
1095 return memcmp(ys1, ys2, ye1 - ys1) == 0;
1098 int same_date(const reference &r1, const reference &r2)
1101 const char *s1 = r1.get_date(&e1);
1103 const char *s2 = r2.get_date(&e2);
1108 else if (e1 - s1 != e2 - s2)
1111 return memcmp(s1, s2, e1 - s1) == 0;
1114 const char *reference::get_sort_field(int i, int si, int ssi,
1115 const char **endp) const
1117 const char *start = sort_key.contents();
1118 const char *end = start + sort_key.length();
1124 start = (char *)memchr(start, SORT_SEP, end - start);
1129 const char *e = (char *)memchr(start, SORT_SEP, end - start);
1137 start = (char *)memchr(start, SORT_SUB_SEP, end - start);
1142 e = (char *)memchr(start, SORT_SUB_SEP, end - start);
1149 while (--ssi >= 0) {
1150 start = (char *)memchr(start, SORT_SUB_SUB_SEP, end - start);
1155 e = (char *)memchr(start, SORT_SUB_SUB_SEP, end - start);