2 * array.c - routines for associative arrays.
6 * Copyright (C) 1986, 1988, 1989, 1991-2011 the Free Software Foundation, Inc.
8 * This file is part of GAWK, the GNU implementation of the
9 * AWK Programming Language.
11 * GAWK is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 3 of the License, or
14 * (at your option) any later version.
16 * GAWK is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
29 * Tree walks (``for (iggy in foo)'') and array deletions use expensive
30 * linear searching. So what we do is start out with small arrays and
31 * grow them as needed, so that our arrays are hopefully small enough,
32 * most of the time, that they're pretty full and we're not looking at
35 * The decision is made to grow the array if the average chain length is
36 * ``too big''. This is defined as the total number of entries in the table
37 * divided by the size of the array being greater than some constant.
39 * We make the constant a variable, so that it can be tweaked
40 * via environment variable.
43 static size_t AVG_CHAIN_MAX = 2; /* Modern machines are bigger, reduce this from 10. */
45 static size_t SUBSEPlen;
48 static NODE *assoc_find(NODE *symbol, NODE *subs, unsigned long hash1, NODE **last);
49 static void grow_table(NODE *symbol);
51 static unsigned long gst_hash_string(const char *str, size_t len, unsigned long hsize, size_t *code);
52 static unsigned long scramble(unsigned long x);
53 static unsigned long awk_hash(const char *s, size_t len, unsigned long hsize, size_t *code);
55 unsigned long (*hash)(const char *s, size_t len, unsigned long hsize, size_t *code) = awk_hash;
57 /* qsort comparison function */
58 static int sort_up_index_string(const void *, const void *);
59 static int sort_down_index_string(const void *, const void *);
60 static int sort_up_index_number(const void *, const void *);
61 static int sort_down_index_number(const void *, const void *);
62 static int sort_up_value_string(const void *, const void *);
63 static int sort_down_value_string(const void *, const void *);
64 static int sort_up_value_number(const void *, const void *);
65 static int sort_down_value_number(const void *, const void *);
66 static int sort_up_value_type(const void *, const void *);
67 static int sort_down_value_type(const void *, const void *);
69 /* array_init --- check relevant environment variables */
78 if ((val = getenv("AVG_CHAIN_MAX")) != NULL && isdigit((unsigned char) *val)) {
79 newval = strtoul(val, & endptr, 10);
80 if (endptr != val && newval > 0)
81 AVG_CHAIN_MAX = newval;
84 if ((val = getenv("AWK_HASH")) != NULL && strcmp(val, "gst") == 0)
85 hash = gst_hash_string;
88 /* make_aname --- construct a 'vname' for a (sub)array */
91 make_aname(const NODE *symbol)
93 static char *aname = NULL;
95 static size_t max_alen;
98 if (symbol->parent_array != NULL) {
101 (void) make_aname(symbol->parent_array);
102 slen = strlen(symbol->vname); /* subscript in parent array */
103 if (alen + slen + 4 > max_alen) { /* sizeof("[\"\"]") = 4 */
104 max_alen = alen + slen + 4 + SLEN;
105 erealloc(aname, char *, (max_alen + 1) * sizeof(char *), "make_aname");
107 alen += sprintf(aname + alen, "[\"%s\"]", symbol->vname);
109 alen = strlen(symbol->vname);
111 max_alen = alen + SLEN;
112 emalloc(aname, char *, (max_alen + 1) * sizeof(char *), "make_aname");
113 } else if (alen > max_alen) {
114 max_alen = alen + SLEN;
115 erealloc(aname, char *, (max_alen + 1) * sizeof(char *), "make_aname");
117 memcpy(aname, symbol->vname, alen + 1);
124 * array_vname --- print the name of the array
126 * Returns a pointer to a statically maintained dynamically allocated string.
127 * It's appropriate for printing the name once; if the caller wants
128 * to save it, they have to make a copy.
132 array_vname(const NODE *symbol)
134 static char *message = NULL;
135 static size_t msglen = 0;
139 const NODE *save_symbol = symbol;
140 const char *from = _("from %s");
143 if (symbol->type != Node_array_ref
144 || symbol->orig_array->type != Node_var_array
146 if (symbol->type != Node_var_array || symbol->parent_array == NULL)
147 return symbol->vname;
148 return make_aname(symbol);
151 /* First, we have to compute the length of the string: */
155 while (symbol->type == Node_array_ref) {
156 len += strlen(symbol->vname);
158 symbol = symbol->prev_array;
161 /* Get the (sub)array name */
162 if (symbol->parent_array == NULL)
163 aname = symbol->vname;
165 aname = make_aname(symbol);
166 len += strlen(aname);
169 * Each node contributes by strlen(from) minus the length
170 * of "%s" in the translation (which is at least 2)
171 * plus 2 for ", " or ")\0"; this adds up to strlen(from).
173 len += n * strlen(from);
175 /* (Re)allocate memory: */
176 if (message == NULL) {
177 emalloc(message, char *, len, "array_vname");
179 } else if (len > msglen) {
180 erealloc(message, char *, len, "array_vname");
183 current buffer can hold new name */
185 /* We're ready to print: */
186 symbol = save_symbol;
189 * Ancient systems have sprintf() returning char *, not int.
190 * If you have one of those, use sprintf(..); s += strlen(s) instead.
193 s += sprintf(s, "%s (", symbol->vname);
195 symbol = symbol->prev_array;
196 if (symbol->type != Node_array_ref)
198 s += sprintf(s, from, symbol->vname);
199 s += sprintf(s, ", ");
201 s += sprintf(s, from, aname);
209 * get_array --- proceed to the actual Node_var_array,
210 * change Node_var_new to an array.
211 * If canfatal and type isn't good, die fatally,
212 * otherwise return the final actual value.
216 get_array(NODE *symbol, int canfatal)
218 NODE *save_symbol = symbol;
221 if (symbol->type == Node_param_list && (symbol->flags & FUNC) == 0) {
222 save_symbol = symbol = GET_PARAM(symbol->param_cnt);
224 if (symbol->type == Node_array_ref)
225 symbol = symbol->orig_array;
228 switch (symbol->type) {
230 symbol->type = Node_var_array;
231 symbol->var_array = NULL;
232 symbol->parent_array = NULL; /* main array has no parent */
238 case Node_param_list:
239 if ((symbol->flags & FUNC) == 0)
245 /* notably Node_var but catches also e.g. FS[1] = "x" */
247 if (symbol->type == Node_val)
248 fatal(_("attempt to use a scalar value as array"));
250 if ((symbol->flags & FUNC) != 0)
251 fatal(_("attempt to use function `%s' as an array"),
254 fatal(_("attempt to use scalar parameter `%s' as an array"),
257 fatal(_("attempt to use scalar `%s' as an array"),
267 /* set_SUBSEP --- update SUBSEP related variables when SUBSEP assigned to */
272 SUBSEP = force_string(SUBSEP_node->var_value)->stptr;
273 SUBSEPlen = SUBSEP_node->var_value->stlen;
276 /* concat_exp --- concatenate expression list into a single string */
279 concat_exp(int nargs, int do_subsep)
281 /* do_subsep is false for Node-concat */
286 size_t subseplen = 0;
288 extern NODE **args_array;
294 subseplen = SUBSEPlen;
297 for (i = 1; i <= nargs; i++) {
299 if (r->type == Node_var_array) {
301 DEREF(args_array[i]); /* avoid memory leak */
302 fatal(_("attempt to use array `%s' in a scalar context"), array_vname(r));
304 args_array[i] = force_string(r);
307 len += (nargs - 1) * subseplen;
309 emalloc(str, char *, len + 2, "concat_exp");
311 r = args_array[nargs];
312 memcpy(str, r->stptr, r->stlen);
315 for (i = nargs - 1; i > 0; i--) {
318 else if (subseplen > 0) {
319 memcpy(s, SUBSEP, subseplen);
323 memcpy(s, r->stptr, r->stlen);
328 return make_str_node(str, len, ALREADY_MALLOCED);
332 /* assoc_clear --- flush all the values in symbol[] */
335 assoc_clear(NODE *symbol)
340 if (symbol->var_array == NULL)
343 for (i = 0; i < symbol->array_size; i++) {
344 for (bucket = symbol->var_array[i]; bucket != NULL; bucket = next) {
345 next = bucket->ahnext;
346 if (bucket->ahvalue->type == Node_var_array) {
347 NODE *r = bucket->ahvalue;
348 assoc_clear(r); /* recursively clear all sub-arrays */
352 unref(bucket->ahvalue);
354 unref(bucket); /* unref() will free the ahname_str */
356 symbol->var_array[i] = NULL;
358 efree(symbol->var_array);
359 symbol->var_array = NULL;
360 symbol->array_size = symbol->table_size = 0;
361 symbol->flags &= ~ARRAYMAXED;
364 /* awk_hash --- calculate the hash function of the string in subs */
367 awk_hash(const char *s, size_t len, unsigned long hsize, size_t *code)
373 * Ozan Yigit's original sdbm hash, copied from Margo Seltzers
376 * This is INCREDIBLY ugly, but fast. We break the string up into
377 * 8 byte units. On the first time through the loop we get the
378 * "leftover bytes" (strlen % 8). On every other iteration, we
379 * perform 8 HASHC's so we handle all 8 bytes. Essentially, this
380 * saves us 7 cmp & branch instructions. If this routine is
381 * heavily used enough, it's worth the ugly coding.
386 * #define HASHC h = *s++ + 65599 * h
387 * Because 65599 = pow(2, 6) + pow(2, 16) - 1 we multiply by shifts
389 * 4/2011: Force the results to 32 bits, to get the same
390 * result on both 32- and 64-bit systems. This may be a
393 #define HASHC htmp = (h << 6); \
394 h = *s++ + htmp + (htmp << 10) - h ; \
395 htmp &= 0xFFFFFFFF; \
400 /* "Duff's Device" */
402 size_t loop = (len + 8 - 1) >> 3;
404 switch (len & (8 - 1)) {
406 do { /* All fall throughs */
427 /* assoc_find --- locate symbol[subs] */
429 static NODE * /* NULL if not found */
430 assoc_find(NODE *symbol, NODE *subs, unsigned long hash1, NODE **last)
437 for (prev = NULL, bucket = symbol->var_array[hash1]; bucket != NULL;
438 prev = bucket, bucket = bucket->ahnext) {
440 * This used to use cmp_nodes() here. That's wrong.
441 * Array indices are strings; compare as such, always!
443 s1_str = bucket->ahname_str;
444 s1_len = bucket->ahname_len;
447 if (s1_len == s2->stlen) {
448 if (s1_len == 0 /* "" is a valid index */
449 || memcmp(s1_str, s2->stptr, s1_len) == 0)
458 /* in_array --- test whether the array element symbol[subs] exists or not,
459 * return pointer to value if it does.
463 in_array(NODE *symbol, NODE *subs)
468 assert(symbol->type == Node_var_array);
470 if (symbol->var_array == NULL)
473 hash1 = hash(subs->stptr, subs->stlen, (unsigned long) symbol->array_size, NULL);
474 ret = assoc_find(symbol, subs, hash1, NULL);
475 return (ret ? ret->ahvalue : NULL);
480 * Find SYMBOL[SUBS] in the assoc array. Install it with value "" if it
481 * isn't there. Returns a pointer ala get_lhs to where its value is stored.
483 * SYMBOL is the address of the node (or other pointer) being dereferenced.
484 * SUBS is a number or string used as the subscript.
488 assoc_lookup(NODE *symbol, NODE *subs, int reference)
494 assert(symbol->type == Node_var_array);
496 (void) force_string(subs);
498 if (symbol->var_array == NULL) {
499 symbol->array_size = symbol->table_size = 0; /* sanity */
500 symbol->flags &= ~ARRAYMAXED;
502 hash1 = hash(subs->stptr, subs->stlen,
503 (unsigned long) symbol->array_size, & code);
505 hash1 = hash(subs->stptr, subs->stlen,
506 (unsigned long) symbol->array_size, & code);
507 bucket = assoc_find(symbol, subs, hash1, NULL);
509 return &(bucket->ahvalue);
512 if (do_lint && reference) {
513 lintwarn(_("reference to uninitialized element `%s[\"%.*s\"]'"),
514 array_vname(symbol), (int)subs->stlen, subs->stptr);
517 /* It's not there, install it. */
518 if (do_lint && subs->stlen == 0)
519 lintwarn(_("subscript of array `%s' is null string"),
520 array_vname(symbol));
522 /* first see if we would need to grow the array, before installing */
523 symbol->table_size++;
524 if ((symbol->flags & ARRAYMAXED) == 0
525 && (symbol->table_size / symbol->array_size) > AVG_CHAIN_MAX) {
527 /* have to recompute hash value for new size */
528 hash1 = code % (unsigned long) symbol->array_size;
532 bucket->type = Node_ahash;
535 * Freeze this string value --- it must never
536 * change, no matter what happens to the value
537 * that created it or to CONVFMT, etc.
539 * One day: Use an atom table to track array indices,
540 * and avoid the extra memory overhead.
542 bucket->flags |= MALLOC;
543 bucket->ahname_ref = 1;
545 emalloc(bucket->ahname_str, char *, subs->stlen + 2, "assoc_lookup");
546 bucket->ahname_len = subs->stlen;
547 memcpy(bucket->ahname_str, subs->stptr, subs->stlen);
548 bucket->ahname_str[bucket->ahname_len] = '\0';
549 bucket->ahvalue = Nnull_string;
551 bucket->ahnext = symbol->var_array[hash1];
552 bucket->ahcode = code;
555 * Set the numeric value for the index if it's available. Useful
556 * for numeric sorting by index. Do this only if the numeric
557 * value is available, instead of all the time, since doing it
558 * all the time is a big performance hit for something that may
561 if ((subs->flags & NUMCUR) != 0) {
562 bucket->ahname_num = subs->numbr;
563 bucket->flags |= NUMIND;
566 /* hook it into the symbol table */
567 symbol->var_array[hash1] = bucket;
568 return &(bucket->ahvalue);
572 /* adjust_fcall_stack: remove subarray(s) of symbol[] from
573 * function call stack.
577 adjust_fcall_stack(NODE *symbol, int nsubs)
584 * Solve the nasty problem of disappearing subarray arguments:
586 * function f(c, d) { delete c; .. use non-existent array d .. }
587 * BEGIN { a[0][0] = 1; f(a, a[0]); .. }
589 * The fix is to convert 'd' to a local empty array; This has
590 * to be done before clearing the parent array to avoid referring to
591 * already free-ed memory.
593 * Similar situations exist for builtins accepting more than
594 * one array argument: split, patsplit, asort and asorti. For example:
596 * BEGIN { a[0][0] = 1; split("abc", a, "", a[0]) }
598 * These cases do not involve the function call stack, and are
599 * handled individually in their respective routines.
602 func = frame_ptr->func_node;
603 if (func == NULL) /* in main */
605 pcount = func->lnode->param_cnt;
606 sp = frame_ptr->stack;
608 for (; pcount > 0; pcount--) {
610 if (r->type != Node_array_ref
611 || r->orig_array->type != Node_var_array)
617 && symbol->parent_array != NULL
620 /* 'symbol' is a subarray, and 'r' is the same subarray:
622 * function f(c, d) { delete c[0]; .. }
623 * BEGIN { a[0][0] = 1; f(a, a[0]); .. }
625 * But excludes cases like (nsubs = 0):
627 * function f(c, d) { delete c; ..}
628 * BEGIN { a[0][0] = 1; f(a[0], a[0]); ...}
633 memset(r, '\0', sizeof(NODE));
635 r->type = Node_var_array;
640 for (n = n->parent_array; n != NULL; n = n->parent_array) {
641 assert(n->type == Node_var_array);
643 /* 'r' is a subarray of 'symbol':
645 * function f(c, d) { delete c; .. use d as array .. }
646 * BEGIN { a[0][0] = 1; f(a, a[0]); .. }
648 * BEGIN { a[0][0][0][0] = 1; f(a[0], a[0][0][0]); .. }
659 /* do_delete --- perform `delete array[s]' */
663 * `nsubs' is number of subscripts
667 do_delete(NODE *symbol, int nsubs)
669 unsigned long hash1 = 0;
670 NODE *subs, *bucket, *last, *r;
673 assert(symbol->type == Node_var_array);
674 subs = bucket = last = r = NULL; /* silence the compiler */
677 * The force_string() call is needed to make sure that
678 * the string subscript is reasonable. For example, with it:
680 * $ ./gawk --posix 'BEGIN { CONVFMT="%ld"; delete a[1.233]}'
681 * gawk: cmd. line:1: fatal: `%l' is not permitted in POSIX awk formats
683 * Without it, the code does not fail.
686 #define free_subs(n) \
688 NODE *s = PEEK(n - 1); \
689 if (s->type == Node_val) { \
690 (void) force_string(s); /* may have side effects ? */ \
695 if (nsubs == 0) { /* delete array */
696 adjust_fcall_stack(symbol, 0); /* fix function call stack; See above. */
701 /* NB: subscripts are in reverse order on stack */
703 for (i = nsubs; i > 0; i--) {
705 if (subs->type != Node_val) {
707 fatal(_("attempt to use array `%s' in a scalar context"), array_vname(subs));
709 (void) force_string(subs);
711 last = NULL; /* shut up gcc -Wall */
712 hash1 = 0; /* ditto */
713 bucket = NULL; /* array may be empty */
715 if (symbol->var_array != NULL) {
716 hash1 = hash(subs->stptr, subs->stlen,
717 (unsigned long) symbol->array_size, NULL);
718 bucket = assoc_find(symbol, subs, hash1, &last);
721 if (bucket == NULL) {
723 lintwarn(_("delete: index `%s' not in array `%s'"),
724 subs->stptr, array_vname(symbol));
725 /* avoid memory leak, free all subs */
731 if (bucket->ahvalue->type != Node_var_array) {
732 /* e.g.: a[1] = 1; delete a[1][1] */
734 fatal(_("attempt to use scalar `%s[\"%.*s\"]' as an array"),
736 (int) bucket->ahname_len,
739 symbol = bucket->ahvalue;
745 if (r->type == Node_var_array) {
746 adjust_fcall_stack(r, nsubs); /* fix function call stack; See above. */
748 /* cleared a sub-array, free Node_var_array */
755 last->ahnext = bucket->ahnext;
757 symbol->var_array[hash1] = bucket->ahnext;
759 unref(bucket); /* unref() will free the ahname_str */
760 symbol->table_size--;
761 if (symbol->table_size <= 0) {
762 symbol->table_size = symbol->array_size = 0;
763 symbol->flags &= ~ARRAYMAXED;
764 if (symbol->var_array != NULL) {
765 efree(symbol->var_array);
766 symbol->var_array = NULL;
774 /* do_delete_loop --- simulate ``for (iggy in foo) delete foo[iggy]'' */
777 * The primary hassle here is that `iggy' needs to have some arbitrary
778 * array index put in it before we can clear the array, we can't
779 * just replace the loop with `delete foo'.
783 do_delete_loop(NODE *symbol, NODE **lhs)
787 assert(symbol->type == Node_var_array);
789 if (symbol->var_array == NULL)
792 /* get first index value */
793 for (i = 0; i < symbol->array_size; i++) {
794 if (symbol->var_array[i] != NULL) {
796 *lhs = make_string(symbol->var_array[i]->ahname_str,
797 symbol->var_array[i]->ahname_len);
802 /* blast the array in one shot */
803 adjust_fcall_stack(symbol, 0);
807 /* grow_table --- grow a hash table */
810 grow_table(NODE *symbol)
812 NODE **old, **new, *chain, *next;
815 unsigned long oldsize, newsize, k;
817 * This is an array of primes. We grow the table by an order of
818 * magnitude each time (not just doubling) so that growing is a
819 * rare operation. We expect, on average, that it won't happen
820 * more than twice. When things are very large (> 8K), we just
821 * double more or less, instead of just jumping from 8K to 64K.
823 static const long sizes[] = {
824 13, 127, 1021, 8191, 16381, 32749, 65497, 131101, 262147,
825 524309, 1048583, 2097169, 4194319, 8388617, 16777259, 33554467,
826 67108879, 134217757, 268435459, 536870923, 1073741827
829 /* find next biggest hash size */
830 newsize = oldsize = symbol->array_size;
831 for (i = 0, j = sizeof(sizes)/sizeof(sizes[0]); i < j; i++) {
832 if (oldsize < sizes[i]) {
838 if (newsize == oldsize) { /* table already at max (!) */
839 symbol->flags |= ARRAYMAXED;
843 /* allocate new table */
844 emalloc(new, NODE **, newsize * sizeof(NODE *), "grow_table");
845 memset(new, '\0', newsize * sizeof(NODE *));
847 /* brand new hash table, set things up and return */
848 if (symbol->var_array == NULL) {
849 symbol->table_size = 0;
853 /* old hash table there, move stuff to new, free old */
854 old = symbol->var_array;
855 for (k = 0; k < oldsize; k++) {
859 for (chain = old[k]; chain != NULL; chain = next) {
860 next = chain->ahnext;
861 hash1 = chain->ahcode % newsize;
863 /* remove from old list, add to new */
864 chain->ahnext = new[hash1];
872 * note that symbol->table_size does not change if an old array,
873 * and is explicitly set to 0 if a new one.
875 symbol->var_array = new;
876 symbol->array_size = newsize;
879 /* pr_node --- print simple node info */
884 if ((n->flags & NUMBER) != 0)
885 printf("%s %g p: %p", flags2str(n->flags), n->numbr, n);
887 printf("%s %.*s p: %p", flags2str(n->flags),
888 (int) n->stlen, n->stptr, n);
893 indent(int indent_level)
896 for (k = 0; k < indent_level; k++)
900 /* assoc_dump --- dump the contents of an array */
903 assoc_dump(NODE *symbol, int indent_level)
908 indent(indent_level);
909 if (symbol->var_array == NULL) {
910 printf(_("%s: empty (null)\n"), symbol->vname);
911 return make_number((AWKNUM) 0);
914 if (symbol->table_size == 0) {
915 printf(_("%s: empty (zero)\n"), symbol->vname);
916 return make_number((AWKNUM) 0);
919 printf(_("%s: table_size = %d, array_size = %d\n"), symbol->vname,
920 (int) symbol->table_size, (int) symbol->array_size);
922 for (i = 0; i < symbol->array_size; i++) {
923 for (bucket = symbol->var_array[i]; bucket != NULL;
924 bucket = bucket->ahnext) {
925 indent(indent_level);
926 printf("%s: I: [len %d <%.*s> p: %p] V: [",
928 (int) bucket->ahname_len,
929 (int) bucket->ahname_len,
932 if (bucket->ahvalue->type == Node_var_array) {
934 assoc_dump(bucket->ahvalue, indent_level + 1);
935 indent(indent_level);
937 pr_node(bucket->ahvalue);
942 return make_number((AWKNUM) 0);
945 /* do_adump --- dump an array: interface to assoc_dump */
953 if (a->type == Node_param_list) {
954 printf(_("%s: is parameter\n"), a->vname);
955 a = GET_PARAM(a->param_cnt);
957 if (a->type == Node_array_ref) {
958 printf(_("%s: array_ref to %s\n"), a->vname,
959 a->orig_array->vname);
962 if (a->type != Node_var_array)
963 fatal(_("adump: argument not an array"));
964 r = assoc_dump(a, 0);
969 * The following functions implement the builtin
970 * asort function. Initial work by Alan J. Broder,
974 /* dup_table --- recursively duplicate input array "symbol" */
977 dup_table(NODE *symbol, NODE *newsymb)
979 NODE **old, **new, *chain, *bucket;
981 unsigned long cursize;
983 /* find the current hash size */
984 cursize = symbol->array_size;
988 /* input is a brand new hash table, so there's nothing to copy */
989 if (symbol->var_array == NULL)
990 newsymb->table_size = 0;
992 /* old hash table there, dupnode stuff into a new table */
994 /* allocate new table */
995 emalloc(new, NODE **, cursize * sizeof(NODE *), "dup_table");
996 memset(new, '\0', cursize * sizeof(NODE *));
998 /* do the copying/dupnode'ing */
999 old = symbol->var_array;
1000 for (i = 0; i < cursize; i++) {
1001 if (old[i] != NULL) {
1002 for (chain = old[i]; chain != NULL;
1003 chain = chain->ahnext) {
1004 /* get a node for the linked list */
1006 bucket->type = Node_ahash;
1007 bucket->flags |= MALLOC;
1008 bucket->ahname_ref = 1;
1009 bucket->ahcode = chain->ahcode;
1010 if ((chain->flags & NUMIND) != 0) {
1011 bucket->ahname_num = chain->ahname_num;
1012 bucket->flags |= NUMIND;
1016 * copy the corresponding name and
1017 * value from the original input list
1019 emalloc(bucket->ahname_str, char *, chain->ahname_len + 2, "dup_table");
1020 bucket->ahname_len = chain->ahname_len;
1022 memcpy(bucket->ahname_str, chain->ahname_str, chain->ahname_len);
1023 bucket->ahname_str[bucket->ahname_len] = '\0';
1025 if (chain->ahvalue->type == Node_var_array) {
1028 r->type = Node_var_array;
1029 r->vname = estrdup(chain->ahname_str, chain->ahname_len);
1030 r->parent_array = newsymb;
1031 bucket->ahvalue = dup_table(chain->ahvalue, r);
1033 bucket->ahvalue = dupnode(chain->ahvalue);
1036 * put the node on the corresponding
1037 * linked list in the new table
1039 bucket->ahnext = new[i];
1044 newsymb->table_size = symbol->table_size;
1047 newsymb->var_array = new;
1048 newsymb->array_size = cursize;
1049 newsymb->flags = symbol->flags; /* ARRAYMAXED */
1054 /* asort_actual --- do the actual work to sort the input array */
1057 asort_actual(int nargs, SORT_CTXT ctxt)
1059 NODE *array, *dest = NULL, *result;
1062 #define TSIZE 100 /* an arbitrary amount */
1063 static char buf[TSIZE+2];
1064 unsigned long num_elems, i;
1065 const char *sort_str;
1067 if (nargs == 3) /* 3rd optional arg */
1070 s = Nnull_string; /* "" => default sorting */
1072 s = force_string(s);
1073 sort_str = s->stptr;
1074 if (s->stlen == 0) { /* default sorting */
1076 sort_str = "@val_type_asc";
1078 sort_str = "@ind_str_asc";
1082 if (nargs >= 2) { /* 2nd optional arg */
1084 if (dest->type != Node_var_array) {
1085 fatal(ctxt == ASORT ?
1086 _("asort: second argument not an array") :
1087 _("asorti: second argument not an array"));
1091 array = POP_PARAM();
1092 if (array->type != Node_var_array) {
1093 fatal(ctxt == ASORT ?
1094 _("asort: first argument not an array") :
1095 _("asorti: first argument not an array"));
1099 for (r = dest->parent_array; r != NULL; r = r->parent_array) {
1101 fatal(ctxt == ASORT ?
1102 _("asort: cannot use a subarray of first arg for second arg") :
1103 _("asorti: cannot use a subarray of first arg for second arg"));
1105 for (r = array->parent_array; r != NULL; r = r->parent_array) {
1107 fatal(ctxt == ASORT ?
1108 _("asort: cannot use a subarray of second arg for first arg") :
1109 _("asorti: cannot use a subarray of second arg for first arg"));
1113 num_elems = array->table_size;
1114 if (num_elems == 0 || array->var_array == NULL) { /* source array is empty */
1115 if (dest != NULL && dest != array)
1117 return make_number((AWKNUM) 0);
1120 /* sorting happens inside assoc_list */
1121 list = assoc_list(array, sort_str, ctxt);
1125 * Must not assoc_clear() the source array before constructing
1126 * the output array. assoc_list() does not duplicate array values
1127 * which are needed for asort().
1130 if (dest != NULL && dest != array) {
1134 /* use 'result' as a temporary destination array */
1136 memset(result, '\0', sizeof(NODE));
1137 result->type = Node_var_array;
1138 result->vname = array->vname;
1139 result->parent_array = array->parent_array;
1142 subs = make_str_node(buf, TSIZE, ALREADY_MALLOCED); /* fake it */
1143 subs->flags &= ~MALLOC; /* safety */
1144 for (i = 1, ptr = list; i <= num_elems; i++) {
1145 sprintf(buf, "%lu", i);
1146 subs->stlen = strlen(buf);
1147 /* make number valid in case this array gets sorted later */
1149 subs->flags |= NUMCUR;
1151 if (ctxt == ASORTI) {
1153 * We want the indices of the source array as values
1154 * of the 'result' array.
1156 *assoc_lookup(result, subs, FALSE) =
1157 make_string(r->ahname_str, r->ahname_len);
1161 /* We want the values of the source array. */
1164 if (result != dest) {
1165 /* optimization for dest = NULL or dest = array */
1167 if (val->type == Node_var_array) {
1168 /* update subarray index in parent array */
1170 val->vname = estrdup(subs->stptr, subs->stlen);
1172 *assoc_lookup(result, subs, FALSE) = val;
1173 r->ahvalue = Nnull_string;
1175 if (val->type == Node_val)
1176 *assoc_lookup(result, subs, FALSE) = dupnode(val);
1181 * There isn't any reference counting for
1182 * subarrays, so recursively copy subarrays
1183 * using dup_table().
1186 arr->type = Node_var_array;
1187 arr->var_array = NULL;
1188 arr->vname = estrdup(subs->stptr, subs->stlen);
1189 arr->parent_array = array; /* actual parent, not the temporary one. */
1190 *assoc_lookup(result, subs, FALSE) = dup_table(val, arr);
1198 freenode(subs); /* stptr(buf) not malloc-ed */
1201 if (result != dest) {
1202 /* dest == NULL or dest == array */
1204 *array = *result; /* copy result into array */
1208 dest != NULL and dest != array */
1210 return make_number((AWKNUM) num_elems);
1214 /* do_asort --- sort array by value */
1219 return asort_actual(nargs, ASORT);
1222 /* do_asorti --- sort array by index */
1225 do_asorti(int nargs)
1227 return asort_actual(nargs, ASORTI);
1231 * cmp_string --- compare two strings; logic similar to cmp_nodes() in eval.c
1232 * except the extra case-sensitive comparison when the case-insensitive
1233 * result is a match.
1237 cmp_string(const NODE *n1, const NODE *n2)
1244 assert(n1->type == n2->type);
1245 if (n1->type == Node_ahash) {
1246 s1 = n1->ahname_str;
1247 len1 = n1->ahname_len;
1248 s2 = n2->ahname_str;
1249 len2 = n2->ahname_len;
1258 return len2 == 0 ? 0 : -1;
1262 /* len1 > 0 && len2 > 0 */
1263 lmin = len1 < len2 ? len1 : len2;
1266 const unsigned char *cp1 = (const unsigned char *) s1;
1267 const unsigned char *cp2 = (const unsigned char *) s2;
1270 if (gawk_mb_cur_max > 1) {
1271 ret = strncasecmpmbs((const unsigned char *) cp1,
1272 (const unsigned char *) cp2, lmin);
1275 for (ret = 0; lmin-- > 0 && ret == 0; cp1++, cp2++)
1276 ret = casetable[*cp1] - casetable[*cp2];
1280 * If case insensitive result is "they're the same",
1281 * use case sensitive comparison to force distinct order.
1285 ret = memcmp(s1, s2, lmin);
1286 if (ret != 0 || len1 == len2)
1288 return (len1 < len2) ? -1 : 1;
1292 /* sort_up_index_string --- qsort comparison function; ascending index strings. */
1295 sort_up_index_string(const void *p1, const void *p2)
1297 const NODE *t1, *t2;
1299 /* Array indices are strings */
1300 t1 = *((const NODE *const *) p1);
1301 t2 = *((const NODE *const *) p2);
1302 return cmp_string(t1, t2);
1306 /* sort_down_index_string --- descending index strings */
1309 sort_down_index_string(const void *p1, const void *p2)
1312 * Negation versus transposed arguments: when all keys are
1313 * distinct, as with array indices here, either method will
1314 * transform an ascending sort into a descending one. But if
1315 * there are equal keys--such as when IGNORECASE is honored--
1316 * that get disambiguated into a determisitc order, negation
1317 * will reverse those but transposed arguments would retain
1318 * their relative order within the rest of the reversed sort.
1320 return -sort_up_index_string(p1, p2);
1324 /* sort_up_index_number --- qsort comparison function; ascending index numbers. */
1327 sort_up_index_number(const void *p1, const void *p2)
1329 const NODE *n1, *n2;
1332 n1 = *((const NODE *const *) p1);
1333 n2 = *((const NODE *const *) p2);
1335 if (n1->ahname_num < n2->ahname_num)
1338 ret = (n1->ahname_num > n2->ahname_num);
1340 /* break a tie with the index string itself */
1342 return cmp_string(n1, n2);
1347 /* sort_down_index_number --- qsort comparison function; descending index numbers */
1350 sort_down_index_number(const void *p1, const void *p2)
1352 return -sort_up_index_number(p1, p2);
1356 /* sort_up_value_string --- qsort comparison function; ascending value string */
1359 sort_up_value_string(const void *p1, const void *p2)
1361 const NODE *t1, *t2;
1364 /* we're passed a pair of index (array subscript) nodes */
1365 t1 = *(const NODE *const *) p1;
1366 t2 = *(const NODE *const *) p2;
1368 /* and we want to compare the element values they refer to */
1372 if (n1->type == Node_var_array) {
1373 /* return 0 if n2 is a sub-array too, else return 1 */
1374 return (n2->type != Node_var_array);
1376 if (n2->type == Node_var_array)
1377 return -1; /* n1 (scalar) < n2 (sub-array) */
1379 /* n1 and n2 both have string values; See sort_force_value_string(). */
1380 return cmp_string(n1, n2);
1384 /* sort_down_value_string --- descending value string */
1387 sort_down_value_string(const void *p1, const void *p2)
1389 return -sort_up_value_string(p1, p2);
1392 /* sort_up_value_number --- qsort comparison function; ascending value number */
1395 sort_up_value_number(const void *p1, const void *p2)
1397 const NODE *t1, *t2;
1401 /* we're passed a pair of index (array subscript) nodes */
1402 t1 = *(const NODE *const *) p1;
1403 t2 = *(const NODE *const *) p2;
1405 /* and we want to compare the element values they refer to */
1409 if (n1->type == Node_var_array) {
1410 /* return 0 if n2 is a sub-array too, else return 1 */
1411 return (n2->type != Node_var_array);
1413 if (n2->type == Node_var_array)
1414 return -1; /* n1 (scalar) < n2 (sub-array) */
1416 /* n1 and n2 both Node_val, and force_number'ed */
1417 if (n1->numbr < n2->numbr)
1420 ret = (n1->numbr > n2->numbr);
1424 * Use string value to guarantee same sort order on all
1425 * versions of qsort().
1427 n1 = force_string(n1);
1428 n2 = force_string(n2);
1429 ret = cmp_string(n1, n2);
1435 /* sort_down_value_number --- descending value number */
1438 sort_down_value_number(const void *p1, const void *p2)
1440 return -sort_up_value_number(p1, p2);
1443 /* sort_up_value_type --- qsort comparison function; ascending value type */
1446 sort_up_value_type(const void *p1, const void *p2)
1448 const NODE *t1, *t2;
1451 /* we're passed a pair of index (array subscript) nodes */
1452 t1 = *(const NODE *const *) p1;
1453 t2 = *(const NODE *const *) p2;
1455 /* and we want to compare the element values they refer to */
1459 /* 1. Arrays vs. scalar, scalar is less than array */
1460 if (n1->type == Node_var_array) {
1461 /* return 0 if n2 is a sub-array too, else return 1 */
1462 return (n2->type != Node_var_array);
1464 if (n2->type == Node_var_array) {
1465 return -1; /* n1 (scalar) < n2 (sub-array) */
1469 /* 2. Resolve MAYBE_NUM, so that have only NUMBER or STRING */
1470 if ((n1->flags & MAYBE_NUM) != 0)
1471 (void) force_number(n1);
1472 if ((n2->flags & MAYBE_NUM) != 0)
1473 (void) force_number(n2);
1475 if ((n1->flags & NUMBER) != 0 && (n2->flags & NUMBER) != 0) {
1476 if (n1->numbr < n2->numbr)
1478 else if (n1->numbr > n2->numbr)
1484 /* 3. All numbers are less than all strings. This is aribitrary. */
1485 if ((n1->flags & NUMBER) != 0 && (n2->flags & STRING) != 0) {
1487 } else if ((n1->flags & STRING) != 0 && (n2->flags & NUMBER) != 0) {
1491 /* 4. Two strings */
1492 return cmp_string(n1, n2);
1495 /* sort_down_value_type --- descending value type */
1498 sort_down_value_type(const void *p1, const void *p2)
1500 return -sort_up_value_type(p1, p2);
1503 /* sort_user_func --- user defined qsort comparison function */
1506 sort_user_func(const void *p1, const void *p2)
1508 const NODE *t1, *t2;
1509 NODE *idx1, *idx2, *val1, *val2;
1513 t1 = *((const NODE *const *) p1);
1514 t2 = *((const NODE *const *) p2);
1516 idx1 = make_string(t1->ahname_str, t1->ahname_len);
1517 idx2 = make_string(t2->ahname_str, t2->ahname_len);
1521 code = TOP()->code_ptr; /* comparison function call instructions */
1523 /* setup 4 arguments to comp_func() */
1525 if (val1->type == Node_val)
1529 if (val2->type == Node_val)
1533 /* execute the comparison function */
1534 (void) interpret(code);
1536 /* return value of the comparison function */
1539 return (ret < 0.0) ? -1 : (ret > 0.0);
1542 /* sort_force_index_number -- pre-process list items for sorting indices as numbers */
1545 sort_force_index_number(NODE **list, size_t num_elems)
1549 static NODE temp_node;
1551 for (i = 0; i < num_elems; i++) {
1554 if ((r->flags & NUMIND) != 0) /* once in a lifetime is plenty */
1556 temp_node.type = Node_val;
1557 temp_node.stptr = r->ahname_str;
1558 temp_node.stlen = r->ahname_len;
1559 temp_node.flags = 0; /* only interested in the return value of r_force_number */
1560 r->ahname_num = r_force_number(& temp_node);
1565 /* sort_force_value_number -- pre-process list items for sorting values as numbers */
1568 sort_force_value_number(NODE **list, size_t num_elems)
1573 for (i = 0; i < num_elems; i++) {
1576 if (val->type == Node_val)
1577 (void) force_number(val);
1581 /* sort_force_value_string -- pre-process list items for sorting values as strings */
1584 sort_force_value_string(NODE **list, size_t num_elems)
1589 for (i = 0; i < num_elems; i++) {
1592 if (val->type == Node_val)
1593 r->ahvalue = force_string(val);
1597 /* assoc_list -- construct, and optionally sort, a list of array elements */
1600 assoc_list(NODE *array, const char *sort_str, SORT_CTXT sort_ctxt)
1602 typedef void (*qsort_prefunc)(NODE **, size_t);
1603 typedef int (*qsort_compfunc)(const void *, const void *);
1605 static const struct qsort_funcs {
1607 qsort_compfunc comp_func;
1608 qsort_prefunc pre_func; /* pre-processing of list items */
1610 { "@ind_str_asc", sort_up_index_string, 0 },
1611 { "@ind_num_asc", sort_up_index_number, sort_force_index_number },
1612 { "@val_str_asc", sort_up_value_string, sort_force_value_string },
1613 { "@val_num_asc", sort_up_value_number, sort_force_value_number },
1614 { "@ind_str_desc", sort_down_index_string, 0 },
1615 { "@ind_num_desc", sort_down_index_number, sort_force_index_number },
1616 { "@val_str_desc", sort_down_value_string, sort_force_value_string },
1617 { "@val_num_desc", sort_down_value_number, sort_force_value_number },
1618 { "@val_type_asc", sort_up_value_type, 0 },
1619 { "@val_type_desc", sort_down_value_type, 0 },
1620 { "@unsorted", 0, 0 },
1624 size_t num_elems, i, j;
1625 qsort_compfunc cmp_func = 0;
1626 qsort_prefunc pre_func = 0;
1627 INSTRUCTION *code = NULL;
1631 num_elems = array->table_size;
1632 assert(num_elems > 0);
1634 for (qi = 0, j = sizeof(sort_funcs)/sizeof(sort_funcs[0]); qi < j; qi++) {
1635 if (strcmp(sort_funcs[qi].name, sort_str) == 0)
1639 if (qi >= 0 && qi < j) {
1640 cmp_func = sort_funcs[qi].comp_func;
1641 pre_func = sort_funcs[qi].pre_func;
1643 } else { /* unrecognized */
1647 assert(sort_str != NULL);
1649 for (sp = sort_str; *sp != '\0'
1650 && ! isspace((unsigned char) *sp); sp++)
1653 /* empty string or string with space(s) not valid as function name */
1654 if (sp == sort_str || *sp != '\0')
1655 fatal(_("`%s' is invalid as a function name"), sort_str);
1657 f = lookup(sort_str);
1658 if (f == NULL || f->type != Node_func)
1659 fatal(_("sort comparison function `%s' is not defined"), sort_str);
1661 cmp_func = sort_user_func;
1662 /* pre_func is still NULL */
1664 /* make function call instructions */
1665 code = bcalloc(Op_func_call, 2, 0);
1666 code->func_body = f;
1667 code->func_name = NULL; /* not needed, func_body already assigned */
1668 (code + 1)->expr_count = 4; /* function takes 4 arguments */
1669 code->nexti = bcalloc(Op_stop, 1, 0);
1671 /* make non-redirected getline, exit, `next' and `nextfile' fatal in
1672 * callback function by setting currule in interpret()
1676 (code + 1)->inrule = currule; /* save current rule */
1682 /* allocate space for array; the extra space is used in for(i in a) opcode (eval.c) */
1683 emalloc(list, NODE **, (num_elems + 1) * sizeof(NODE *), "assoc_list");
1686 for (i = j = 0; i < array->array_size; i++)
1687 for (r = array->var_array[i]; r != NULL; r = r->ahnext)
1688 list[j++] = dupnode(r);
1689 list[num_elems] = NULL;
1691 if (! cmp_func) /* unsorted */
1694 /* special pre-processing of list items */
1696 pre_func(list, num_elems);
1698 qsort(list, num_elems, sizeof(NODE *), cmp_func); /* shazzam! */
1700 if (cmp_func == sort_user_func) {
1702 currule = (code + 1)->inrule; /* restore current rule */
1703 bcfree(code->nexti); /* Op_stop */
1704 bcfree(code); /* Op_func_call */
1712 From bonzini@gnu.org Mon Oct 28 16:05:26 2002
1713 Date: Mon, 28 Oct 2002 13:33:03 +0100
1714 From: Paolo Bonzini <bonzini@gnu.org>
1715 To: arnold@skeeve.com
1716 Subject: Hash function
1717 Message-ID: <20021028123303.GA6832@biancaneve>
1719 Here is the hash function I'm using in GNU Smalltalk. The scrambling is
1720 needed if you use powers of two as the table sizes. If you use primes it
1723 To use double-hashing with power-of-two size, you should use the
1724 _gst_hash_string(str, len) as the primary hash and
1725 scramble(_gst_hash_string (str, len)) | 1 as the secondary hash.
1731 * ADR: Slightly modified to work w/in the context of gawk.
1734 static unsigned long
1735 gst_hash_string(const char *str, size_t len, unsigned long hsize, size_t *code)
1737 unsigned long hashVal = 1497032417; /* arbitrary value */
1742 hashVal += (hashVal << 10);
1743 hashVal ^= (hashVal >> 6);
1746 ret = scramble(hashVal);
1757 static unsigned long
1758 scramble(unsigned long x)
1760 if (sizeof(long) == 4) {
1763 x += (y << 10) | (y >> 22);
1764 x += (x << 6) | (x >> 26);
1765 x -= (x << 16) | (x >> 16);
1768 x += (x << 21) | (x >> 11);
1769 x += (x << 5) | (x >> 27);
1770 x += (x << 27) | (x >> 5);