3 % Revision 1.4 1996/09/18 00:37:25 mike
4 % 1) Fix stupid bozo in A[expr], expr is numeric and not integer.
5 % 2) Add static for non-ansi compilers.
6 % 3) Minor tweaks to documentation.
8 % Revision 1.3 1996/07/28 21:55:32 mike
9 % trivial change -- add extra {}
11 % Revision 1.2 1996/02/25 23:42:25 mike
12 % Fix zfree bug in array_clear.
13 % Clean up documentation.
17 \def\hi{\smallskip\hangindent\parindent\indent\ignorespaces}
21 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
27 <<array typedefs and [[#defines]]>>
28 <<interface prototypes>>
38 <<local constants, defs and prototypes>>
39 <<interface functions>>
43 The type [[ARRAY]] is a pointer to a [[struct array]].
44 The [[size]] field is the number of elements in the table.
45 The meaning of the other fields depends on the [[type]] field.
47 <<array typedefs and [[#defines]]>>=
48 typedef struct array {
49 PTR ptr ; /* What this points to depends on the type */
50 unsigned size ; /* number of elts in the table */
51 unsigned limit ; /* Meaning depends on type */
52 unsigned hmask ; /* bitwise and with hash value to get table index */
53 short type ; /* values in AY_NULL .. AY_SPLIT */
57 There are three types of arrays and these are distinguished by the
58 [[type]] field in the structure. The types are:
60 \hi [[AY_NULL]]\quad The array is empty and the [[size]] field is always
61 zero. The other fields have no meaning.
63 \hi [[AY_SPLIT]]\quad The array was created by the [[AWK]] built-in
64 [[split]]. The return value from [[split]] is stored in the [[size]]
65 field. The [[ptr]] field points at a vector of [[CELLs]]. The number
66 of [[CELLs]] is the [[limit]] field. It is always true that
67 ${\it size}\leq{\it limit}$. The address of [[A[i]]] is [[(CELL*)A->ptr+i-1]]
68 for $1\leq i\leq{\it size}$. The [[hmask]] field has no meaning.
70 \hi {\bf Hash Table}\quad The array is a hash table. If the [[AY_STR]] bit
71 in the [[type]] field is set, then the table is keyed on strings.
72 If the [[AY_INT]] bit in the [[type]] field is set, then the table is
73 keyed on integers. Both bits can be set, and then the two keys are
74 consistent, i.e., look up of [[A[-14]]] and [[A["-14"]]] will
75 return identical [[CELL]] pointers although the look up methods will
76 be different. In this case, the [[size]] field is the number of hash nodes
77 in the table. When insertion of a new element would cause [[size]] to
78 exceed [[limit]], the table grows by doubling the number of hash chains.
80 $({\it hmask}+1){\it max\_ave\_list\_length}={\it limit}$, is always true.
81 {\it Max\_ave\_list\_length} is a tunable constant.
84 <<array typedefs and [[#defines]]>>=
91 The hash tables are linked lists of nodes, called [[ANODEs]].
92 The number of lists is [[hmask+1]] which is always a power of two.
93 The [[ptr]] field points at a vector of list heads. Since there are
94 potentially two types of lists, integer lists and strings lists,
95 each list head is a structure, [[DUAL_LINK]].
97 <<local constants, defs and prototypes>>=
99 typedef struct {struct anode *slink, *ilink ;} DUAL_LINK ;
102 The string lists are chains connected by [[slinks]] and the integer
103 lists are chains connected by [[ilinks]]. We sometimes refer to these
104 lists as slists and ilists, respectively.
105 The elements on the lists are [[ANODEs]].
106 The fields of an [[ANODE]] are:
108 \hi [[slink]]\quad The link field for slists.
109 \hi [[ilink]]\quad The link field for ilists.
110 \hi [[sval]]\quad If non-null, then [[sval]] is a pointer to a string
111 key. For a given table, if the [[AY_STR]] bit is set then every
112 [[ANODE]] has a non-null [[sval]] field and conversely, if [[AY_STR]]
113 is not set, then every [[sval]] field is null.
115 \hi [[hval]]\quad The hash value of [[sval]]. This field has no
116 meaning if [[sval]] is null.
117 \hi [[ival]]\quad The integer key. The field has no meaning if set
118 to the constant, [[NOT_AN_IVALUE]]. If the [[AY_STR]] bit is off,
119 then every [[ANODE]] will have a valid [[ival]] field. If the
120 [[AY_STR]] bit is on, then the [[ival]] field may or may not be
123 \hi [[cell]]\quad The data field in the hash table.
126 So the value of $A[\expr]$ is stored in the [[cell]] field, and if
127 \expr{} is an integer, then \expr{} is stored in [[ival]], else it
128 is stored in [[sval]].
131 <<local constants, defs and prototypes>>=
132 typedef struct anode {
133 struct anode *slink ;
134 struct anode *ilink ;
142 @ Interface Functions
143 The interface functions are:
146 \hi [[CELL* array_find(ARRAY A, CELL *cp, int create_flag)]] returns a
147 pointer to $A[\expr]$ where [[cp]] is a pointer to the [[CELL]]
148 holding \expr\/. If the [[create_flag]] is on and \expr\/ is not
149 an element of [[A]], then the element is created with value \Null\/.
151 \hi [[void array_delete(ARRAY A, CELL *cp)]] removes an element
152 $A[\expr]$ from the array $A$. [[cp]] points at the [[CELL]] holding
155 \hi [[void array_load(ARRAY A, int cnt)]] builds a split array. The
156 values $A[1..{\it cnt}]$ are copied from the array
157 ${\it split\_buff}[0..{\it cnt}-1]$.
159 \hi [[void array_clear(ARRAY A)]] removes all elements of $A$. The
160 type of $A$ is then [[AY_NULL]].
162 \hi [[STRING** array_loop_vector(ARRAY A, unsigned *sizep)]]
164 to a linear vector that holds all the strings that are indices of $A$.
165 The size of the the vector is returned indirectly in [[*sizep]].
166 If [[A->size==0]], a \Null{} pointer is returned.
168 \hi [[CELL* array_cat(CELL *sp, int cnt)]] concatenates the elements
169 of ${\it sp}[1-cnt..0]$, with each element separated by [[SUBSEP]], to
170 compute an array index. For example, on a reference to $A[i,j]$,
171 [[array_cat]] computes $i\circ{\it SUBSEP}\circ j$ where
172 $\circ$ denotes concatenation.
175 <<interface prototypes>>=
176 CELL* PROTO(array_find, (ARRAY,CELL*,int)) ;
177 void PROTO(array_delete, (ARRAY,CELL*)) ;
178 void PROTO(array_load, (ARRAY,int)) ;
179 void PROTO(array_clear, (ARRAY)) ;
180 STRING** PROTO(array_loop_vector, (ARRAY,unsigned*)) ;
181 CELL* PROTO(array_cat, (CELL*,int)) ;
184 Any reference to $A[\expr]$ creates a call to
185 [[array_find(A,cp,CREATE)]] where [[cp]] points at the cell holding
186 \expr\/. The test, $\expr \hbox{ in } A$, creates a call to
187 [[array_find(A,cp,NO_CREATE)]].
189 <<array typedefs and [[#defines]]>>=
194 [[Array_find]] is hash-table lookup that breaks into two cases:
196 \hi 1)\quad If [[*cp]] is numeric and integer valued, then lookup by
197 integer value using [[find_by_ival]]. If [[*cp]] is numeric, but not
198 integer valued, then convert to string with [[sprintf(CONVFMT,...)]] and
201 \hi 2)\quad if [[*cp]] is string valued, then lookup by string value
202 using [[find_by_sval]].
204 <<interface functions>>=
205 CELL* array_find(A, cp, create_flag)
211 if (A->size == 0 && !create_flag)
212 /* eliminating this trivial case early avoids unnecessary conversions later */
216 <<if the [[*cp]] is an integer, find by integer value else find by string value>>
219 ap = find_by_sval(A, &null_str, create_flag) ;
222 ap = find_by_sval(A, string(cp), create_flag) ;
225 return ap ? &ap->cell : (CELL *) 0 ;
229 To test whether [[cp->dval]] is integer, we convert to the nearest
230 integer by rounding towards zero (done by [[do_to_I]]) and then cast
231 back to double. If we get the same number we started with, then
232 [[cp->dval]] is integer valued.
234 <<if the [[*cp]] is an integer, find by integer value else find by string value>>=
236 double d = cp->dval ;
237 Int ival = d_to_I(d) ;
238 if ((double)ival == d) {
239 if (A->type == AY_SPLIT) {
240 if (ival >= 1 && ival <= A->size)
241 return (CELL*)A->ptr+(ival-1) ;
242 if (!create_flag) return (CELL*) 0 ;
243 convert_split_array_to_table(A) ;
245 else if (A->type == AY_NULL) make_empty_table(A, AY_INT) ;
246 ap = find_by_ival(A, ival, create_flag) ;
249 /* convert to string */
252 sprintf(buff, string(CONVFMT)->str, d) ;
253 sval = new_STRING(buff) ;
254 ap = find_by_sval(A,sval,create_flag) ;
260 When we get to the function [[find_by_ival]], the search has been reduced
261 to lookup in a hash table by integer value.
264 static ANODE* find_by_ival(A, ival, create_flag)
269 DUAL_LINK *table = (DUAL_LINK*) A->ptr ;
270 unsigned index = ival & A->hmask ;
271 ANODE *p = table[index].ilink ; /* walks ilist */
272 ANODE *q = (ANODE*) 0 ; /* trails p */
276 <<search by string value if needed and create if needed>>
279 else if (p->ival == ival) {
280 /* found it, now move to the front */
281 if (!q) /* already at the front */
283 /* delete for insertion at the front */
284 q->ilink = p->ilink ;
287 q = p ; p = q->ilink ;
289 /* insert at the front */
290 p->ilink = table[index].ilink ;
291 table[index].ilink = p ;
296 When a search by integer value fails, we have to check by string
298 handle the case insertion by [[A["123"]]] and later search as
299 [[A[123]]]. This string search is necessary if and only if the
300 [[AY_STR]] bit is set. An important point is that all [[ANODEs]] get
301 created with a valid [[sval]] if [[AY_STR]] is set, because then creation
302 of new nodes always occurs in a call to [[find_by_sval]].
304 <<search by string value if needed and create if needed>>=
305 if (A->type & AY_STR) {
306 /* need to search by string */
309 sprintf(buff, INT_FMT, ival) ;
310 sval = new_STRING(buff) ;
311 p = find_by_sval(A, sval, create_flag) ;
313 if (!p) return (ANODE*) 0 ;
315 else if (create_flag) {
317 p->sval = (STRING*) 0 ;
318 p->cell.type = C_NOINIT ;
319 if (++A->size > A->limit) {
320 double_the_hash_table(A) ; /* changes table, may change index */
321 table = (DUAL_LINK*) A->ptr ;
322 index = A->hmask & ival ;
325 else return (ANODE*) 0 ;
330 Searching by string value is easier because [[AWK]] arrays are really
331 string associations. If the array does not have the [[AY_STR]] bit set,
332 then we have to convert the array to a dual hash table with strings
333 which is done by the function [[add_string_associations]].
336 static ANODE* find_by_sval(A, sval, create_flag)
341 unsigned hval = ahash(sval) ;
342 char *str = sval->str ;
345 ANODE *p ; /* walks list */
346 ANODE *q = (ANODE*) 0 ; /* trails p */
347 if (! (A->type & AY_STR)) add_string_associations(A) ;
348 table = (DUAL_LINK*) A->ptr ;
349 index = hval & A->hmask ;
350 p = table[index].slink ;
354 <<create a new anode for [[sval]]>>
357 else return (ANODE*) 0 ;
359 else if (p->hval == hval && strcmp(p->sval->str,str) == 0 ) {
361 if (!q) /* already at the front */
363 else { /* delete for move to the front */
364 q->slink = p->slink ;
368 q = p ; p = q->slink ;
370 p->slink = table[index].slink ;
371 table[index].slink = p ;
376 One [[Int]] value is reserved to show that the [[ival]] field is invalid.
377 This works because [[d_to_I]] returns a value in [[[-Max_Int, Max_Int]]].
379 <<local constants, defs and prototypes>>=
380 #define NOT_AN_IVALUE (-Max_Int-1) /* usually 0x80000000 */
382 <<create a new anode for [[sval]]>>=
387 p->ival = NOT_AN_IVALUE ;
389 p->cell.type = C_NOINIT ;
390 if (++A->size > A->limit) {
391 double_the_hash_table(A) ; /* changes table, may change index */
392 table = (DUAL_LINK*) A->ptr ;
393 index = hval & A->hmask ;
398 On entry to [[add_string_associations]], we know that the [[AY_STR]] bit
399 is not set. We convert to a dual hash table, then walk all the integer
400 lists and put each [[ANODE]] on a string list.
403 static void add_string_associations(A)
406 if (A->type == AY_NULL) make_empty_table(A, AY_STR) ;
409 int i ; /* walks table */
410 ANODE *p ; /* walks ilist */
412 if (A->type == AY_SPLIT) convert_split_array_to_table(A) ;
413 table = (DUAL_LINK*) A->ptr ;
414 for(i=0;i <= A->hmask; i++) {
417 sprintf(buff, INT_FMT, p->ival) ;
418 p->sval = new_STRING(buff) ;
419 p->hval = ahash(p->sval) ;
420 p->slink = table[A->hmask&p->hval].slink ;
421 table[A->hmask&p->hval].slink = p ;
430 The execution of the statement, $\hbox{\it delete }A[\expr]$, creates a
431 call to [[array_delete(ARRAY A, CELL *cp)]]. Depending on the
432 type of [[*cp]], the call is routed to [[find_by_sval]] or [[find_by_ival]].
433 Each of these functions leaves its return value on the front of an
434 slist or ilist, respectively, and then it is deleted from the front of
435 the list. The case where $A[\expr]$ is on two lists, e.g.,
436 [[A[12]]] and [[A["12"]]] is checked by examining the [[sval]] and
437 [[ival]] fields of the returned [[ANODE*]].
439 <<interface functions>>=
440 void array_delete(A, cp)
445 if (A->size == 0) return ;
449 double d = cp->dval ;
450 Int ival = d_to_I(d) ;
451 if ((double)ival == d) <<delete by integer value and return>>
452 else { /* get the string value */
455 sprintf(buff, string(CONVFMT)->str, d) ;
456 sval = new_STRING(buff) ;
457 ap = find_by_sval(A, sval, NO_CREATE) ;
463 ap = find_by_sval(A, &null_str, NO_CREATE) ;
466 ap = find_by_sval(A, string(cp), NO_CREATE) ;
469 if (ap) { /* remove from the front of the slist */
470 DUAL_LINK *table = (DUAL_LINK*) A->ptr ;
471 table[ap->hval&A->hmask].slink = ap->slink ;
472 <<if [[ival]] is valid, remove [[ap]] from its ilist>>
473 free_STRING(ap->sval) ;
474 cell_destroy(&ap->cell) ;
476 <<decrement [[A->size]]>>
480 <<delete by integer value and return>>=
482 if (A->type == AY_SPLIT)
483 if (ival >=1 && ival <= A->size) convert_split_array_to_table(A) ;
484 else return ; /* ival not in range */
485 ap = find_by_ival(A, ival, NO_CREATE) ;
486 if (ap) { /* remove from the front of the ilist */
487 DUAL_LINK *table = (DUAL_LINK*) A->ptr ;
488 table[ap->ival & A->hmask].ilink = ap->ilink ;
489 <<if [[sval]] is valid, remove [[ap]] from its slist>>
490 cell_destroy(&ap->cell) ;
492 <<decrement [[A->size]]>>
498 Even though we found a node by searching an ilist it might also
499 be on an slist and vice-versa.
501 <<if [[sval]] is valid, remove [[ap]] from its slist>>=
504 int index = ap->hval & A->hmask ;
505 p = table[index].slink ;
506 while(p != ap) { q = p ; p = q->slink ; }
507 if (q) q->slink = p->slink ;
508 else table[index].slink = p->slink ;
509 free_STRING(ap->sval) ;
512 <<if [[ival]] is valid, remove [[ap]] from its ilist>>=
513 if (ap->ival != NOT_AN_IVALUE) {
515 int index = ap->ival & A->hmask ;
516 p = table[index].ilink ;
517 while(p != ap) { q = p ; p = q->ilink ; }
518 if (q) q->ilink = p->ilink ;
519 else table[index].ilink = p->ilink ;
523 When the size of a hash table drops below a certain value, it might
524 be profitable to shrink the hash table. Currently we don't do this,
525 because our guess is that it would be a waste of time for most
526 [[AWK]] applications. However, we do convert an array to [[AY_NULL]]
527 when the size goes to zero which would resize a large hash table
528 that had been completely cleared by successive deletions.
530 <<decrement [[A->size]]>>=
531 if (--A->size == 0) array_clear(A) ;
534 @ Building an Array with Split
535 A simple operation is to create an array with the [[AWK]]
536 primitive [[split]]. The code that performs [[split]] puts the
537 pieces in the global buffer [[split_buff]]. The call
538 [[array_load(A, cnt)]] moves the [[cnt]] elements from [[split_buff]] to
539 [[A]]. This is the only way an array of type [[AY_SPLIT]] is
542 <<interface functions>>=
543 void array_load(A, cnt)
547 CELL *cells ; /* storage for A[1..cnt] */
548 int i ; /* index into cells[] */
549 <<clean up the existing array and prepare an empty split array>>
550 cells = (CELL*) A->ptr ;
552 <<if [[cnt]] exceeds [[MAX_SPLIT]], load from overflow list and adjust [[cnt]]>>
553 for(i=0;i < cnt; i++) {
554 cells[i].type = C_MBSTRN ;
555 cells[i].ptr = split_buff[i] ;
560 When [[cnt > MAX_SPLIT]], [[split_buff]] was not big enough to hold
561 everything so the overflow went on the [[split_ov_list]].
562 The elements from [[MAX_SPLIT+1]] to [[cnt]] get loaded into
563 [[cells[MAX_SPLIT..cnt-1]]] from this list.
565 <<if [[cnt]] exceeds [[MAX_SPLIT]], load from overflow list and adjust [[cnt]]>>=
566 if (cnt > MAX_SPLIT) {
567 SPLIT_OV *p = split_ov_list ;
569 split_ov_list = (SPLIT_OV*) 0 ;
572 cells[i].type = C_MBSTRN ;
573 cells[i].ptr = (PTR) p->sval ;
574 q = p ; p = q->link ; ZFREE(q) ;
581 If the array [[A]] is a split array and big enough then we reuse it,
582 otherwise we need to allocate a new split array.
583 When we allocate a block of [[CELLs]] for a split array, we round up
586 <<clean up the existing array and prepare an empty split array>>=
587 if (A->type != AY_SPLIT || A->limit < cnt) {
589 A->limit = (cnt&~3)+4 ;
590 A->ptr = zmalloc(A->limit*sizeof(CELL)) ;
594 for(i=0;i < A->size; i++) cell_destroy((CELL*)A->ptr+i) ;
597 The function [[array_clear(ARRAY A)]] converts [[A]] to type [[AY_NULL]]
598 and frees all storage used by [[A]] except for the [[struct array]]
599 itself. This function gets called in two contexts:
600 (1)~when an array local to a user function goes out of scope and
601 (2)~execution of the [[AWK]] statement, [[delete A]].
603 <<interface functions>>=
609 if (A->type == AY_SPLIT) {
610 for(i=0;i < A->size; i++) cell_destroy((CELL*)A->ptr+i) ;
611 zfree(A->ptr, A->limit * sizeof(CELL)) ;
613 else if (A->type & AY_STR) {
614 DUAL_LINK *table = (DUAL_LINK*) A->ptr ;
615 for(i=0;i <= A->hmask; i++) {
618 q = p ; p = q->slink ;
619 free_STRING(q->sval) ;
620 cell_destroy(&q->cell) ;
624 zfree(A->ptr, (A->hmask+1)*sizeof(DUAL_LINK)) ;
626 else if (A->type & AY_INT) {
627 DUAL_LINK *table = (DUAL_LINK*) A->ptr ;
628 for(i=0;i <= A->hmask; i++) {
631 q = p ; p = q->ilink ;
632 cell_destroy(&q->cell) ;
636 zfree(A->ptr, (A->hmask+1)*sizeof(DUAL_LINK)) ;
638 memset(A, 0, sizeof(*A)) ;
643 @ Constructor and Conversions
644 Arrays are always created as empty arrays of type [[AY_NULL]].
645 Global arrays are never destroyed although they can go empty or have
646 their type change by conversion. The only constructor function is
649 <<array typedefs and [[#defines]]>>=
650 #define new_ARRAY() ((ARRAY)memset(ZMALLOC(struct array),0,sizeof(struct array)))
653 Hash tables only get constructed by conversion. This happens in two
655 The function [[make_empty_table]] converts an empty array of type
656 [[AY_NULL]] to an empty hash table. The number of lists in the table
657 is a power of 2 determined by the constant [[STARTING_HMASK]].
658 The limit size of the table is determined by the constant
659 [[MAX_AVE_LIST_LENGTH]] which is the largest average size of the hash
660 lists that we are willing to tolerate before enlarging the table.
661 When [[A->size]] exceeds [[A->limit]],
662 the hash table grows in size by doubling the number of lists.
663 [[A->limit]] is then reset to [[MAX_AVE_LIST_LENGTH]] times
666 <<local constants, defs and prototypes>>=
667 #define STARTING_HMASK 63 /* 2^6-1, must have form 2^n-1 */
668 #define MAX_AVE_LIST_LENGTH 12
669 #define hmask_to_limit(x) (((x)+1)*MAX_AVE_LIST_LENGTH)
672 static void make_empty_table(A, type)
674 int type ; /* AY_INT or AY_STR */
676 size_t sz = (STARTING_HMASK+1)*sizeof(DUAL_LINK) ;
678 A->hmask = STARTING_HMASK ;
679 A->limit = hmask_to_limit(STARTING_HMASK) ;
680 A->ptr = memset(zmalloc(sz), 0, sz) ;
684 The other way a hash table gets constructed is when a split array is
685 converted to a hash table of type [[AY_INT]].
688 static void convert_split_array_to_table(A)
691 CELL *cells = (CELL*) A->ptr ;
692 int i ; /* walks cells */
694 int j ; /* walks table */
695 unsigned entry_limit = A->limit ;
696 <<determine the size of the hash table and allocate>>
697 /* insert each cells[i] in the new hash table on an ilist */
698 for(i=0, j=1 ;i < A->size; i++) {
699 ANODE *p = ZMALLOC(ANODE) ;
700 p->sval = (STRING*) 0 ;
703 p->ilink = table[j].ilink ;
705 j++ ; j &= A->hmask ;
708 zfree(cells, entry_limit*sizeof(CELL)) ;
712 To determine the size of the table, we set the initial size to
713 [[STARTING_HMASK+1]] and then double the size until
714 [[A->size <= A->limit]].
716 <<determine the size of the hash table and allocate>>=
717 A->hmask = STARTING_HMASK ;
718 A->limit = hmask_to_limit(STARTING_HMASK) ;
719 while(A->size > A->limit) {
720 A->hmask = (A->hmask<<1) + 1 ; /* double the size */
721 A->limit = hmask_to_limit(A->hmask) ;
724 size_t sz = (A->hmask+1)*sizeof(DUAL_LINK) ;
725 A->ptr = memset(zmalloc(sz), 0, sz) ;
726 table = (DUAL_LINK*) A->ptr ;
730 @ Doubling the Size of a Hash Table
731 The whole point of making the table size a power of two is to
732 facilitate resizing the table. If the table size is $2^n$ and
733 $h$ is the hash key, then $h\bmod 2^n$ is the hash chain index
734 which can be calculated with bit-wise and,
735 {\mathchardef~="2026 $h ~ (2^n-1)$}.
736 When the table size doubles, the new bit-mask has one more bit
737 turned on. Elements of an old hash chain whose hash value have this bit
738 turned on get moved to a new chain. Elements with this bit turned off
739 stay on the same chain. On average only half the old chain moves to the
740 new chain. If the old chain is at ${\it table}[i],\ 0\le i < 2^n$,
741 then the elements that move, all move to the new chain at
742 ${\it table}[i+2^n]$.
745 static void double_the_hash_table(A)
748 unsigned old_hmask = A->hmask ;
749 unsigned new_hmask = (old_hmask<<1)+1 ;
751 <<allocate the new hash table>>
752 <<if the old table has string lists, move about half the string nodes>>
753 <<if the old table has integer lists, move about half the integer nodes>>
754 A->hmask = new_hmask ;
755 A->limit = hmask_to_limit(new_hmask) ;
759 <<allocate the new hash table>>=
760 A->ptr = zrealloc(A->ptr, (old_hmask+1)*sizeof(DUAL_LINK),
761 (new_hmask+1)*sizeof(DUAL_LINK)) ;
762 table = (DUAL_LINK*) A->ptr ;
763 /* zero out the new part which is the back half */
764 memset(&table[old_hmask+1], 0, (old_hmask+1)*sizeof(DUAL_LINK)) ;
766 <<if the old table has string lists, move about half the string nodes>>=
767 if (A->type & AY_STR) {
768 int i ; /* index to old lists */
769 int j ; /* index to new lists */
770 ANODE *p ; /* walks an old list */
771 ANODE *q ; /* trails p for deletion */
772 ANODE *tail ; /* builds new list from the back */
773 ANODE dummy0, dummy1 ;
774 for(i=0, j=old_hmask+1;i <= old_hmask; i++, j++)
775 <<walk one old string list, creating one new string list>>
779 As we walk an old string list with pointer [[p]], the expression
780 [[p->hval & new_hmask]] takes one of two values. If it is equal
781 to [[p->hval & old_hmask]] (which equals [[i]]),
782 then the node stays otherwise it gets moved
783 to a new string list at [[j]]. The new string list preserves order so that
784 the positions of the move-to-the-front heuristic are preserved.
785 Nodes moving to the new list are appended at pointer [[tail]].
786 The [[ANODEs]], [[dummy0]]~and [[dummy1]], are sentinels that remove
787 special handling of boundary conditions.
789 <<walk one old string list, creating one new string list>>=
792 q->slink = p = table[i].slink ;
795 if ((p->hval&new_hmask) != i) { /* move it */
796 q->slink = p->slink ;
797 tail = tail->slink = p ;
802 table[i].slink = dummy0.slink ;
803 tail->slink = (ANODE*) 0 ;
804 table[j].slink = dummy1.slink ;
808 The doubling of the integer lists is exactly the same except that
809 [[slink]] is replaced by [[ilink]] and [[hval]] is replaced by [[ival]].
811 <<if the old table has integer lists, move about half the integer nodes>>=
812 if (A->type & AY_INT) {
813 int i ; /* index to old lists */
814 int j ; /* index to new lists */
815 ANODE *p ; /* walks an old list */
816 ANODE *q ; /* trails p for deletion */
817 ANODE *tail ; /* builds new list from the back */
818 ANODE dummy0, dummy1 ;
819 for(i=0, j=old_hmask+1;i <= old_hmask; i++, j++)
820 <<walk one old integer list, creating one new integer list>>
823 <<walk one old integer list, creating one new integer list>>=
826 q->ilink = p = table[i].ilink ;
829 if ((p->ival&new_hmask) != i) { /* move it */
830 q->ilink = p->ilink ;
831 tail = tail->ilink = p ;
836 table[i].ilink = dummy0.ilink ;
837 tail->ilink = (ANODE*) 0 ;
838 table[j].ilink = dummy1.ilink ;
841 @ Initializing Array Loops
842 Our mechanism for dealing with execution of the statement,
844 \centerline{[[for(i in A) {]] {\it statements} [[}]]}
847 is simple. We allocate a vector of [[STRING*]] of size,
848 [[A->size]]. Each element of the vector is a string key for~[[A]].
849 Note that if the [[AY_STR]] bit of [[A]] is not set, then [[A]]
850 has to be converted to a string hash table, because the index
851 [[i]] walks string indices.
853 To execute the loop, the only state that needs to be saved is the
854 address of [[i]] and an index into the vector of string keys. Since
855 nothing about [[A]] is saved as state, the user
856 program can do anything to [[A]] inside the body of
857 the loop, even [[delete A]], and the loop
858 still works. Essentially, we have traded data space (the string vector)
859 in exchange for implementation simplicity. On a 32-bit system, each
860 [[ANODE]] is 36 bytes, so the extra memory needed for the array loop is
861 11\% more than the memory consumed by the [[ANODEs]] of the array.
862 Note that the large size of the [[ANODEs]] is indicative of our whole
863 design which pays data space for integer lookup speed and algorithm
866 The only aspect of array loops that occurs in [[array.c]] is construction
867 of the string vector. The rest of the implementation
868 is in the file [[execute.c]].
870 <<interface functions>>=
871 STRING** array_loop_vector(A, sizep)
878 if (!(A->type & AY_STR)) add_string_associations(A) ;
879 ret = (STRING**) zmalloc(A->size*sizeof(STRING*)) ;
880 <<for each [[ANODE]] in [[A]], put one string in [[ret]]>>
883 else return (STRING**) 0 ;
887 As we walk over the hash table [[ANODEs]], putting each [[sval]] in
888 [[ret]], we need to increment each reference count. The user of the
889 return value is responsible for these new reference counts.
891 <<for each [[ANODE]] in [[A]], put one string in [[ret]]>>=
893 int r = 0 ; /* indexes ret */
894 DUAL_LINK* table = (DUAL_LINK*) A->ptr ;
895 int i ; /* indexes table */
896 ANODE *p ; /* walks slists */
897 for(i=0;i <= A->hmask; i++) {
898 for(p = table[i].slink; p ; p = p->slink) {
906 Since a hash value is turned into a table index via bit-wise and with
907 \hbox{[[A->hmask]]}, it is important that the hash function does a good job
908 of scrambling the low-order bits of the returned hash value.
909 Empirical tests indicate the following function does an adequate job.
910 Note that for strings with length greater than 10, we only hash on
911 the first five characters, the last five character and the length.
914 static unsigned ahash(sval)
917 unsigned sum1 = sval->len ;
918 unsigned sum2 = sum1 ;
919 unsigned char *p , *q ;
921 for(p=(unsigned char*)sval->str; *p ; p++) {
928 p = (unsigned char*)sval->str ; /* p starts at the front */
929 q = (unsigned char*)sval->str + (sum1-1) ; /* q starts at the back */
943 @ Concatenating Array Indices
944 In [[AWK]], an array expression [[A[i,j]]] is equivalent to the
945 expression [[A[i SUBSEP j]]], i.e., the index is the
946 concatenation of the three
947 elements [[i]], [[SUBSEP]] and [[j]]. This is performed by the
948 function [[array_cat]]. On entry, [[sp]] points at the top of a
950 [[Cnt]] cells are popped off the stack and concatenated together
951 separated by [[SUBSEP]] and the result is pushed back on the stack.
952 On entry, the first multi-index is in [[sp[1-cnt]]] and the last is
953 in [[sp[0]]]. The return value is the new stack top.
954 (The stack is the run-time evaluation stack.
955 This operation really has nothing to do with array structure, so
956 logically this code belongs in [[execute.c]], but remains here for
960 <<interface functions>>=
961 CELL *array_cat(sp, cnt)
965 CELL *p ; /* walks the eval stack */
966 CELL subsep ; /* local copy of SUBSEP */
968 unsigned total_len ; /* length of cat'ed expression */
969 CELL *top ; /* value of sp at entry */
970 char *target ; /* build cat'ed char* here */
971 STRING *sval ; /* build cat'ed STRING here */
972 <<get subsep and compute parts>>
973 <<set [[top]] and return value of [[sp]]>>
974 <<cast cells to string and compute [[total_len]]>>
975 <<build the cat'ed [[STRING]] in [[sval]]>>
976 <<cleanup, set [[sp]] and return>>
980 We make a copy of [[SUBSEP]] which we can cast to string in the
981 unlikely event the user has assigned a number to [[SUBSEP]].
984 unsigned subsep_len ; /* string length of subsep_str */
987 <<get subsep and compute parts>>=
988 cellcpy(&subsep, SUBSEP) ;
989 if ( subsep.type < C_STRING ) cast1_to_s(&subsep) ;
990 subsep_len = string(&subsep)->len ;
991 subsep_str = string(&subsep)->str ;
994 Set [[sp]] and [[top]] so the cells to concatenate are inclusively
995 between [[sp]] and [[top]].
997 <<set [[top]] and return value of [[sp]]>>=
998 top = sp ; sp -= (cnt-1) ;
1001 The [[total_len]] is the sum of the lengths of the [[cnt]]
1002 strings and the [[cnt-1]] copies of [[subsep]].
1004 <<cast cells to string and compute [[total_len]]>>=
1005 total_len = (cnt-1)*subsep_len ;
1006 for(p = sp ; p <= top ; p++) {
1007 if ( p->type < C_STRING ) cast1_to_s(p) ;
1008 total_len += string(p)->len ;
1011 <<build the cat'ed [[STRING]] in [[sval]]>>=
1012 sval = new_STRING0(total_len) ;
1013 target = sval->str ;
1014 for(p = sp ; p < top ; p++) {
1015 memcpy(target, string(p)->str, string(p)->len) ;
1016 target += string(p)->len ;
1017 memcpy(target, subsep_str, subsep_len) ;
1018 target += subsep_len ;
1021 memcpy(target, string(p)->str, string(p)->len) ;
1024 The return value is [[sp]] and it is already set correctly. We
1025 just need to free the strings and set the contents of [[sp]].
1027 <<cleanup, set [[sp]] and return>>=
1028 for(p = sp; p <= top ; p++) free_STRING(string(p)) ;
1029 free_STRING(string(&subsep)) ;
1030 /* set contents of sp , sp->type > C_STRING is possible so reset */
1031 sp->type = C_STRING ;
1032 sp->ptr = (PTR) sval ;
1036 Here are some things we want to make sure end up in the [[.c]] and
1038 The compiler needs prototypes for the local functions, and we will
1039 put a copyright and links to the source file, [[array.w]], in each
1042 <<local constants, defs and prototypes>>=
1043 static ANODE* PROTO(find_by_ival,(ARRAY, Int, int)) ;
1044 static ANODE* PROTO(find_by_sval,(ARRAY, STRING*, int)) ;
1045 static void PROTO(add_string_associations,(ARRAY)) ;
1046 static void PROTO(make_empty_table,(ARRAY, int)) ;
1047 static void PROTO(convert_split_array_to_table,(ARRAY)) ;
1048 static void PROTO(double_the_hash_table,(ARRAY)) ;
1049 static unsigned PROTO(ahash, (STRING*)) ;
1059 This file was generated with the command
1061 notangle -R'"array.c"' array.w > array.c
1067 Notangle is part of Norman Ramsey's noweb literate programming package
1068 available from CTAN(ftp.shsu.edu).
1070 It's easiest to read or modify this file by working with array.w.
1078 This file was generated with the command
1080 notangle -R'"array.h"' array.w > array.h
1086 copyright 1991-96, Michael D. Brennan
1088 This is a source file for mawk, an implementation of
1089 the AWK programming language.
1091 Mawk is distributed without warranty under the terms of
1092 the GNU General Public License, version 2, 1991.