2 # this program finds the twenty most freq
3 # words in document using a heap sort at the end
7 function down_heap(i, k,hold)
11 if ( compare(heap[2*i], heap[2*i+1]) <= 0 ) k = 2*i
14 if ( compare(heap[i],heap[k]) <= 0 ) return
16 hold = heap[k] ; heap[k] = heap[i] ; heap[i] = hold
21 # compares two values of form "number word"
22 # by number and breaks ties by word (reversed)
24 function compare(s1, s2, t, X)
26 t = (s1+0) - (s2+0) # forces types to number
30 split(s1, X); s1 = X[2]
31 split(s2, X); s2 = X[2]
32 if ( s2 < s1 ) return -1
40 BEGIN { RS = "[^a-zA-Z]+" ; BIG = "999999:" }
44 END { delete cnt[ "" ]
50 heap[j] = num_word( cnt[i] , i )
52 if ( ++j == 21 ) break ;
56 for( i = j ; i < 43 ; i++ ) heap[i] = BIG
58 h_empty = j # save the first empty slot
59 # make a heap with the smallest in slot 1
60 for( i = h_empty - 1 ; i > 0 ; i-- ) down_heap(i)
62 # examine the rest of the values
65 j = num_word(cnt[i], i)
66 if ( compare(j, heap[1]) > 0 )
68 # take the smallest out of the heap and readjust
76 # what's left are the twenty largest
84 heap[1] = heap[h_empty]
91 for(j = 1 ; j <= 20 ; j++ ) print buffer[j]
95 function num_word(num, word)
97 return sprintf("%3d %s", num, word)