test/wfrq0.awk

   1
   2 #   this program finds the twenty most freq
   3 #   words in document using a heap sort at the end
   4 #
   5 #
   6
   7 function down_heap(i,           k,hold)
   8 {
   9   while ( 1 )
  10   {
  11       if ( compare(heap[2*i], heap[2*i+1]) <= 0 ) k = 2*i
  12       else  k = 2*i + 1
  13
  14       if ( compare(heap[i],heap[k]) <= 0 )  return
  15
  16       hold = heap[k] ; heap[k] = heap[i] ; heap[i] = hold
  17       i = k
  18    }
  19 }
  20
  21 # compares two values of form  "number word"
  22 #    by number and breaks ties by word (reversed)
  23
  24 function  compare(s1, s2,       t, X)
  25 {
  26   t = (s1+0) - (s2+0)  # forces types to number
  27
  28   if ( t == 0 )
  29   {
  30     split(s1, X);  s1 = X[2]
  31     split(s2, X); s2 = X[2]
  32     if ( s2 < s1 )  return -1
  33     return s1 < s2
  34   }
  35
  36   return t
  37 }
  38
  39
  40 BEGIN { RS = "[^a-zA-Z]+" ;  BIG = "999999:" }
  41
  42 { cnt[$0]++ }
  43
  44 END { delete  cnt[ "" ]
  45
  46 # load twenty values
  47 j = 1
  48 for( i in cnt )
  49 {
  50   heap[j] = num_word( cnt[i] , i )
  51   delete cnt[i] ;
  52   if ( ++j == 21 )  break ;
  53 }
  54
  55 # make some sentinals
  56 for( i = j ; i < 43 ; i++ )  heap[i] = BIG
  57
  58 h_empty = j  # save the first empty slot
  59 # make a heap with the smallest in slot 1
  60 for( i = h_empty - 1 ; i > 0 ; i-- )  down_heap(i)
  61
  62 # examine the rest of the values
  63 for ( i in cnt )
  64 {
  65   j = num_word(cnt[i], i)
  66   if ( compare(j, heap[1]) > 0 )
  67   { # its bigger
  68     # take the smallest out of the heap and readjust
  69     heap[1] = j
  70     down_heap(1)
  71   }
  72 }
  73
  74 h_empty-- ;
  75
  76 # what's left are the twenty largest
  77 # smallest at the top
  78 #
  79
  80 i = 20
  81 while ( h_empty > 1 )
  82 {
  83   buffer[i--] = heap[1]
  84   heap[1] = heap[h_empty]
  85   heap[h_empty] = BIG
  86   down_heap(1)
  87   h_empty--
  88 }
  89   buffer[i--] = heap[1]
  90
  91   for(j = 1 ; j <= 20 ; j++ )  print buffer[j]
  92 }
  93
  94
  95 function num_word(num, word)
  96 {
  97   return sprintf("%3d %s", num, word)
  98 }