1 /* code -- bigram- and front-encode filenames for locate
2 Copyright (C) 1994, 2005, 2007, 2008, 2010, 2011 Free Software
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, either version 3 of the License, or
8 (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <http://www.gnu.org/licenses/>.
19 /* Compress a sorted list.
20 Works with `find' to encode a filename database to save space
25 bigram < file_list > bigrams
26 process-bigrams > most_common_bigrams
27 code most_common_bigrams < file_list > squeezed_list
29 Uses `front compression' (see ";login:", March 1983, p. 8).
30 The output begins with the 128 most common bigrams.
31 After that, the output format is, for each line,
32 an offset (from the previous line) differential count byte
33 followed by a (partially bigram-encoded) ASCII remainder.
34 The output lines have no terminating byte; the start of the next line
35 is indicated by its first byte having a value <= 30.
37 The encoding of the output bytes is:
39 0-28 likeliest differential counts + offset (14) to make nonnegative
40 30 escape code for out-of-range count to follow in next halfword
41 128-255 bigram codes (the 128 most common, as determined by `updatedb')
42 32-127 single character (printable) ASCII remainder
44 Written by James A. Woods <jwoods@adobe.com>.
45 Modified by David MacKenzie <djm@gnu.org>. */
47 /* config.h should always be included first. */
56 #include <sys/types.h>
66 #include "findutils-version.h"
71 # define _(Text) gettext (Text)
74 #define textdomain(Domain)
75 #define bindtextdomain(Package, Directory)
78 # define N_(String) gettext_noop (String)
80 /* See locate.c for explanation as to why not use (String) */
81 # define N_(String) String
85 #ifndef ATTRIBUTE_NORETURN
86 # define ATTRIBUTE_NORETURN __attribute__ ((__noreturn__))
90 /* The 128 most common bigrams in the file list, padded with NULs
91 if there are fewer. */
92 static char bigrams[257] = {0};
94 /* Return the offset of PATTERN in STRING, or -1 if not found. */
97 strindex (char *string, char *pattern)
101 for (s = string; *s != '\0'; s++)
102 /* Fast first char check. */
105 register char *p2 = pattern + 1, *s2 = s + 1;
106 while (*p2 != '\0' && *p2 == *s2)
109 return s2 - strlen (pattern) - string;
114 /* Return the length of the longest common prefix of strings S1 and S2. */
117 prefix_length (char *s1, char *s2)
119 register char *start;
121 for (start = s1; *s1 == *s2 && *s1 != '\0'; s1++, s2++)
126 extern char *version_string;
131 fprintf (stream, _("\
132 Usage: %s [--version | --help]\n\
133 or %s most_common_bigrams < file-list > locate-database\n"),
134 program_name, program_name);
135 fputs (_("\nReport bugs to <bug-findutils@gnu.org>.\n"), stream);
139 static void inerr (const char *filename) ATTRIBUTE_NORETURN;
140 static void outerr (void) ATTRIBUTE_NORETURN;
143 inerr (const char *filename)
145 error (EXIT_FAILURE, errno, "%s", filename);
153 error (EXIT_FAILURE, errno, _("write error"));
160 main (int argc, char **argv)
162 char *path; /* The current input entry. */
163 char *oldpath; /* The previous input entry. */
164 size_t pathsize, oldpathsize; /* Amounts allocated for them. */
165 int count, oldcount, diffcount; /* Their prefix lengths & the difference. */
166 char bigram[3]; /* Bigram to search for in table. */
167 int code; /* Index of `bigram' in bigrams table. */
168 FILE *fp; /* Most common bigrams file. */
169 int line_len; /* Length of input line. */
171 set_program_name (argv[0]);
172 if (atexit (close_stdout))
174 error (EXIT_FAILURE, errno, _("The atexit library function failed"));
185 if (0 == strcmp (argv[1], "--help"))
190 else if (0 == strcmp (argv[1], "--version"))
192 display_findutils_version ("code");
196 fp = fopen (argv[1], "r");
199 fprintf (stderr, "%s: ", argv[0]);
204 pathsize = oldpathsize = 1026; /* Increased as necessary by getline. */
205 path = xmalloc (pathsize);
206 oldpath = xmalloc (oldpathsize);
208 /* Set to empty string, to force the first prefix count to 0. */
212 /* Copy the list of most common bigrams to the output,
213 padding with NULs if there are <128 of them. */
214 if (NULL == fgets (bigrams, 257, fp))
217 if (256 != fwrite (bigrams, 1, 256, stdout))
220 if (EOF == fclose (fp))
223 while ((line_len = getline (&path, &pathsize, stdin)) > 0)
227 path[line_len - 1] = '\0'; /* Remove newline. */
229 /* Squelch unprintable chars in path so as not to botch decoding. */
230 for (pp = path; *pp != '\0'; pp++)
232 if (!(*pp >= 040 && *pp < 0177))
236 count = prefix_length (oldpath, path);
237 diffcount = count - oldcount;
239 /* If the difference is small, it fits in one byte;
240 otherwise, two bytes plus a marker noting that fact. */
241 if (diffcount < -LOCATEDB_OLD_OFFSET || diffcount > LOCATEDB_OLD_OFFSET)
243 if (EOF ==- putc (LOCATEDB_OLD_ESCAPE, stdout))
246 if (!putword (stdout,
247 diffcount+LOCATEDB_OLD_OFFSET,
248 GetwordEndianStateNative))
253 if (EOF == putc (diffcount + LOCATEDB_OLD_OFFSET, stdout))
257 /* Look for bigrams in the remainder of the path. */
258 for (pp = path + count; *pp != '\0'; pp += 2)
262 /* No bigram is possible; only one char is left. */
268 /* Linear search for specific bigram in string table. */
269 code = strindex (bigrams, bigram);
271 putchar ((code / 2) | 0200); /* It's a common bigram. */
273 fputs (bigram, stdout); /* Write the text as printable ASCII. */
277 /* Swap path and oldpath and their sizes. */
278 char *tmppath = oldpath;
279 size_t tmppathsize = oldpathsize;
281 oldpathsize = pathsize;
283 pathsize = tmppathsize;