2 * libjsongit2 - trie file functions
4 * Copyright (C) 2018 Andy Green <andy@warmcat.com>
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation:
9 * version 2.1 of the License.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
22 #include "core/private.h"
23 #include "misc/fts/private.h"
29 #include <sys/types.h>
32 #define AC_COUNT_STASHED_CHILDREN 8
45 struct ch ch[AC_COUNT_STASHED_CHILDREN];
59 struct linetable *next;
61 int chunk_line_number_start;
62 int chunk_line_number_count;
64 off_t chunk_filepos_start;
66 off_t vli_ofs_in_index;
72 return (b[0] << 24) | (b[1] << 16) | (b[2] << 8) | b[3];
78 return (b[0] << 8) | b[1];
82 lws_fts_filepath(struct lws_fts_file *jtf, int filepath_index, char *result,
83 size_t len, uint32_t *ofs_linetable, uint32_t *lines)
85 unsigned char buf[256 + 15];
91 if (filepath_index > jtf->filepaths)
94 if (lseek(jtf->fd, jtf->filepath_table + (4 * filepath_index),
96 lwsl_err("%s: unable to seek\n", __func__);
101 ra = read(jtf->fd, buf, 4);
105 o = (unsigned int)b32(buf);
106 if (lseek(jtf->fd, o, SEEK_SET) < 0) {
107 lwsl_err("%s: unable to seek\n", __func__);
112 ra = read(jtf->fd, buf, sizeof(buf));
117 bp += rq32(&buf[bp], ofs_linetable);
119 bp += rq32(&buf[bp], &flen);
121 bp += rq32(&buf[bp], lines);
123 bp += rq32(&buf[bp], &flen);
124 bp += rq32(&buf[bp], &flen);
130 strncpy(result, (char *)&buf[bp], m);
132 result[len - 1] = '\0';
138 * returns -1 for fail or fd open on the trie file.
140 * *root is set to the position of the root trie entry.
141 * *flen is set to the length of the whole file
145 lws_fts_adopt(struct lws_fts_file *jtf)
147 unsigned char buf[256];
150 if (read(jtf->fd, buf, TRIE_FILE_HDR_SIZE) != TRIE_FILE_HDR_SIZE) {
151 lwsl_err("%s: unable to read file header\n", __func__);
155 if (buf[0] != 0xca || buf[1] != 0x7a ||
156 buf[2] != 0x5f || buf[3] != 0x75) {
157 lwsl_err("%s: bad magic %02X %02X %02X %02X\n", __func__,
158 buf[0], buf[1], buf[2], buf[3]);
162 jtf->root = b32(&buf[4]);
164 ot = lseek(jtf->fd, 0, SEEK_END);
166 lwsl_err("%s: unable to seek\n", __func__);
172 if (jtf->flen != b32(&buf[8])) {
173 lwsl_err("%s: file size doesn't match expected\n", __func__);
178 jtf->filepath_table = b32(&buf[12]);
179 jtf->filepaths = b32(&buf[16]);
187 struct lws_fts_file *
188 lws_fts_open(const char *filepath)
190 struct lws_fts_file *jtf;
192 jtf = lws_malloc(sizeof(*jtf), "fts open");
196 jtf->fd = open(filepath, O_RDONLY);
198 lwsl_err("%s: unable to open %s\n", __func__, filepath);
202 if (lws_fts_adopt(jtf) < 0)
216 lws_fts_close(struct lws_fts_file *jtf)
222 #define grab(_pos, _size) { \
224 if (lseek(jtf->fd, _pos, SEEK_SET) < 0) { \
225 lwsl_err("%s: unable to seek\n", __func__); \
230 ra = read(jtf->fd, buf, _size); \
235 static struct linetable *
236 lws_fts_cache_chunktable(struct lws_fts_file *jtf, uint32_t ofs_linetable,
237 struct lwsac **linetable_head)
239 struct linetable *lt, *first = NULL, **prev = NULL;
240 unsigned char buf[8];
241 int line = 1, bp, ra;
244 *linetable_head = NULL;
247 grab(ofs_linetable, sizeof(buf));
249 lt = lwsac_use(linetable_head, sizeof(*lt), 0);
260 lt->chunk_line_number_start = line;
261 lt->chunk_line_number_count = b16(&buf[bp + 2]);
262 lt->vli_ofs_in_index = ofs_linetable + 8;
263 lt->chunk_filepos_start = cfs;
265 line += lt->chunk_line_number_count;
267 cfs += b32(&buf[bp + 4]);
268 ofs_linetable += b16(&buf[bp]);
270 } while (b16(&buf[bp]));
275 lwsac_free(linetable_head);
281 lws_fts_getfileoffset(struct lws_fts_file *jtf, struct linetable *ltstart,
282 int line, off_t *_ofs)
284 struct linetable *lt = ltstart;
285 unsigned char buf[LWS_FTS_LINES_PER_CHUNK * 5];
290 /* first figure out which chunk */
293 if (line >= lt->chunk_line_number_start &&
294 line < lt->chunk_line_number_start +
295 lt->chunk_line_number_count)
304 /* we know it's in this chunk */
306 ofs = lt->chunk_filepos_start;
307 line -= lt->chunk_line_number_start;
309 grab(lt->vli_ofs_in_index, sizeof(buf));
313 bp += rq32(&buf[bp], &ll);
318 /* we know the offset it is at in the original file */
325 lwsl_info("%s: bail %d\n", __func__, line);
331 ac_record(struct lws_fts_file *jtf, struct lwsac **results_head,
332 const char *needle, int pos, struct wac *s, int sp,
333 uint32_t instances, uint32_t agg_instances, uint32_t children,
334 struct lws_fts_result_autocomplete ***ppac)
336 struct lws_fts_result_autocomplete *ac;
340 if (!instances && !agg_instances)
344 for (n = 1; n <= sp; n++)
345 m += s[n].ch[s[n].child - 1].name_length;
347 ac = lwsac_use(results_head, sizeof(*ac) + m + 1, 0);
351 p = (char *)(ac + 1);
356 ac->instances = instances;
357 ac->agg_instances = agg_instances;
359 ac->has_children = !!children;
362 memcpy(p, needle, pos);
365 for (n = 1; n <= sp; n++) {
366 int w = s[n].child - 1;
368 memcpy(p, s[n].ch[w].name, s[n].ch[w].name_length);
369 p += s[n].ch[w].name_length;
371 p = (char *)(ac + 1);
375 * deduct this child's instance weight from his antecdents to track
376 * relative path attractiveness dynamically, after we already used its
377 * best results (children are sorted best-first)
379 for (n = sp; n >= 0; n--) {
380 s[n].ch[s[n].child - 1].child_agg -= instances;
381 s[n].agg -= instances;
387 struct lws_fts_result *
388 lws_fts_search(struct lws_fts_file *jtf, struct lws_fts_search_params *ftsp)
390 uint32_t children, instances, co, sl, agg, slt, chunk,
391 fileofs_tif_start, desc, agg_instances;
392 int pos = 0, n, m, nl, bp, base = 0, ra, palm, budget, sp, ofd = -1;
393 unsigned long long tf = lws_now_usecs();
394 struct lws_fts_result_autocomplete **pac = NULL;
395 char stasis, nac = 0, credible, needle[32];
396 struct lws_fts_result_filepath *fp;
397 struct lws_fts_result *result;
398 unsigned char buf[4096];
402 ftsp->results_head = NULL;
407 nl = (int)strlen(ftsp->needle);
408 if ((size_t)nl > sizeof(needle) - 2)
411 result = lwsac_use(&ftsp->results_head, sizeof(*result), 0);
415 /* start with no results... */
417 result->autocomplete_head = NULL;
418 pac = &result->autocomplete_head;
419 result->filepath_head = NULL;
420 result->duration_ms = 0;
421 result->effective_flags = ftsp->flags;
425 for (n = 0; n < nl; n++)
426 needle[n] = tolower(ftsp->needle[n]);
434 grab(o, sizeof(buf));
437 bp += rq32(&buf[bp], &fileofs_tif_start);
438 bp += rq32(&buf[bp], &children);
439 bp += rq32(&buf[bp], &instances);
440 bp += rq32(&buf[bp], &agg_instances);
443 /* the children follow here */
448 if (!fileofs_tif_start)
450 * we matched, but there are no instances of
451 * this, it's actually an intermediate
456 /* we leave with bp positioned at the instance list */
458 o = fileofs_tif_start;
459 grab(o, sizeof(buf));
463 if (ra - bp < 1024) {
466 * We don't have enough. So reload the buffer starting
467 * at where we got to.
471 grab(o + base, sizeof(buf));
474 /* gets set if any child COULD match needle if it went on */
477 for (n = 0; (uint32_t)n < children; n++) {
480 bp += rq32(&buf[bp], &co);
481 bp += rq32(&buf[bp], &inst);
482 bp += rq32(&buf[bp], &agg);
483 bp += rq32(&buf[bp], &desc);
484 bp += rq32(&buf[bp], &sl);
486 if (sl > (uint32_t)(nl - pos)) {
489 * it can't be a match because it's longer than
490 * our needle string (but that leaves it as a
491 * perfectly fine autocomplete candidate)
496 * "credible" means at least one child matches
497 * all the chars in needle up to as many as it
498 * has. If not "credible" this path cannot
501 if (!strncmp((char *)&buf[bp], &needle[pos], g))
505 * deflate the parent agg using the
506 * knowledge this child is not on the
507 * path shown by the remainder of needle
509 agg_instances -= agg;
518 /* the comparison string potentially has huge length */
524 * the strategy is to compare whatever we have
525 * lying around, then bring in more if it didn't
526 * fail to match yet. That way we don't bring
527 * in anything we could already have known was
528 * not needed due to a match fail.
535 if ((chunk == 1 && needle[pos] != buf[bp]) ||
537 memcmp(&needle[pos], &buf[bp], chunk))) {
540 * it doesn't match... so nothing can
541 * autocomplete this...
553 /* so far, it matches */
556 /* we matched the whole thing */
566 * do we have at least buf more to match, or the
567 * remainder of the string, whichever is less?
569 * bp may exceed sizeof(buf) on no match path
575 if (ra - bp >= (int)chunk)
579 * We don't have enough. So reload buf starting
580 * at where we got to.
583 grab(o + base, sizeof(buf));
585 } /* while we are still comparing */
587 } /* for each child */
589 if ((uint32_t)n == children) {
598 result->duration_ms = (int)((lws_now_usecs() - tf) / 1000);
600 if (!instances && !children)
603 /* the match list may easily exceed one read buffer load ... */
608 * Only do the file match list if it was requested in the search flags
611 if (!(ftsp->flags & LWSFTS_F_QUERY_FILES))
615 uint32_t fi, tot, line, ro, ofs_linetable, lines, fplen,
617 struct lwsac *lt_head = NULL;
618 struct linetable *ltst;
624 grab(o, sizeof(buf));
627 bp += rq32(&buf[bp], &_o);
630 assert(!o || o > TRIE_FILE_HDR_SIZE);
632 bp += rq32(&buf[bp], &fi);
633 bp += rq32(&buf[bp], &tot);
635 if (lws_fts_filepath(jtf, fi, path, sizeof(path) - 1,
636 &ofs_linetable, &lines)) {
637 lwsl_err("can't get filepath index %d\n", fi);
641 if (ftsp->only_filepath && strcmp(path, ftsp->only_filepath))
644 ltst = lws_fts_cache_chunktable(jtf, ofs_linetable, <_head);
648 if (ftsp->flags & LWSFTS_F_QUERY_QUOTE_LINE) {
649 ofd = open(path, O_RDONLY);
651 lwsac_free(<_head);
656 fplen = (int)strlen(path);
657 footprint = sizeof(*fp) + fplen + 1;
658 if (ftsp->flags & LWSFTS_F_QUERY_FILE_LINES) {
659 /* line number and offset in file */
660 footprint += 2 * sizeof(uint32_t) * tot;
662 if (ftsp->flags & LWSFTS_F_QUERY_QUOTE_LINE)
663 /* pointer to quote string */
664 footprint += sizeof(void *) * tot;
667 fp = lwsac_use(&ftsp->results_head, footprint, 0);
669 lwsac_free(<_head);
673 fp->filepath_length = fplen;
674 fp->lines_in_file = lines;
676 fp->matches_length = footprint - sizeof(*fp) - (fplen + 1);
677 fp->next = result->filepath_head;
678 result->filepath_head = fp;
680 /* line table first so it can be aligned */
682 u = (uint32_t*)(fp + 1);
684 if (ftsp->flags & LWSFTS_F_QUERY_FILE_LINES) {
686 /* for each line number */
688 for (n = 0; (uint32_t)n < tot; n++) {
690 unsigned char lbuf[256], *p;
697 grab(ro + base, sizeof(buf));
700 bp += rq32(&buf[bp], &line);
703 if (lws_fts_getfileoffset(jtf, ltst, line, &fo))
708 if (!(ftsp->flags & LWSFTS_F_QUERY_QUOTE_LINE))
711 if (lseek(ofd, fo, SEEK_SET) < 0)
714 m = read(ofd, lbuf, sizeof(lbuf) - 1);
717 lbuf[sizeof(lbuf) - 1] = '\0';
719 p = (unsigned char *)strchr((char *)lbuf, '\n');
721 m = lws_ptr_diff(p, lbuf);
723 p = (unsigned char *)strchr((char *)lbuf, '\r');
725 m = lws_ptr_diff(p, lbuf);
728 lws_json_purify(ebuf, (const char *)lbuf,
730 m = (int)strlen(ebuf);
732 p = lwsac_use(&ftsp->results_head, m + 1, 0);
734 lwsac_free(<_head);
740 v = (const char **)u;
741 *v = (const char *)p;
742 u += sizeof(const char *) / sizeof(uint32_t);
746 pp = ((char *)&fp[1]) + fp->matches_length;
747 memcpy(pp, path, fplen);
755 lwsac_free(<_head);
757 if (ftsp->only_filepath)
762 /* sort the instance file list by results density */
765 struct lws_fts_result_filepath **prf, *rf1, *rf2;
769 /* bubble sort keeps going until nothing changed */
771 prf = &result->filepath_head;
777 if (rf2 && rf1->lines_in_file && rf2->lines_in_file &&
778 ((rf1->matches * 1000) / rf1->lines_in_file) <
779 ((rf2->matches * 1000) / rf2->lines_in_file)) {
783 rf1->next = rf2->next;
794 if (!(ftsp->flags & LWSFTS_F_QUERY_AUTOCOMPLETE) || nac)
798 * autocomplete (ie, the descendent paths that yield the most hits)
800 * We actually need to spider the earliest terminal descendents from
801 * the child we definitely got past, and present the first n terminal
802 * strings. The descendents are already sorted in order of highest
803 * aggregated hits in their descendents first, so simply collecting n
804 * earliest leaf children is enough.
806 * The leaf children may be quite deep down in a stack however. So we
807 * have to go through all the walking motions collecting and retaining
808 * child into for when we come back up the walk.
810 * We can completely ignore file instances for this, we just need the
811 * earliest children. And we can restrict how many children we stash
812 * in each stack level to eg, 5.
814 * child_ofs comes in pointing at the start of the trie entry that is
815 * to be the starting point for making suggestions.
818 budget = ftsp->max_autocomplete;
821 pac = &result->autocomplete_head;
823 if (pos > (int)sizeof(s[sp].ch[0].name) - 1)
824 pos = (int)sizeof(s[sp].ch[0].name) - 1;
826 memset(&s[sp], 0, sizeof(s[sp]));
829 s[sp].tifs = fileofs_tif_start;
830 s[sp].self = child_ofs;
831 s[sp].ch[0].effpos = pos;
834 n = ac_record(jtf, &ftsp->results_head, needle, pos, s, 0,
835 instances, agg_instances, children, &pac);
837 while (sp >= 0 && budget) {
839 struct ch *tch = &s[sp].ch[s[sp].child - 1];
841 grab(child_ofs, sizeof(buf));
843 bp += rq32(&buf[bp], &fileofs_tif_start);
844 bp += rq32(&buf[bp], &children);
845 bp += rq32(&buf[bp], &instances);
846 bp += rq32(&buf[bp], &agg_instances);
848 if (sp > 0 && s[sp - 1].done_children &&
849 tch->effpos + tch->name_length >= nl &&
850 tch->inst && fileofs_tif_start) {
851 n = ac_record(jtf, &ftsp->results_head, needle, pos, s,
852 sp, tch->inst, tch->child_agg,
853 tch->descendents, &pac);
861 if (!s[sp].done_children && children) {
862 s[sp].done_children = 1;
864 memset(&s[sp], 0, sizeof(s[sp]));
865 s[sp].tifs = fileofs_tif_start;
866 s[sp].self = child_ofs;
868 for (n = 0; n < (int)children && s[sp].child_count <
869 (int)LWS_ARRAY_SIZE(s[0].ch); n++) {
870 uint32_t slen, cho, agg, inst;
871 int i = s[sp].child_count;
872 struct ch *ch = &s[sp].ch[i];
875 bp += rq32(&buf[bp], &cho);
876 bp += rq32(&buf[bp], &inst);
877 bp += rq32(&buf[bp], &agg);
878 bp += rq32(&buf[bp], &desc);
879 bp += rq32(&buf[bp], &slen);
882 if (max > sizeof(ch->name) - 1)
883 max = sizeof(ch->name) - 1;
885 strncpy(ch->name, (char *)&buf[bp], max);
888 ch->name_length = (int)max;
889 ch->name[sizeof(ch->name) - 1] = '\0';
892 s[sp - 1].ch[s[sp - 1].child - 1].effpos;
895 ch->descendents = desc;
898 * if we have more needle chars than we matched
899 * to get this far, we can only allow potential
900 * matches that are consistent with the
901 * additional unmatched character(s)...
905 if (m > ch->name_length)
909 strncmp(&needle[ch->effpos], ch->name, m))
913 s[sp].ch[s[sp].child_count++].ofs = cho;
918 while (sp >= 0 && s[sp].child >= s[sp].child_count) {
919 s[sp].done_children = 0;
924 * Compare parent remaining agg vs parent's next siblings' still
925 * intact original agg... if the next sibling has more, abandon
926 * the parent path and go with the sibling... this keeps the
927 * autocomplete results related to popularity.
933 struct lws_fts_result_autocomplete *ac =
934 (struct lws_fts_result_autocomplete *)pac;
936 if (s[n].child < s[n].child_count &&
937 s[n].ch[s[n].child - 1].child_agg <
938 s[n].ch[s[n].child].child_agg) {
942 * mark the autocomplete result that
943 * there were more children down his
944 * path that we skipped in these results
948 for (m = n; m < sp + 1; m++)
949 s[m].done_children = 0;
951 child_ofs = s[sp].ch[s[sp].child++].ofs;
958 if (nobump || sp < 0)
961 child_ofs = s[sp].ch[s[sp].child++].ofs;
964 /* let's do a final sort into agg order */
967 struct lws_fts_result_autocomplete *ac1, *ac2;
971 /* bubble sort keeps going until nothing changed */
973 pac = &result->autocomplete_head;
979 if (ac2 && ac1->instances < ac2->instances) {
983 ac1->next = ac2->next;
998 lwsl_info("%s: search ended up at bail\n", __func__);