4 * Copyright (C) 2018 Andy Green <andy@warmcat.com>
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation:
9 * version 2.1 of the License.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
23 * - collecting a concordance of strings from one or more files (eg, a
24 * directory of files) into a single in-memory, lac-backed trie;
26 * - to optimize and serialize the in-memory trie to an fd;
28 * - to very quickly report any instances of a string in any of the files
29 * indexed by the trie, by a seeking around a serialized trie fd, without
30 * having to load it all in memory
33 #include "core/private.h"
34 #include "misc/fts/private.h"
41 #include <sys/types.h>
45 /* notice these are stored in t->lwsac_input_head which has input file scope */
47 struct lws_fts_filepath {
48 struct lws_fts_filepath *next;
49 struct lws_fts_filepath *prev;
52 jg2_file_offset line_table_ofs;
59 /* notice these are stored in t->lwsac_input_head which has input file scope */
61 struct lws_fts_lines {
62 struct lws_fts_lines *lines_next;
64 * amount of line numbers needs to meet average count for best
67 * Line numbers are stored in VLI format since if we don't, around half
68 * the total lac allocation consists of struct lws_fts_lines...
69 * size chosen to maintain 8-byte struct alignment
75 /* this represents the instances of a symbol inside a given filepath */
77 struct lws_fts_instance_file {
78 /* linked-list of tifs generated for current file */
79 struct lws_fts_instance_file *inst_file_next;
80 struct lws_fts_entry *owner;
81 struct lws_fts_lines *lines_list, *lines_tail;
86 * optimization for the common case there's only 1 - ~3 matches, so we
87 * don't have to allocate any lws_fts_lines struct
89 * Using 8 bytes total for this maintains 8-byte struct alignment...
97 * this is the main trie in-memory allocation object
100 struct lws_fts_entry {
101 struct lws_fts_entry *parent;
103 struct lws_fts_entry *child_list;
104 struct lws_fts_entry *sibling;
107 * care... this points to content in t->lwsac_input_head, it goes
108 * out of scope when the input file being indexed completes
110 struct lws_fts_instance_file *inst_file_list;
112 jg2_file_offset ofs_last_inst_file;
114 char *suffix; /* suffix string or NULL if one char (in .c) */
116 uint32_t child_count;
117 uint32_t instance_count;
118 uint32_t agg_inst_count;
119 uint32_t agg_child_count;
124 /* there's only one of these per trie file */
127 struct lwsac *lwsac_head;
128 struct lwsac *lwsac_input_head;
129 struct lws_fts_entry *root;
130 struct lws_fts_filepath *filepath_list;
131 struct lws_fts_filepath *fp;
133 struct lws_fts_entry *parser;
134 struct lws_fts_entry *root_lookup[256];
137 * head of linked-list of tifs generated for current file
138 * care... this points to content in t->lwsac_input_head
140 struct lws_fts_instance_file *tif_list;
142 jg2_file_offset c; /* length of output file so far */
144 uint64_t agg_trie_creation_us;
145 uint64_t agg_raw_input;
146 uint64_t worst_lwsac_input_size;
149 jg2_file_offset last_block_len_ofs;
151 int lines_in_unsealed_linetable;
156 unsigned int agg_pos;
157 unsigned int str_match_pos;
159 unsigned char aggregate;
160 unsigned char agg[128];
163 /* since the kernel case allocates >300MB, no point keeping this too low */
165 #define TRIE_LWSAC_BLOCK_SIZE (1024 * 1024)
167 #define spill(margin, force) \
168 if (bp && ((uint32_t)bp >= (sizeof(buf) - (margin)) || (force))) { \
169 if (write(t->fd, buf, bp) != bp) { \
170 lwsl_err("%s: write %d failed (%d)\n", __func__, \
179 g32(unsigned char *b, uint32_t d)
181 *b++ = (d >> 24) & 0xff;
182 *b++ = (d >> 16) & 0xff;
183 *b++ = (d >> 8) & 0xff;
190 g16(unsigned char *b, int d)
192 *b++ = (d >> 8) & 0xff;
199 wq32(unsigned char *b, uint32_t d)
201 unsigned char *ob = b;
203 if (d > (1 << 28) - 1)
204 *b++ = ((d >> 28) | 0x80) & 0xff;
206 if (d > (1 << 21) - 1)
207 *b++ = ((d >> 21) | 0x80) & 0xff;
209 if (d > (1 << 14) - 1)
210 *b++ = ((d >> 14) | 0x80) & 0xff;
212 if (d > (1 << 7) - 1)
213 *b++ = ((d >> 7) | 0x80) & 0xff;
217 return (int)(b - ob);
221 /* read a VLI, return the number of bytes used */
224 rq32(unsigned char *b, uint32_t *d)
226 unsigned char *ob = b;
231 t = (t << 7) | (*b & 0x7f);
233 t = (t << 7) | (*b & 0x7f);
235 t = (t << 7) | (*b & 0x7f);
237 t = (t << 7) | (*b & 0x7f);
246 return (int)(b - ob);
250 lws_fts_create(int fd)
253 struct lwsac *lwsac_head = NULL;
254 unsigned char buf[TRIE_FILE_HDR_SIZE];
256 t = lwsac_use(&lwsac_head, sizeof(*t), TRIE_LWSAC_BLOCK_SIZE);
260 memset(t, 0, sizeof(*t));
263 t->lwsac_head = lwsac_head;
264 t->root = lwsac_use(&lwsac_head, sizeof(*t->root),
265 TRIE_LWSAC_BLOCK_SIZE);
269 memset(t->root, 0, sizeof(*t->root));
271 t->last_file_index = -1;
273 t->filepath_list = NULL;
275 memset(t->root_lookup, 0, sizeof(*t->root_lookup));
277 /* write the header */
284 /* (these are filled in with correct data at the end) */
286 /* file offset to root trie entry */
288 /* file length when it was created */
290 /* fileoffset to the filepath table */
292 /* count of filepaths */
295 if (write(t->fd, buf, TRIE_FILE_HDR_SIZE) != TRIE_FILE_HDR_SIZE) {
296 lwsl_err("%s: trie header write failed\n", __func__);
300 t->c = TRIE_FILE_HDR_SIZE;
305 lwsac_free(&lwsac_head);
311 lws_fts_destroy(struct lws_fts **trie)
313 struct lwsac *lwsac_head = (*trie)->lwsac_head;
314 lwsac_free(&(*trie)->lwsac_input_head);
315 lwsac_free(&lwsac_head);
320 lws_fts_file_index(struct lws_fts *t, const char *filepath, int filepath_len,
323 struct lws_fts_filepath *fp = t->filepath_list;
326 if (fp->filepath_len == filepath_len &&
327 !strcmp(fp->filepath, filepath))
328 return fp->file_index;
333 fp = lwsac_use(&t->lwsac_head, sizeof(*fp), TRIE_LWSAC_BLOCK_SIZE);
337 fp->next = t->filepath_list;
338 t->filepath_list = fp;
339 strncpy(fp->filepath, filepath, sizeof(fp->filepath) - 1);
340 fp->filepath[sizeof(fp->filepath) - 1] = '\0';
341 fp->filepath_len = filepath_len;
342 fp->file_index = t->next_file_index++;
343 fp->line_table_ofs = t->c;
344 fp->priority = priority;
348 return fp->file_index;
351 static struct lws_fts_entry *
352 lws_fts_entry_child_add(struct lws_fts *t, unsigned char c,
353 struct lws_fts_entry *parent)
355 struct lws_fts_entry *e, **pe;
357 e = lwsac_use(&t->lwsac_head, sizeof(*e), TRIE_LWSAC_BLOCK_SIZE);
361 memset(e, 0, sizeof(*e));
364 parent->child_count++;
368 /* keep the parent child list in ascending sort order for c */
370 pe = &parent->child_list;
372 assert((*pe)->parent == parent);
379 pe = &(*pe)->sibling;
383 /* add it at the end */
392 finalize_per_input(struct lws_fts *t)
394 struct lws_fts_instance_file *tif;
395 unsigned char buf[8192];
396 uint64_t lwsac_input_size;
397 jg2_file_offset temp;
400 bp += g16(&buf[bp], 0);
401 bp += g16(&buf[bp], 0);
402 bp += g32(&buf[bp], 0);
403 if (write(t->fd, buf, bp) != bp)
409 * Write the generated file index + instances (if any)
411 * Notice the next same-parent file instance fileoffset list is
412 * backwards, so it does not require seeks to fill in. The first
413 * entry has 0 but the second entry points to the first entry (whose
414 * fileoffset is known).
416 * After all the file instance structs are finalized,
417 * .ofs_last_inst_file contains the fileoffset of that child's tif
418 * list head in the file.
420 * The file instances are written to disk in the order that the files
421 * were indexed, along with their prev pointers inline.
426 struct lws_fts_lines *i;
428 spill((3 * MAX_VLI) + tif->count, 0);
430 temp = tif->owner->ofs_last_inst_file;
432 tif->owner->ofs_last_inst_file = t->c + bp;
434 assert(!temp || (temp > TRIE_FILE_HDR_SIZE && temp < t->c));
436 /* fileoffset of prev instance file for this entry, or 0 */
437 bp += wq32(&buf[bp], temp);
438 bp += wq32(&buf[bp], tif->file_index);
439 bp += wq32(&buf[bp], tif->total);
441 /* remove any pointers into this disposable lac footprint */
442 tif->owner->inst_file_list = NULL;
444 memcpy(&buf[bp], &tif->vli, tif->count);
450 memcpy(&buf[bp], &i->vli, i->count);
456 tif = tif->inst_file_next;
461 assert(lseek(t->fd, 0, SEEK_END) == (off_t)t->c);
463 if (t->lwsac_input_head) {
464 lwsac_input_size = lwsac_total_alloc(t->lwsac_input_head);
465 if (lwsac_input_size > t->worst_lwsac_input_size)
466 t->worst_lwsac_input_size = lwsac_input_size;
470 * those per-file allocations are all on a separate lac so we can
471 * free it cleanly afterwards
473 lwsac_free(&t->lwsac_input_head);
475 /* and lose the pointer into the deallocated lac */
482 * 0 = punctuation, whitespace, brackets etc
483 * 1 = character inside symbol set
484 * 2 = upper-case character inside symbol set
487 static char classify[] = {
488 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
489 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
490 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
491 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
492 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
493 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 1, //1,
494 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
495 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
496 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
497 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
498 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
499 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
500 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
501 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
502 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
503 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
508 name_entry(struct lws_fts_entry *e1, char *s, int len)
510 struct lws_fts_entry *e2;
518 if ((int)e2->suffix_len < n) {
520 memcpy(&s[n], e2->suffix, e2->suffix_len);
535 * as we parse the input, we create a line length table for the file index.
536 * Only the file header has been written before we start doing this.
540 lws_fts_fill(struct lws_fts *t, uint32_t file_index, const char *buf,
543 unsigned long long tf = lws_now_usecs();
544 unsigned char c, linetable[256], vlibuf[8];
545 struct lws_fts_entry *e, *e1, *dcl;
546 struct lws_fts_instance_file *tif;
547 int bp = 0, sline, chars, m;
548 char *osuff, skipline = 0;
549 struct lws_fts_lines *tl;
550 unsigned int olen, n;
553 if ((int)file_index != t->last_file_index) {
554 if (t->last_file_index >= 0)
555 finalize_per_input(t);
556 t->last_file_index = file_index;
558 t->chars_in_line = 0;
559 t->lines_in_unsealed_linetable = 0;
562 t->agg_raw_input += len;
568 sline = t->line_number;
569 bp += g16(&linetable[bp], 0);
570 bp += g16(&linetable[bp], 0);
571 bp += g32(&linetable[bp], 0);
576 if (t->lines_in_unsealed_linetable >= LWS_FTS_LINES_PER_CHUNK)
581 c = (unsigned char)*buf++;
585 t->filepath_list->total_lines++;
586 t->lines_in_unsealed_linetable++;
589 bp += wq32(&linetable[bp], t->chars_in_line);
590 if ((unsigned int)bp > sizeof(linetable) - 6) {
591 if (write(t->fd, linetable, bp) != bp) {
592 lwsl_err("%s: linetable write failed\n",
598 // assert(lseek(t->fd, 0, SEEK_END) == t->c);
601 chars += t->chars_in_line;
602 t->chars_in_line = 0;
605 * Detect overlength lines and skip them (eg, BASE64
612 while (n < 200 && m < 80 && buf[n] != '\n') {
613 if (buf[n] == ' ' || buf[n] == '\t')
619 /* 80 lines no whitespace, or >=200-char line */
621 if (m == 80 || n == 200)
630 m = classify[(int)c];
639 * We created a trie entry for an earlier char in this
640 * symbol already. So we know at the moment, any
641 * further chars in the symbol are the only children.
643 * Aggregate them and add them as a string suffix to
644 * the trie symbol at the end (when we know how much to
648 if (t->agg_pos < sizeof(t->agg) - 1)
649 /* symbol is not too long to stash */
650 t->agg[t->agg_pos++] = c;
655 if (t->str_match_pos) {
660 /* zeroth-iteration child matching */
662 if (t->parser == t->root) {
663 e = t->root_lookup[(int)c];
670 /* look for the char amongst the children */
672 e = t->parser->child_list;
675 /* since they're alpha ordered... */
684 t->str_match_pos = 1;
697 * we are blazing a new trail, add a new child representing
698 * the whole suffix that couldn't be matched until now.
701 e = lws_fts_entry_child_add(t, c, t->parser);
703 lwsl_err("%s: lws_fts_entry_child_add failed\n",
708 /* if it's the root node, keep the root_lookup table in sync */
710 if (t->parser == t->root)
711 t->root_lookup[(int)c] = e;
713 /* follow the new path */
717 struct lws_fts_entry **pe = &e->child_list;
719 assert((*pe)->parent == e);
721 pe = &(*pe)->sibling;
726 * If there are any more symbol characters coming, just
727 * create a suffix string on t->parser instead of what must
728 * currently be single-child nodes, since we just created e
729 * as a child with a single character due to no existing match
730 * on that single character... so if no match on 'h' with this
731 * guy's parent, we created e that matches on the single char
732 * 'h'. If the symbol continues ... 'a' 'p' 'p' 'y', then
733 * instead of creating singleton child nodes under e,
734 * modify e to match on the whole string suffix "happy".
736 * If later "hoppy" appears, we will remove the suffix on e,
737 * so it reverts to a char match for 'h', add singleton children
738 * for 'a' and 'o', and attach a "ppy" suffix child to each of
741 * We want to do this so we don't have to allocate trie entries
742 * for every char in the string to save memory and consequently
745 * Don't try this optimization if the parent is the root node...
746 * it's not compatible with it's root_lookup table and it's
747 * highly likely children off the root entry are going to have
751 if (e->parent != t->root) {
759 if (t->str_match_pos) {
762 * We're partway through matching an elaborated string
763 * on a child, not just a character. String matches
764 * only exist when we met a child entry that only had
765 * one path until now... so we had an 'h', and the
766 * only child had a string "hello".
768 * We are following the right path and will not need
769 * to back up, but we may find as we go we have the
770 * first instance of a second child path, eg, "help".
772 * When we get to the 'p', we have to split what was
773 * the only string option "hello" into "hel" and then
774 * two child entries, for "lo" and 'p'.
777 if (c == t->parser->suffix[t->str_match_pos++]) {
778 if (t->str_match_pos < t->parser->suffix_len)
782 * We simply matched everything, continue
783 * parsing normally from this trie entry.
786 t->str_match_pos = 0;
791 * So... we hit a mismatch somewhere... it means we
792 * have to split this string entry.
794 * We know the first char actually matched in order to
795 * start down this road. So for the current trie entry,
796 * we need to truncate his suffix at the char before
797 * this mismatched one, where we diverged (if the
798 * second char, simply remove the suffix string from the
799 * current trie entry to turn it back to a 1-char match)
801 * The original entry, which becomes the lhs post-split,
805 olen = t->parser->suffix_len;
806 osuff = t->parser->suffix;
808 if (t->str_match_pos == 2)
809 t->parser->suffix = NULL;
811 t->parser->suffix_len = t->str_match_pos - 1;
814 * Then we need to create a new child trie entry that
815 * represents the remainder of the original string
816 * path that we didn't match. For the "hello" /
817 * "help" case, this guy will have "lo".
819 * Any instances or children (not siblings...) that were
820 * attached to the original trie entry must be detached
821 * first and then migrate to this new guy that completes
822 * the original string.
825 dcl = t->parser->child_list;
826 m = t->parser->child_count;
828 t->parser->child_list = NULL;
829 t->parser->child_count = 0;
831 e = lws_fts_entry_child_add(t,
832 osuff[t->str_match_pos - 1], t->parser);
834 lwsl_err("%s: lws_fts_entry_child_add fail1\n",
842 * any children we took over must point to us as the
843 * parent now they appear on our child list
852 * We detached any children, gave them to the new guy
853 * and replaced them with just our new guy
855 t->parser->child_count = 1;
856 t->parser->child_list = e;
859 * any instances that belonged to the original entry we
860 * are splitting now must be reassigned to the end
864 e->inst_file_list = t->parser->inst_file_list;
865 if (e->inst_file_list)
866 e->inst_file_list->owner = e;
867 t->parser->inst_file_list = NULL;
868 e->instance_count = t->parser->instance_count;
869 t->parser->instance_count = 0;
871 e->ofs_last_inst_file = t->parser->ofs_last_inst_file;
872 t->parser->ofs_last_inst_file = 0;
874 if (t->str_match_pos != olen) {
875 /* we diverged partway */
876 e->suffix = &osuff[t->str_match_pos - 1];
877 e->suffix_len = olen - (t->str_match_pos - 1);
881 * if the current char is a terminal, skip creating a
885 if (classify[(int)c]) {
888 * Lastly we need to create a new child trie
889 * entry that represents the new way forward
890 * from the point that we diverged. For the
891 * "hello" / "help" case, this guy will start
892 * as a child of "hel" with the single
893 * character match 'p'.
895 * Since he becomes the current parser context,
896 * more symbol characters may be coming to make
897 * him into, eg, "helping", in which case he
898 * will acquire a suffix eventually of "ping"
899 * via the aggregation stuff
902 e = lws_fts_entry_child_add(t, c, t->parser);
904 lwsl_err("%s: child_add fail2\n",
910 /* go on following this path */
916 t->str_match_pos = 0;
921 /* this is intended to be a seal */
927 if (t->aggregate && t->agg_pos) {
929 /* if nothing in agg[]: leave as single char match */
931 /* otherwise copy out the symbol aggregation */
932 t->parser->suffix = lwsac_use(&t->lwsac_head,
934 TRIE_LWSAC_BLOCK_SIZE);
935 if (!t->parser->suffix) {
936 lwsl_err("%s: lac for suffix failed\n",
941 /* add the first char at the beginning */
942 *t->parser->suffix = t->parser->c;
943 /* and then add the agg buffer stuff */
944 memcpy(t->parser->suffix + 1, t->agg, t->agg_pos);
945 t->parser->suffix_len = t->agg_pos + 1;
949 if (t->parser == t->root) /* multiple terminal chars */
952 if (!t->parser->inst_file_list ||
953 t->parser->inst_file_list->file_index != file_index) {
954 tif = lwsac_use(&t->lwsac_input_head, sizeof(*tif),
955 TRIE_LWSAC_BLOCK_SIZE);
957 lwsl_err("%s: lac for tif failed\n",
962 tif->file_index = file_index;
963 tif->owner = t->parser;
964 tif->lines_list = NULL;
965 tif->lines_tail = NULL;
968 tif->inst_file_next = t->tif_list;
971 t->parser->inst_file_list = tif;
975 * A naive allocation strategy for this leads to 50% of the
976 * total inmem lac allocation being for line numbers...
978 * It's mainly solved by only holding the instance and line
979 * number tables for the duration of a file being input, as soon
980 * as one input file is finished it is written to disk.
982 * For the common case of 1 - ~3 matches the line number are
983 * stored in a small VLI array inside the filepath inst. If the
984 * next one won't fit, it allocates a line number struct with
985 * more vli space and continues chaining those if needed.
988 n = wq32(vlibuf, t->line_number);
989 tif = t->parser->inst_file_list;
991 if (!tif->lines_list) {
992 /* we are still trying to use the file inst vli */
993 if (LWS_ARRAY_SIZE(tif->vli) - tif->count >= n) {
994 tif->count += wq32(tif->vli + tif->count,
998 /* we are going to have to allocate */
1001 /* can we add to an existing line numbers struct? */
1002 if (tif->lines_tail &&
1003 LWS_ARRAY_SIZE(tif->lines_tail->vli) -
1004 tif->lines_tail->count >= n) {
1005 tif->lines_tail->count += wq32(tif->lines_tail->vli +
1006 tif->lines_tail->count,
1011 /* either no existing line numbers struct at tail, or full */
1013 /* have to create a(nother) line numbers struct */
1014 tl = lwsac_use(&t->lwsac_input_head, sizeof(*tl),
1015 TRIE_LWSAC_BLOCK_SIZE);
1017 lwsl_err("%s: lac for tl failed\n", __func__);
1020 tl->lines_next = NULL;
1021 if (tif->lines_tail)
1022 tif->lines_tail->lines_next = tl;
1024 tif->lines_tail = tl;
1025 if (!tif->lines_list)
1026 tif->lines_list = tl;
1028 tl->count = wq32(tl->vli, t->line_number);
1034 const char *ne = name_entry(t->parser, s, sizeof(s));
1036 if (!strcmp(ne, "describ")) {
1037 lwsl_err(" %s %d\n", ne, t->str_match_pos);
1038 write(1, buf - 10, 20);
1042 t->parser->instance_count++;
1043 t->parser = t->root;
1044 t->str_match_pos = 0;
1047 /* seal off the line length table block */
1050 if (write(t->fd, linetable, bp) != bp)
1056 if (lseek(t->fd, lbh, SEEK_SET) < 0) {
1057 lwsl_err("%s: seek to 0x%llx failed\n", __func__,
1058 (unsigned long long)lbh);
1062 g16(linetable, t->c - lbh);
1063 g16(linetable + 2, t->line_number - sline);
1064 g32(linetable + 4, chars);
1065 if (write(t->fd, linetable, 8) != 8) {
1066 lwsl_err("%s: write linetable header failed\n", __func__);
1070 assert(lseek(t->fd, 0, SEEK_END) == (off_t)t->c);
1072 if (lseek(t->fd, t->c, SEEK_SET) < 0) {
1073 lwsl_err("%s: end seek failed\n", __func__);
1080 t->lines_in_unsealed_linetable = 0;
1084 /* dump the collected per-input instance and line data, and free it */
1086 t->agg_trie_creation_us += lws_now_usecs() - tf;
1091 /* refer to ./README.md */
1094 lws_fts_serialize(struct lws_fts *t)
1096 struct lws_fts_filepath *fp = t->filepath_list, *ofp;
1097 unsigned long long tf = lws_now_usecs();
1098 struct lws_fts_entry *e, *e1, *s[256];
1099 unsigned char buf[8192], stasis;
1100 int n, bp, sp = 0, do_parent;
1103 finalize_per_input(t);
1106 * Compute aggregated instance counts (parents should know the total
1107 * number of instances below each child path)
1112 * (root) -> (c1) -> (c2)
1115 * we need to visit the nodes in the order
1126 /* aggregate in every antecedent */
1128 for (n = 0; n <= sp; n++) {
1129 s[n]->agg_inst_count += s[sp]->instance_count;
1130 s[n]->agg_child_count += s[sp]->child_count;
1133 /* handle any children before the parent */
1135 if (s[sp]->child_list) {
1136 if (sp + 1 == LWS_ARRAY_SIZE(s)) {
1137 lwsl_err("Stack too deep\n");
1142 s[sp + 1] = s[sp]->child_list;
1148 if (s[sp]->sibling) {
1149 s[sp] = s[sp]->sibling;
1156 /* dump the filepaths and set prev */
1158 fp = t->filepath_list;
1163 fp->ofs = t->c + bp;
1164 n = (int)strlen(fp->filepath);
1167 bp += wq32(&buf[bp], fp->line_table_ofs);
1168 bp += wq32(&buf[bp], fp->total_lines);
1169 bp += wq32(&buf[bp], n);
1170 memcpy(&buf[bp], fp->filepath, n);
1180 /* record the fileoffset of the filepath map and filepath count */
1182 if (lseek(t->fd, 0xc, SEEK_SET) < 0)
1185 g32(buf, t->c + bp);
1186 g32(buf + 4, t->next_file_index);
1187 if (write(t->fd, buf, 8) != 8)
1190 if (lseek(t->fd, t->c + bp, SEEK_SET) < 0)
1193 /* dump the filepath map, starting from index 0, which is at the tail */
1199 g32(buf + bp, fp->ofs);
1206 * The trie entries in reverse order... because of the reversal, we have
1207 * always written children first, and marked them with their file offset
1208 * before we come to refer to them.
1217 /* handle any children before the parent */
1219 if (!do_parent && s[sp]->child_list) {
1221 if (sp + 1 == LWS_ARRAY_SIZE(s)) {
1222 lwsl_err("Stack too deep\n");
1227 s[sp + 1] = s[sp]->child_list;
1232 /* leaf nodes with no children */
1237 /* write the trie entry header */
1239 spill((3 * MAX_VLI), 0);
1241 bp += wq32(&buf[bp], e->ofs_last_inst_file);
1242 bp += wq32(&buf[bp], e->child_count);
1243 bp += wq32(&buf[bp], e->instance_count);
1244 bp += wq32(&buf[bp], e->agg_inst_count);
1246 /* sort the children in order of highest aggregate hits first */
1249 struct lws_fts_entry **pe, *te1, *te2;
1253 /* bubble sort keeps going until nothing changed */
1255 pe = &e->child_list;
1261 if (te2 && te1->agg_inst_count <
1262 te2->agg_inst_count) {
1266 te1->sibling = te2->sibling;
1270 pe = &(*pe)->sibling;
1275 /* write the children */
1279 spill((5 * MAX_VLI) + e1->suffix_len + 1, 0);
1281 bp += wq32(&buf[bp], e1->ofs);
1282 bp += wq32(&buf[bp], e1->instance_count);
1283 bp += wq32(&buf[bp], e1->agg_inst_count);
1284 bp += wq32(&buf[bp], e1->agg_child_count);
1286 if (e1->suffix) { /* string */
1287 bp += wq32(&buf[bp], e1->suffix_len);
1288 memmove(&buf[bp], e1->suffix, e1->suffix_len);
1289 bp += e1->suffix_len;
1291 bp += wq32(&buf[bp], 1);
1295 if (e1->suffix && e1->suffix_len == 3 &&
1296 !memcmp(e1->suffix, "cri", 3)) {
1297 struct lws_fts_entry *e2;
1302 lwsl_notice("%s\n", e2->suffix);
1304 lwsl_notice("%c\n", e2->c);
1309 lwsl_err("*** %c CRI inst %d ch %d\n", e1->parent->c,
1310 e1->instance_count, e1->child_count);
1316 /* if there are siblings, do those next */
1324 s[sp] = s[sp]->sibling;
1326 /* if there are no siblings, do the parent */
1328 s[sp] = s[sp]->parent;
1334 assert(lseek(t->fd, 0, SEEK_END) == (off_t)t->c);
1336 /* drop the correct root trie offset + file length into the header */
1338 if (lseek(t->fd, 4, SEEK_SET) < 0) {
1339 lwsl_err("%s: unable to seek\n", __func__);
1344 g32(buf, t->root->ofs);
1346 if (write(t->fd, buf, 0x8) != 0x8)
1349 lwsl_notice("%s: index %d files (%uMiB) cpu time %dms, "
1350 "alloc: %dKiB + %dKiB, "
1351 "serialize: %dms, file: %dKiB\n", __func__,
1353 (int)(t->agg_raw_input / (1024 * 1024)),
1354 (int)(t->agg_trie_creation_us / 1000),
1355 (int)(lwsac_total_alloc(t->lwsac_head) / 1024),
1356 (int)(t->worst_lwsac_input_size / 1024),
1357 (int)((lws_now_usecs() - tf) / 1000),
1358 (int)(t->c / 1024));
1363 lwsl_err("%s: problem seekings\n", __func__);