From 6a1513fb5fc41f7a3d965e26e094d9c7b6579212 Mon Sep 17 00:00:00 2001 From: Michael Matz Date: Thu, 6 Dec 2007 03:03:29 +0000 Subject: [PATCH 1/1] prefix-code the string store, difference-code the dependency idarrays. V1 solv files still can be loaded. --- src/pool.h | 2 +- src/pooltypes.h | 2 + src/repo_solv.c | 86 ++++++++++++++++++++------ tools/repo_write.c | 174 ++++++++++++++++++++++++++++++++++++++++++++++------- 4 files changed, 224 insertions(+), 40 deletions(-) diff --git a/src/pool.h b/src/pool.h index 602a971..dcdc805 100644 --- a/src/pool.h +++ b/src/pool.h @@ -125,7 +125,7 @@ struct _Pool { #define TYPE_IDARRAY 2 #define TYPE_STR 3 #define TYPE_U32 4 -#define TYPE_BITMAP 128 +#define TYPE_REL_IDARRAY 5 //----------------------------------------------- diff --git a/src/pooltypes.h b/src/pooltypes.h index df6145f..afacf62 100644 --- a/src/pooltypes.h +++ b/src/pooltypes.h @@ -16,8 +16,10 @@ /* version number for .solv files */ #define SOLV_VERSION_0 0 #define SOLV_VERSION_1 1 +#define SOLV_VERSION_2 2 #define SOLV_FLAG_PACKEDSIZES 1 #define SOLV_FLAG_VERTICAL 2 +#define SOLV_FLAG_PREFIX_POOL 4 struct _Stringpool; typedef struct _Stringpool Stringpool; diff --git a/src/repo_solv.c b/src/repo_solv.c index 57b4f9c..4927e65 100644 --- a/src/repo_solv.c +++ b/src/repo_solv.c @@ -29,7 +29,7 @@ #define INTERESTED_START SOLVABLE_NAME #define INTERESTED_END SOLVABLE_FRESHENS -Pool *mypool; /* for pool_debug... */ +static Pool *mypool; /* for pool_debug... */ /*-----------------------------------------------------------------*/ /* .solv read functions */ @@ -116,10 +116,11 @@ read_id(FILE *fp, Id max) */ static Id * -read_idarray(FILE *fp, Id max, Id *map, Id *store, Id *end) +read_idarray(FILE *fp, Id max, Id *map, Id *store, Id *end, int relative) { unsigned int x = 0; int c; + Id old = 0; for (;;) { c = getc(fp); @@ -131,18 +132,31 @@ read_idarray(FILE *fp, Id max, Id *map, Id *store, Id *end) if ((c & 128) == 0) { x = (x << 6) | (c & 63); - if (x >= max) + if (!relative || x != 1) { - pool_debug(mypool, SAT_FATAL, "read_idarray: id too large (%u/%u)\n", x, max); - exit(1); + if (relative) + { + x--; + x += old; + old = x; + } + if (x >= max) + { + pool_debug(mypool, SAT_FATAL, "read_idarray: id too large (%u/%u)\n", x, max); + exit(1); + } + if (map) + x = map[x]; } + else + /* (relative && x==1) : + Ugly PREREQ handling. See repo_write.c. */ + x = SOLVABLE_PREREQMARKER, old = 1; if (store == end) { pool_debug(mypool, SAT_FATAL, "read_idarray: array overflow\n"); exit(1); } - if (map) - x = map[x]; *store++ = x; if ((c & 64) == 0) { @@ -204,6 +218,7 @@ repo_add_solv(Repo *repo, FILE *fp) Offset ido; Solvable *s; unsigned int solvflags; + unsigned int solvversion; struct key *keys; Id *schemadata, *schemadatap, *schemadataend; Id *schemata, key; @@ -215,10 +230,15 @@ repo_add_solv(Repo *repo, FILE *fp) pool_debug(pool, SAT_FATAL, "not a SOLV file\n"); exit(1); } - if (read_u32(fp) != SOLV_VERSION_1) + solvversion = read_u32(fp); + switch (solvversion) { - pool_debug(pool, SAT_FATAL, "unsupported SOLV version\n"); - exit(1); + case SOLV_VERSION_1: + case SOLV_VERSION_2: + break; + default: + pool_debug(pool, SAT_FATAL, "unsupported SOLV version\n"); + exit(1); } pool_freeidhashes(pool); @@ -264,10 +284,38 @@ repo_add_solv(Repo *repo, FILE *fp) * read new repo at end of pool */ - if (fread(strsp, sizeid, 1, fp) != 1) + if ((solvflags & SOLV_FLAG_PREFIX_POOL) == 0) { - pool_debug(pool, SAT_FATAL, "read error while reading strings\n"); - exit(1); + if (fread(strsp, sizeid, 1, fp) != 1) + { + pool_debug(pool, SAT_FATAL, "read error while reading strings\n"); + exit(1); + } + } + else + { + unsigned int pfsize = read_u32 (fp); + char *prefix = xmalloc (pfsize); + char *pp = prefix; + char *old_str = ""; + char *dest = strsp; + if (fread (prefix, pfsize, 1, fp) != 1) + { + pool_debug(pool, SAT_FATAL, "read error while reading strings\n"); + exit(1); + } + for (i = 1; i < numid; i++) + { + int same = (unsigned char)*pp++; + size_t len = strlen (pp) + 1; + if (same) + memcpy (dest, old_str, same); + memcpy (dest + same, pp, len); + pp += len; + old_str = dest; + dest += same + len; + } + xfree (prefix); } strsp[sizeid] = 0; /* make string space \0 terminated */ sp = strsp; @@ -449,7 +497,7 @@ repo_add_solv(Repo *repo, FILE *fp) for (i = 0; i < numschemata; i++) { schemata[i] = schemadatap - schemadata; - schemadatap = read_idarray(fp, numid, 0, schemadatap, schemadataend); + schemadatap = read_idarray(fp, numid, 0, schemadatap, schemadataend, 0); } /******* Part 5: Info ***********************************************/ @@ -473,6 +521,7 @@ repo_add_solv(Repo *repo, FILE *fp) ; break; case TYPE_IDARRAY: + case TYPE_REL_IDARRAY: while ((read_u8(fp) & 0xc0) != 0) ; break; @@ -497,7 +546,8 @@ repo_add_solv(Repo *repo, FILE *fp) for (i = 1; i < numkeys; i++) { id = keys[i].name; - if (keys[i].type == TYPE_IDARRAY && id >= INTERESTED_START && id <= INTERESTED_END) + if ((keys[i].type == TYPE_IDARRAY || keys[i].type == TYPE_REL_IDARRAY) + && id >= INTERESTED_START && id <= INTERESTED_END) size_idarray += keys[i].size; } @@ -574,6 +624,7 @@ repo_add_solv(Repo *repo, FILE *fp) ; break; case TYPE_IDARRAY: + case TYPE_REL_IDARRAY: if (id < INTERESTED_START || id > INTERESTED_END) { /* not interested in array */ @@ -582,7 +633,7 @@ repo_add_solv(Repo *repo, FILE *fp) break; } ido = idarraydatap - repo->idarraydata; - idarraydatap = read_idarray(fp, numid + numrel, idmap, idarraydatap, idarraydataend); + idarraydatap = read_idarray(fp, numid + numrel, idmap, idarraydatap, idarraydataend, type == TYPE_REL_IDARRAY); if (id == SOLVABLE_PROVIDES) s->provides = ido; else if (id == SOLVABLE_OBSOLETES) @@ -653,6 +704,7 @@ repo_add_solv(Repo *repo, FILE *fp) ; break; case TYPE_IDARRAY: + case TYPE_REL_IDARRAY: if (id < INTERESTED_START || id > INTERESTED_END) { /* not interested in array */ @@ -661,7 +713,7 @@ repo_add_solv(Repo *repo, FILE *fp) break; } ido = idarraydatap - repo->idarraydata; - idarraydatap = read_idarray(fp, numid + numrel, idmap, idarraydatap, idarraydataend); + idarraydatap = read_idarray(fp, numid + numrel, idmap, idarraydatap, idarraydataend, keys[key].type == TYPE_REL_IDARRAY); if (id == SOLVABLE_PROVIDES) s->provides = ido; else if (id == SOLVABLE_OBSOLETES) diff --git a/tools/repo_write.c b/tools/repo_write.c index 85f6b23..d16b6c0 100644 --- a/tools/repo_write.c +++ b/tools/repo_write.c @@ -21,11 +21,14 @@ #include #include #include +#include #include "pool.h" #include "util.h" #include "repo_write.h" +/* #define IGNORE_NEED_FREQUENCY */ + /*------------------------------------------------------------------*/ /* Id map optimizations */ @@ -95,6 +98,32 @@ needid_cmp_need(const void *ap, const void *bp) return a->map - b->map; } +static Pool *cmp_pool; +static int +needid_cmp_need_s(const void *ap, const void *bp) +{ + const NeedId *a = ap; + const NeedId *b = bp; + if (a == b) + return 0; +#ifdef IGNORE_NEED_FREQUENCY + if (a->need == 0) + return 1; + else if (b->need == 0) + return -1; +#else + int r; + r = b->need - a->need; + if (r) + return r; +#endif + char *as = cmp_pool->ss.stringspace + cmp_pool->ss.strings[a->map]; + char *bs = cmp_pool->ss.stringspace + cmp_pool->ss.strings[b->map]; + size_t alen = strlen (as); + size_t blen = strlen (bs); + return memcmp (as, bs, alen < blen ? alen + 1 : blen + 1); +} + /*------------------------------------------------------------------*/ /* output routines */ @@ -188,6 +217,82 @@ write_idarray(FILE *fp, Pool *pool, NeedId *needid, Id *ids) } } +static int +cmp_ids (const void *pa, const void *pb) +{ + Id a = *(Id *)pa; + Id b = *(Id *)pb; + return a - b; +} + +static void +write_idarray_sort(FILE *fp, Pool *pool, NeedId *needid, Id *ids) +{ + int len, i; + if (!ids) + return; + if (!*ids) + { + write_u8 (fp, 0); + return; + } + /* If we ever share idarrays we can't do this in-place. */ + for (len = 0; ids[len]; len++) + { + Id id = ids[len]; + if (needid) + ids[len] = needid[ISRELDEP(id) ? RELOFF(id) : id].need; + } + + /* That bloody solvable:prereqmarker needs to stay in position :-( */ + Id prereq = SOLVABLE_PREREQMARKER; + if (needid) + prereq = needid[prereq].need; + for (i = 0; i < len; i++) + if (ids[i] == prereq) + break; + if (i > 1) + qsort (ids, i, sizeof (Id), cmp_ids); + if ((len - i) > 2) + qsort (ids + i + 1, len - i - 1, sizeof (Id), cmp_ids); + + Id old = 0; + for (i = 0; i < len; i++) + /* Ugly PREREQ handling. A "difference" of 1 is the prereq marker, + hence all real differences are offsetted by 1. Otherwise we would + have to handle negative differences, which would cost code space for + the encoding of the sign. We loose the exact mapping of prereq here, + but we know the result, so we can recover from that in the reader. */ + if (ids[i] == prereq) + old = ids[i] = 1; + else + { + ids[i] -= old; + old = ids[i] + old; + /* difference is zero --> we have multiple equal elements in the list */ + assert (ids[i] > 0); + ids[i]++; + } + + /* The differencing above produces many runs of ones and twos. I tried + fairly elaborate schemes to RLE those, but they give only very mediocre + improvements in compression, as coding the escapes costs quite some + space. Even if they are coded only as bits in IDs. The best improvement + was about 2.7% for the whole .solv file. It's probably better to + invest some complexity into sharing idarrays, than RLEing. */ + for (;;) + { + Id id = *ids++; + if (id >= 64) + id = (id & 63) | ((id & ~63) << 1); + if (!*ids) + { + write_id(fp, id); + return; + } + write_id(fp, id | 64); + } +} /* * Repo @@ -202,6 +307,7 @@ repo_write(Repo *repo, FILE *fp) NeedId *needid; int nstrings, nrels, nkeys, nschemata; unsigned int sizeid; + unsigned int solv_flags; Reldep *ran; Id *idarraydata; @@ -275,7 +381,8 @@ repo_write(Repo *repo, FILE *fp) for (i = 0; i < pool->ss.nstrings + pool->nrels; i++) needid[i].map = i; - qsort(needid + 1, pool->ss.nstrings - 1, sizeof(*needid), needid_cmp_need); + cmp_pool = pool; + qsort(needid + 1, pool->ss.nstrings - 1, sizeof(*needid), needid_cmp_need_s); qsort(needid + pool->ss.nstrings, pool->nrels, sizeof(*needid), needid_cmp_need); sizeid = 0; @@ -400,7 +507,7 @@ repo_write(Repo *repo, FILE *fp) /* write file header */ write_u32(fp, 'S' << 24 | 'O' << 16 | 'L' << 8 | 'V'); - write_u32(fp, SOLV_VERSION_1); + write_u32(fp, SOLV_VERSION_2); /* write counts */ write_u32(fp, nstrings); @@ -409,25 +516,48 @@ repo_write(Repo *repo, FILE *fp) write_u32(fp, nkeys); write_u32(fp, nschemata); write_u32(fp, 0); /* no info block */ + solv_flags = 0; + solv_flags |= SOLV_FLAG_PREFIX_POOL; #if 0 - write_u32(fp, SOLV_FLAG_VERTICAL); /* no flags */ -#else - write_u32(fp, 0); /* no flags */ + solv_flags |= SOLV_FLAG_VERTICAL; #endif + write_u32(fp, solv_flags); + + /* Build the prefix-encoding of the string pool. We need to know + the size of that before writing it to the file, so we have to + build a separate buffer for that. As it's temporarily possible + that this actually is an expansion we can't easily reuse the + stringspace for this. The max expansion per string is 1 byte, + so it will fit into sizeid+nstrings bytes. */ + char *prefix = xmalloc (sizeid + nstrings); + char *pp = prefix; + char *old_str = ""; + for (i = 1; i < nstrings; i++) + { + char *str = pool->ss.stringspace + pool->ss.strings[needid[i].map]; + int same; + size_t len; + for (same = 0; same < 255; same++) + if (old_str[same] != str[same]) + break; + *pp++ = same; + len = strlen (str + same) + 1; + memcpy (pp, str + same, len); + pp += len; + old_str = str; + } /* * write strings */ write_u32(fp, sizeid); - for (i = 1; i < nstrings; i++) + write_u32(fp, pp - prefix); + if (fwrite(prefix, pp - prefix, 1, fp) != 1) { - char *str = pool->ss.stringspace + pool->ss.strings[needid[i].map]; - if (fwrite(str, strlen(str) + 1, 1, fp) != 1) - { - perror("write error"); - exit(1); - } + perror("write error"); + exit(1); } + xfree (prefix); /* * write RelDeps @@ -449,7 +579,7 @@ repo_write(Repo *repo, FILE *fp) continue; write_id(fp, needid[i].need); if (i >= SOLVABLE_PROVIDES && i <= SOLVABLE_FRESHENS) - write_id(fp, TYPE_IDARRAY); + write_id(fp, TYPE_REL_IDARRAY); else if (i == RPM_RPMDBID) write_id(fp, TYPE_U32); else @@ -564,23 +694,23 @@ repo_write(Repo *repo, FILE *fp) if (s->vendor) write_id(fp, needid[s->vendor].need); if (s->provides) - write_idarray(fp, pool, needid, idarraydata + s->provides); + write_idarray_sort(fp, pool, needid, idarraydata + s->provides); if (s->obsoletes) - write_idarray(fp, pool, needid, idarraydata + s->obsoletes); + write_idarray_sort(fp, pool, needid, idarraydata + s->obsoletes); if (s->conflicts) - write_idarray(fp, pool, needid, idarraydata + s->conflicts); + write_idarray_sort(fp, pool, needid, idarraydata + s->conflicts); if (s->requires) - write_idarray(fp, pool, needid, idarraydata + s->requires); + write_idarray_sort(fp, pool, needid, idarraydata + s->requires); if (s->recommends) - write_idarray(fp, pool, needid, idarraydata + s->recommends); + write_idarray_sort(fp, pool, needid, idarraydata + s->recommends); if (s->suggests) - write_idarray(fp, pool, needid, idarraydata + s->suggests); + write_idarray_sort(fp, pool, needid, idarraydata + s->suggests); if (s->supplements) - write_idarray(fp, pool, needid, idarraydata + s->supplements); + write_idarray_sort(fp, pool, needid, idarraydata + s->supplements); if (s->enhances) - write_idarray(fp, pool, needid, idarraydata + s->enhances); + write_idarray_sort(fp, pool, needid, idarraydata + s->enhances); if (s->freshens) - write_idarray(fp, pool, needid, idarraydata + s->freshens); + write_idarray_sort(fp, pool, needid, idarraydata + s->freshens); if (repo->rpmdbid) write_u32(fp, repo->rpmdbid[i - repo->start]); } -- 2.7.4