"GOT", but the "O" is a cute, smiling pufferfish. Index | Thread | Search

From:
Stefan Sperling <stsp@stsp.name>
Subject:
Re: optimise reading the file index
To:
Omar Polo <op@omarpolo.com>
Cc:
Mark Jamsek <mark@jamsek.com>, "Todd C. Miller" <millert@openbsd.org>, Christian Weisgerber <naddy@mips.inka.de>, gameoftrees@openbsd.org
Date:
Sun, 30 Jul 2023 12:34:52 +0200

Download raw body.

Thread
  • Mark Jamsek:

    optimise reading the file index

  • On Sun, Jul 30, 2023 at 11:45:45AM +0200, Omar Polo wrote:
    > Another approach would be to use a different data structure.
    
    If you are interested in exploring this further:
    
    I have already adapted code from lib/object_idset.c into a generic
    hash table data structure in order to solve an unrelated issue in
    nsh (storing environment variables). This adapted code is below.
    
    If you want to drop that into lib/hashtable.c and use it instead of an
    RB tree you would get O(1) file index lookups, and you could use a
    custom routine to populate an initial table from the file index, pre-sized
    such that you can insert everything without table re-allocations.
    
    This hash table can be used with C structs or C strings as keys/values.
    Just keep in mind that the caller has to manage memory for keys and values.
    
    Table creation:
    
    	t = hashtable_alloc();
    	if (t == NULL) {
    		printf("hashtable_alloc: %s", strerror(errno));
    		return -1;
    	}
    
    Insertion as used in nsh (to store environment variables):
    
    	void *name0;
    	char *name = strdup("key=value");
    	char *eq = strchr(name, '=');
    	char *value = eq + 1;
    
    	*eq = '\0';
    
    	/* Try to remove first in case of updating an existing entry. */
    	if (hashtable_remove(t, &name0, NULL, NULL,
    	    name, strlen(name)) == 0)
    		free(name0);
    
    	if (hashtable_add(t, name, strlen(name), value, strlen(value))) {
    		printf("%s: hashtable_add(\"%s\", \"%s\") failed\n",
    		    __func__, name, value);
    		free(name);
    	}
    
    Iterate:
    
    	static int
    	show_entry(void *keyptr, size_t keysize, void *value, size_t valsize,
    	    void *arg)
    	{
    		FILE *f = arg;
    		char *name = keyptr;
    		char *val = value;
    		int ret;
    
    		ret = fprintf(f, "%s=%s\n", name, val);
    		if (ret != keysize + valsize + 2)
    			return -1;
    		return 0;
    	}
    
    	[...]
    
    	hashtable_foreach(t, show_entry, stdout);
    
    Patch against the nsh repo, apply to Got manually as needed.
    
    Might want to re-add got-style error handling which I had to rip out.
    And if we add this hash table into Got we might as well rewrite
    lib/object_idset.c on top of this. I could do all of that later.
     
    -----------------------------------------------
     add a hash table implementation
     
    diff 574d5c2d6a16ae5600d65fedc157d03e7fcfdd43 455c4f1f69829f815e61e1fba31766f8289ddaba
    commit - 574d5c2d6a16ae5600d65fedc157d03e7fcfdd43
    commit + 455c4f1f69829f815e61e1fba31766f8289ddaba
    blob - d0ea33ff297130b499703d8f068c954783abcdb3
    blob + 452d823c43405e462983e5c13eee67d7081cf956
    --- Makefile
    +++ Makefile
    @@ -30,7 +30,7 @@ SRCS+=helpcommands.c makeargv.c
     SRCS+=bridge.c tunnel.c media.c sysctl.c passwd.c pfsync.c carp.c
     SRCS+=trunk.c who.c more.c stringlist.c utils.c sqlite3.c ppp.c prompt.c
     SRCS+=nopt.c pflow.c wg.c nameserver.c ndp.c umb.c utf8.c cmdargs.c ctlargs.c
    -SRCS+=helpcommands.c makeargv.c
    +SRCS+=helpcommands.c makeargv.c hashtable.c
     CLEANFILES+=compile.c
     LDADD=-lutil -ledit -ltermcap -lsqlite3 -L/usr/local/lib #-static
     
    blob - a1f2ecea9d8252e146f3e086eba407199cdc685d
    blob + bd1b0df0dfb34f5aa2493ebcd17c49cfc2d6ba1b
    --- externs.h
    +++ externs.h
    @@ -563,3 +563,19 @@ char **step_optreq(char **, char **, int, char **, int
     /* ctlargs.c */
     int pr_prot1(int, char **);
     char **step_optreq(char **, char **, int, char **, int);
    +
    +/* hashtable.c */
    +struct hashtable;
    +struct hashtable *hashtable_alloc(void);
    +void hashtable_free(struct hashtable *);
    +int hashtable_add(struct hashtable *, void *key, size_t keysize,
    +    void *value, size_t valsize);
    +void *hashtable_get_keyptr(struct hashtable *, void *key, size_t keysize);
    +void *hashtable_get_value(struct hashtable *, void *key, size_t keysize);
    +int hashtable_remove(struct hashtable *, void **keyptr, void **value,
    +    size_t *valsize, void *key, size_t keysize);
    +int hashtable_contains(struct hashtable *, void *key, size_t keysize);
    +int hashtable_foreach(struct hashtable *,
    +    int (*cb)(void *, size_t, void *, size_t, void *),
    +    void *cb_arg);
    +int hashtable_num_entries(struct hashtable *);
    blob - /dev/null
    blob + c9dd913ad5963e10318a72705c3dc7f25e00c8c3 (mode 644)
    --- /dev/null
    +++ hashtable.c
    @@ -0,0 +1,304 @@
    +/*
    + * Copyright (c) 2023 Stefan Sperling <stsp@openbsd.org>
    + *
    + * Permission to use, copy, modify, and distribute this software for any
    + * purpose with or without fee is hereby granted, provided that the above
    + * copyright notice and this permission notice appear in all copies.
    + *
    + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
    + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
    + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
    + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
    + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
    + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
    + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
    + */
    +
    +#include <sys/queue.h>
    +
    +#include <stdio.h>
    +#include <string.h>
    +#include <stdlib.h>
    +#include <limits.h>
    +#include <errno.h>
    +#include <siphash.h>
    +
    +#include "externs.h"
    +
    +struct hashentry {
    +	STAILQ_ENTRY(hashentry) entry;
    +	void *key;
    +	size_t keysize;
    +	void *value;
    +	size_t valsize;
    +};
    +
    +STAILQ_HEAD(hashbucket, hashentry);
    +
    +#define HASHTABLE_MIN_BUCKETS	64
    +
    +struct hashtable {
    +	struct hashbucket *buckets;
    +	size_t nbuckets;
    +	unsigned int nentries;
    +	unsigned int flags;
    +#define HASHTABLE_F_TRAVERSAL	0x01
    +#define HASHTABLE_F_NOMEM	0x02
    +	SIPHASH_KEY siphash_key;
    +};
    +
    +static uint64_t
    +entry_hash(struct hashtable *t, void *key, size_t keysize)
    +{
    +	return SipHash24(&t->siphash_key, key, keysize);
    +}
    +
    +struct hashtable *
    +hashtable_alloc(void)
    +{
    +	struct hashtable *t;
    +	const size_t nbuckets = HASHTABLE_MIN_BUCKETS;
    +	size_t i;
    +
    +	t = calloc(1, sizeof(*t));
    +	if (t == NULL)
    +		return NULL;
    +
    +	t->buckets = calloc(nbuckets, sizeof(t->buckets[0]));
    +	if (t->buckets == NULL) {
    +		free(t);
    +		return NULL;
    +	}
    +
    +	for (i = 0; i < nbuckets; i++)
    +		STAILQ_INIT(&t->buckets[i]);
    +	t->nbuckets = nbuckets;
    +	arc4random_buf(&t->siphash_key, sizeof(t->siphash_key));
    +
    +	return t;
    +}
    +
    +void
    +hashtable_free(struct hashtable *t)
    +{
    +	size_t i;
    +	struct hashentry *e;
    +
    +	for (i = 0; i < t->nbuckets; i++) {
    +		while (!STAILQ_EMPTY(&t->buckets[i])) {
    +			e = STAILQ_FIRST(&t->buckets[i]);
    +			STAILQ_REMOVE(&t->buckets[i], e, hashentry, entry);
    +			free(e);
    +		}
    +	}
    +	/* Storage for keys and values should be freed by caller. */
    +	free(t->buckets);
    +	free(t);
    +}
    +
    +static int
    +table_resize(struct hashtable *t, size_t nbuckets)
    +{
    +	struct hashbucket *buckets;
    +	size_t i;
    +
    +	buckets = calloc(nbuckets, sizeof(buckets[0]));
    +	if (buckets == NULL) {
    +		if (errno != ENOMEM)
    +			return -1;
    +		/* Proceed with our current amount of hash buckets. */
    +		t->flags |= HASHTABLE_F_NOMEM;
    +		return 0;
    +	}
    +
    +	for (i = 0; i < nbuckets; i++)
    +		STAILQ_INIT(&buckets[i]);
    +
    +	arc4random_buf(&t->siphash_key, sizeof(t->siphash_key));
    +
    +	for (i = 0; i < t->nbuckets; i++) {
    +		while (!STAILQ_EMPTY(&t->buckets[i])) {
    +			struct hashentry *e;
    +			uint64_t idx;
    +			e = STAILQ_FIRST(&t->buckets[i]);
    +			STAILQ_REMOVE(&t->buckets[i], e, hashentry, entry);
    +			idx = entry_hash(t, e->key, e->keysize) % nbuckets;
    +			STAILQ_INSERT_HEAD(&buckets[idx], e, entry);
    +		}
    +	}
    +
    +	free(t->buckets);
    +	t->buckets = buckets;
    +	t->nbuckets = nbuckets;
    +	return 0;
    +}
    +
    +static int
    +table_grow(struct hashtable *t)
    +{
    +	size_t nbuckets;
    +
    +	if (t->flags & HASHTABLE_F_NOMEM)
    +		return 0;
    +
    +	if (t->nbuckets >= UINT_MAX / 2)
    +		nbuckets = UINT_MAX;
    +	else
    +		nbuckets = t->nbuckets * 2;
    +
    +	return table_resize(t, nbuckets);
    +}
    +
    +int
    +hashtable_add(struct hashtable *t, void *key, size_t keysize,
    +    void *value, size_t valsize)
    +{
    +	struct hashentry *e;
    +	uint64_t idx;
    +	struct hashbucket *bucket;
    +
    +	/*
    +	 * Do not allow adding more entries during traversal.
    +	 * This function may resize the table.
    +	 */
    +	if (t->flags & HASHTABLE_F_TRAVERSAL)
    +		return -1;
    +
    +	if (t->nentries == UINT_MAX)
    +		return -1;
    +
    +	idx = entry_hash(t, key, keysize) % t->nbuckets;
    +	bucket = &t->buckets[idx];
    +
    +	/* Require unique keys. */
    +	STAILQ_FOREACH(e, bucket, entry) {
    +		if (e->keysize == keysize && memcmp(e->key, key, keysize) == 0)
    +			return -1;
    +	}
    +
    +	e = calloc(1, sizeof(*e));
    +	if (e == NULL)
    +		return -1;
    +
    +	e->key = key;
    +	e->keysize = keysize;
    +	e->value = value;
    +	e->valsize = valsize;
    +
    +	STAILQ_INSERT_HEAD(bucket, e, entry);
    +	t->nentries++;
    +
    +	if (t->nbuckets < t->nentries)
    +		if (table_grow(t) == -1)
    +			return -1;
    +
    +	return 0;
    +}
    +
    +static struct hashentry *
    +find_entry(struct hashtable *t, void *key, size_t keysize)
    +{
    +	uint64_t idx = entry_hash(t, key, keysize) % t->nbuckets;
    +	struct hashbucket *bucket = &t->buckets[idx];
    +	struct hashentry *e;
    +
    +	STAILQ_FOREACH(e, bucket, entry) {
    +		if (e->keysize == keysize && memcmp(e->key, key, keysize) == 0)
    +			return e;
    +	}
    +
    +	return NULL;
    +}
    +
    +void *
    +hashtable_get_keyptr(struct hashtable *t, void *key, size_t keysize)
    +{
    +	struct hashentry *e = find_entry(t, key, keysize);
    +	return e ? e->key : NULL;
    +}
    +
    +void *
    +hashtable_get_value(struct hashtable *t, void *key, size_t keysize)
    +{
    +	struct hashentry *e = find_entry(t, key, keysize);
    +	return e ? e->value : NULL;
    +}
    +
    +int
    +hashtable_remove(struct hashtable *t, void **keyptr, void **value,
    +    size_t *valsize, void *key, size_t keysize)
    +{
    +	uint64_t idx;
    +	struct hashbucket *bucket;
    +	struct hashentry *e;
    +
    +	if (keyptr)
    +		*keyptr = NULL;
    +	if (value)
    +		*value = NULL;
    +	if (valsize)
    +		*valsize = 0;
    +
    +	if (t->nentries == 0)
    +		return -1;
    +
    +	idx = entry_hash(t, key, keysize) % t->nbuckets;
    +	bucket = &t->buckets[idx];
    +	STAILQ_FOREACH(e, bucket, entry) {
    +		if (e->keysize == keysize && memcmp(e->key, key, keysize) == 0)
    +			break;
    +	}
    +	if (e == NULL)
    +		return -1;
    +
    +	if (keyptr)
    +		*keyptr = e->key;
    +	if (value)
    +		*value = e->value;
    +	if (valsize)
    +		*valsize = e->valsize;
    +
    +	STAILQ_REMOVE(bucket, e, hashentry, entry);
    +	free(e);
    +	t->nentries--;
    +
    +	return 0;
    +}
    +
    +int
    +hashtable_contains(struct hashtable *t, void *key, size_t keysize)
    +{
    +	struct hashentry *e = find_entry(t, key, keysize);
    +	return e ? 1 : 0;
    +}
    +
    +int
    +hashtable_foreach(struct hashtable *t,
    +    int (*cb)(void *, size_t, void *, size_t, void *),
    +    void *cb_arg)
    +{
    +	struct hashbucket *bucket;
    +	struct hashentry *e, *tmp;
    +	size_t i;
    +	int ret = 0;
    +
    +	t->flags |= HASHTABLE_F_TRAVERSAL;
    +	for (i = 0; i < t->nbuckets; i++) {
    +		bucket = &t->buckets[i];
    +		STAILQ_FOREACH_SAFE(e, bucket, entry, tmp) {
    +			ret = (*cb)(e->key, e->keysize, e->value, e->valsize,
    +			    cb_arg);
    +			if (ret)
    +				goto done;
    +		}
    +	}
    +done:
    +	t->flags &= ~HASHTABLE_F_TRAVERSAL;
    +	return ret;
    +}
    +
    +int
    +hashtable_num_entries(struct hashtable *t)
    +{
    +	return t->nentries;
    +}
    
    
  • Mark Jamsek:

    optimise reading the file index