include (untested) hash

2022-09-25 19:31:11 +02:00
parent bb5ddc1772
commit 17cabaddb7
6 changed files with 472 additions and 2 deletions
--- a/c/hash.c
+++ b/c/hash.c
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*  inspired from kernel's <fs/namei.h>
+ */
+#include "hash.h"
+
+/* Return the hash of a string of known length */
+unsigned int full_name_hash(const void *salt, const char *name, unsigned int len)
+{
+	unsigned long hash = init_name_hash(salt);
+	while (len--)
+		hash = partial_name_hash((unsigned char)*name++, hash);
+	return end_name_hash(hash);
+}
+
+/* Return the "hash_len" (hash and length) of a null-terminated string */
+u64 hashlen_string(const void *salt, const char *name)
+{
+	unsigned long hash = init_name_hash(salt);
+	unsigned long len = 0, c;
+
+	c = (unsigned char)*name;
+	while (c) {
+		len++;
+		hash = partial_name_hash(c, hash);
+		c = (unsigned char)name[len];
+	}
+	return hashlen_create(end_name_hash(hash), len);
+}
--- a/c/include/bits.h
+++ b/c/include/bits.h
@@ -262,6 +262,90 @@ static __always_inline int popcount32(u32 n)
 #   endif
 }

+/*  rolXX are taken from kernel's <linux/bitops.h> are are:
+ * SPDX-License-Identifier: GPL-2.0
+ */
+
+/**
+ * rol64 - rotate a 64-bit value left
+ * @word: value to rotate
+ * @shift: bits to roll
+ */
+static inline u64 rol64(u64 word, unsigned int shift)
+{
+	return (word << (shift & 63)) | (word >> ((-shift) & 63));
+}
+
+/**
+ * ror64 - rotate a 64-bit value right
+ * @word: value to rotate
+ * @shift: bits to roll
+ */
+static inline u64 ror64(u64 word, unsigned int shift)
+{
+	return (word >> (shift & 63)) | (word << ((-shift) & 63));
+}
+
+/**
+ * rol32 - rotate a 32-bit value left
+ * @word: value to rotate
+ * @shift: bits to roll
+ */
+static inline u32 rol32(u32 word, unsigned int shift)
+{
+	return (word << (shift & 31)) | (word >> ((-shift) & 31));
+}
+
+/**
+ * ror32 - rotate a 32-bit value right
+ * @word: value to rotate
+ * @shift: bits to roll
+ */
+static inline u32 ror32(u32 word, unsigned int shift)
+{
+	return (word >> (shift & 31)) | (word << ((-shift) & 31));
+}
+
+/**
+ * rol16 - rotate a 16-bit value left
+ * @word: value to rotate
+ * @shift: bits to roll
+ */
+static inline u16 rol16(u16 word, unsigned int shift)
+{
+	return (word << (shift & 15)) | (word >> ((-shift) & 15));
+}
+
+/**
+ * ror16 - rotate a 16-bit value right
+ * @word: value to rotate
+ * @shift: bits to roll
+ */
+static inline u16 ror16(u16 word, unsigned int shift)
+{
+	return (word >> (shift & 15)) | (word << ((-shift) & 15));
+}
+
+/**
+ * rol8 - rotate an 8-bit value left
+ * @word: value to rotate
+ * @shift: bits to roll
+ */
+static inline u8 rol8(u8 word, unsigned int shift)
+{
+	return (word << (shift & 7)) | (word >> ((-shift) & 7));
+}
+
+/**
+ * ror8 - rotate an 8-bit value right
+ * @word: value to rotate
+ * @shift: bits to roll
+ */
+static inline u8 ror8(u8 word, unsigned int shift)
+{
+	return (word >> (shift & 7)) | (word << ((-shift) & 7));
+}
+


 /** bit_for_each - iterate over an u64/u32 bits
--- a/c/include/br.h
+++ b/c/include/br.h
@@ -0,0 +1,205 @@
+/* bits.h - bits functions.
+ *
+ * Copyright (C) 2021-2022 Bruno Raoult ("br")
+ * Licensed under the GNU General Public License v3.0 or later.
+ * Some rights reserved. See COPYING.
+ *
+ * You should have received a copy of the GNU General Public License along with this
+ * program. If not, see <https://www.gnu.org/licenses/gpl-3.0-standalone.html>.
+ *
+ * SPDX-License-Identifier: GPL-3.0-or-later <https://spdx.org/licenses/GPL-3.0-or-later.html>
+ *
+ * Some parts are taken from Linux's kernel <linux/kernel.h> and others, and are :
+ * SPDX-License-Identifier: GPL-2.0
+ *
+ * This header contains generic stuff.
+ */
+
+#ifndef _BR_H
+#define _BR_H
+
+/* generate a (maybe) unique id.
+ */
+#define ___PASTE(x, y)      x##y
+#define __PASTE(x, y)       ___PASTE(x, y)
+#define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __COUNTER__)
+//__##prefix##__COUNTER__
+
+/* see https://lkml.org/lkml/2018/3/20/845 for explanation of this monster
+ */
+#define __is_constexpr(x) \
+	(sizeof(int) == sizeof(*(8 ? ((void *)((long)(x) * 0l)) : (int *)8)))
+
+/*
+ * min()/max()/clamp() macros must accomplish three things:
+ *
+ * - avoid multiple evaluations of the arguments (so side-effects like
+ *   "x++" happen only once) when non-constant.
+ * - perform strict type-checking (to generate warnings instead of
+ *   nasty runtime surprises). See the "unnecessary" pointer comparison
+ *   in __typecheck().
+ * - retain result as a constant expressions when called with only
+ *   constant expressions (to avoid tripping VLA warnings in stack
+ *   allocation usage).
+ */
+#define __typecheck(x, y) \
+	(!!(sizeof((typeof(x) *)1 == (typeof(y) *)1)))
+
+#define __no_side_effects(x, y) \
+		(__is_constexpr(x) && __is_constexpr(y))
+
+#define __safe_cmp(x, y) \
+		(__typecheck(x, y) && __no_side_effects(x, y))
+
+#define __cmp(x, y, op)	((x) op (y) ? (x) : (y))
+
+#define __cmp_once(x, y, unique_x, unique_y, op) ({	\
+		typeof(x) unique_x = (x);		\
+		typeof(y) unique_y = (y);		\
+		__cmp(unique_x, unique_y, op); })
+
+#define __careful_cmp(x, y, op) \
+	__builtin_choose_expr(__safe_cmp(x, y), \
+		__cmp(x, y, op), \
+		__cmp_once(x, y, __UNIQUE_ID(__x), __UNIQUE_ID(__y), op))
+
+#define __pure   __attribute__((__pure__))
+
+/**
+ * min - return minimum of two values of the same or compatible types
+ * @x: first value
+ * @y: second value
+ */
+#define min(x, y)	__careful_cmp(x, y, <)
+
+/**
+ * max - return maximum of two values of the same or compatible types
+ * @x: first value
+ * @y: second value
+ */
+#define max(x, y)	__careful_cmp(x, y, >)
+
+/**
+ * min3 - return minimum of three values
+ * @x: first value
+ * @y: second value
+ * @z: third value
+ */
+#define min3(x, y, z) min((typeof(x))min(x, y), z)
+
+/**
+ * max3 - return maximum of three values
+ * @x: first value
+ * @y: second value
+ * @z: third value
+ */
+#define max3(x, y, z) max((typeof(x))max(x, y), z)
+
+/**
+ * min_not_zero - return the minimum that is _not_ zero, unless both are zero
+ * @x: value1
+ * @y: value2
+ */
+#define min_not_zero(x, y) ({			\
+	typeof(x) __x = (x);			\
+	typeof(y) __y = (y);			\
+	__x == 0 ? __y : ((__y == 0) ? __x : min(__x, __y)); })
+
+/**
+ * clamp - return a value clamped to a given range with strict typechecking
+ * @val: current value
+ * @lo: lowest allowable value
+ * @hi: highest allowable value
+ *
+ * This macro does strict typechecking of @lo/@hi to make sure they are of the
+ * same type as @val.  See the unnecessary pointer comparisons.
+ */
+#define clamp(val, lo, hi) min((typeof(val))max(val, lo), hi)
+
+/*
+ * ..and if you can't take the strict
+ * types, you can specify one yourself.
+ *
+ * Or not use min/max/clamp at all, of course.
+ */
+
+/**
+ * min_t - return minimum of two values, using the specified type
+ * @type: data type to use
+ * @x: first value
+ * @y: second value
+ */
+#define min_t(type, x, y)	__careful_cmp((type)(x), (type)(y), <)
+
+/**
+ * max_t - return maximum of two values, using the specified type
+ * @type: data type to use
+ * @x: first value
+ * @y: second value
+ */
+#define max_t(type, x, y)	__careful_cmp((type)(x), (type)(y), >)
+
+/**
+ * clamp_t - return a value clamped to a given range using a given type
+ * @type: the type of variable to use
+ * @val: current value
+ * @lo: minimum allowable value
+ * @hi: maximum allowable value
+ *
+ * This macro does no typechecking and uses temporary variables of type
+ * @type to make all the comparisons.
+ */
+#define clamp_t(type, val, lo, hi) min_t(type, max_t(type, val, lo), hi)
+
+/**
+ * clamp_val - return a value clamped to a given range using val's type
+ * @val: current value
+ * @lo: minimum allowable value
+ * @hi: maximum allowable value
+ *
+ * This macro does no typechecking and uses temporary variables of whatever
+ * type the input argument @val is.  This is useful when @val is an unsigned
+ * type and @lo and @hi are literals that will otherwise be assigned a signed
+ * integer type.
+ */
+#define clamp_val(val, lo, hi) clamp_t(typeof(val), val, lo, hi)
+
+/**
+ * swap - swap values of @a and @b
+ * @a: first value
+ * @b: second value
+ */
+#define swap(a, b) \
+	do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0)
+
+/**
+ * ARRAY_SIZE - get the number of elements in array @arr
+ * @arr: array to be sized
+ */
+#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
+
+/**
+ * abs - return absolute value of an argument
+ * @x: the value.  If it is unsigned type, it is converted to signed type first.
+ *     char is treated as if it was signed (regardless of whether it really is)
+ *     but the macro's return type is preserved as char.
+ *
+ * Return: an absolute value of x.
+ */
+#define abs(x)	__abs_choose_expr(x, long long,				\
+		__abs_choose_expr(x, long,				\
+		__abs_choose_expr(x, int,				\
+		__abs_choose_expr(x, short,				\
+		__abs_choose_expr(x, char,				\
+		__builtin_choose_expr(					\
+			__builtin_types_compatible_p(typeof(x), char),	\
+			(char)({ signed char __x = (x); __x<0?-__x:__x; }), \
+			((void)0)))))))
+
+#define __abs_choose_expr(x, type, other) __builtin_choose_expr(	\
+	__builtin_types_compatible_p(typeof(x),   signed type) ||	\
+	__builtin_types_compatible_p(typeof(x), unsigned type),		\
+	({ signed type __x = (x); __x < 0 ? -__x : __x; }), other)
+
+
+#endif  /* _BR_H */
--- a/c/include/bug.h
+++ b/c/include/bug.h
@@ -8,7 +8,8 @@
 #include "likely.h"
 #include "debug.h"

-/* inspired by Linux kernel's <asm/bug.h> */
+/* BUG functions inspired by Linux kernel's <asm/bug.h>
+ */

 #define panic() exit(0xff)

--- a/c/include/hash.h
+++ b/c/include/hash.h
@@ -2,7 +2,7 @@

 #ifndef _BR_HASH_H
 #define _BR_HASH_H
-/* adaptation of Linux kernel's <linux/hash.h>
+/* adaptation of Linux kernel's <linux/hash.h> and <linux/stringhash.h>
 */

 /* Fast hashing routine for ints,  longs and pointers.
@@ -11,6 +11,8 @@
 #include <asm/types.h>
 #include <asm/bitsperlong.h>
 #include "bits.h"
+#include "br.h"
+
 /*
 * The "GOLDEN_RATIO_PRIME" is used in ifs/btrfs/brtfs_inode.h and
 * fs/inode.c.  It's not actually prime any more (the previous primes
@@ -98,4 +100,74 @@ static inline u32 hash32_ptr(const void *ptr)
 	return (u32)val;
 }

+/*
+ * Routines for hashing strings of bytes to a 32-bit hash value.
+ *
+ * These hash functions are NOT GUARANTEED STABLE between kernel
+ * versions, architectures, or even repeated boots of the same kernel.
+ * (E.g. they may depend on boot-time hardware detection or be
+ * deliberately randomized.)
+ *
+ * They are also not intended to be secure against collisions caused by
+ * malicious inputs; much slower hash functions are required for that.
+ *
+ * They are optimized for pathname components, meaning short strings.
+ * Even if a majority of files have longer names, the dynamic profile of
+ * pathname components skews short due to short directory names.
+ * (E.g. /usr/lib/libsesquipedalianism.so.3.141.)
+ */
+
+/*
+ * Version 1: one byte at a time.  Example of use:
+ *
+ * unsigned long hash = init_name_hash;
+ * while (*p)
+ *	hash = partial_name_hash(tolower(*p++), hash);
+ * hash = end_name_hash(hash);
+ *
+ * Although this is designed for bytes, fs/hfsplus/unicode.c
+ * abuses it to hash 16-bit values.
+ */
+
+/* Hash courtesy of the R5 hash in reiserfs modulo sign bits */
+#define init_name_hash(salt)		(unsigned long)(salt)
+
+/* partial hash update function. Assume roughly 4 bits per character */
+static inline unsigned long
+partial_name_hash(unsigned long c, unsigned long prevhash)
+{
+	return (prevhash + (c << 4) + (c >> 4)) * 11;
+}
+
+/*
+ * Finally: cut down the number of bits to a int value (and try to avoid
+ * losing bits).  This also has the property (wanted by the dcache)
+ * that the msbits make a good hash table index.
+ */
+static inline unsigned int end_name_hash(unsigned long hash)
+{
+	return hash_long(hash, 32);
+}
+
+/*
+ * Version 2: One word (32 or 64 bits) at a time.
+ * If CONFIG_DCACHE_WORD_ACCESS is defined (meaning <asm/word-at-a-time.h>
+ * exists, which describes major Linux platforms like x86 and ARM), then
+ * this computes a different hash function much faster.
+ *
+ * If not set, this falls back to a wrapper around the preceding.
+ */
+extern unsigned int __pure full_name_hash(const void *salt, const char *, unsigned int);
+
+/*
+ * A hash_len is a u64 with the hash of a string in the low
+ * half and the length in the high half.
+ */
+#define hashlen_hash(hashlen) ((u32)(hashlen))
+#define hashlen_len(hashlen)  ((u32)((hashlen) >> 32))
+#define hashlen_create(hash, len) ((u64)(len)<<32 | (u32)(hash))
+
+/* Return the "hash_len" (hash and length) of a null-terminated string */
+extern u64 __pure hashlen_string(const void *salt, const char *name);
+
 #endif /* _BR_HASH_H */
--- a/c/include/stringhash.h
+++ b/c/include/stringhash.h
@@ -0,0 +1,79 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __LINUX_STRINGHASH_H
+#define __LINUX_STRINGHASH_H
+
+#include <linux/compiler.h>	/* For __pure */
+#include <linux/types.h>	/* For u32, u64 */
+#include <linux/hash.h>
+
+/*
+ * Routines for hashing strings of bytes to a 32-bit hash value.
+ *
+ * These hash functions are NOT GUARANTEED STABLE between kernel
+ * versions, architectures, or even repeated boots of the same kernel.
+ * (E.g. they may depend on boot-time hardware detection or be
+ * deliberately randomized.)
+ *
+ * They are also not intended to be secure against collisions caused by
+ * malicious inputs; much slower hash functions are required for that.
+ *
+ * They are optimized for pathname components, meaning short strings.
+ * Even if a majority of files have longer names, the dynamic profile of
+ * pathname components skews short due to short directory names.
+ * (E.g. /usr/lib/libsesquipedalianism.so.3.141.)
+ */
+
+/*
+ * Version 1: one byte at a time.  Example of use:
+ *
+ * unsigned long hash = init_name_hash;
+ * while (*p)
+ *	hash = partial_name_hash(tolower(*p++), hash);
+ * hash = end_name_hash(hash);
+ *
+ * Although this is designed for bytes, fs/hfsplus/unicode.c
+ * abuses it to hash 16-bit values.
+ */
+
+/* Hash courtesy of the R5 hash in reiserfs modulo sign bits */
+#define init_name_hash(salt)		(unsigned long)(salt)
+
+/* partial hash update function. Assume roughly 4 bits per character */
+static inline unsigned long
+partial_name_hash(unsigned long c, unsigned long prevhash)
+{
+	return (prevhash + (c << 4) + (c >> 4)) * 11;
+}
+
+/*
+ * Finally: cut down the number of bits to a int value (and try to avoid
+ * losing bits).  This also has the property (wanted by the dcache)
+ * that the msbits make a good hash table index.
+ */
+static inline unsigned int end_name_hash(unsigned long hash)
+{
+	return hash_long(hash, 32);
+}
+
+/*
+ * Version 2: One word (32 or 64 bits) at a time.
+ * If CONFIG_DCACHE_WORD_ACCESS is defined (meaning <asm/word-at-a-time.h>
+ * exists, which describes major Linux platforms like x86 and ARM), then
+ * this computes a different hash function much faster.
+ *
+ * If not set, this falls back to a wrapper around the preceding.
+ */
+extern unsigned int __pure full_name_hash(const void *salt, const char *, unsigned int);
+
+/*
+ * A hash_len is a u64 with the hash of a string in the low
+ * half and the length in the high half.
+ */
+#define hashlen_hash(hashlen) ((u32)(hashlen))
+#define hashlen_len(hashlen)  ((u32)((hashlen) >> 32))
+#define hashlen_create(hash, len) ((u64)(len)<<32 | (u32)(hash))
+
+/* Return the "hash_len" (hash and length) of a null-terminated string */
+extern u64 __pure hashlen_string(const void *salt, const char *name);
+
+#endif	/* __LINUX_STRINGHASH_H */