From 2c9512402037aceeb7ccfbf41a6addb3995307c0 Mon Sep 17 00:00:00 2001
From: Bruno Raoult <braoult@gmail.com>
Date: Sat, 6 Jan 2024 18:23:29 +0100
Subject: [PATCH] bitops: all macros (hmmm, mistake ?), more tests

---
 .dir-locals.el.wrong                  |   3 -
 .gitignore                            |   1 +
 include/bitops-emulated/generic-clz.h |  45 ++
 include/bitops-emulated/generic-ctz.h | 130 ++++++
 include/bitops.h                      | 575 ++++++++------------------
 test/bitops-test.c                    | 162 +++++++-
 6 files changed, 504 insertions(+), 412 deletions(-)
 delete mode 100644 .dir-locals.el.wrong
 create mode 100644 include/bitops-emulated/generic-clz.h
 create mode 100644 include/bitops-emulated/generic-ctz.h

diff --git a/.dir-locals.el.wrong b/.dir-locals.el.wrong
deleted file mode 100644
index 63c1c1b..0000000
--- a/.dir-locals.el.wrong
+++ /dev/null
@@ -1,3 +0,0 @@
-((nil .
-      ((projectile-project-root-functions . 'projectile-root-local)
-       )))
diff --git a/.gitignore b/.gitignore
index 17229b8..81cb317 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,6 +3,7 @@ core
 /.ccls-cache/
 /test/test/
 /test/cutest/
+/tmp/
 # created when building
 /bin/
 /obj/
diff --git a/include/bitops-emulated/generic-clz.h b/include/bitops-emulated/generic-clz.h
new file mode 100644
index 0000000..1b6f033
--- /dev/null
+++ b/include/bitops-emulated/generic-clz.h
@@ -0,0 +1,45 @@
+/* generic-clz.h - generic clz implementations.
+ *
+ * Copyright (C) 2024 Bruno Raoult ("br")
+ * Licensed under the GNU General Public License v3.0 or later.
+ * Some rights reserved. See COPYING.
+ *
+ * You should have received a copy of the GNU General Public License along with this
+ * program. If not, see <https://www.gnu.org/licenses/gpl-3.0-standalone.html>.
+ *
+ * SPDX-License-Identifier: GPL-3.0-or-later <https://spdx.org/licenses/GPL-3.0-or-later.html>
+ *
+ */
+#ifndef _GENERIC_CLZ_H_
+#define _GENERIC_CLZ_H_
+
+#include "br.h"
+
+/* Adapted from: http://www-graphics.stanford.edu/%7Eseander/bithacks.html
+ */
+static __always_inline int __clz32_emulated(u32 n)
+{
+    uint r, q;
+
+    r  = (n > 0xFFFF) << 4; n >>= r;
+    q  = (n > 0xFF  ) << 3; n >>= q; r |= q;
+    q  = (n > 0xF   ) << 2; n >>= q; r |= q;
+    q  = (n > 0x3   ) << 1; n >>= q; r |= q;
+    r |= (n >> 1);
+    return 31 - r;
+}
+
+static __always_inline int __clz64_emulated(u64 n)
+{
+    uint r, q;
+
+    r = (n > 0xFFFFFFFF) << 5; n >>= r;
+    q = (n > 0xFFFF)     << 4; n >>= q; r |= q;
+    q = (n > 0xFF  )     << 3; n >>= q; r |= q;
+    q = (n > 0xF   )     << 2; n >>= q; r |= q;
+    q = (n > 0x3   )     << 1; n >>= q; r |= q;
+    r |= (n >> 1);
+    return 63 - r;
+}
+
+#endif  /* _GENERIC_CLZ_H_ */
diff --git a/include/bitops-emulated/generic-ctz.h b/include/bitops-emulated/generic-ctz.h
new file mode 100644
index 0000000..4045e35
--- /dev/null
+++ b/include/bitops-emulated/generic-ctz.h
@@ -0,0 +1,130 @@
+/* generic-ctz.h - generic ctz implementations.
+ *
+ * Copyright (C) 2024 Bruno Raoult ("br")
+ * Licensed under the GNU General Public License v3.0 or later.
+ * Some rights reserved. See COPYING.
+ *
+ * You should have received a copy of the GNU General Public License along with this
+ * program. If not, see <https://www.gnu.org/licenses/gpl-3.0-standalone.html>.
+ *
+ * SPDX-License-Identifier: GPL-3.0-or-later <https://spdx.org/licenses/GPL-3.0-or-later.html>
+ *
+ */
+#ifndef _GENERIC_CTZ_H_
+#define _GENERIC_CTZ_H_
+
+#include "br.h"
+
+/* Adapted from: http://www-graphics.stanford.edu/%7Eseander/bithacks.html
+ */
+static __always_inline int __ctz32_emulated(u32 n)
+{
+    int r = 0;
+    if (!(n & 0xffff)) {
+        n >>= 16;
+        r += 16;
+    }
+    if (!(n & 0xff)) {
+        n >>= 8;
+        r += 8;
+    }
+    if (!(n & 0xf)) {
+        n >>= 4;
+        r += 4;
+    }
+    if (!(n & 3)) {
+        n >>= 2;
+        r += 2;
+    }
+    if (!(n & 1)) {
+        n >>= 1;
+        r += 1;
+    }
+    return r;
+}
+
+#define __ctz32_emulated2(n) ({   \
+    int r = 0;                    \
+    if (!(n & 0xffff)) {          \
+        n >>= 16;                 \
+        r += 16;                  \
+    }                             \
+    if (!(n & 0xff)) {            \
+        n >>= 8;                  \
+        r += 8;                   \
+    }                             \
+    if (!(n & 0xf)) {             \
+        n >>= 4;                  \
+        r += 4;                   \
+    }                             \
+    if (!(n & 3)) {               \
+        n >>= 2;                  \
+        r += 2;                   \
+    }                             \
+    if (!(n & 1)) {               \
+        n >>= 1;                  \
+        r += 1;                   \
+    }                             \
+    r;                            \
+})
+
+static __always_inline int __ctz64_emulated(u64 n)
+{
+    int r = 0;                                    /* !!!!!!!!!!!!!!!!!!!!!!!! */
+    if (!(n & 0xffffffff)) {
+        n >>= 32;
+        r += 32;
+    }
+    if (!(n & 0xffff)) {
+        n >>= 16;
+        r += 16;
+    }
+    if (!(n & 0xff)) {
+        n >>= 8;
+        r += 8;
+    }
+    if (!(n & 0xf)) {
+        n >>= 4;
+        r += 4;
+    }
+    if (!(n & 3)) {
+        n >>= 2;
+        r += 2;
+    }
+    if (!(n & 1)) {
+        n >>= 1;
+        r += 1;
+    }
+    return r;
+}
+
+#define __ctz64_emulated2(n) ({   \
+    int r = 0;                  \
+    if (!(n & 0xffffffff)) {      \
+        n >>= 32;                 \
+        r += 32;                  \
+    }                             \
+    if (!(n & 0xffff)) {          \
+        n >>= 16;                 \
+        r += 16;                  \
+    }                             \
+    if (!(n & 0xff)) {            \
+        n >>= 8;                  \
+        r += 8;                   \
+    }                             \
+    if (!(n & 0xf)) {             \
+        n >>= 4;                  \
+        r += 4;                   \
+    }                             \
+    if (!(n & 3)) {               \
+        n >>= 2;                  \
+        r += 2;                   \
+    }                             \
+    if (!(n & 1)) {               \
+        n >>= 1;                  \
+        r += 1;                   \
+    }                             \
+    r;                            \
+})
+
+#endif  /* _GENERIC_CTZ_H_ */
diff --git a/include/bitops.h b/include/bitops.h
index 0f8cf53..e3ffbfc 100644
--- a/include/bitops.h
+++ b/include/bitops.h
@@ -14,6 +14,24 @@
 #define _BITS_H
 
 #include "br.h"
+#include "bitops-emulated/generic-ctz.h"
+#include "bitops-emulated/generic-clz.h"
+
+/* determine which native builtins are available
+ */
+#if __has_builtin(__builtin_popcount)
+#   define HAS_POPCOUNT
+#endif
+#if __has_builtin(__builtin_ctz)
+#   define HAS_CTZ
+#endif
+#if __has_builtin(__builtin_clz)
+#   define HAS_CLZ
+#endif
+#if __has_builtin(__builtin_ffs)
+#   define HAS_FFS
+#endif
+
 
 /**
  * print_bitops_impl() - print bitops implementation.
@@ -33,459 +51,210 @@ void print_bitops_impl(void);
  * #endif
  */
 
-/*  lsb, msb: least/most significant bit: 10101000
- *                                msb = 7 ^   ^ lsb = 3
+/**
+ * lsb, msb: least/most significant bit: 10101000
+ *                               msb = 7 ^   ^ lsb = 3
+ *
  */
 #define lsb64(x) (ctz64(x))
 #define lsb32(x) (ctz32(x))
 #define msb64(x) (63 ^ clz64(x))
 #define msb32(x) (31 ^ clz32(x))
 
-/* count set bits:  10101000 -> 3
- *                  ^ ^ ^
+/**
+ * popcount32, popcout64 - count set bits:  10101000 -> 3
+ * @num: unsigned 32 or 64 bits integer.
+ *
  */
-#if __has_builtin(__builtin_popcountll)
-#define ___popcount64_native(n)  __builtin_popcountll(n)
+#if defined(HAS_POPCOUNT)
+#   define __popcount32_native(n) __builtin_popcount(n)
+#   define __popcount64_native(n) __builtin_popcountll(n)
+
+#   define popcount64(n) __popcount64_native(n)
+#   define popcount32(n) __popcount32_native(n)
+
+/* see ctz section below */
+#   define __ctz32_popcount(n) (popcount(n & -n) - 1)
+#   define __ctz64_popcount(n) (popcountll(n & -n) - 1)
+
+/* see ffs section below */
+#   define __ffs32_popcount(n) (__builtin_popcount((n) ^ ~-(n)))
+#   define __ffs64_popcount(n) (__builtin_popcountll((n) ^ ~-(n)))
+
 #endif
-#if __has_builtin(__builtin_popcount)
-#define ___popcount32_native(n)  __builtin_popcount(n)
-#endif
-#define ___popcount_emulated(n)  ({     \
+/* Brian Kernighan's algorithm - pretty efficient for likely sparse values
+ */
+#define __popcount_emulated(n)  ({      \
     int ___count = 0;                   \
     while (n) {                         \
         ___count++;                     \
         n &= (n - 1);                   \
     }                                   \
     ___count; })
-
-#ifdef ___popcount64_native
-#define ppcount64(n) ___popcount64_native(n)
-#else
-#define ppcount64(n) ___popcount_emulated(n)
+#if !defined(popcount32)
+#   define popcount32(n) __popcount_emulated(n)
+#endif
+#if !defined(popcount64)
+#   define popcount64(n) __popcount_emulated(n)
 #endif
 
-static __always_inline int popcount64(u64 n)
-{
-#   if __has_builtin(__builtin_popcountll)
-    return __builtin_popcountll(n);
-
-#   else
-    int count = 0;
-    while (n) {
-        count++;
-        n &= (n - 1);
-    }
-    return count;
-#   endif
-}
-
-static __always_inline int popcount32(u32 n)
-{
-#   if __has_builtin(__builtin_popcount)
-    return __builtin_popcount(n);
-
-#   else
-    int count = 0;
-    while (n) {
-        count++;
-        n &= (n - 1);
-    }
-    return count;
-#   endif
-}
-
-/* count trailing zeroes : 00101000 -> 3
- *                              ^^^
+/**
+ * ctz32, ctz64 - count trailing zeros: 00101000 -> 3
+ * @num: unsigned 32 or 64 bits integer.
+ *
+ * Not defined if no bit set, so check for non-zero before calling this.
+ * This is similat the FFS (First Find Set), which has FFS(0) = 0.
  */
-static __always_inline int ctz64(u64 n)
-{
-#   if __has_builtin(__builtin_ctzll)
-    return __builtin_ctzll(n);
+#if defined(HAS_CTZ)
+#   define __ctz32_native(n)  __builtin_ctz(n)
+#   define __ctz64_native(n)  __builtin_ctzll(n)
+#   define ctz32(n) __ctz32_native(n)
+#   define ctz64(n) __ctz64_native(n)
 
-#   elif __has_builtin(__builtin_clzll)
-    return __WORDSIZE - (__builtin_clzll(n & -n) + 1);
+/* see ffs section below */
+#   define __ffs32_ctz(n) ({ n? __builtin_ctz(n) + 1: 0; })
+#   define __ffs64_ctz(n) ({ n? __builtin_ctzll(n) + 1: 0; })
+#endif
+#if !defined(ctz32) && defined(__ctz32_popcount)
+#   define ctz32(n) __ctz32_popcount(n)
+#   define ctz64(n) __ctz64_popcount(n)
+#endif
+#if !defined(ctz32)
+#   define ctz32(n) __ctz32_emulated(n)
+#   define ctz64(n) __ctz64_emulated(n)
+#endif
 
-#   else
-    return popcount64((n & -n) - 1);
-#   endif
-}
-
-static __always_inline int ctz32(u32 n)
-{
-#   if __has_builtin(__builtin_ctz)
-    return __builtin_ctz(n);
-
-#   elif __has_builtin(__builtin_clz)
-    return __WORDSIZE - (__builtin_clz(n & -n) + 1);
-
-#   else
-    return popcount32((n & -n) - 1);
-#   endif
-}
-
-/* clz - count leading zeroes : 00101000 -> 2
- *                              ^^
+/**
+ * clz32, clz64 - count leading zeros: 00101000 -> 2
+ *
+ * @num: unsigned 32 or 64 bits integer.
+ *
+ * Not defined if no bit set, so check for non-zero before calling this.
  */
-static __always_inline int clz64(u64 n)
-{
-#   if __has_builtin(__builtin_clzll)
-    return __builtin_clzll(n);
+#if defined (HAS_CLZ)
+#   define __clz32_native(n)  __builtin_clz(n)
+#   define __clz64_native(n)  __builtin_clzll(n)
+#   define clz32(n) __clz32_native(n)
+#   define clz64(n) __clz64_native(n)
+#endif
+#if !defined(clz32)
+#   define clz32(n) __clz32_emulated(n)
+#   define clz64(n) __clz64_emulated(n)
+#endif
 
-#   else
-    u64 r, q;
-
-    r = (n > 0xFFFFFFFF) << 5; n >>= r;
-    q = (n > 0xFFFF)     << 4; n >>= q; r |= q;
-    q = (n > 0xFF  )     << 3; n >>= q; r |= q;
-    q = (n > 0xF   )     << 2; n >>= q; r |= q;
-    q = (n > 0x3   )     << 1; n >>= q; r |= q;
-    r |= (n >> 1);
-    return 64 - r - 1;
-#   endif
-}
-
-static __always_inline int clz32(u32 n)
-{
-#   if __has_builtin(__builtin_clz)
-    return __builtin_clz(n);
-
-#   else
-    u32 r, q;
-
-    r = (n > 0xFFFF)     << 4; n >>= r;
-    q = (n > 0xFF  )     << 3; n >>= q; r |= q;
-    q = (n > 0xF   )     << 2; n >>= q; r |= q;
-    q = (n > 0x3   )     << 1; n >>= q; r |= q;
-    r |= (n >> 1);
-    return 32 - r - 1;
-#   endif
-}
-
-/* fls - return one plus msb : 00101000 -> 6
- *                               ^
+/**
+ * ffs32, ffs64 - find first bit set, indexed from 0: 00101000 -> 4
+ * ffz32, ffz64 - find first bit unset, indexed from 0: 00101000 -> 0
+ * @num: unsigned 32 or 64 bits integer.
+ *
+ * ffs(n) is similar to ctz(n) + 1, but returns 0 if n == 0 (except
+ * for ctz version, where ffs(0) is undefined).
+ * ffz(n) is ffz(~n), with undefine value if n = 0.
  */
-static __always_inline int fls64(u64 n)
-{
-    if (!n)
-        return 0;
-    return 64 - clz64(n);
-}
+#if defined(HAS_FFS)
+#   define __ffs32_native(n)  __builtin_ffs(n)
+#   define __ffs64_native(n)  __builtin_ffsll(n)
+#   define ffs32(n) __ffs32_native(n)
+#   define ffs64(n) __ffs64_native(n)
+#endif
+#define __ffs32_emulated(n) (popcount32((n) ^ ~-(n)))
+#define __ffs64_emulated(n) (popcount64((n) ^ ~-(n)))
+#if !defined(ffs32) && defined(__ffs32_popcount)
+#   define ffs32(n) __ffs32_popcount(n)
+#   define ffs64(n) __ffs64_popcount(n)
+#endif
+#if !defined(ffs32) && defined(__ffs32_ctz)
+#   define ffs32(n) __ffs32_ctz(n)
+#   define ffs64(n) __ffs64_ctzll(n)
+#endif
+#if !defined(ffs32)
+#   define ffs32(n) __ffs32_emulated(n)
+#   define ffs64(n) __ffs64_emulated(n)
+#endif
+#define ffz32(n)  ffs32(~(n))
+#define ffz64(n)  ffs64(~(n))
 
-static __always_inline int fls32(u32 n)
-{
-    if (!n)
-        return 0;
-    return 32 - clz32(n);
-}
-
-/* ffs - return one plus lsb index:  00101000 -> 4
- *                                       ^
+/**
+ * fls32, fls64 - return one plus MSB index: 00101000 -> 6
+ * @num: unsigned 32 or 64 bits integer.
+ *
+ * Similar to nbits(n) - clz(n), but returns 0 if n == 0;
  */
-static __always_inline uint ffs64(u64 n)
-{
-#   if __has_builtin(__builtin_ffsll)
-    return __builtin_ffsll(n);
-
-#   elif __has_builtin(__builtin_ctzll)
-    if (n == 0)
-        return (0);
-    return __builtin_ctzll(n) + 1;
-
-#   else
-    return popcount64(n ^ ~-n);
-#   endif
-}
-
-static __always_inline uint ffs32(u32 n)
-{
-#   if __has_builtin(__builtin_ffs)
-    return __builtin_ffs(n);
-
-#   elif __has_builtin(__builtin_ctz)
-    if (n == 0)
-        return (0);
-    return __builtin_ctz(n) + 1;
-
-#   else
-    return popcount32(n ^ ~-n);
-#   endif
-}
+#define fls32(n) ((n)? 32 - clz32(n): 0)
+#define fls64(n) ((n)? 64 - clz64(n): 0)
 
 /* rolXX/rorXX are taken from kernel's <linux/bitops.h> are are:
  * SPDX-License-Identifier: GPL-2.0
  */
-
 /**
- * rol64 - rotate a 64-bit value left
- * @word: value to rotate
- * @shift: bits to roll
+ * rol8, rol16, rol32, rol64 - rotate left
+ * @num: unsigned 8, 16, 32 or 64 bits integer
+ * @n: bits to roll
  */
-static inline u64 rol64(u64 word, unsigned int shift)
-{
-        return (word << (shift & 63)) | (word >> ((-shift) & 63));
-}
+#define rol8(num, n)  ((num << (n &  7)) | (num >> ((-n) &  7)))
+#define rol16(num, n) ((num << (n & 15)) | (num >> ((-n) & 15)))
+#define rol32(num, n) ((num << (n & 31)) | (num >> ((-n) & 31)))
+#define rol64(num, n) ((num << (n & 63)) | (num >> ((-n) & 63)))
 
 /**
- * ror64 - rotate a 64-bit value right
- * @word: value to rotate
- * @shift: bits to roll
+ * ror8, ror16, ror32, ror64 - rotate right
+ * @num: unsigned 8, 16, 32 or 64 bits integer
+ * @n: bits to roll
  */
-static inline u64 ror64(u64 word, unsigned int shift)
-{
-        return (word >> (shift & 63)) | (word << ((-shift) & 63));
-}
+#define ror8(num, n)  ((num >> (n &  7)) | (num << ((-n) &  7)))
+#define ror16(num, n) ((num >> (n & 15)) | (num << ((-n) & 15)))
+#define ror32(num, n) ((num >> (n & 31)) | (num << ((-n) & 31)))
+#define ror64(num, n) ((num >> (n & 63)) | (num << ((-n) & 63)))
 
 /**
- * rol32 - rotate a 32-bit value left
- * @word: value to rotate
- * @shift: bits to roll
+ * ilog2 - log base 2
+ * @n: unsigned 32 or 64 bits integer.
  */
-static inline u32 rol32(u32 word, unsigned int shift)
-{
-        return (word << (shift & 31)) | (word >> ((-shift) & 31));
-}
+#define ilog2_32(n) (fls32(n) - 1)
+#define ilog2_64(n) (fls64(n) - 1)
 
 /**
- * ror32 - rotate a 32-bit value right
- * @word: value to rotate
- * @shift: bits to roll
- */
-static inline u32 ror32(u32 word, unsigned int shift)
-{
-        return (word >> (shift & 31)) | (word << ((-shift) & 31));
-}
-
-/**
- * rol16 - rotate a 16-bit value left
- * @word: value to rotate
- * @shift: bits to roll
- */
-static inline u16 rol16(u16 word, unsigned int shift)
-{
-        return (word << (shift & 15)) | (word >> ((-shift) & 15));
-}
-
-/**
- * ror16 - rotate a 16-bit value right
- * @word: value to rotate
- * @shift: bits to roll
- */
-static inline u16 ror16(u16 word, unsigned int shift)
-{
-        return (word >> (shift & 15)) | (word << ((-shift) & 15));
-}
-
-/**
- * rol8 - rotate an 8-bit value left
- * @word: value to rotate
- * @shift: bits to roll
- */
-static inline u8 rol8(u8 word, unsigned int shift)
-{
-        return (word << (shift & 7)) | (word >> ((-shift) & 7));
-}
-
-/**
- * ror8 - rotate an 8-bit value right
- * @word: value to rotate
- * @shift: bits to roll
- */
-static inline u8 ror8(u8 word, unsigned int shift)
-{
-        return (word >> (shift & 7)) | (word << ((-shift) & 7));
-}
-
-/**
- * __ilog2 - non-constant log of base 2 calculators
- * - the arch may override these in asm/bitops.h if they can be implemented
- *   more efficiently than using fls() and fls64()
- * - the arch is not required to handle n==0 if implementing the fallback
- */
-static __always_inline __attribute__((const))
-int __ilog2_u64(u64 n)
-{
-        return fls64(n) - 1;
-}
-
-static __always_inline __attribute__((const))
-int __ilog2_u32(u32 n)
-{
-        return fls32(n) - 1;
-}
-
-/**
- * is_power_of_2() - check if a value is a power of two
+ * is_pow2() - check if number is a power of two
  * @n: the value to check
  *
- * Determine whether some value is a power of two, where zero is
- * *not* considered a power of two.
- * Return: true if @n is a power of 2, otherwise false.
+ * Zero is *not* considered a power of two.
  */
-static inline __attribute__((const))
-bool is_power_of_2(unsigned long n)
-{
-        return (n != 0 && ((n & (n - 1)) == 0));
-}
+#define is_pow2(n) (n != 0 && (((n) & ((n) - 1)) == 0))
 
 /**
- * __roundup_pow_of_two() - round up to nearest power of two
- * @n: value to round up
- */
-static inline __attribute__((const))
-u64 __roundup_pow_of_two(u64 n)
-{
-        return 1UL << fls64(n - 1);
-}
-
-/**
- * __rounddown_pow_of_two() - round down to nearest power of two
- * @n: value to round down
- */
-static inline __attribute__((const)) u64 __rounddown_pow_of_two(u64 n)
-{
-        return 1UL << (fls64(n) - 1);
-}
-
-/**
- * ilog2 - log base 2 of 32-bit or a 64-bit unsigned value
- * @n: parameter
+ * bit_for_eachXX - iterate over an integer bits (0-indexed)
+ * bit_for_eachXX_ffs - iterate over an integer bits (1-indexed)
+ * @pos:  int used as current bit
+ * @tmp:  temp u64/u32 used as temporary storage
+ * @ul:   u32/u64 to loop over
  *
- * constant-capable log of base 2 calculation
- * - this can be used to initialise global variables from constant data, hence
- * the massive ternary operator construction
+ * Bits are 0-indexed from 0 with bit_for_each, and 1-indexed with
+ * bits_for_each_ffs.
  *
- * selects the appropriately-sized optimised version depending on sizeof(n)
- */
-#define ilog2(n)                        \
-(                                       \
-        __builtin_constant_p(n) ?       \
-        ((n) < 2 ? 0 :                  \
-         63 - __builtin_clzll(n)) :     \
-        (sizeof(n) <= 4) ?              \
-        __ilog2_u32(n) :                \
-        __ilog2_u64(n)                  \
- )
-
-/**
- * roundup_pow_of_two - round the given value up to nearest power of two
- * @n: parameter
- *
- * round the given value up to the nearest power of two
- * - the result is undefined when n == 0
- * - this can be used to initialise global variables from constant data
- */
-#define roundup_pow_of_two(n)                   \
-(                                               \
-        __builtin_constant_p(n) ? (             \
-                ((n) == 1) ? 1 :                \
-                (1UL << (ilog2((n) - 1) + 1))   \
-                                   ) :          \
-        __roundup_pow_of_two(n)                 \
- )
-
-/**
- * rounddown_pow_of_two - round the given value down to nearest power of two
- * @n: parameter
- *
- * round the given value down to the nearest power of two
- * - the result is undefined when n == 0
- * - this can be used to initialise global variables from constant data
- */
-#define rounddown_pow_of_two(n)                 \
-(                                               \
-        __builtin_constant_p(n) ? (             \
-                (1UL << ilog2(n))) :            \
-        __rounddown_pow_of_two(n)               \
- )
-
-static inline __attribute_const__ int __order_base_2(unsigned long n)
-{
-        return n > 1 ? ilog2(n - 1) + 1 : 0;
-}
-
-/**
- * order_base_2 - calculate the (rounded up) base 2 order of the argument
- * @n: parameter
- *
- * The first few values calculated by this routine:
- *  ob2(0) = 0
- *  ob2(1) = 0
- *  ob2(2) = 1
- *  ob2(3) = 2
- *  ob2(4) = 2
- *  ob2(5) = 3
- *  ... and so on.
- */
-#define order_base_2(n)                         \
-(                                               \
-        __builtin_constant_p(n) ? (             \
-            ((n) == 0 || (n) == 1) ?            \
-            0 :                                 \
-            ilog2((n) - 1) + 1) :               \
-        __order_base_2(n)                       \
-)
-
-static inline __attribute__((const)) int __bits_per(unsigned long n)
-{
-        if (n < 2)
-            return 1;
-        if (is_power_of_2(n))
-            return order_base_2(n) + 1;
-        return order_base_2(n);
-}
-
-/**
- * bits_per - calculate the number of bits required for the argument
- * @n: parameter
- *
- * This is constant-capable and can be used for compile time
- * initializations, e.g bitfields.
- *
- * The first few values calculated by this routine:
- * bf(0) = 1
- * bf(1) = 1
- * bf(2) = 2
- * bf(3) = 2
- * bf(4) = 3
- * ... and so on.
- */
-#define bits_per(n)                             \
-(                                               \
-        __builtin_constant_p(n) ? (             \
-            ((n) == 0 || (n) == 1) ?            \
-            1 :                                 \
-            ilog2(n) + 1 :                      \
-            __bits_per(n)                       \
-)
-
-/**
- * bit_for_each - iterate over an u64/u32 bits
- * @pos:        an int used as current bit
- * @tmp:        a temp u64/u32 used as temporary storage
- * @ul:         the u64/u32 to loop over
- *
- * Usage:
- * u64 u=139, _t;     //  u=b10001011
+ * Example:
+ * u64 u=139, _t;           // u=b10001011
  * int cur;
  * bit_for_each64(cur, _t, u) {
  *     printf("%d\n", cur);
  * }
- * This will display the position of each bit set in ul: 1, 2, 4, 8
+ * This will display the position of each bit set in ul: 0, 1, 3, 7
  *
- * I should probably re-think the implementation...
  */
-#define bit_for_each64(pos, tmp, ul)                                  \
-    for (tmp = ul, pos = ctz64(tmp); tmp; tmp ^= 1UL << pos, pos = ctz64(tmp))
-
-#define bit_for_each32(pos, tmp, ul)                                  \
-    for (tmp = ul, pos = ctz32(tmp); tmp; tmp ^= 1U << pos, pos = ctz32(tmp))
-
-/** or would it be more useful (counting bits from zero instead of 1) ?
- */
-#define bit_for_each64_1(pos, tmp, ul)                                    \
-    for (tmp = ul, pos = ffs64(tmp); tmp; tmp &= (tmp - 1),  pos = ffs64(tmp))
-
-#define bit_for_each32_1(pos, tmp, ul)                                    \
-    for (tmp = ul, pos = ffs32(tmp); tmp; tmp &= (tmp - 1),  pos = ffs32(tmp))
+#define bit_for_each32(pos, tmp, ul)                 \
+    for (tmp = ul, pos = ctz32(tmp);                 \
+         tmp;                                        \
+         tmp ^= 1U << pos, pos = ctz32(tmp))
+#define bit_for_each64(pos, tmp, ul)                 \
+    for (tmp = ul, pos = ctz64(tmp);                 \
+         tmp;                                        \
+         tmp ^= 1UL << pos, pos = ctz64(tmp))
+#define bit_for_each64_ffs(pos, tmp, ul)             \
+    for (tmp = ul, pos = ffs64(tmp);                 \
+         tmp;                                        \
+         tmp &= (tmp - 1),  pos = ffs64(tmp))
+#define bit_for_each32_ffs(pos, tmp, ul)             \
+    for (tmp = ul, pos = ffs32(tmp);                 \
+         tmp;                                        \
+         tmp &= (tmp - 1),  pos = ffs32(tmp))
 
 #endif  /* _BITS_H */
diff --git a/test/bitops-test.c b/test/bitops-test.c
index 5603738..c83660f 100644
--- a/test/bitops-test.c
+++ b/test/bitops-test.c
@@ -24,23 +24,173 @@ static void test_popcount()
     u64 t64[] = { 0x0ll, 0x8880000000000101LL, 0xffffffffffffffffll };
 
     for (uint i = 0; i < ARRAY_SIZE(t32); ++i) {
-        printf("popcount 32 (%#x): ", t32[i]);
+        printf("popcount32 (%#x): ", t32[i]);
 #       ifdef ___popcount32_native
-        printf("native:%d ", ___popcount32_native(t32[i]));
+        printf("native:%d ", __popcount32_native(t32[i]));
+#       else
+        printf("native:XXX ");
 #       endif
-        printf("emulated:%d\n", ___popcount_emulated(t32[i]));
+        printf("emulated:%d ", __popcount_emulated(t32[i]));
+        printf("\n");
     }
+        printf("\n");
     for (uint i = 0; i < ARRAY_SIZE(t64); ++i) {
-        printf("popcount 64 (%#lx): ", t64[i]);
+        printf("popcount64 (%#lx): ", t64[i]);
 #       ifdef ___popcount64_native
-        printf("native:%d ", ___popcount64_native(t64[i]));
+        printf("native:%d ", __popcount64_native(t64[i]));
+#       else
+        printf("native:XXX ");
 #       endif
-        printf("emulated:%d\n", ___popcount_emulated(t64[i]));
+        printf("emulated:%d ", __popcount_emulated(t64[i]));
+        printf("\n");
     }
+    printf("\n");
+}
+
+static void test_ctz()
+{
+    u32 t32[] = {
+        0x88800101,
+        0xffffffff,
+        0x800,
+        0x80000000,
+        0x00800000
+    };
+    u64 t64[] = {
+        0x8880000000000101LL,
+        0xffffffffffffffffll,
+        0x800ll,
+        0x8000000000000000LL,
+        0x0080000000000000LL};
+
+    for (uint i = 0; i < ARRAY_SIZE(t32); ++i) {
+        printf("ctz32 (%#x): ", t32[i]);
+#       ifdef __ctz32_native
+        printf("native:%d ", __ctz32_native(t32[i]));
+#       else
+        printf("native:XXX ");
+#       endif
+        printf("emulated1:%d ", __ctz32_emulated(t32[i]));
+        printf("emulated2:%d ", __ctz32_emulated2(t32[i]));
+        //printf("emulated3:%d ", __ctz32_emulated3(t32[i]));
+        //printf("emulated4:%d ", __ctz32_emulated4(t32[i]));
+        printf("\n");
+    }
+    printf("\n");
+    for (uint i = 0; i < ARRAY_SIZE(t64); ++i) {
+        printf("ctz64 (%#lx): ", t64[i]);
+#       ifdef __ctz64_native
+        printf("native:%d ", __ctz64_native(t64[i]));
+#       else
+        printf("native:XXX ");
+#       endif
+        printf("emulated1:%d ", __ctz64_emulated(t64[i]));
+        printf("emulated2:%d ", __ctz64_emulated2(t64[i]));
+        //printf("emulated3:%d ", __ctz64_emulated3(t64[i]));
+        //printf("emulated4:%d ", __ctz64_emulated4(t64[i]));
+        printf("\n");
+    }
+    printf("\n");
+}
+
+static void test_clz()
+{
+    u32 t32[] = {
+        0x88800101,
+        0xffffffff,
+        0x800,
+        0x80000000,
+        0x00800000
+    };
+    u64 t64[] = {
+        0x8880000000000101LL,
+        0xffffffffffffffffll,
+        0x800ll,
+        0x8000000000000000LL,
+        0x0080000000000000LL};
+
+    for (uint i = 0; i < ARRAY_SIZE(t32); ++i) {
+        printf("clz32 (%#x): ", t32[i]);
+#       ifdef __clz32_native
+        printf("native:%d ", __clz32_native(t32[i]));
+#       else
+        printf("native:XXX ");
+#       endif
+        printf("emulated1:%d ", __clz32_emulated(t32[i]));
+        //printf("emulated2:%d ", __clz32_emulated2(t32[i]));
+        //printf("emulated3:%d ", __ctz32_emulated3(t32[i]));
+        //printf("emulated4:%d ", __ctz32_emulated4(t32[i]));
+        printf("\n");
+    }
+    printf("\n");
+    for (uint i = 0; i < ARRAY_SIZE(t64); ++i) {
+        printf("clz64 (%#lx): ", t64[i]);
+#       ifdef __clz64_native
+        printf("native:%d ", __clz64_native(t64[i]));
+#       else
+        printf("native:XXX ");
+#       endif
+        printf("emulated1:%d ", __clz64_emulated(t64[i]));
+        //printf("emulated2:%d ", __ctz64_emulated2(t64[i]));
+        //printf("emulated3:%d ", __ctz64_emulated3(t64[i]));
+        //printf("emulated4:%d ", __ctz64_emulated4(t64[i]));
+        printf("\n");
+    }
+    printf("\n");
+}
+
+static void test_ffs()
+{
+    u32 t32[] = {
+        0x88800101,
+        0xffffffff,
+        0x800,
+        0x80000000,
+        0x00800000
+    };
+    u64 t64[] = {
+        0x8880000000000101LL,
+        0xffffffffffffffffll,
+        0x800ll,
+        0x8000000000000000LL,
+        0x0080000000000000LL};
+
+    for (uint i = 0; i < ARRAY_SIZE(t32); ++i) {
+        printf("ffs32 (%#x): ", t32[i]);
+#       ifdef __ffs32_native
+        printf("native:%d ", __ffs32_native(t32[i]));
+#       else
+        printf("native:XXX ");
+#       endif
+        printf("popcount:%d ", __ffs32_popcount(t32[i]));
+        printf("ctz:%d ", __ffs32_ctz(t32[i]));
+        printf("emulated:%d ", __ffs32_emulated(t32[i]));
+        //printf("emulated4:%d ", __ctz32_emulated4(t32[i]));
+        printf("\n");
+    }
+    printf("\n");
+    for (uint i = 0; i < ARRAY_SIZE(t64); ++i) {
+        printf("ffs64 (%#lx): ", t64[i]);
+#       ifdef __ffs64_native
+        printf("native:%d ", __ffs64_native(t64[i]));
+#       else
+        printf("native:XXX ");
+#       endif
+        printf("popcount:%d ", __ffs64_popcount(t64[i]));
+        printf("ctz:%d ", __ffs64_ctz(t64[i]));
+        printf("emulated:%d ", __ffs64_emulated(t64[i]));
+        //printf("emulated4:%d ", __ctz64_emulated4(t64[i]));
+        printf("\n");
+    }
+    printf("\n");
+    printf("\n");
 }
 
 int main()
 {
     test_popcount();
+    test_ctz();
+    test_clz();
+    test_ffs();
     exit(0);
 }