Re: [PATCH v16 4/5] random: introduce generic vDSO getrandom() implementation
From: Thomas Gleixner <hidden>
Date: 2024-06-05 21:03:03
Also in:
linux-crypto, linux-patches, lkml
Jason! On Tue, May 28 2024 at 14:19, Jason A. Donenfeld wrote:
quoted hunk ↗ jump to hunk
diff --git a/include/vdso/getrandom.h b/include/vdso/getrandom.h index e3ceb1976386..7dc93d5f72dc 100644 --- a/include/vdso/getrandom.h +++ b/include/vdso/getrandom.h@@ -6,11 +6,39 @@ #ifndef _VDSO_GETRANDOM_H #define _VDSO_GETRANDOM_H +#include <crypto/chacha.h>
Can you please split the required defines into a seperate header preferrably in include/vdso/ and include that from crypto/chacha.h The point is that VDSO is very intentionally not using anything outside include/uapi/ and include/vdso/ except for include/linux/compiler.h and include/linux/types.h. We've had too much trouble of random include chains which magically break the build dependent on architectures and configurations. VDSO is a userspace library after all.
+#include <vdso/types.h>
+
/**
* struct vgetrandom_state - State used by vDSO getrandom() and allocated by vgetrandom_alloc().
*
- * Currently empty, as the vDSO getrandom() function has not yet been implemented.
+ * @batch: One and a half ChaCha20 blocks of buffered RNG output.
+ *
+ * @key: Key to be used for generating next batch.
+ *
+ * @batch_key: Union of the prior two members, which is exactly two full
+ * ChaCha20 blocks in size, so that @batch and @key can be filled
+ * together.
+ *
+ * @generation: Snapshot of @rng_info->generation in the vDSO data page at
+ * the time @key was generated.
+ *
+ * @pos: Offset into @batch of the next available random byte.
+ *
+ * @in_use: Reentrancy guard for reusing a state within the same thread
+ * due to signal handlers.
*/
-struct vgetrandom_state { int placeholder; };
+struct vgetrandom_state {
+ union {
+ struct {
+ u8 batch[CHACHA_BLOCK_SIZE * 3 / 2];
+ u32 key[CHACHA_KEY_SIZE / sizeof(u32)];CHACHA_STATE_WORDS ?
+ }; + u8 batch_key[CHACHA_BLOCK_SIZE * 2];
Lot's of magic constants here *3/2 *2 ....
+ }; + vdso_kernel_ulong generation; + u8 pos;
What does the u8 buy here over a simple unsigned int?
quoted hunk ↗ jump to hunk
+ bool in_use; +}; #endif /* _VDSO_GETRANDOM_H */diff --git a/include/vdso/types.h b/include/vdso/types.h new file mode 100644 index 000000000000..ce131463aeff --- /dev/null +++ b/include/vdso/types.h@@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: GPL-2.0 */
Why does this need an extra header when it's clearly getrandom specific? Please put this into getrandom.h
+/* + * Copyright (C) 2022 Jason A. Donenfeld [off-list ref]. All Rights Reserved. + */ +#ifndef __VDSO_TYPES_H +#define __VDSO_TYPES_H + +#include <linux/types.h> + +/** + * type vdso_kernel_ulong - unsigned long type that matches kernel's unsigned long + * + * Data shared between userspace and the kernel must operate the same way in both 64-bit code and in + * 32-bit compat code, over the same potentially 64-bit kernel. This type represents the size of an + * unsigned long as used by kernel code. This isn't necessarily the same as an unsigned long as used + * by userspace, however.
This is confusing at best. First of all 64-bit code can run only on a 64-bit kernel, so what does 'the same potentially 64-bit kernel' even mean in that sentence? What means: 'This type represents the size of an unsigned long as used by kernel code'?
+ * +-------------------+-------------------+------------------+-------------------+ + * | 32-bit userspace | 32-bit userspace | 64-bit userspace | 64-bit userspace | + * | unsigned long | vdso_kernel_ulong | unsigned long | vdso_kernel_ulong | + * +---------------+-------------------+-------------------+------------------+-------------------+ + * | 32-bit kernel | ✓ same size | ✓ same size | + * | unsigned long | | | + * +---------------+-------------------+-------------------+------------------+-------------------+ + * | 64-bit kernel | ✘ different size! | ✓ same size | ✓ same size | ✓ same size | + * | unsigned long | | | | | + * +---------------+-------------------+-------------------+------------------+-------------------+
I have no idea what this table tries to tell me, but I clearly can see what you are trying to achieve here:
+ */ +#ifdef CONFIG_64BIT +typedef u64 vdso_kernel_ulong; +#else +typedef u32 vdso_kernel_ulong; +#endif
All of this is pointless because if a 32-bit application runs on a 64-bit kernel it has to use the 64-bit 'generation'. So why on earth do we need magic here for a 32-bit kernel? Just use u64 for both and spare all this voodoo. We're seriously not "optimizing" for 32-bit kernels.
+/**
+ * __cvdso_getrandom_data - Generic vDSO implementation of getrandom() syscall.
+ * @rng_info: Describes state of kernel RNG, memory shared with kernel.
+ * @buffer: Destination buffer to fill with random bytes.
+ * @len: Size of @buffer in bytes.
+ * @flags: Zero or more GRND_* flags.
+ * @opaque_state: Pointer to an opaque state area.
+ *
+ * This implements a "fast key erasure" RNG using ChaCha20, in the same way that the kernel's
+ * getrandom() syscall does. It periodically reseeds its key from the kernel's RNG, at the same
+ * schedule that the kernel's RNG is reseeded. If the kernel's RNG is not ready, then this always
+ * calls into the syscall.
+ *
+ * @opaque_state *must* be allocated using the vgetrandom_alloc() syscall. Unless external locking
+ * is used, one state must be allocated per thread, as it is not safe to call this function
+ * concurrently with the same @opaque_state. However, it is safe to call this using the same
+ * @opaque_state that is shared between main code and signal handling code, within the same thread.
+ *
+ * Returns the number of random bytes written to @buffer, or a negative value indicating an error.
+ */
+static __always_inline ssize_t
+__cvdso_getrandom_data(const struct vdso_rng_data *rng_info, void *buffer, size_t len,
+ unsigned int flags, void *opaque_state)
+{
+ ssize_t ret = min_t(size_t, INT_MAX & PAGE_MASK /* = MAX_RW_COUNT */, len);We really need to allow reading almost 2GB of random data in one go?
+ struct vgetrandom_state *state = opaque_state;
+ size_t batch_len, nblocks, orig_len = len;
+ unsigned long current_generation;
+ void *orig_buffer = buffer;
+ u32 counter[2] = { 0 };
+ bool in_use, have_retried = false;Please keep the reverse fir tree order.
+ /* The state must not straddle a page, since pages can be zeroed at any time. */ + if (unlikely(((unsigned long)opaque_state & ~PAGE_MASK) + sizeof(*state) > PAGE_SIZE)) + goto fallback_syscall; + + /* + * If the kernel's RNG is not yet ready, then it's not possible to provide random bytes from + * userspace, because A) the various @flags require this to block, or not, depending on + * various factors unavailable to userspace, and B) the kernel's behavior before the RNG is + * ready is to reseed from the entropy pool at every invocation. + */ + if (unlikely(!READ_ONCE(rng_info->is_ready))) + goto fallback_syscall; + + /* + * This condition is checked after @rng_info->is_ready, because before the kernel's RNG is + * initialized, the @flags parameter may require this to block or return an error, even when + * len is zero. + */ + if (unlikely(!len)) + return 0; + + /* + * @state->in_use is basic reentrancy protection against this running in a signal handler + * with the same @opaque_state, but obviously not atomic wrt multiple CPUs or more than one + * level of reentrancy. If a signal interrupts this after reading @state->in_use, but before + * writing @state->in_use, there is still no race, because the signal handler will run to + * its completion before returning execution.
Can you please add an explanation that the syscall does not touch the state and just fills the buffer?
+ */
+ in_use = READ_ONCE(state->in_use);
+ if (unlikely(in_use))
+ goto fallback_syscall;
+ WRITE_ONCE(state->in_use, true);
+
+retry_generation:
+ /*
+ * @rng_info->generation must always be read here, as it serializes @state->key with the
+ * kernel's RNG reseeding schedule.
+ */
+ current_generation = READ_ONCE(rng_info->generation);
+
+ /*
+ * If @state->generation doesn't match the kernel RNG's generation, then it means the
+ * kernel's RNG has reseeded, and so @state->key is reseeded as well.
+ */
+ if (unlikely(state->generation != current_generation)) {
+ /*
+ * Write the generation before filling the key, in case of fork. If there is a fork
+ * just after this line, the two forks will get different random bytes from thethe two forks? You mean the parent and the child, no?
+ * syscall, which is good. However, were this line to occur after the getrandom
+ * syscall, then both child and parent could have the same bytes and the same
+ * generation counter, so the fork would not be detected. Therefore, write
+ * @state->generation before the call to the getrandom syscall.
+ */
+ WRITE_ONCE(state->generation, current_generation);
+
+ /* Prevent the syscall from being reordered wrt current_generation. */
+ barrier();
+
+ /* Reseed @state->key using fresh bytes from the kernel. */
+ if (getrandom_syscall(state->key, sizeof(state->key), 0) != sizeof(state->key)) {
+ /*
+ * If the syscall failed to refresh the key, then @state->key is now
+ * invalid, so invalidate the generation so that it is not used again, and
+ * fallback to using the syscall entirely.
+ */
+ WRITE_ONCE(state->generation, 0);
+
+ /*
+ * Set @state->in_use to false only after the last write to @state in the
+ * line above.
+ */
+ WRITE_ONCE(state->in_use, false);So here you rely on the compiler not reordering vs. WRITE_ONCE(), i.e. volatile, but above you have a barrier() to prevent the write being reordered vs. the syscall, confused. But even when the compiler does not reorder, what prevents a weakly ordered CPU from doing so?
+ goto fallback_syscall;
+ }
+
+ /*
+ * Set @state->pos to beyond the end of the batch, so that the batch is refilled
+ * using the new key.
+ */
+ state->pos = sizeof(state->batch);
+ }
+
+ /* Set len to the total amount of bytes that this function is allowed to read, ret. */
+ len = ret;
+more_batch:
+ /*
+ * First use bytes out of @state->batch, which may have been filled by the last call to this
+ * function.
+ */
+ batch_len = min_t(size_t, sizeof(state->batch) - state->pos, len);
+ if (batch_len) {
+ /* Zeroing at the same time as memcpying helps preserve forward secrecy. */
+ memcpy_and_zero_src(buffer, state->batch + state->pos, batch_len);
+ state->pos += batch_len;
+ buffer += batch_len;
+ len -= batch_len;
+ }
+
+ if (!len) {
+ /* Prevent the loop from being reordered wrt ->generation. */
+ barrier();Same question as above.
+ /*
+ * Since @rng_info->generation will never be 0, re-read @state->generation, rather
+ * than using the local current_generation variable, to learn whether a fork
+ * occurred or if @state was zeroed due to memory pressure. Primarily, though, this
+ * indicates whether the kernel's RNG has reseeded, in which case generate a new key
+ * and start over.
+ */
+ if (unlikely(READ_ONCE(state->generation) != READ_ONCE(rng_info->generation))) {
+ /*
+ * Prevent this from looping forever in case of low memory or racing with a
+ * user force-reseeding the kernel's RNG using the ioctl.
+ */
+ if (have_retried) {
+ WRITE_ONCE(state->in_use, false);
+ goto fallback_syscall;
+ }
+
+ have_retried = true;
+ buffer = orig_buffer;
+ goto retry_generation;
+ }
+
+ /*
+ * Set @state->in_use to false only when there will be no more reads or writes of
+ * @state.
+ */
+ WRITE_ONCE(state->in_use, false);
+ return ret;
+ }
+
+ /* Generate blocks of RNG output directly into @buffer while there's enough room left. */
+ nblocks = len / CHACHA_BLOCK_SIZE;
+ if (nblocks) {
+ __arch_chacha20_blocks_nostack(buffer, state->key, counter, nblocks);
+ buffer += nblocks * CHACHA_BLOCK_SIZE;
+ len -= nblocks * CHACHA_BLOCK_SIZE;
+ }
+
+ BUILD_BUG_ON(sizeof(state->batch_key) % CHACHA_BLOCK_SIZE != 0);
+
+ /* Refill the batch and then overwrite the key, in order to preserve forward secrecy. */'and then overwrite'? Isn't this overwriting it implicitely because batch_key and key are at the same place in the union?
+ __arch_chacha20_blocks_nostack(state->batch_key, state->key, counter, + sizeof(state->batch_key) / CHACHA_BLOCK_SIZE); + + /* Since the batch was just refilled, set the position back to 0 to indicate a full batch. */ + state->pos = 0; + goto more_batch;
Thanks,
tglx