Re: [PATCH 1/2] tls: remove close callback sock unlock/lock and flush_sync

From: John Fastabend <john.fastabend@gmail.com>
Date: 2019-06-28 14:12:20
Also in: bpf

Jakub Kicinski wrote:

On Thu, 27 Jun 2019 10:36:42 -0700, John Fastabend wrote:

quoted

The tls close() callback currently drops the sock lock, makes a
cancel_delayed_work_sync() call, and then relocks the sock. This
seems suspect at best. The lock_sock() is applied to stop concurrent
operations on the socket while tearing the sock down. Further we
will need to add support for unhash() shortly and this complicates
matters because the lock may or may not be held then.

So to fix the above situation and simplify the next patch to add
unhash this patch creates a function tls_sk_proto_cleanup() that
tears down the socket without calling lock_sock/release_sock. In
order to flush the workqueue then we do the following,

  - Add a new bit to ctx, BIT_TX_CLOSING that is set when the
    tls resources are being removed.
  - Check this bit before scheduling any new work. This way we
    avoid queueing new work after tear down has started.
  - With the BIT_TX_CLOSING ensuring no new work is being added
    convert the cancel_delayed_work_sync to flush_delayed_work()
  - Finally call tlx_tx_records() to complete any available records
    before,
  - releasing and removing tls ctx.

The above is implemented for the software case namely any of
the following configurations from build_protos,

   prot[TLS_SW][TLS_BASE]
   prot[TLS_BASE][TLS_SW]
   prot[TLS_SW][TLS_SW]

The implication is a follow up patch is needed to resolve the
hardware offload case.

Tested with net selftests and bpf selftests.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
---
 include/net/tls.h  |    4 ++--
 net/tls/tls_main.c |   54 ++++++++++++++++++++++++++--------------------------
 net/tls/tls_sw.c   |   50 ++++++++++++++++++++++++++++++++----------------
 3 files changed, 62 insertions(+), 46 deletions(-)

diff --git a/include/net/tls.h b/include/net/tls.h
index 4a55ce6a303f..6fe1f5c96f4a 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h

@@ -105,9 +105,7 @@ struct tls_device {
 enum {
 	TLS_BASE,
 	TLS_SW,
-#ifdef CONFIG_TLS_DEVICE
 	TLS_HW,
-#endif
 	TLS_HW_RECORD,
 	TLS_NUM_CONFIG,
 };

@@ -160,6 +158,7 @@ struct tls_sw_context_tx {
 	int async_capable;
 
 #define BIT_TX_SCHEDULED	0

BTW do you understand why we track this bit separately?  Just to avoid
the irq operations in the workqueue code?

Sorry not sure I understand. You mean vs simply scheduling the work
without checking the bit? Presumably its better to avoid scheduling
unnecessary work.

quoted

+#define BIT_TX_CLOSING		1

But since we do have the above, and I think it's tested everywhere,
wouldn't setting SCHEDULED without accentually scheduling have
effectively the same result?

It would block a send from calling tls_tx_records() but I guess that is
OK because this is a tear down operation and we are about to call
tls_tx_records anyways.

Sure we can do it this way might be slightly nicer to avoid checking
two bits.

quoted

 	unsigned long tx_bitmask;
 };

@@ -327,6 +326,7 @@ void tls_sw_close(struct sock *sk, long timeout);
 void tls_sw_free_resources_tx(struct sock *sk);
 void tls_sw_free_resources_rx(struct sock *sk);
 void tls_sw_release_resources_rx(struct sock *sk);
+void tls_sw_release_strp_rx(struct tls_context *tls_ctx);
 int tls_sw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 		   int nonblock, int flags, int *addr_len);
 bool tls_sw_stream_read(const struct sock *sk);

diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index fc81ae18cc44..51cb19e24dd9 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c

@@ -261,24 +261,9 @@ static void tls_ctx_free(struct tls_context *ctx)
 	kfree(ctx);
 }
 
-static void tls_sk_proto_close(struct sock *sk, long timeout)
+static void tls_sk_proto_cleanup(struct sock *sk,
+				 struct tls_context *ctx, long timeo)
 {
-	struct tls_context *ctx = tls_get_ctx(sk);
-	long timeo = sock_sndtimeo(sk, 0);
-	void (*sk_proto_close)(struct sock *sk, long timeout);
-	bool free_ctx = false;
-
-	lock_sock(sk);
-	sk_proto_close = ctx->sk_proto_close;
-
-	if (ctx->tx_conf == TLS_HW_RECORD && ctx->rx_conf == TLS_HW_RECORD)
-		goto skip_tx_cleanup;
-
-	if (ctx->tx_conf == TLS_BASE && ctx->rx_conf == TLS_BASE) {
-		free_ctx = true;
-		goto skip_tx_cleanup;
-	}
-
 	if (!tls_complete_pending_work(sk, ctx, 0, &timeo))
 		tls_handle_open_record(sk, 0);

@@ -299,22 +284,37 @@ static void tls_sk_proto_close(struct sock *sk, long timeout)
 #ifdef CONFIG_TLS_DEVICE
 	if (ctx->rx_conf == TLS_HW)
 		tls_device_offload_cleanup_rx(sk);
-
-	if (ctx->tx_conf != TLS_HW && ctx->rx_conf != TLS_HW) {
-#else
-	{
 #endif
-		tls_ctx_free(ctx);
-		ctx = NULL;
+}
+
+static void tls_sk_proto_close(struct sock *sk, long timeout)
+{
+	struct tls_context *ctx = tls_get_ctx(sk);
+	long timeo = sock_sndtimeo(sk, 0);
+	void (*sk_proto_close)(struct sock *sk, long timeout);
+	bool free_ctx = false;

Set but not used?

Removed in second patch but right should be removed here.

quoted

+
+	lock_sock(sk);
+	sk_proto_close = ctx->sk_proto_close;
+
+	if (ctx->tx_conf == TLS_HW_RECORD && ctx->rx_conf == TLS_HW_RECORD)
+		goto skip_tx_cleanup;
+
+	if (ctx->tx_conf == TLS_BASE && ctx->rx_conf == TLS_BASE) {
+		free_ctx = true;
+		goto skip_tx_cleanup;
 	}
 
+	tls_sk_proto_cleanup(sk, ctx, timeo);
+
 skip_tx_cleanup:
 	release_sock(sk);
+	if (ctx->rx_conf == TLS_SW)
+		tls_sw_release_strp_rx(ctx);
 	sk_proto_close(sk, timeout);
-	/* free ctx for TLS_HW_RECORD, used by tcp_set_state
-	 * for sk->sk_prot->unhash [tls_hw_unhash]
-	 */
-	if (free_ctx)
+
+	if (ctx->tx_conf != TLS_HW && ctx->rx_conf != TLS_HW &&
+	    ctx->tx_conf != TLS_HW_RECORD && ctx->rx_conf != TLS_HW_RECORD)
 		tls_ctx_free(ctx);
 }

diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 455a782c7658..d234a6b818e6 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c

@@ -473,7 +473,8 @@ static void tls_encrypt_done(struct crypto_async_request *req, int err)
 		return;
 
 	/* Schedule the transmission */
-	if (!test_and_set_bit(BIT_TX_SCHEDULED, &ctx->tx_bitmask))
+	if (!test_and_set_bit(BIT_TX_SCHEDULED, &ctx->tx_bitmask) &&
+	    !test_bit(BIT_TX_CLOSING, &ctx->tx_bitmask))

Probably doesn't matter but seems like CLOSING test should be before
the test_and_set().

Yea, looks like we can drop CLOSING bit and use SCHEDULED bit makes
these a bit nicer.

quoted

 		schedule_delayed_work(&ctx->tx_work.work, 1);
 }

@@ -2058,16 +2059,26 @@ void tls_sw_free_resources_tx(struct sock *sk)
 	struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
 	struct tls_rec *rec, *tmp;
 
+	/* Set TX CLOSING bit to stop tx_work from being scheduled
+	 * while tearing down TX context. We will flush any pending
+	 * work before free'ing ctx anyways. If already set then
+	 * another call is already free'ing resources.
+	 */

Oh, can we get multiple calls here?  Is this prep for unhash?

It was prep for unhash() but there is a nicer way to get this so
we can drop it and just ensure we reset the prot callbacks before.

quoted

+	if (test_and_set_bit(BIT_TX_CLOSING, &ctx->tx_bitmask))
+		return;
+
 	/* Wait for any pending async encryptions to complete */
 	smp_store_mb(ctx->async_notify, true);
 	if (atomic_read(&ctx->encrypt_pending))
 		crypto_wait_req(-EINPROGRESS, &ctx->async_wait);
 
-	release_sock(sk);
-	cancel_delayed_work_sync(&ctx->tx_work.work);
-	lock_sock(sk);
-
-	/* Tx whatever records we can transmit and abandon the rest */
+	/* Flush work queue and then Tx whatever records we can
+	 * transmit and abandon the rest, lock_sock(sk) must be
+	 * held here. We ensure no further work is enqueue by
+	 * checking CLOSING bit before queueing new work and
+	 * setting it above.
+	 */
+	flush_delayed_work(&ctx->tx_work.work);
 	tls_tx_records(sk, -1);
 
 	/* Free up un-sent records in tx_list. First, free

@@ -2111,22 +2122,22 @@ void tls_sw_release_resources_rx(struct sock *sk)
 		write_lock_bh(&sk->sk_callback_lock);
 		sk->sk_data_ready = ctx->saved_data_ready;
 		write_unlock_bh(&sk->sk_callback_lock);
-		release_sock(sk);
-		strp_done(&ctx->strp);
-		lock_sock(sk);
 	}
 }
 
-void tls_sw_free_resources_rx(struct sock *sk)
+void tls_sw_release_strp_rx(struct tls_context *tls_ctx)
 {
-	struct tls_context *tls_ctx = tls_get_ctx(sk);
 	struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
 
-	tls_sw_release_resources_rx(sk);
-
+	strp_done(&ctx->strp);
 	kfree(ctx);
 }
 
+void tls_sw_free_resources_rx(struct sock *sk)
+{
+	tls_sw_release_resources_rx(sk);
+}

I don't understand the RX side well enough, but perhaps a separate
patch would make sense here?

sure. Its actually its own fix I guess.

quoted

 /* The work handler to transmitt the encrypted records in tx_list */
 static void tx_work_handler(struct work_struct *work)
 {

@@ -2140,9 +2151,14 @@ static void tx_work_handler(struct work_struct *work)
 	if (!test_and_clear_bit(BIT_TX_SCHEDULED, &ctx->tx_bitmask))
 		return;
 
-	lock_sock(sk);
+	/* If we are running from a socket close operation then the
+	 * lock is already held so we do not need to hold it.
+	 */
+	if (likely(!test_bit(BIT_TX_CLOSING, &ctx->tx_bitmask)))
+		lock_sock(sk);

	CPU 0 (free)		CPU 1 (wq)
				test_bit()
	lock(sk)
	set_bit()
				lock(sk)
	flush_work()

No?

Yeah seems possible although never seen in my testing. So I'll
move the test_bit() inside the lock and do a ctx check to ensure
still have the reference.

  CPU 0 (free)           CPU 1 (wq)

  lock(sk)
                         lock(sk)
  set_bit()
  cancel_work()
  release
                         ctx = tls_get_ctx(sk)
                         unlikely(!ctx) <- we may have free'd 
                         test_bit()
                         ...
                         release()

or

  CPU 0 (free)           CPU 1 (wq)

                         lock(sk)
  lock(sk)
                         ctx = tls_get_ctx(sk)
                         unlikely(!ctx)
                         test_bit()
                         ...
                         release()
  set_bit()
  cancel_work()
  release

quoted

 	tls_tx_records(sk, -1);
-	release_sock(sk);
+	if (likely(!test_bit(BIT_TX_CLOSING, &ctx->tx_bitmask)))
+		release_sock(sk);
 }
 
 void tls_sw_write_space(struct sock *sk, struct tls_context *ctx)

@@ -2152,8 +2168,8 @@ void tls_sw_write_space(struct sock *sk, struct tls_context *ctx)
 	/* Schedule the transmission if tx list is ready */
 	if (is_tx_ready(tx_ctx) && !sk->sk_write_pending) {
 		/* Schedule the transmission */
-		if (!test_and_set_bit(BIT_TX_SCHEDULED,
-				      &tx_ctx->tx_bitmask))
+		if (!test_and_set_bit(BIT_TX_SCHEDULED, &tx_ctx->tx_bitmask) &&
+		    !test_bit(BIT_TX_CLOSING, &tx_ctx->tx_bitmask))
 			schedule_delayed_work(&tx_ctx->tx_work.work, 0);
 	}
 }

Thanks,
John

`h`	back out one level
`j`	next message in thread
`k`	previous message in thread
`l`	drill in
`Esc`	close help / fold thread tree
`?`	toggle this help