[Openvpn-devel,ovpn,net] ovpn: tcp - fix packet extraction from stream

Message ID 20260204141441.528655-1-ralf@mandelbit.com
State New
Headers show
Series [Openvpn-devel,ovpn,net] ovpn: tcp - fix packet extraction from stream | expand

Commit Message

Ralf Lici Feb. 4, 2026, 2:14 p.m. UTC
When processing TCP stream data in ovpn_tcp_recv, we receive large
cloned skbs from __strp_rcv that may contain multiple coalesced packets.
The current implementation has two bugs:

1. Header offset overflow: Using pskb_pull with large offsets on
   coalesced skbs causes skb->data - skb->head to exceed the u16 storage
   of skb->network_header. This causes skb_reset_network_header to fail
   on the inner decapsulated packet, resulting in packet drops.

2. Unaligned protocol headers: Extracting packets from arbitrary
   positions within the coalesced TCP stream provides no alignment
   guarantees for the packet data causing performance penalties on
   architectures without efficient unaligned access. Additionally,
   openvpn's 2-byte length prefix on TCP packets causes the subsequent
   4-byte opcode and packet ID fields to be inherently misaligned.

Fix both issues by allocating a new skb for each openvpn packet and
using skb_copy_bits to extract only the packet content into the new
buffer, skipping the 2-byte length prefix. Also, check the length before
invoking the function that performs the allocation to avoid creating an
invalid skb.

If the packet has to be forwarded to userspace the 2-byte prefix can be
pushed to the head safely, without misalignment.

As a side effect, this approach also avoids the expensive linearization
that pskb_pull triggers on cloned skbs with page fragments. In testing,
this resulted in TCP throughput improvements of up to 74%.

Fixes: 11851cbd60ea ("ovpn: implement TCP transport")
Signed-off-by: Ralf Lici <ralf@mandelbit.com>
---
 drivers/net/ovpn/tcp.c | 53 ++++++++++++++++++++++++++++--------------
 1 file changed, 36 insertions(+), 17 deletions(-)

Comments

Ralf Lici Feb. 4, 2026, 2:40 p.m. UTC | #1
On Wed, 2026-02-04 at 15:14 +0100, Ralf Lici wrote:
> When processing TCP stream data in ovpn_tcp_recv, we receive large
> cloned skbs from __strp_rcv that may contain multiple coalesced
> packets.
> The current implementation has two bugs:
> 
> 1. Header offset overflow: Using pskb_pull with large offsets on
>    coalesced skbs causes skb->data - skb->head to exceed the u16
> storage
>    of skb->network_header. This causes skb_reset_network_header to
> fail
>    on the inner decapsulated packet, resulting in packet drops.
> 
> 2. Unaligned protocol headers: Extracting packets from arbitrary
>    positions within the coalesced TCP stream provides no alignment
>    guarantees for the packet data causing performance penalties on
>    architectures without efficient unaligned access. Additionally,
>    openvpn's 2-byte length prefix on TCP packets causes the subsequent
>    4-byte opcode and packet ID fields to be inherently misaligned.
> 
> Fix both issues by allocating a new skb for each openvpn packet and
> using skb_copy_bits to extract only the packet content into the new
> buffer, skipping the 2-byte length prefix. Also, check the length
> before
> invoking the function that performs the allocation to avoid creating
> an
> invalid skb.
> 
> If the packet has to be forwarded to userspace the 2-byte prefix can
> be
> pushed to the head safely, without misalignment.
> 
> As a side effect, this approach also avoids the expensive
> linearization
> that pskb_pull triggers on cloned skbs with page fragments. In
> testing,
> this resulted in TCP throughput improvements of up to 74%.
> 
> Fixes: 11851cbd60ea ("ovpn: implement TCP transport")
> Signed-off-by: Ralf Lici <ralf@mandelbit.com>
> ---
>  drivers/net/ovpn/tcp.c | 53 ++++++++++++++++++++++++++++-------------
> -
>  1 file changed, 36 insertions(+), 17 deletions(-)
> 
> diff --git a/drivers/net/ovpn/tcp.c b/drivers/net/ovpn/tcp.c
> index b7348da9b040..ffe8db69d76d 100644
> --- a/drivers/net/ovpn/tcp.c
> +++ b/drivers/net/ovpn/tcp.c
> @@ -70,37 +70,56 @@ static void ovpn_tcp_to_userspace(struct ovpn_peer
> *peer, struct sock *sk,
>  	peer->tcp.sk_cb.sk_data_ready(sk);
>  }
>  
> -static void ovpn_tcp_rcv(struct strparser *strp, struct sk_buff *skb)
> +static struct sk_buff *ovpn_tcp_skb_packet(const struct ovpn_peer
> *peer,
> +					   struct sk_buff *orig_skb,
> +					   const int pkt_len, const
> int pkt_off)
>  {
> -	struct ovpn_peer *peer = container_of(strp, struct ovpn_peer,
> tcp.strp);
> -	struct strp_msg *msg = strp_msg(skb);
> -	size_t pkt_len = msg->full_len - 2;
> -	size_t off = msg->offset + 2;
> -	u8 opcode;
> +	struct sk_buff *ovpn_skb;
> +	int err;
>  
> -	/* ensure skb->data points to the beginning of the openvpn
> packet */
> -	if (!pskb_pull(skb, off)) {
> -		net_warn_ratelimited("%s: packet too small for peer
> %u\n",
> -				     netdev_name(peer->ovpn->dev),
> peer->id);
> +	/* create a new skb with only the content of the current
> packet */
> +	ovpn_skb = netdev_alloc_skb(peer->ovpn->dev, pkt_len);
> +	if (unlikely(!ovpn_skb))
>  		goto err;
> -	}
>  
> -	/* strparser does not trim the skb for us, therefore we do it
> now */
> -	if (pskb_trim(skb, pkt_len) != 0) {
> -		net_warn_ratelimited("%s: trimming skb failed for
> peer %u\n",
> +	skb_copy_header(ovpn_skb, orig_skb);
> +	err = skb_copy_bits(orig_skb, pkt_off, skb_put(ovpn_skb,
> pkt_len),
> +			    pkt_len);
> +	if (unlikely(err)) {
> +		net_warn_ratelimited("%s: skb_copy_bits failed for
> peer %u\n",
>  				     netdev_name(peer->ovpn->dev),
> peer->id);
> +		kfree_skb(ovpn_skb);
>  		goto err;
>  	}
>  
> -	/* we need the first 4 bytes of data to be accessible
> +	consume_skb(orig_skb);
> +	return ovpn_skb;
> +err:
> +	kfree_skb(orig_skb);
> +	return NULL;
> +}
> +
> +static void ovpn_tcp_rcv(struct strparser *strp, struct sk_buff *skb)
> +{
> +	struct ovpn_peer *peer = container_of(strp, struct ovpn_peer,
> tcp.strp);
> +	struct strp_msg *msg = strp_msg(skb);
> +	int pkt_len = msg->full_len - 2;
> +	u8 opcode;
> +
> +	/* we need at least 4 bytes of data in the packet
>  	 * to extract the opcode and the key ID later on
>  	 */
> -	if (!pskb_may_pull(skb, OVPN_OPCODE_SIZE)) {
> +	if (unlikely(pkt_len < OVPN_OPCODE_SIZE)) {
>  		net_warn_ratelimited("%s: packet too small to fetch
> opcode for peer %u\n",
>  				     netdev_name(peer->ovpn->dev),
> peer->id);
>  		goto err;
>  	}
>  
> +	/* extract the packet into a new skb */
> +	skb = ovpn_tcp_skb_packet(peer, skb, pkt_len, msg->offset +
> 2);
> +	if (unlikely(!skb))
> +		goto err;
> +
>  	/* DATA_V2 packets are handled in kernel, the rest goes to
> user space */
>  	opcode = ovpn_opcode_from_skb(skb, 0);
>  	if (unlikely(opcode != OVPN_DATA_V2)) {
> @@ -113,7 +132,7 @@ static void ovpn_tcp_rcv(struct strparser *strp,
> struct sk_buff *skb)
>  		/* The packet size header must be there when sending
> the packet
>  		 * to userspace, therefore we put it back
>  		 */
> -		skb_push(skb, 2);
> +		*((__force __be16 *)__skb_push(skb, 2)) =
> cpu_to_be16(pkt_len);
>  		ovpn_tcp_to_userspace(peer, strp->sk, skb);
>  		return;
>  	}

This issue manifests as the following warning (on
skb_reset_network_header):


[  126.764066] ------------[ cut here ]------------
[  126.764541] WARNING: CPU: 1 PID: 22 at ./include/linux/skbuff.h:3122
ovpn_decrypt_post+0x531/0x7b0 [ovpn]
[  126.765698] Modules linked in: ovpn ip6_udp_tunnel udp_tunnel chacha
libchacha chacha20poly1305 libpoly1305 veth [last unloaded:
ip6_udp_tunnel]
[  126.767085] CPU: 1 UID: 0 PID: 22 Comm: ksoftirqd/1 Not tainted
6.17.0+ #222 PREEMPT(none) 
[  126.768053] RIP: 0010:ovpn_decrypt_post+0x531/0x7b0 [ovpn]
[  126.768769] Code: cc cc cc cc 48 2b 93 c8 00 00 00 83 c2 28 0f 88 62
02 00 00 89 c8 29 f0 39 d0 0f 82 2e 02 00 00 b8 86 dd ff ff e9 68 fe ff
ff <0f> 0b e9 20 fc ff ff 0f 0b e9 35 fc ff ff 39 c1 0f 82 5b fc ff ff
[  126.770834] RSP: 0018:ffffc900000d77f0 EFLAGS: 00010216
[  126.771503] RAX: 000000000001034c RBX: ffff88810c7ac100 RCX:
0000000000000588
[  126.772386] RDX: ffff888116ee0000 RSI: 0000000000000000 RDI:
ffffffff8359f040
[  126.773251] RBP: ffffc900000d7820 R08: 00000000000005a0 R09:
ffff888116ef034c
[  126.774126] R10: 0000000000000000 R11: 0000000000000001 R12:
ffff888116bec800
[  126.774980] R13: ffff888108118000 R14: 0000000000000018 R15:
ffff888103a6cc00
[  126.775871] FS:  0000000000000000(0000) GS:ffff8882f446b000(0000)
knlGS:0000000000000000
[  126.776839] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  126.777605] CR2: 00007f5f9d2fc000 CR3: 000000011659b000 CR4:
0000000000750eb0
[  126.778434] PKRU: 55555554
[  126.778887] Call Trace:
[  126.779324]  <TASK>
[  126.779728]  ovpn_recv+0x15b/0x3a0 [ovpn]
[  126.780299]  ovpn_tcp_rcv+0xbd/0x320 [ovpn]
[  126.780886]  __strp_recv+0x172/0x640
[  126.781430]  strp_recv+0x27/0x30
[  126.781931]  __tcp_read_sock+0x92/0x260
[  126.782483]  ? __pfx_strp_recv+0x10/0x10
[  126.783055]  tcp_read_sock+0x1b/0x30
[  126.783586]  strp_read_sock+0xce/0xe0
[  126.784162]  strp_data_ready+0x6a/0xa0
[  126.784721]  ovpn_tcp_data_ready+0x90/0x220 [ovpn]
[  126.785441]  tcp_data_ready+0x30/0xd0
[  126.785992]  tcp_data_queue+0x8a4/0x1000
[  126.786560]  tcp_rcv_established+0x328/0xca0
[  126.787170]  tcp_v4_do_rcv+0x18f/0x3c0
[  126.787726]  tcp_v4_rcv+0xf51/0x1520
[  126.788264]  ip_protocol_deliver_rcu+0x4a/0x200
[  126.788898]  ? process_backlog+0x14b/0x630
[  126.789552]  ip_local_deliver_finish+0xd7/0x220
[  126.790193]  ip_local_deliver+0x63/0x250
[  126.790769]  ? lock_is_held_type+0x9c/0x110
[  126.791366]  ? process_backlog+0x14b/0x630
[  126.791995]  ip_rcv+0x26f/0x340
[  126.792516]  ? process_backlog+0x14b/0x630
[  126.793114]  ? process_backlog+0x14b/0x630
[  126.793714]  __netif_receive_skb_one_core+0x6b/0x80
[  126.794358]  __netif_receive_skb+0x16/0x60
[  126.794951]  process_backlog+0x18b/0x630
[  126.795515]  ? process_backlog+0x6a/0x630
[  126.796109]  __napi_poll.constprop.0+0x2e/0x1e0
[  126.796721]  net_rx_action+0x384/0x420
[  126.797252]  ? local_clock_noinstr+0x13/0xf0
[  126.797861]  handle_softirqs+0xc8/0x3f0
[  126.798401]  ? __pfx_smpboot_thread_fn+0x10/0x10
[  126.799024]  run_ksoftirqd+0x36/0x50
[  126.799548]  smpboot_thread_fn+0xfe/0x230
[  126.800110]  kthread+0x103/0x210
[  126.800602]  ? __pfx_kthread+0x10/0x10
[  126.801145]  ret_from_fork+0x18a/0x1e0
[  126.801705]  ? __pfx_kthread+0x10/0x10
[  126.802284]  ret_from_fork_asm+0x1a/0x30
[  126.802879]  </TASK>
[  126.803288] irq event stamp: 41024
[  126.803799] hardirqs last  enabled at (41032): [<ffffffff8137676c>]
__up_console_sem+0x5c/0x70
[  126.804799] hardirqs last disabled at (41039): [<ffffffff81376751>]
__up_console_sem+0x41/0x70
[  126.805812] softirqs last  enabled at (33936): [<ffffffff812e6486>]
run_ksoftirqd+0x36/0x50
[  126.806819] softirqs last disabled at (33941): [<ffffffff812e6486>]
run_ksoftirqd+0x36/0x50
[  126.807822] ---[ end trace 0000000000000000 ]---
Sabrina Dubroca Feb. 4, 2026, 7:46 p.m. UTC | #2
Sorry Ralf, December and January flew by and I didn't get a chance to
dive deeper into your previous RFC patch.

2026-02-04, 15:14:41 +0100, Ralf Lici wrote:
> When processing TCP stream data in ovpn_tcp_recv, we receive large
> cloned skbs from __strp_rcv that may contain multiple coalesced packets.
> The current implementation has two bugs:
> 
> 1. Header offset overflow: Using pskb_pull with large offsets on
>    coalesced skbs causes skb->data - skb->head to exceed the u16 storage
>    of skb->network_header. This causes skb_reset_network_header to fail
>    on the inner decapsulated packet, resulting in packet drops.
> 
> 2. Unaligned protocol headers: Extracting packets from arbitrary
>    positions within the coalesced TCP stream provides no alignment
>    guarantees for the packet data causing performance penalties on
>    architectures without efficient unaligned access. Additionally,
>    openvpn's 2-byte length prefix on TCP packets causes the subsequent
>    4-byte opcode and packet ID fields to be inherently misaligned.
> 
> Fix both issues by allocating a new skb for each openvpn packet and
> using skb_copy_bits to extract only the packet content into the new
> buffer, skipping the 2-byte length prefix. Also, check the length before
> invoking the function that performs the allocation to avoid creating an
> invalid skb.
> 
> If the packet has to be forwarded to userspace the 2-byte prefix can be
> pushed to the head safely, without misalignment.
> 
> As a side effect, this approach also avoids the expensive linearization
> that pskb_pull triggers on cloned skbs with page fragments. In testing,

We don't really avoid the linearization (because
netdev_alloc_skb+skb_copy_bits is pretty much doing that) but we're
only linearizing pkt_len instead of offset + pkt_len?

> this resulted in TCP throughput improvements of up to 74%.

Wow.

[...]
> +static void ovpn_tcp_rcv(struct strparser *strp, struct sk_buff *skb)
> +{
> +	struct ovpn_peer *peer = container_of(strp, struct ovpn_peer, tcp.strp);
> +	struct strp_msg *msg = strp_msg(skb);
> +	int pkt_len = msg->full_len - 2;
> +	u8 opcode;
> +
> +	/* we need at least 4 bytes of data in the packet
>  	 * to extract the opcode and the key ID later on
>  	 */
> -	if (!pskb_may_pull(skb, OVPN_OPCODE_SIZE)) {
> +	if (unlikely(pkt_len < OVPN_OPCODE_SIZE)) {
>  		net_warn_ratelimited("%s: packet too small to fetch opcode for peer %u\n",
>  				     netdev_name(peer->ovpn->dev), peer->id);
>  		goto err;
>  	}
>  
> +	/* extract the packet into a new skb */
> +	skb = ovpn_tcp_skb_packet(peer, skb, pkt_len, msg->offset + 2);
> +	if (unlikely(!skb))
> +		goto err;


err:
	/* take reference for deferred peer deletion. should never fail */
	if (WARN_ON(!ovpn_peer_hold(peer)))
		goto err_nopeer;
	schedule_work(&peer->tcp.defer_del_work);
	dev_dstats_rx_dropped(peer->ovpn->dev);

I'm thinking that we may not need to abort the connection if we failed
to allocate the new skb or copy the message, since we have a message
boundary so we can still parse the following messages out of the
stream? But we won't be able to validate that boundary since we can't
try to decrypt the packet, so maybe it's better to still abort.
(if the "don't abort" idea is wanted, it can be done at a later time,
doesn't have to be part of this patch)

> +
>  	/* DATA_V2 packets are handled in kernel, the rest goes to user space */
>  	opcode = ovpn_opcode_from_skb(skb, 0);
>  	if (unlikely(opcode != OVPN_DATA_V2)) {
> @@ -113,7 +132,7 @@ static void ovpn_tcp_rcv(struct strparser *strp, struct sk_buff *skb)
>  		/* The packet size header must be there when sending the packet
>  		 * to userspace, therefore we put it back
>  		 */
> -		skb_push(skb, 2);
> +		*((__force __be16 *)__skb_push(skb, 2)) = cpu_to_be16(pkt_len);

Why switch to __skb_push here? The skb_under_panic() would be nice in
the (unexpected) case we have an invalid skb.

Here we rely on the extra room that netdev_alloc_skb reserves
(NET_SKB_PAD) in case we have to re-push the packet length, right?
I'm a bit concerned about the silent assumptions that could lead to
writing to arbitrary addresses in memory.

>  		ovpn_tcp_to_userspace(peer, strp->sk, skb);
>  		return;
>  	}
> -- 
> 2.52.0
>
Antonio Quartulli Feb. 4, 2026, 10:42 p.m. UTC | #3
Hi Sabrina!

On 04/02/2026 20:46, Sabrina Dubroca wrote:
>> this resulted in TCP throughput improvements of up to 74%.
> 
> Wow.

I was also astonished by this result!

Actually Ralf tested various options (i.e. netdev_alloc_skb vs 
pskb_extract, on all skbs vs on large ones only) and this solution 
turned to be the one giving the highest boost.

In my tests yesterday, TCP jumped from ~4.3Gbps to ~7.3Gbps after 
applying this patch, which is the same throughput I am getting over UDP!


> 
> [...]
>> +static void ovpn_tcp_rcv(struct strparser *strp, struct sk_buff *skb)
>> +{
>> +	struct ovpn_peer *peer = container_of(strp, struct ovpn_peer, tcp.strp);
>> +	struct strp_msg *msg = strp_msg(skb);
>> +	int pkt_len = msg->full_len - 2;
>> +	u8 opcode;
>> +
>> +	/* we need at least 4 bytes of data in the packet
>>   	 * to extract the opcode and the key ID later on
>>   	 */
>> -	if (!pskb_may_pull(skb, OVPN_OPCODE_SIZE)) {
>> +	if (unlikely(pkt_len < OVPN_OPCODE_SIZE)) {
>>   		net_warn_ratelimited("%s: packet too small to fetch opcode for peer %u\n",
>>   				     netdev_name(peer->ovpn->dev), peer->id);
>>   		goto err;
>>   	}
>>   
>> +	/* extract the packet into a new skb */
>> +	skb = ovpn_tcp_skb_packet(peer, skb, pkt_len, msg->offset + 2);
>> +	if (unlikely(!skb))
>> +		goto err;
> 
> 
> err:
> 	/* take reference for deferred peer deletion. should never fail */
> 	if (WARN_ON(!ovpn_peer_hold(peer)))
> 		goto err_nopeer;
> 	schedule_work(&peer->tcp.defer_del_work);
> 	dev_dstats_rx_dropped(peer->ovpn->dev);
> 
> I'm thinking that we may not need to abort the connection if we failed
> to allocate the new skb or copy the message, since we have a message
> boundary so we can still parse the following messages out of the
> stream? But we won't be able to validate that boundary since we can't
> try to decrypt the packet, so maybe it's better to still abort.
> (if the "don't abort" idea is wanted, it can be done at a later time,
> doesn't have to be part of this patch)



> 
>> +
>>   	/* DATA_V2 packets are handled in kernel, the rest goes to user space */
>>   	opcode = ovpn_opcode_from_skb(skb, 0);
>>   	if (unlikely(opcode != OVPN_DATA_V2)) {
>> @@ -113,7 +132,7 @@ static void ovpn_tcp_rcv(struct strparser *strp, struct sk_buff *skb)
>>   		/* The packet size header must be there when sending the packet
>>   		 * to userspace, therefore we put it back
>>   		 */
>> -		skb_push(skb, 2);
>> +		*((__force __be16 *)__skb_push(skb, 2)) = cpu_to_be16(pkt_len);
> 
> Why switch to __skb_push here? The skb_under_panic() would be nice in
> the (unexpected) case we have an invalid skb.
> 

We just created the skb, so it should be safe to assume that the skb is 
valid, no?

> Here we rely on the extra room that netdev_alloc_skb reserves
> (NET_SKB_PAD) in case we have to re-push the packet length, right?

The comment at 
https://elixir.bootlin.com/linux/v6.19-rc5/source/include/linux/skbuff.h#L3282 
about NET_SKB_PAD seems to suggest that this is actual headroom that 
networking drivers can assume to have, no?
And it should never be less than 32 bytes.


Regards,

> I'm a bit concerned about the silent assumptions that could lead to
> writing to arbitrary addresses in memory.
> 
>>   		ovpn_tcp_to_userspace(peer, strp->sk, skb);
>>   		return;
>>   	}
>> -- 
>> 2.52.0
>>
>
Sabrina Dubroca Feb. 5, 2026, 10:42 a.m. UTC | #4
2026-02-04, 23:42:31 +0100, Antonio Quartulli wrote:
> Hi Sabrina!
> 
> On 04/02/2026 20:46, Sabrina Dubroca wrote:
> > > this resulted in TCP throughput improvements of up to 74%.
> > 
> > Wow.
> 
> I was also astonished by this result!
> 
> Actually Ralf tested various options (i.e. netdev_alloc_skb vs pskb_extract,
> on all skbs vs on large ones only) and this solution turned to be the one
> giving the highest boost.
> 
> In my tests yesterday, TCP jumped from ~4.3Gbps to ~7.3Gbps after applying
> this patch, which is the same throughput I am getting over UDP!

Very nice :)


> > > @@ -113,7 +132,7 @@ static void ovpn_tcp_rcv(struct strparser *strp, struct sk_buff *skb)
> > >   		/* The packet size header must be there when sending the packet
> > >   		 * to userspace, therefore we put it back
> > >   		 */
> > > -		skb_push(skb, 2);
> > > +		*((__force __be16 *)__skb_push(skb, 2)) = cpu_to_be16(pkt_len);
> > 
> > Why switch to __skb_push here? The skb_under_panic() would be nice in
> > the (unexpected) case we have an invalid skb.
> > 
> 
> We just created the skb, so it should be safe to assume that the skb is
> valid, no?

I meant invalid in the sense of "not having the expected room to push
the header".

> > Here we rely on the extra room that netdev_alloc_skb reserves
> > (NET_SKB_PAD) in case we have to re-push the packet length, right?
> 
> The comment at https://elixir.bootlin.com/linux/v6.19-rc5/source/include/linux/skbuff.h#L3282
> about NET_SKB_PAD seems to suggest that this is actual headroom that
> networking drivers can assume to have, no?

I guess so. But I'd feel a bit safer if we had something enforcing
that at least this push can't fail, more as a guarantee against an
accidental code change in ovpn. But maybe we'd have bigger problems
through the rest of the stack if ovpn skbs get bridged/forwarded and
we don't have all that headroom.

> And it should never be less than 32 bytes.
> 
> 
> Regards,
Antonio Quartulli Feb. 5, 2026, 1:42 p.m. UTC | #5
On 05/02/2026 11:42, Sabrina Dubroca wrote:
> 2026-02-04, 23:42:31 +0100, Antonio Quartulli wrote:
>>>> @@ -113,7 +132,7 @@ static void ovpn_tcp_rcv(struct strparser *strp, struct sk_buff *skb)
>>>>    		/* The packet size header must be there when sending the packet
>>>>    		 * to userspace, therefore we put it back
>>>>    		 */
>>>> -		skb_push(skb, 2);
>>>> +		*((__force __be16 *)__skb_push(skb, 2)) = cpu_to_be16(pkt_len);
>>>
>>> Why switch to __skb_push here? The skb_under_panic() would be nice in
>>> the (unexpected) case we have an invalid skb.
>>>
>>
>> We just created the skb, so it should be safe to assume that the skb is
>> valid, no?
> 
> I meant invalid in the sense of "not having the expected room to push
> the header".
> 
>>> Here we rely on the extra room that netdev_alloc_skb reserves
>>> (NET_SKB_PAD) in case we have to re-push the packet length, right?
>>
>> The comment at https://elixir.bootlin.com/linux/v6.19-rc5/source/include/linux/skbuff.h#L3282
>> about NET_SKB_PAD seems to suggest that this is actual headroom that
>> networking drivers can assume to have, no?
> 
> I guess so. But I'd feel a bit safer if we had something enforcing
> that at least this push can't fail, more as a guarantee against an
> accidental code change in ovpn. But maybe we'd have bigger problems
> through the rest of the stack if ovpn skbs get bridged/forwarded and
> we don't have all that headroom.

Exactly my thought. If this assumption doesn't hold anymore, more things 
are going to break.
(I expect also other mechanisms outside ovpn to be affected)

On top of that, we ovpn_tcp_send_skb() basically relying on the same 
assumption already.
Therefore, I'd just stick with the current version.

However, I have a little nitpick: Ralf, please use the same style as the 
assignment in ovpn_tcp_send_skb(), for consistency:

584         *(__be16 *)__skb_push(skb, sizeof(u16)) = htons(len);

Regards,

> 
>> And it should never be less than 32 bytes.
>>
>>
>> Regards,
>
Ralf Lici Feb. 5, 2026, 2:28 p.m. UTC | #6
On Wed, 2026-02-04 at 20:46 +0100, Sabrina Dubroca wrote:
> Sorry Ralf, December and January flew by and I didn't get a chance to
> dive deeper into your previous RFC patch.

No problem at all. We had the opportunity to investigate this further
and run some tests, as Antonio mentioned.

> 2026-02-04, 15:14:41 +0100, Ralf Lici wrote:
> > When processing TCP stream data in ovpn_tcp_recv, we receive large
> > cloned skbs from __strp_rcv that may contain multiple coalesced
> > packets.
> > The current implementation has two bugs:
> > 
> > 1. Header offset overflow: Using pskb_pull with large offsets on
> >    coalesced skbs causes skb->data - skb->head to exceed the u16
> > storage
> >    of skb->network_header. This causes skb_reset_network_header to
> > fail
> >    on the inner decapsulated packet, resulting in packet drops.
> > 
> > 2. Unaligned protocol headers: Extracting packets from arbitrary
> >    positions within the coalesced TCP stream provides no alignment
> >    guarantees for the packet data causing performance penalties on
> >    architectures without efficient unaligned access. Additionally,
> >    openvpn's 2-byte length prefix on TCP packets causes the
> > subsequent
> >    4-byte opcode and packet ID fields to be inherently misaligned.
> > 
> > Fix both issues by allocating a new skb for each openvpn packet and
> > using skb_copy_bits to extract only the packet content into the new
> > buffer, skipping the 2-byte length prefix. Also, check the length
> > before
> > invoking the function that performs the allocation to avoid creating
> > an
> > invalid skb.
> > 
> > If the packet has to be forwarded to userspace the 2-byte prefix can
> > be
> > pushed to the head safely, without misalignment.
> > 
> > As a side effect, this approach also avoids the expensive
> > linearization
> > that pskb_pull triggers on cloned skbs with page fragments. In
> > testing,
> 
> We don't really avoid the linearization (because
> netdev_alloc_skb+skb_copy_bits is pretty much doing that) but we're
> only linearizing pkt_len instead of offset + pkt_len?

Exactly, the main issue was the first pskb_pull in ovpn_tcp_rcv which
would linearize large amounts of data for every openvpn packet inside a
coalesced skb (strp keeps cloning the original one). In fact your
suggestion of using pskb_extract does resolve the
skb_reset_network_header issue and improves throughput but
netdev_alloc_skb performs better. I suppose that's because while
pskb_extract avoids initial data copying through frag manipulation,
skb_cow_data in ovpn_aead_decrypt still linearizes the packet for
crypto, resulting in two passes over the data (carve + linearize) versus
one direct copy with netdev_alloc_skb. Also pskb_extract requires atomic
refcount operations which are not needed with the alloc + copy approach.

> 
> > this resulted in TCP throughput improvements of up to 74%.
> 
> Wow.
> 
> [...]
> > +static void ovpn_tcp_rcv(struct strparser *strp, struct sk_buff
> > *skb)
> > +{
> > +	struct ovpn_peer *peer = container_of(strp, struct
> > ovpn_peer, tcp.strp);
> > +	struct strp_msg *msg = strp_msg(skb);
> > +	int pkt_len = msg->full_len - 2;
> > +	u8 opcode;
> > +
> > +	/* we need at least 4 bytes of data in the packet
> >  	 * to extract the opcode and the key ID later on
> >  	 */
> > -	if (!pskb_may_pull(skb, OVPN_OPCODE_SIZE)) {
> > +	if (unlikely(pkt_len < OVPN_OPCODE_SIZE)) {
> >  		net_warn_ratelimited("%s: packet too small to fetch
> > opcode for peer %u\n",
> >  				     netdev_name(peer->ovpn->dev),
> > peer->id);
> >  		goto err;
> >  	}
> >  
> > +	/* extract the packet into a new skb */
> > +	skb = ovpn_tcp_skb_packet(peer, skb, pkt_len, msg->offset +
> > 2);
> > +	if (unlikely(!skb))
> > +		goto err;
> 
> 
> err:
> 	/* take reference for deferred peer deletion. should never
> fail */
> 	if (WARN_ON(!ovpn_peer_hold(peer)))
> 		goto err_nopeer;
> 	schedule_work(&peer->tcp.defer_del_work);
> 	dev_dstats_rx_dropped(peer->ovpn->dev);
> 
> I'm thinking that we may not need to abort the connection if we failed
> to allocate the new skb or copy the message, since we have a message
> boundary so we can still parse the following messages out of the
> stream? But we won't be able to validate that boundary since we can't
> try to decrypt the packet, so maybe it's better to still abort.
> (if the "don't abort" idea is wanted, it can be done at a later time,
> doesn't have to be part of this patch)

I agree. While an allocation failure here doesn't strictly require
aborting the connection, subsequent operations are unlikely to succeed
anyway. I can address this in a follow-up patch to keep this fix
focused.

> > +
> >  	/* DATA_V2 packets are handled in kernel, the rest goes to
> > user space */
> >  	opcode = ovpn_opcode_from_skb(skb, 0);
> >  	if (unlikely(opcode != OVPN_DATA_V2)) {
> > @@ -113,7 +132,7 @@ static void ovpn_tcp_rcv(struct strparser *strp,
> > struct sk_buff *skb)
> >  		/* The packet size header must be there when
> > sending the packet
> >  		 * to userspace, therefore we put it back
> >  		 */
> > -		skb_push(skb, 2);
> > +		*((__force __be16 *)__skb_push(skb, 2)) =
> > cpu_to_be16(pkt_len);
> 
> Why switch to __skb_push here? The skb_under_panic() would be nice in
> the (unexpected) case we have an invalid skb.
> 
> Here we rely on the extra room that netdev_alloc_skb reserves
> (NET_SKB_PAD) in case we have to re-push the packet length, right?
> I'm a bit concerned about the silent assumptions that could lead to
> writing to arbitrary addresses in memory.
> 
> >  		ovpn_tcp_to_userspace(peer, strp->sk, skb);
> >  		return;
> >  	}
> > -- 
> > 2.52.0
> >
Ralf Lici Feb. 5, 2026, 2:29 p.m. UTC | #7
On Thu, 2026-02-05 at 14:42 +0100, Antonio Quartulli wrote:
> On 05/02/2026 11:42, Sabrina Dubroca wrote:
> > 2026-02-04, 23:42:31 +0100, Antonio Quartulli wrote:
> > > > > @@ -113,7 +132,7 @@ static void ovpn_tcp_rcv(struct strparser
> > > > > *strp, struct sk_buff *skb)
> > > > >    		/* The packet size header must be there when
> > > > > sending the packet
> > > > >    		 * to userspace, therefore we put it back
> > > > >    		 */
> > > > > -		skb_push(skb, 2);
> > > > > +		*((__force __be16 *)__skb_push(skb, 2)) =
> > > > > cpu_to_be16(pkt_len);
> > > > 
> > > > Why switch to __skb_push here? The skb_under_panic() would be
> > > > nice in
> > > > the (unexpected) case we have an invalid skb.
> > > > 
> > > 
> > > We just created the skb, so it should be safe to assume that the
> > > skb is
> > > valid, no?
> > 
> > I meant invalid in the sense of "not having the expected room to
> > push
> > the header".
> > 
> > > > Here we rely on the extra room that netdev_alloc_skb reserves
> > > > (NET_SKB_PAD) in case we have to re-push the packet length,
> > > > right?
> > > 
> > > The comment at
> > > https://elixir.bootlin.com/linux/v6.19-rc5/source/include/linux/skbuff.h#L3282
> > > about NET_SKB_PAD seems to suggest that this is actual headroom
> > > that
> > > networking drivers can assume to have, no?
> > 
> > I guess so. But I'd feel a bit safer if we had something enforcing
> > that at least this push can't fail, more as a guarantee against an
> > accidental code change in ovpn. But maybe we'd have bigger problems
> > through the rest of the stack if ovpn skbs get bridged/forwarded and
> > we don't have all that headroom.
> 
> Exactly my thought. If this assumption doesn't hold anymore, more
> things 
> are going to break.
> (I expect also other mechanisms outside ovpn to be affected)
> 
> On top of that, we ovpn_tcp_send_skb() basically relying on the same 
> assumption already.
> Therefore, I'd just stick with the current version.
> 
> However, I have a little nitpick: Ralf, please use the same style as
> the 
> assignment in ovpn_tcp_send_skb(), for consistency:
> 
> 584         *(__be16 *)__skb_push(skb, sizeof(u16)) = htons(len);

ACK.

> Regards,
> 
> > 
> > > And it should never be less than 32 bytes.
> > > 
> > > 
> > > Regards,
> >

Patch

diff --git a/drivers/net/ovpn/tcp.c b/drivers/net/ovpn/tcp.c
index b7348da9b040..ffe8db69d76d 100644
--- a/drivers/net/ovpn/tcp.c
+++ b/drivers/net/ovpn/tcp.c
@@ -70,37 +70,56 @@  static void ovpn_tcp_to_userspace(struct ovpn_peer *peer, struct sock *sk,
 	peer->tcp.sk_cb.sk_data_ready(sk);
 }
 
-static void ovpn_tcp_rcv(struct strparser *strp, struct sk_buff *skb)
+static struct sk_buff *ovpn_tcp_skb_packet(const struct ovpn_peer *peer,
+					   struct sk_buff *orig_skb,
+					   const int pkt_len, const int pkt_off)
 {
-	struct ovpn_peer *peer = container_of(strp, struct ovpn_peer, tcp.strp);
-	struct strp_msg *msg = strp_msg(skb);
-	size_t pkt_len = msg->full_len - 2;
-	size_t off = msg->offset + 2;
-	u8 opcode;
+	struct sk_buff *ovpn_skb;
+	int err;
 
-	/* ensure skb->data points to the beginning of the openvpn packet */
-	if (!pskb_pull(skb, off)) {
-		net_warn_ratelimited("%s: packet too small for peer %u\n",
-				     netdev_name(peer->ovpn->dev), peer->id);
+	/* create a new skb with only the content of the current packet */
+	ovpn_skb = netdev_alloc_skb(peer->ovpn->dev, pkt_len);
+	if (unlikely(!ovpn_skb))
 		goto err;
-	}
 
-	/* strparser does not trim the skb for us, therefore we do it now */
-	if (pskb_trim(skb, pkt_len) != 0) {
-		net_warn_ratelimited("%s: trimming skb failed for peer %u\n",
+	skb_copy_header(ovpn_skb, orig_skb);
+	err = skb_copy_bits(orig_skb, pkt_off, skb_put(ovpn_skb, pkt_len),
+			    pkt_len);
+	if (unlikely(err)) {
+		net_warn_ratelimited("%s: skb_copy_bits failed for peer %u\n",
 				     netdev_name(peer->ovpn->dev), peer->id);
+		kfree_skb(ovpn_skb);
 		goto err;
 	}
 
-	/* we need the first 4 bytes of data to be accessible
+	consume_skb(orig_skb);
+	return ovpn_skb;
+err:
+	kfree_skb(orig_skb);
+	return NULL;
+}
+
+static void ovpn_tcp_rcv(struct strparser *strp, struct sk_buff *skb)
+{
+	struct ovpn_peer *peer = container_of(strp, struct ovpn_peer, tcp.strp);
+	struct strp_msg *msg = strp_msg(skb);
+	int pkt_len = msg->full_len - 2;
+	u8 opcode;
+
+	/* we need at least 4 bytes of data in the packet
 	 * to extract the opcode and the key ID later on
 	 */
-	if (!pskb_may_pull(skb, OVPN_OPCODE_SIZE)) {
+	if (unlikely(pkt_len < OVPN_OPCODE_SIZE)) {
 		net_warn_ratelimited("%s: packet too small to fetch opcode for peer %u\n",
 				     netdev_name(peer->ovpn->dev), peer->id);
 		goto err;
 	}
 
+	/* extract the packet into a new skb */
+	skb = ovpn_tcp_skb_packet(peer, skb, pkt_len, msg->offset + 2);
+	if (unlikely(!skb))
+		goto err;
+
 	/* DATA_V2 packets are handled in kernel, the rest goes to user space */
 	opcode = ovpn_opcode_from_skb(skb, 0);
 	if (unlikely(opcode != OVPN_DATA_V2)) {
@@ -113,7 +132,7 @@  static void ovpn_tcp_rcv(struct strparser *strp, struct sk_buff *skb)
 		/* The packet size header must be there when sending the packet
 		 * to userspace, therefore we put it back
 		 */
-		skb_push(skb, 2);
+		*((__force __be16 *)__skb_push(skb, 2)) = cpu_to_be16(pkt_len);
 		ovpn_tcp_to_userspace(peer, strp->sk, skb);
 		return;
 	}