Skip to content

Commit

Permalink
mptcp: Add handling of outgoing MP_JOIN requests
Browse files Browse the repository at this point in the history
Subflow creation may be initiated by the path manager when
the primary connection is fully established and a remote
address has been received via ADD_ADDR.

Create an in-kernel sock and use kernel_connect() to
initiate connection.

Passive sockets can't acquire the mptcp socket lock at
subflow creation time, so an additional list protected by
a new spinlock is used to track the MPJ subflows.

Such list is spliced into conn_list tail every time the msk
socket lock is acquired, so that it will not interfere
with data flow on the original connection.

Data flow and connection failover not addressed by this commit.

Co-developed-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Florian Westphal <fw@strlen.de>
Co-developed-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Co-developed-by: Matthieu Baerts <matthieu.baerts@tessares.net>
Signed-off-by: Matthieu Baerts <matthieu.baerts@tessares.net>
Signed-off-by: Peter Krystad <peter.krystad@linux.intel.com>
Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
  • Loading branch information
Peter Krystad authored and davem330 committed Mar 30, 2020
1 parent f296234 commit ec3edaa
Show file tree
Hide file tree
Showing 5 changed files with 287 additions and 17 deletions.
2 changes: 2 additions & 0 deletions include/net/mptcp.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ struct mptcp_out_options {
u8 backup;
u32 nonce;
u64 thmac;
u32 token;
u8 hmac[20];
struct mptcp_ext ext_copy;
#endif
};
Expand Down
108 changes: 94 additions & 14 deletions net/mptcp/options.c
Original file line number Diff line number Diff line change
Expand Up @@ -328,6 +328,16 @@ bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb,
opts->sndr_key = subflow->local_key;
*size = TCPOLEN_MPTCP_MPC_SYN;
return true;
} else if (subflow->request_join) {
pr_debug("remote_token=%u, nonce=%u", subflow->remote_token,
subflow->local_nonce);
opts->suboptions = OPTION_MPTCP_MPJ_SYN;
opts->join_id = subflow->local_id;
opts->token = subflow->remote_token;
opts->nonce = subflow->local_nonce;
opts->backup = subflow->request_bkup;
*size = TCPOLEN_MPTCP_MPJ_SYN;
return true;
}
return false;
}
Expand All @@ -337,16 +347,55 @@ void mptcp_rcv_synsent(struct sock *sk)
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
struct tcp_sock *tp = tcp_sk(sk);

pr_debug("subflow=%p", subflow);
if (subflow->request_mptcp && tp->rx_opt.mptcp.mp_capable) {
subflow->mp_capable = 1;
subflow->can_ack = 1;
subflow->remote_key = tp->rx_opt.mptcp.sndr_key;
} else {
pr_debug("subflow=%p, remote_key=%llu", subflow,
subflow->remote_key);
} else if (subflow->request_join && tp->rx_opt.mptcp.mp_join) {
subflow->mp_join = 1;
subflow->thmac = tp->rx_opt.mptcp.thmac;
subflow->remote_nonce = tp->rx_opt.mptcp.nonce;
pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u", subflow,
subflow->thmac, subflow->remote_nonce);
} else if (subflow->request_mptcp) {
tcp_sk(sk)->is_mptcp = 0;
}
}

/* MP_JOIN client subflow must wait for 4th ack before sending any data:
* TCP can't schedule delack timer before the subflow is fully established.
* MPTCP uses the delack timer to do 3rd ack retransmissions
*/
static void schedule_3rdack_retransmission(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
unsigned long timeout;

/* reschedule with a timeout above RTT, as we must look only for drop */
if (tp->srtt_us)
timeout = tp->srtt_us << 1;
else
timeout = TCP_TIMEOUT_INIT;

WARN_ON_ONCE(icsk->icsk_ack.pending & ICSK_ACK_TIMER);
icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER;
icsk->icsk_ack.timeout = timeout;
sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout);
}

static void clear_3rdack_retransmission(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);

sk_stop_timer(sk, &icsk->icsk_delack_timer);
icsk->icsk_ack.timeout = 0;
icsk->icsk_ack.ato = 0;
icsk->icsk_ack.pending &= ~(ICSK_ACK_SCHED | ICSK_ACK_TIMER);
}

static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb,
unsigned int *size,
unsigned int remaining,
Expand All @@ -356,17 +405,21 @@ static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb,
struct mptcp_ext *mpext;
unsigned int data_len;

pr_debug("subflow=%p fully established=%d seq=%x:%x remaining=%d",
subflow, subflow->fully_established, subflow->snd_isn,
skb ? TCP_SKB_CB(skb)->seq : 0, remaining);
/* When skb is not available, we better over-estimate the emitted
* options len. A full DSS option (28 bytes) is longer than
* TCPOLEN_MPTCP_MPC_ACK_DATA(22) or TCPOLEN_MPTCP_MPJ_ACK(24), so
* tell the caller to defer the estimate to
* mptcp_established_options_dss(), which will reserve enough space.
*/
if (!skb)
return false;

if (subflow->mp_capable && !subflow->fully_established && skb &&
subflow->snd_isn == TCP_SKB_CB(skb)->seq) {
/* When skb is not available, we better over-estimate the
* emitted options len. A full DSS option is longer than
* TCPOLEN_MPTCP_MPC_ACK_DATA, so let's the caller try to fit
* that.
*/
/* MPC/MPJ needed only on 3rd ack packet */
if (subflow->fully_established ||
subflow->snd_isn != TCP_SKB_CB(skb)->seq)
return false;

if (subflow->mp_capable) {
mpext = mptcp_get_ext(skb);
data_len = mpext ? mpext->data_len : 0;

Expand Down Expand Up @@ -394,6 +447,14 @@ static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb,
data_len);

return true;
} else if (subflow->mp_join) {
opts->suboptions = OPTION_MPTCP_MPJ_ACK;
memcpy(opts->hmac, subflow->hmac, MPTCPOPT_HMAC_LEN);
*size = TCPOLEN_MPTCP_MPJ_ACK;
pr_debug("subflow=%p", subflow);

schedule_3rdack_retransmission(sk);
return true;
}
return false;
}
Expand Down Expand Up @@ -674,10 +735,12 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *sk,
return true;

subflow->pm_notified = 1;
if (subflow->mp_join)
if (subflow->mp_join) {
clear_3rdack_retransmission(sk);
mptcp_pm_subflow_established(msk, subflow);
else
} else {
mptcp_pm_fully_established(msk);
}
return true;
}

Expand Down Expand Up @@ -860,6 +923,16 @@ void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts)
0, opts->rm_id);
}

if (OPTION_MPTCP_MPJ_SYN & opts->suboptions) {
*ptr++ = mptcp_option(MPTCPOPT_MP_JOIN,
TCPOLEN_MPTCP_MPJ_SYN,
opts->backup, opts->join_id);
put_unaligned_be32(opts->token, ptr);
ptr += 1;
put_unaligned_be32(opts->nonce, ptr);
ptr += 1;
}

if (OPTION_MPTCP_MPJ_SYNACK & opts->suboptions) {
*ptr++ = mptcp_option(MPTCPOPT_MP_JOIN,
TCPOLEN_MPTCP_MPJ_SYNACK,
Expand All @@ -870,6 +943,13 @@ void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts)
ptr += 1;
}

if (OPTION_MPTCP_MPJ_ACK & opts->suboptions) {
*ptr++ = mptcp_option(MPTCPOPT_MP_JOIN,
TCPOLEN_MPTCP_MPJ_ACK, 0, 0);
memcpy(ptr, opts->hmac, MPTCPOPT_HMAC_LEN);
ptr += 5;
}

if (opts->ext_copy.use_ack || opts->ext_copy.use_map) {
struct mptcp_ext *mpext = &opts->ext_copy;
u8 len = TCPOLEN_MPTCP_DSS_BASE;
Expand Down
31 changes: 30 additions & 1 deletion net/mptcp/protocol.c
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,16 @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk)
sk->sk_data_ready(sk);
}

static void __mptcp_flush_join_list(struct mptcp_sock *msk)
{
if (likely(list_empty(&msk->join_list)))
return;

spin_lock_bh(&msk->join_list_lock);
list_splice_tail_init(&msk->join_list, &msk->conn_list);
spin_unlock_bh(&msk->join_list_lock);
}

static bool mptcp_ext_cache_refill(struct mptcp_sock *msk)
{
if (!msk->cached_ext)
Expand Down Expand Up @@ -462,6 +472,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
return ret >= 0 ? ret + copied : (copied ? copied : ret);
}

__mptcp_flush_join_list(msk);
ssk = mptcp_subflow_get_send(msk);
while (!sk_stream_memory_free(sk) || !ssk) {
ret = sk_stream_wait_memory(sk, &timeo);
Expand Down Expand Up @@ -603,6 +614,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,

len = min_t(size_t, len, INT_MAX);
target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
__mptcp_flush_join_list(msk);

while (len > (size_t)copied) {
int bytes_read;
Expand Down Expand Up @@ -718,6 +730,7 @@ static void mptcp_worker(struct work_struct *work)
struct sock *sk = &msk->sk.icsk_inet.sk;

lock_sock(sk);
__mptcp_flush_join_list(msk);
__mptcp_move_skbs(msk);
release_sock(sk);
sock_put(sk);
Expand All @@ -727,7 +740,10 @@ static int __mptcp_init_sock(struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);

spin_lock_init(&msk->join_list_lock);

INIT_LIST_HEAD(&msk->conn_list);
INIT_LIST_HEAD(&msk->join_list);
__set_bit(MPTCP_SEND_SPACE, &msk->flags);
INIT_WORK(&msk->work, mptcp_worker);

Expand Down Expand Up @@ -800,6 +816,8 @@ static void mptcp_close(struct sock *sk, long timeout)
mptcp_token_destroy(msk->token);
inet_sk_state_store(sk, TCP_CLOSE);

__mptcp_flush_join_list(msk);

list_splice_init(&msk->conn_list, &conn_list);

data_fin_tx_seq = msk->write_seq;
Expand Down Expand Up @@ -1107,6 +1125,7 @@ bool mptcp_finish_join(struct sock *sk)
struct mptcp_sock *msk = mptcp_sk(subflow->conn);
struct sock *parent = (void *)msk;
struct socket *parent_sock;
bool ret;

pr_debug("msk=%p, subflow=%p", msk, subflow);

Expand All @@ -1122,7 +1141,15 @@ bool mptcp_finish_join(struct sock *sk)
if (parent_sock && !sk->sk_socket)
mptcp_sock_graft(sk, parent_sock);

return mptcp_pm_allow_new_subflow(msk);
ret = mptcp_pm_allow_new_subflow(msk);
if (ret) {
/* active connections are already on conn_list */
spin_lock_bh(&msk->join_list_lock);
if (!WARN_ON_ONCE(!list_empty(&subflow->node)))
list_add_tail(&subflow->node, &msk->join_list);
spin_unlock_bh(&msk->join_list_lock);
}
return ret;
}

bool mptcp_sk_is_subflow(const struct sock *sk)
Expand Down Expand Up @@ -1311,6 +1338,7 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,
/* set ssk->sk_socket of accept()ed flows to mptcp socket.
* This is needed so NOSPACE flag can be set from tcp stack.
*/
__mptcp_flush_join_list(msk);
list_for_each_entry(subflow, &msk->conn_list, node) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);

Expand Down Expand Up @@ -1392,6 +1420,7 @@ static int mptcp_shutdown(struct socket *sock, int how)
sock->state = SS_CONNECTED;
}

__mptcp_flush_join_list(msk);
mptcp_for_each_subflow(msk, subflow) {
struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow);

Expand Down
13 changes: 13 additions & 0 deletions net/mptcp/protocol.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,10 @@
#define TCPOLEN_MPTCP_PORT_LEN 2
#define TCPOLEN_MPTCP_RM_ADDR_BASE 4

/* MPTCP MP_JOIN flags */
#define MPTCPOPT_BACKUP BIT(0)
#define MPTCPOPT_HMAC_LEN 20
#define MPTCPOPT_THMAC_LEN 8

/* MPTCP MP_CAPABLE flags */
#define MPTCP_VERSION_MASK (0x0F)
Expand Down Expand Up @@ -148,8 +150,10 @@ struct mptcp_sock {
u32 token;
unsigned long flags;
bool can_ack;
spinlock_t join_list_lock;
struct work_struct work;
struct list_head conn_list;
struct list_head join_list;
struct skb_ext *cached_ext; /* for the next sendmsg */
struct socket *subflow; /* outgoing connect/listener/!mp_capable */
struct sock *first;
Expand Down Expand Up @@ -202,6 +206,8 @@ struct mptcp_subflow_context {
u32 ssn_offset;
u32 map_data_len;
u32 request_mptcp : 1, /* send MP_CAPABLE */
request_join : 1, /* send MP_JOIN */
request_bkup : 1,
mp_capable : 1, /* remote is MPTCP capable */
mp_join : 1, /* remote is JOINing */
fully_established : 1, /* path validated */
Expand All @@ -218,6 +224,8 @@ struct mptcp_subflow_context {
u32 remote_nonce;
u64 thmac;
u32 local_nonce;
u32 remote_token;
u8 hmac[MPTCPOPT_HMAC_LEN];
u8 local_id;
u8 remote_id;

Expand Down Expand Up @@ -263,6 +271,11 @@ mptcp_subflow_get_mapped_dsn(const struct mptcp_subflow_context *subflow)
int mptcp_is_enabled(struct net *net);
bool mptcp_subflow_data_available(struct sock *sk);
void mptcp_subflow_init(void);

/* called with sk socket lock held */
int __mptcp_subflow_connect(struct sock *sk, int ifindex,
const struct mptcp_addr_info *loc,
const struct mptcp_addr_info *remote);
int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock);

static inline void mptcp_subflow_tcp_fallback(struct sock *sk,
Expand Down
Loading

0 comments on commit ec3edaa

Please sign in to comment.