/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ /* vim:set ts=2 sw=2 sts=2 et cindent: */ /* This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ #include "TCPFastOpenLayer.h" #include "nsSocketTransportService2.h" #include "prmem.h" #include "prio.h" namespace mozilla { namespace net { static PRDescIdentity sTCPFastOpenLayerIdentity; static PRIOMethods sTCPFastOpenLayerMethods; static PRIOMethods *sTCPFastOpenLayerMethodsPtr = nullptr; #define TFO_MAX_PACKET_SIZE_IPV4 1460 #define TFO_MAX_PACKET_SIZE_IPV6 1440 #define TFO_TLS_RECORD_HEADER_SIZE 22 /** * For the TCP Fast Open it is necessary to send all data that can fit into the * first packet on a single sendto function call. Consecutive calls will not * have an effect. Therefore TCPFastOpenLayer will collect some data before * calling sendto. Necko and nss will call PR_Write multiple times with small * amount of data. * TCPFastOpenLayer has 4 states: * WAITING_FOR_CONNECT: * This is before connect is call. A call of recv, send or getpeername will * return PR_NOT_CONNECTED_ERROR. After connect is call the state transfers * into COLLECT_DATA_FOR_FIRST_PACKET. * * COLLECT_DATA_FOR_FIRST_PACKET: * In this state all data received by send function calls will be stored in * a buffer. If transaction do not have any more data ready to be sent or * the buffer is full, TCPFastOpenFinish is call. TCPFastOpenFinish sends * the collected data using sendto function and the state transfers to * WAITING_FOR_CONNECTCONTINUE. If an error occurs during sendto, the error * is reported by the TCPFastOpenFinish return values. nsSocketTransfer is * the only caller of TCPFastOpenFinish; it knows how to interpreter these * errors. * WAITING_FOR_CONNECTCONTINUE: * connectcontinue transfers from this state to CONNECTED. Any other * function (e.g. send, recv) returns PR_WOULD_BLOCK_ERROR. * CONNECTED: * The size of mFirstPacketBuf is 1440/1460 (RFC7413 recomends that packet * does exceeds these sizes). SendTo does not have to consume all buffered * data and some data can be still in mFirstPacketBuf. Before sending any * new data we need to send the remaining buffered data. **/ class TCPFastOpenSecret { public: TCPFastOpenSecret() : mState(WAITING_FOR_CONNECT) , mFirstPacketBufLen(0) , mCondition(0) { this->mAddr.raw.family = 0; this->mAddr.inet.family = 0; this->mAddr.inet.port = 0; this->mAddr.inet.ip = 0; this->mAddr.ipv6.family = 0; this->mAddr.ipv6.port = 0; this->mAddr.ipv6.flowinfo = 0; this->mAddr.ipv6.scope_id = 0; this->mAddr.local.family = 0; } enum { CONNECTED, WAITING_FOR_CONNECTCONTINUE, COLLECT_DATA_FOR_FIRST_PACKET, WAITING_FOR_CONNECT, SOCKET_ERROR_STATE } mState; PRNetAddr mAddr; char mFirstPacketBuf[1460]; uint16_t mFirstPacketBufLen; PRErrorCode mCondition; }; static PRStatus TCPFastOpenConnect(PRFileDesc *fd, const PRNetAddr *addr, PRIntervalTime timeout) { MOZ_RELEASE_ASSERT(fd->identity == sTCPFastOpenLayerIdentity); MOZ_ASSERT(OnSocketThread(), "not on socket thread"); TCPFastOpenSecret *secret = reinterpret_cast(fd->secret); SOCKET_LOG(("TCPFastOpenConnect state=%d.\n", secret->mState)); if (secret->mState != TCPFastOpenSecret::WAITING_FOR_CONNECT) { PR_SetError(PR_IS_CONNECTED_ERROR, 0); return PR_FAILURE; } // Remember the address. It will be used for sendto or connect later. memcpy(&secret->mAddr, addr, sizeof(secret->mAddr)); secret->mState = TCPFastOpenSecret::COLLECT_DATA_FOR_FIRST_PACKET; return PR_SUCCESS; } static PRInt32 TCPFastOpenSend(PRFileDesc *fd, const void *buf, PRInt32 amount, PRIntn flags, PRIntervalTime timeout) { MOZ_RELEASE_ASSERT(fd->identity == sTCPFastOpenLayerIdentity); MOZ_ASSERT(OnSocketThread(), "not on socket thread"); TCPFastOpenSecret *secret = reinterpret_cast(fd->secret); SOCKET_LOG(("TCPFastOpenSend state=%d.\n", secret->mState)); switch(secret->mState) { case TCPFastOpenSecret::CONNECTED: // Before sending new data we need to drain the data collected during tfo. if (secret->mFirstPacketBufLen) { SOCKET_LOG(("TCPFastOpenSend - %d bytes to drain from " "mFirstPacketBufLen.\n", secret->mFirstPacketBufLen )); PRInt32 rv = (fd->lower->methods->send)(fd->lower, secret->mFirstPacketBuf, secret->mFirstPacketBufLen, 0, // flags PR_INTERVAL_NO_WAIT); if (rv <= 0) { return rv; } secret->mFirstPacketBufLen -= rv; if (secret->mFirstPacketBufLen) { memmove(secret->mFirstPacketBuf, secret->mFirstPacketBuf + rv, secret->mFirstPacketBufLen); PR_SetError(PR_WOULD_BLOCK_ERROR, 0); return PR_FAILURE; } // if we drained the buffer we can fall through this checks and call // send for the new data } SOCKET_LOG(("TCPFastOpenSend sending new data.\n")); return (fd->lower->methods->send)(fd->lower, buf, amount, flags, timeout); case TCPFastOpenSecret::WAITING_FOR_CONNECTCONTINUE: PR_SetError(PR_WOULD_BLOCK_ERROR, 0); return -1; case TCPFastOpenSecret::COLLECT_DATA_FOR_FIRST_PACKET: { int32_t toSend = (secret->mAddr.raw.family == PR_AF_INET) ? TFO_MAX_PACKET_SIZE_IPV4 : TFO_MAX_PACKET_SIZE_IPV6; MOZ_ASSERT(secret->mFirstPacketBufLen <= toSend); toSend -= secret->mFirstPacketBufLen; SOCKET_LOG(("TCPFastOpenSend: amount of data in the buffer=%d; the amount" " of additional data that can be stored=%d.\n", secret->mFirstPacketBufLen, toSend)); if (!toSend) { PR_SetError(PR_WOULD_BLOCK_ERROR, 0); return -1; } toSend = (toSend > amount) ? amount : toSend; memcpy(secret->mFirstPacketBuf + secret->mFirstPacketBufLen, buf, toSend); secret->mFirstPacketBufLen += toSend; return toSend; } case TCPFastOpenSecret::WAITING_FOR_CONNECT: PR_SetError(PR_NOT_CONNECTED_ERROR, 0); return -1; case TCPFastOpenSecret::SOCKET_ERROR_STATE: PR_SetError(secret->mCondition, 0); return -1; } PR_SetError(PR_WOULD_BLOCK_ERROR, 0); return PR_FAILURE; } static PRInt32 TCPFastOpenWrite(PRFileDesc *fd, const void *buf, PRInt32 amount) { return TCPFastOpenSend(fd, buf, amount, 0, PR_INTERVAL_NO_WAIT); } static PRInt32 TCPFastOpenRecv(PRFileDesc *fd, void *buf, PRInt32 amount, PRIntn flags, PRIntervalTime timeout) { MOZ_RELEASE_ASSERT(fd->identity == sTCPFastOpenLayerIdentity); TCPFastOpenSecret *secret = reinterpret_cast(fd->secret); PRInt32 rv = -1; switch(secret->mState) { case TCPFastOpenSecret::CONNECTED: if (secret->mFirstPacketBufLen) { // TLS will not call write before receiving data from a server, therefore // we need to force sending buffered data even during recv call. Otherwise // It can come to a deadlock (clients waits for response, but the request // is sitting in mFirstPacketBufLen). SOCKET_LOG(("TCPFastOpenRevc - %d bytes to drain from mFirstPacketBuf.\n", secret->mFirstPacketBufLen )); PRInt32 rv = (fd->lower->methods->send)(fd->lower, secret->mFirstPacketBuf, secret->mFirstPacketBufLen, 0, // flags PR_INTERVAL_NO_WAIT); if (rv <= 0) { return rv; } secret->mFirstPacketBufLen -= rv; if (secret->mFirstPacketBufLen) { memmove(secret->mFirstPacketBuf, secret->mFirstPacketBuf + rv, secret->mFirstPacketBufLen); } } rv = (fd->lower->methods->recv)(fd->lower, buf, amount, flags, timeout); break; case TCPFastOpenSecret::WAITING_FOR_CONNECTCONTINUE: case TCPFastOpenSecret::COLLECT_DATA_FOR_FIRST_PACKET: PR_SetError(PR_WOULD_BLOCK_ERROR, 0); break; case TCPFastOpenSecret::WAITING_FOR_CONNECT: PR_SetError(PR_NOT_CONNECTED_ERROR, 0); break; case TCPFastOpenSecret::SOCKET_ERROR_STATE: PR_SetError(secret->mCondition, 0); } return rv; } static PRInt32 TCPFastOpenRead(PRFileDesc *fd, void *buf, PRInt32 amount) { return TCPFastOpenRecv(fd, buf, amount , 0, PR_INTERVAL_NO_WAIT); } static PRStatus TCPFastOpenConnectContinue(PRFileDesc *fd, PRInt16 out_flags) { MOZ_RELEASE_ASSERT(fd->identity == sTCPFastOpenLayerIdentity); TCPFastOpenSecret *secret = reinterpret_cast(fd->secret); PRStatus rv = PR_FAILURE; switch(secret->mState) { case TCPFastOpenSecret::CONNECTED: rv = PR_SUCCESS; break; case TCPFastOpenSecret::WAITING_FOR_CONNECT: case TCPFastOpenSecret::COLLECT_DATA_FOR_FIRST_PACKET: PR_SetError(PR_NOT_CONNECTED_ERROR, 0); rv = PR_FAILURE; break; case TCPFastOpenSecret::WAITING_FOR_CONNECTCONTINUE: rv = (fd->lower->methods->connectcontinue)(fd->lower, out_flags); SOCKET_LOG(("TCPFastOpenConnectContinue result=%d.\n", rv)); secret->mState = TCPFastOpenSecret::CONNECTED; break; case TCPFastOpenSecret::SOCKET_ERROR_STATE: PR_SetError(secret->mCondition, 0); rv = PR_FAILURE; } return rv; } static PRStatus TCPFastOpenClose(PRFileDesc *fd) { if (!fd) { return PR_FAILURE; } PRFileDesc* layer = PR_PopIOLayer(fd, PR_TOP_IO_LAYER); MOZ_RELEASE_ASSERT(layer && layer->identity == sTCPFastOpenLayerIdentity, "TCP Fast Open Layer not on top of stack"); TCPFastOpenSecret *secret = reinterpret_cast(layer->secret); layer->secret = nullptr; layer->dtor(layer); delete secret; return fd->methods->close(fd); } static PRStatus TCPFastOpenGetpeername (PRFileDesc *fd, PRNetAddr *addr) { MOZ_RELEASE_ASSERT(fd); MOZ_RELEASE_ASSERT(addr); MOZ_RELEASE_ASSERT(fd->identity == sTCPFastOpenLayerIdentity); TCPFastOpenSecret *secret = reinterpret_cast(fd->secret); if (secret->mState == TCPFastOpenSecret::WAITING_FOR_CONNECT) { PR_SetError(PR_NOT_CONNECTED_ERROR, 0); return PR_FAILURE; } memcpy(addr, &secret->mAddr, sizeof(secret->mAddr)); return PR_SUCCESS; } static PRInt16 TCPFastOpenPoll(PRFileDesc *fd, PRInt16 how_flags, PRInt16 *p_out_flags) { MOZ_RELEASE_ASSERT(fd); MOZ_RELEASE_ASSERT(fd->identity == sTCPFastOpenLayerIdentity); TCPFastOpenSecret *secret = reinterpret_cast(fd->secret); if (secret->mFirstPacketBufLen) { how_flags |= PR_POLL_WRITE; } return fd->lower->methods->poll(fd->lower, how_flags, p_out_flags); } nsresult AttachTCPFastOpenIOLayer(PRFileDesc *fd) { MOZ_ASSERT(OnSocketThread(), "not on socket thread"); if (!sTCPFastOpenLayerMethodsPtr) { sTCPFastOpenLayerIdentity = PR_GetUniqueIdentity("TCPFastOpen Layer"); sTCPFastOpenLayerMethods = *PR_GetDefaultIOMethods(); sTCPFastOpenLayerMethods.connect = TCPFastOpenConnect; sTCPFastOpenLayerMethods.send = TCPFastOpenSend; sTCPFastOpenLayerMethods.write = TCPFastOpenWrite; sTCPFastOpenLayerMethods.recv = TCPFastOpenRecv; sTCPFastOpenLayerMethods.read = TCPFastOpenRead; sTCPFastOpenLayerMethods.connectcontinue = TCPFastOpenConnectContinue; sTCPFastOpenLayerMethods.close = TCPFastOpenClose; sTCPFastOpenLayerMethods.getpeername = TCPFastOpenGetpeername; sTCPFastOpenLayerMethods.poll = TCPFastOpenPoll; sTCPFastOpenLayerMethodsPtr = &sTCPFastOpenLayerMethods; } PRFileDesc *layer = PR_CreateIOLayerStub(sTCPFastOpenLayerIdentity, sTCPFastOpenLayerMethodsPtr); if (!layer) { return NS_ERROR_FAILURE; } TCPFastOpenSecret *secret = new TCPFastOpenSecret(); layer->secret = reinterpret_cast(secret); PRStatus status = PR_PushIOLayer(fd, PR_NSPR_IO_LAYER, layer); if (status == PR_FAILURE) { delete secret; PR_Free(layer); // PR_CreateIOLayerStub() uses PR_Malloc(). return NS_ERROR_FAILURE; } return NS_OK; } void TCPFastOpenFinish(PRFileDesc *fd, PRErrorCode &err, bool &fastOpenNotSupported, uint8_t &tfoStatus) { PRFileDesc *tfoFd = PR_GetIdentitiesLayer(fd, sTCPFastOpenLayerIdentity); MOZ_RELEASE_ASSERT(tfoFd); MOZ_ASSERT(OnSocketThread(), "not on socket thread"); TCPFastOpenSecret *secret = reinterpret_cast(tfoFd->secret); MOZ_ASSERT(secret->mState == TCPFastOpenSecret::COLLECT_DATA_FOR_FIRST_PACKET); fastOpenNotSupported = false; tfoStatus = TFO_NOT_TRIED; PRErrorCode result = 0; // If we do not have data to send with syn packet or nspr version does not // have sendto implemented we will call normal connect. // If sendto is not implemented it points to _PR_InvalidInt, therefore we // check if sendto != _PR_InvalidInt. _PR_InvalidInt is exposed so we use // reserved_fn_0 which also points to _PR_InvalidInt. if (!secret->mFirstPacketBufLen || (tfoFd->lower->methods->sendto == (PRSendtoFN)tfoFd->lower->methods->reserved_fn_0)) { // Because of the way our nsHttpTransaction dispatch work, it can happened // that data has not been written into the socket. // In this case we can just call connect. PRInt32 rv = (tfoFd->lower->methods->connect)(tfoFd->lower, &secret->mAddr, PR_INTERVAL_NO_WAIT); if (rv == PR_SUCCESS) { result = PR_IS_CONNECTED_ERROR; } else { result = PR_GetError(); } if (tfoFd->lower->methods->sendto == (PRSendtoFN)tfoFd->lower->methods->reserved_fn_0) { // sendto is not implemented, it is equal to _PR_InvalidInt! // We will disable Fast Open. SOCKET_LOG(("TCPFastOpenFinish - sendto not implemented.\n")); fastOpenNotSupported = true; tfoStatus = TFO_DISABLED; } } else { // We have some data ready in the buffer we will send it with the syn // packet. PRInt32 rv = (tfoFd->lower->methods->sendto)(tfoFd->lower, secret->mFirstPacketBuf, secret->mFirstPacketBufLen, 0, //flags &secret->mAddr, PR_INTERVAL_NO_WAIT); SOCKET_LOG(("TCPFastOpenFinish - sendto result=%d.\n", rv)); if (rv > 0) { result = PR_IN_PROGRESS_ERROR; secret->mFirstPacketBufLen -= rv; if (secret->mFirstPacketBufLen) { memmove(secret->mFirstPacketBuf, secret->mFirstPacketBuf + rv, secret->mFirstPacketBufLen); } tfoStatus = TFO_DATA_SENT; } else { result = PR_GetError(); SOCKET_LOG(("TCPFastOpenFinish - sendto error=%d.\n", result)); if (result == PR_NOT_TCP_SOCKET_ERROR) { // SendTo will return PR_NOT_TCP_SOCKET_ERROR if TCP Fast Open is turned off on Linux. // We can call connect again. fastOpenNotSupported = true; rv = (tfoFd->lower->methods->connect)(tfoFd->lower, &secret->mAddr, PR_INTERVAL_NO_WAIT); if (rv == PR_SUCCESS) { result = PR_IS_CONNECTED_ERROR; } else { result = PR_GetError(); } tfoStatus = TFO_DISABLED; } else { tfoStatus = TFO_TRIED; } } } if (result == PR_IN_PROGRESS_ERROR) { secret->mState = TCPFastOpenSecret::WAITING_FOR_CONNECTCONTINUE; } else { // If the error is not PR_IN_PROGRESS_ERROR, change the state to CONNECT so // that recv/send can perform recv/send on the next lower layer and pick up // the real error. This is really important! // The result can also be PR_IS_CONNECTED_ERROR, that should change the // state to CONNECT anyway. secret->mState = TCPFastOpenSecret::CONNECTED; } err = result; } /* This function returns the size of the remaining free space in the * first_packet_buffer. This will be used by transactions with a tls layer. For * other transactions it is not necessary. The tls transactions make a tls * record before writing to this layer and if the record is too big the part * that does not have place in the mFirstPacketBuf will be saved on the tls * layer. During TFO we cannot send more than TFO_MAX_PACKET_SIZE_IPV4/6 bytes, * so if we have a big tls record, this record is encrypted with 0RTT key, * tls-early-data can be rejected and than we still need to send the rest of the * record. */ int32_t TCPFastOpenGetBufferSizeLeft(PRFileDesc *fd) { PRFileDesc *tfoFd = PR_GetIdentitiesLayer(fd, sTCPFastOpenLayerIdentity); MOZ_RELEASE_ASSERT(tfoFd); MOZ_ASSERT(OnSocketThread(), "not on socket thread"); TCPFastOpenSecret *secret = reinterpret_cast(tfoFd->secret); if (secret->mState != TCPFastOpenSecret::COLLECT_DATA_FOR_FIRST_PACKET) { return 0; } int32_t sizeLeft = (secret->mAddr.raw.family == PR_AF_INET) ? TFO_MAX_PACKET_SIZE_IPV4 : TFO_MAX_PACKET_SIZE_IPV6; MOZ_ASSERT(secret->mFirstPacketBufLen <= sizeLeft); sizeLeft -= secret->mFirstPacketBufLen; SOCKET_LOG(("TCPFastOpenGetBufferSizeLeft=%d.\n", sizeLeft)); return (sizeLeft > TFO_TLS_RECORD_HEADER_SIZE) ? sizeLeft - TFO_TLS_RECORD_HEADER_SIZE : 0; } bool TCPFastOpenGetCurrentBufferSize(PRFileDesc *fd) { PRFileDesc *tfoFd = PR_GetIdentitiesLayer(fd, sTCPFastOpenLayerIdentity); MOZ_RELEASE_ASSERT(tfoFd); MOZ_ASSERT(OnSocketThread(), "not on socket thread"); TCPFastOpenSecret *secret = reinterpret_cast(tfoFd->secret); return secret->mFirstPacketBufLen; } bool TCPFastOpenFlushBuffer(PRFileDesc *fd) { PRFileDesc *tfoFd = PR_GetIdentitiesLayer(fd, sTCPFastOpenLayerIdentity); MOZ_RELEASE_ASSERT(tfoFd); MOZ_ASSERT(OnSocketThread(), "not on socket thread"); TCPFastOpenSecret *secret = reinterpret_cast(tfoFd->secret); MOZ_ASSERT(secret->mState == TCPFastOpenSecret::CONNECTED); if (secret->mFirstPacketBufLen) { SOCKET_LOG(("TCPFastOpenFlushBuffer - %d bytes to drain from " "mFirstPacketBufLen.\n", secret->mFirstPacketBufLen )); PRInt32 rv = (tfoFd->lower->methods->send)(tfoFd->lower, secret->mFirstPacketBuf, secret->mFirstPacketBufLen, 0, // flags PR_INTERVAL_NO_WAIT); if (rv <= 0) { PRErrorCode err = PR_GetError(); if (err == PR_WOULD_BLOCK_ERROR) { // We still need to send this data. return true; } // There is an error, let nsSocketTransport pick it up properly. secret->mCondition = err; secret->mState = TCPFastOpenSecret::SOCKET_ERROR_STATE; return false; } secret->mFirstPacketBufLen -= rv; if (secret->mFirstPacketBufLen) { memmove(secret->mFirstPacketBuf, secret->mFirstPacketBuf + rv, secret->mFirstPacketBufLen); } } return secret->mFirstPacketBufLen; } } }