Kernel: Avoid allocations when sending IP packets

Previously we'd allocate buffers when sending packets. This patch
avoids these allocations by using the NetworkAdapter's packet queue.

At the same time this also avoids copying partially constructed
packets in order to prepend Ethernet and/or IPv4 headers. It also
properly truncates UDP and raw IP packets.
This commit is contained in:
Gunnar Beutner 2021-05-26 05:35:05 +02:00 committed by Andreas Kling
parent f8310b7796
commit b436dd138b
7 changed files with 94 additions and 117 deletions

View file

@ -218,10 +218,19 @@ KResultOr<size_t> IPv4Socket::sendto(FileDescription&, const UserOrKernelBuffer&
dbgln_if(IPV4_SOCKET_DEBUG, "sendto: destination={}:{}", m_peer_address, m_peer_port);
if (type() == SOCK_RAW) {
auto result = routing_decision.adapter->send_ipv4(local_address(), routing_decision.next_hop,
m_peer_address, (IPv4Protocol)protocol(), data, data_length, m_ttl);
if (result.is_error())
return result;
auto ipv4_payload_offset = routing_decision.adapter->ipv4_payload_offset();
data_length = min(data_length, routing_decision.adapter->mtu() - ipv4_payload_offset);
auto packet = routing_decision.adapter->acquire_packet_buffer(ipv4_payload_offset + data_length);
if (!packet)
return ENOMEM;
routing_decision.adapter->fill_in_ipv4_header(*packet, local_address(), routing_decision.next_hop,
m_peer_address, (IPv4Protocol)protocol(), data_length, m_ttl);
if (!data.read(packet->buffer.data() + ipv4_payload_offset, data_length)) {
routing_decision.adapter->release_packet_buffer(*packet);
return EFAULT;
}
routing_decision.adapter->send_raw({ packet->buffer.data(), packet->buffer.size() });
routing_decision.adapter->release_packet_buffer(*packet);
return data_length;
}

View file

@ -10,7 +10,6 @@
#include <Kernel/Heap/kmalloc.h>
#include <Kernel/Lock.h>
#include <Kernel/Net/EtherType.h>
#include <Kernel/Net/EthernetFrameHeader.h>
#include <Kernel/Net/LoopbackAdapter.h>
#include <Kernel/Net/NetworkAdapter.h>
#include <Kernel/Process.h>
@ -76,15 +75,15 @@ void NetworkAdapter::send(const MACAddress& destination, const ARPPacket& packet
send_raw({ (const u8*)eth, size_in_bytes });
}
KResult NetworkAdapter::send_ipv4(const IPv4Address& source_ipv4, const MACAddress& destination_mac, const IPv4Address& destination_ipv4, IPv4Protocol protocol, const UserOrKernelBuffer& payload, size_t payload_size, u8 ttl)
void NetworkAdapter::fill_in_ipv4_header(PacketWithTimestamp& packet, IPv4Address const& source_ipv4, MACAddress const& destination_mac, IPv4Address const& destination_ipv4, IPv4Protocol protocol, size_t payload_size, u8 ttl)
{
size_t ipv4_packet_size = sizeof(IPv4Packet) + payload_size;
if (ipv4_packet_size > mtu())
return send_ipv4_fragmented(source_ipv4, destination_mac, destination_ipv4, protocol, payload, payload_size, ttl);
VERIFY(ipv4_packet_size <= mtu());
size_t ethernet_frame_size = sizeof(EthernetFrameHeader) + sizeof(IPv4Packet) + payload_size;
auto buffer = NetworkByteBuffer::create_zeroed(ethernet_frame_size);
auto& eth = *(EthernetFrameHeader*)buffer.data();
size_t ethernet_frame_size = ipv4_payload_offset() + payload_size;
VERIFY(packet.buffer.size() == ethernet_frame_size);
memset(packet.buffer.data(), 0, ipv4_payload_offset());
auto& eth = *(EthernetFrameHeader*)packet.buffer.data();
eth.set_source(mac_address());
eth.set_destination(destination_mac);
eth.set_ether_type(EtherType::IPv4);
@ -98,53 +97,6 @@ KResult NetworkAdapter::send_ipv4(const IPv4Address& source_ipv4, const MACAddre
ipv4.set_ident(1);
ipv4.set_ttl(ttl);
ipv4.set_checksum(ipv4.compute_checksum());
m_packets_out++;
m_bytes_out += ethernet_frame_size;
if (!payload.read(ipv4.payload(), payload_size))
return EFAULT;
send_raw({ (const u8*)&eth, ethernet_frame_size });
return KSuccess;
}
KResult NetworkAdapter::send_ipv4_fragmented(const IPv4Address& source_ipv4, const MACAddress& destination_mac, const IPv4Address& destination_ipv4, IPv4Protocol protocol, const UserOrKernelBuffer& payload, size_t payload_size, u8 ttl)
{
// packets must be split on the 64-bit boundary
auto packet_boundary_size = (mtu() - sizeof(IPv4Packet) - sizeof(EthernetFrameHeader)) & 0xfffffff8;
auto fragment_block_count = (payload_size + packet_boundary_size) / packet_boundary_size;
auto last_block_size = payload_size - packet_boundary_size * (fragment_block_count - 1);
auto number_of_blocks_in_fragment = packet_boundary_size / 8;
auto identification = get_good_random<u16>();
size_t ethernet_frame_size = mtu();
for (size_t packet_index = 0; packet_index < fragment_block_count; ++packet_index) {
auto is_last_block = packet_index + 1 == fragment_block_count;
auto packet_payload_size = is_last_block ? last_block_size : packet_boundary_size;
auto buffer = NetworkByteBuffer::create_zeroed(ethernet_frame_size);
auto& eth = *(EthernetFrameHeader*)buffer.data();
eth.set_source(mac_address());
eth.set_destination(destination_mac);
eth.set_ether_type(EtherType::IPv4);
auto& ipv4 = *(IPv4Packet*)eth.payload();
ipv4.set_version(4);
ipv4.set_internet_header_length(5);
ipv4.set_source(source_ipv4);
ipv4.set_destination(destination_ipv4);
ipv4.set_protocol((u8)protocol);
ipv4.set_length(sizeof(IPv4Packet) + packet_payload_size);
ipv4.set_has_more_fragments(!is_last_block);
ipv4.set_ident(identification);
ipv4.set_ttl(ttl);
ipv4.set_fragment_offset(packet_index * number_of_blocks_in_fragment);
ipv4.set_checksum(ipv4.compute_checksum());
m_packets_out++;
m_bytes_out += ethernet_frame_size;
if (!payload.read(ipv4.payload(), packet_index * packet_boundary_size, packet_payload_size))
return EFAULT;
send_raw({ (const u8*)&eth, ethernet_frame_size });
}
return KSuccess;
}
void NetworkAdapter::did_receive(ReadonlyBytes payload)
@ -195,6 +147,8 @@ RefPtr<PacketWithTimestamp> NetworkAdapter::acquire_packet_buffer(size_t size)
if (m_unused_packets.is_empty()) {
auto buffer = KBuffer::create_with_size(size, Region::Access::Read | Region::Access::Write, "Packet Buffer", AllocationStrategy::AllocateNow);
auto packet = adopt_ref_if_nonnull(new PacketWithTimestamp { move(buffer), kgettimeofday() });
if (!packet)
return nullptr;
packet->buffer.set_size(size);
return packet;
}
@ -208,6 +162,8 @@ RefPtr<PacketWithTimestamp> NetworkAdapter::acquire_packet_buffer(size_t size)
auto buffer = KBuffer::create_with_size(size, Region::Access::Read | Region::Access::Write, "Packet Buffer", AllocationStrategy::AllocateNow);
packet = adopt_ref_if_nonnull(new PacketWithTimestamp { move(buffer), kgettimeofday() });
if (!packet)
return nullptr;
packet->buffer.set_size(size);
return packet;
}

View file

@ -15,6 +15,7 @@
#include <AK/Weakable.h>
#include <Kernel/KBuffer.h>
#include <Kernel/Net/ARP.h>
#include <Kernel/Net/EthernetFrameHeader.h>
#include <Kernel/Net/ICMP.h>
#include <Kernel/Net/IPv4.h>
#include <Kernel/PCI/Definitions.h>
@ -38,7 +39,8 @@ struct PacketWithTimestamp : public RefCounted<PacketWithTimestamp> {
IntrusiveListNode<PacketWithTimestamp, RefPtr<PacketWithTimestamp>> packet_node;
};
class NetworkAdapter : public RefCounted<NetworkAdapter> {
class NetworkAdapter : public RefCounted<NetworkAdapter>
, public Weakable<NetworkAdapter> {
public:
template<typename Callback>
static inline void for_each(Callback callback)
@ -67,8 +69,7 @@ public:
void set_ipv4_gateway(const IPv4Address&);
void send(const MACAddress&, const ARPPacket&);
KResult send_ipv4(const IPv4Address& source_ipv4, const MACAddress&, const IPv4Address&, IPv4Protocol, const UserOrKernelBuffer& payload, size_t payload_size, u8 ttl);
KResult send_ipv4_fragmented(const IPv4Address& source_ipv4, const MACAddress&, const IPv4Address&, IPv4Protocol, const UserOrKernelBuffer& payload, size_t payload_size, u8 ttl);
void fill_in_ipv4_header(PacketWithTimestamp&, IPv4Address const&, MACAddress const&, IPv4Address const&, IPv4Protocol, size_t, u8);
size_t dequeue_packet(u8* buffer, size_t buffer_size, Time& packet_timestamp);
@ -85,13 +86,17 @@ public:
RefPtr<PacketWithTimestamp> acquire_packet_buffer(size_t);
void release_packet_buffer(PacketWithTimestamp&);
constexpr size_t layer3_payload_offset() const { return sizeof(EthernetFrameHeader); }
constexpr size_t ipv4_payload_offset() const { return layer3_payload_offset() + sizeof(IPv4Packet); }
virtual void send_raw(ReadonlyBytes) = 0;
Function<void()> on_receive;
protected:
NetworkAdapter();
void set_interface_name(const PCI::Address&);
void set_mac_address(const MACAddress& mac_address) { m_mac_address = mac_address; }
virtual void send_raw(ReadonlyBytes) = 0;
void did_receive(ReadonlyBytes);
void set_loopback_name();

View file

@ -246,8 +246,14 @@ void handle_icmp(const EthernetFrameHeader& eth, const IPv4Packet& ipv4_packet,
dbgln("handle_icmp: EchoRequest packet is too small, ignoring.");
return;
}
auto buffer = ByteBuffer::create_zeroed(icmp_packet_size);
auto& response = *(ICMPEchoPacket*)buffer.data();
auto ipv4_payload_offset = adapter->ipv4_payload_offset();
auto packet = adapter->acquire_packet_buffer(ipv4_payload_offset + icmp_packet_size);
if (!packet) {
dbgln("Could not allocate packet buffer while sending ICMP packet");
return;
}
adapter->fill_in_ipv4_header(*packet, adapter->ipv4_address(), eth.source(), ipv4_packet.source(), IPv4Protocol::ICMP, icmp_packet_size, 64);
auto& response = *(ICMPEchoPacket*)(packet->buffer.data() + ipv4_payload_offset);
response.header.set_type(ICMPType::EchoReply);
response.header.set_code(0);
response.identifier = request.identifier;
@ -256,9 +262,8 @@ void handle_icmp(const EthernetFrameHeader& eth, const IPv4Packet& ipv4_packet,
memcpy(response.payload(), request.payload(), icmp_payload_size);
response.header.set_checksum(internet_checksum(&response, icmp_packet_size));
// FIXME: What is the right TTL value here? Is 64 ok? Should we use the same TTL as the echo request?
auto response_buffer = UserOrKernelBuffer::for_kernel_buffer((u8*)&response);
[[maybe_unused]] auto result = adapter->send_ipv4(adapter->ipv4_address(), eth.source(),
ipv4_packet.source(), IPv4Protocol::ICMP, response_buffer, buffer.size(), 64);
adapter->send_raw({ packet->buffer.data(), packet->buffer.size() });
adapter->release_packet_buffer(*packet);
}
}

View file

@ -188,18 +188,30 @@ KResult TCPSocket::send_ack(bool allow_duplicate)
KResult TCPSocket::send_tcp_packet(u16 flags, const UserOrKernelBuffer* payload, size_t payload_size, RoutingDecision* user_routing_decision)
{
RoutingDecision routing_decision = user_routing_decision ? *user_routing_decision : route_to(peer_address(), local_address(), bound_interface());
if (routing_decision.is_zero())
return EHOSTUNREACH;
auto ipv4_payload_offset = routing_decision.adapter->ipv4_payload_offset();
const bool has_mss_option = flags == TCPFlags::SYN;
const size_t options_size = has_mss_option ? sizeof(TCPOptionMSS) : 0;
const size_t header_size = sizeof(TCPPacket) + options_size;
const size_t buffer_size = header_size + payload_size;
auto buffer = NetworkByteBuffer::create_zeroed(buffer_size);
auto& tcp_packet = *(TCPPacket*)(buffer.data());
const size_t tcp_header_size = sizeof(TCPPacket) + options_size;
const size_t buffer_size = ipv4_payload_offset + tcp_header_size + payload_size;
auto packet = routing_decision.adapter->acquire_packet_buffer(buffer_size);
if (!packet)
return ENOMEM;
routing_decision.adapter->fill_in_ipv4_header(*packet, local_address(),
routing_decision.next_hop, peer_address(), IPv4Protocol::TCP,
buffer_size - ipv4_payload_offset, ttl());
memset(packet->buffer.data() + ipv4_payload_offset, 0, sizeof(TCPPacket));
auto& tcp_packet = *(TCPPacket*)(packet->buffer.data() + ipv4_payload_offset);
VERIFY(local_port());
tcp_packet.set_source_port(local_port());
tcp_packet.set_destination_port(peer_port());
tcp_packet.set_window_size(NumericLimits<u16>::max());
tcp_packet.set_sequence_number(m_sequence_number);
tcp_packet.set_data_offset(header_size / sizeof(u32));
tcp_packet.set_data_offset(tcp_header_size / sizeof(u32));
tcp_packet.set_flags(flags);
if (flags & TCPFlags::ACK) {
@ -217,31 +229,22 @@ KResult TCPSocket::send_tcp_packet(u16 flags, const UserOrKernelBuffer* payload,
m_sequence_number += payload_size;
}
RoutingDecision routing_decision = user_routing_decision ? *user_routing_decision : route_to(peer_address(), local_address(), bound_interface());
if (routing_decision.is_zero())
return EHOSTUNREACH;
if (has_mss_option) {
u16 mss = routing_decision.adapter->mtu() - sizeof(IPv4Packet) - sizeof(TCPPacket);
TCPOptionMSS mss_option { mss };
VERIFY(buffer.size() >= sizeof(TCPPacket) + sizeof(mss_option));
memcpy(buffer.data() + sizeof(TCPPacket), &mss_option, sizeof(mss_option));
VERIFY(packet->buffer.size() >= ipv4_payload_offset + sizeof(TCPPacket) + sizeof(mss_option));
memcpy(packet->buffer.data() + ipv4_payload_offset + sizeof(TCPPacket), &mss_option, sizeof(mss_option));
}
tcp_packet.set_checksum(compute_tcp_checksum(local_address(), peer_address(), tcp_packet, payload_size));
auto packet_buffer = UserOrKernelBuffer::for_kernel_buffer(buffer.data());
auto result = routing_decision.adapter->send_ipv4(
local_address(), routing_decision.next_hop, peer_address(), IPv4Protocol::TCP,
packet_buffer, buffer_size, ttl());
if (result.is_error())
return result;
routing_decision.adapter->send_raw({ packet->buffer.data(), packet->buffer.size() });
m_packets_out++;
m_bytes_out += buffer_size;
if (tcp_packet.has_syn() || payload_size > 0) {
Locker locker(m_not_acked_lock);
m_not_acked.append({ m_sequence_number, move(buffer) });
m_not_acked.append({ m_sequence_number, move(packet), ipv4_payload_offset, *routing_decision.adapter });
enqueue_for_retransmit();
}
@ -263,6 +266,9 @@ void TCPSocket::receive_tcp_packet(const TCPPacket& packet, u16 size)
dbgln_if(TCP_SOCKET_DEBUG, "TCPSocket: iterate: {}", packet.ack_number);
if (packet.ack_number <= ack_number) {
auto old_adapter = packet.adapter.strong_ref();
if (old_adapter)
old_adapter->release_packet_buffer(*packet.buffer);
m_not_acked.take_first();
removed++;
} else {
@ -531,7 +537,7 @@ void TCPSocket::retransmit_packets()
packet.tx_counter++;
if constexpr (TCP_SOCKET_DEBUG) {
auto& tcp_packet = *(const TCPPacket*)(packet.buffer.data());
auto& tcp_packet = *(const TCPPacket*)(packet.buffer->buffer.data() + packet.ipv4_payload_offset);
dbgln("Sending TCP packet from {}:{} to {}:{} with ({}{}{}{}) seq_no={}, ack_no={}, tx_counter={}",
local_address(), local_port(),
peer_address(), peer_port(),
@ -544,29 +550,19 @@ void TCPSocket::retransmit_packets()
packet.tx_counter);
}
auto packet_buffer = UserOrKernelBuffer::for_kernel_buffer(packet.buffer.data());
int err = routing_decision.adapter->send_ipv4(
local_address(), routing_decision.next_hop, peer_address(),
IPv4Protocol::TCP, packet_buffer, packet.buffer.size(), ttl());
if (err < 0) {
auto& tcp_packet = *(const TCPPacket*)(packet.buffer.data());
dmesgln("Error ({}) sending TCP packet from {}:{} to {}:{} with ({}{}{}{}) seq_no={}, ack_no={}, tx_counter={}",
err,
local_address(),
local_port(),
peer_address(),
peer_port(),
(tcp_packet.has_syn() ? "SYN " : ""),
(tcp_packet.has_ack() ? "ACK " : ""),
(tcp_packet.has_fin() ? "FIN " : ""),
(tcp_packet.has_rst() ? "RST " : ""),
tcp_packet.sequence_number(),
tcp_packet.ack_number(),
packet.tx_counter);
} else {
m_packets_out++;
m_bytes_out += packet.buffer.size();
size_t ipv4_payload_offset = routing_decision.adapter->ipv4_payload_offset();
if (ipv4_payload_offset != packet.ipv4_payload_offset) {
// FIXME: Add support for this. This can happen if after a route change
// we ended up on another adapter which doesn't have the same layer 2 type
// like the previous adapter.
VERIFY_NOT_REACHED();
}
routing_decision.adapter->fill_in_ipv4_header(*packet.buffer,
local_address(), routing_decision.next_hop, peer_address(),
IPv4Protocol::TCP, packet.buffer->buffer.size() - ipv4_payload_offset, ttl());
routing_decision.adapter->send_raw({ packet.buffer->buffer.data(), packet.buffer->buffer.size() });
m_packets_out++;
m_bytes_out += packet.buffer->buffer.size();
}
}

View file

@ -194,7 +194,9 @@ private:
struct OutgoingPacket {
u32 ack_number { 0 };
NetworkByteBuffer buffer;
RefPtr<PacketWithTimestamp> buffer;
size_t ipv4_payload_offset;
WeakPtr<NetworkAdapter> adapter;
int tx_counter { 0 };
};

View file

@ -78,19 +78,23 @@ KResultOr<size_t> UDPSocket::protocol_send(const UserOrKernelBuffer& data, size_
auto routing_decision = route_to(peer_address(), local_address(), bound_interface());
if (routing_decision.is_zero())
return EHOSTUNREACH;
const size_t buffer_size = sizeof(UDPPacket) + data_length;
auto buffer = ByteBuffer::create_zeroed(buffer_size);
auto& udp_packet = *reinterpret_cast<UDPPacket*>(buffer.data());
auto ipv4_payload_offset = routing_decision.adapter->ipv4_payload_offset();
data_length = min(data_length, routing_decision.adapter->mtu() - ipv4_payload_offset - sizeof(UDPPacket));
const size_t udp_buffer_size = sizeof(UDPPacket) + data_length;
auto packet = routing_decision.adapter->acquire_packet_buffer(ipv4_payload_offset + udp_buffer_size);
if (!packet)
return ENOMEM;
memset(packet->buffer.data() + ipv4_payload_offset, 0, sizeof(UDPPacket));
auto& udp_packet = *reinterpret_cast<UDPPacket*>(packet->buffer.data() + ipv4_payload_offset);
udp_packet.set_source_port(local_port());
udp_packet.set_destination_port(peer_port());
udp_packet.set_length(buffer_size);
udp_packet.set_length(udp_buffer_size);
if (!data.read(udp_packet.payload(), data_length))
return EFAULT;
auto result = routing_decision.adapter->send_ipv4(local_address(), routing_decision.next_hop,
peer_address(), IPv4Protocol::UDP, UserOrKernelBuffer::for_kernel_buffer(buffer.data()), buffer_size, ttl());
if (result.is_error())
return result;
routing_decision.adapter->fill_in_ipv4_header(*packet, local_address(), routing_decision.next_hop,
peer_address(), IPv4Protocol::UDP, udp_buffer_size, ttl());
routing_decision.adapter->send_raw({ packet->buffer.data(), packet->buffer.size() });
return data_length;
}