From a3a428a95cad416a0b2d94437bf284a70854e149 Mon Sep 17 00:00:00 2001 From: Victor Julien Date: Sun, 31 Oct 2021 22:13:19 +0100 Subject: [PATCH] af-packet: fix soft lockup issues The Suricata AF_PACKET code opens a socket per thread, then after some minor setup enters a loop where the socket is poll()'d with a timeout. When the poll() call returns a non zero positive value, the AF_PACKET ring will be processed. The ringbuffer processing logic has a pointer into the ring where we last checked the ring. From this position we will inspect each frame until we find a frame with tp_status == TP_STATUS_KERNEL (so essentially 0). This means the frame is currently owned by the kernel. There is a special case handling for starting the ring processing but finding a TP_STATUS_KERNEL immediately. This logic then skip to the next frame, rerun the check, etc until it either finds an initialized frame or the last frame of the ringbuffer. The problem was, however, that the initial uninitialized frame was possibly (likely?) still being initialized by the kernel. A data race between the notification through the socket (the poll()) and the updating of the `tp_status` field in the frame could lead to a valid frame getting skipped. Of note is that for example libpcap does not do frame scanning. Instead it simply exits it ring processing loop. Also interesting is that libpcap uses atomic loads and stores on the tp_status field. This skipping of frames had 2 bad side effects: 1. in most cases, the buffer would be full enough that the frame would be processed in the next pass of the ring, but now the frame would out of order. This might have lead to packets belong to the same flow getting processed in the wrong order. 2. more severe is the soft lockup case. The skipped frame sits at ring buffer index 0. The rest of the ring has been cleared, after the initial frame was skipped. As our pass of the ring stops at the end of the ring (ptv->frame_offset + 1 == ptv->req.v2.tp_frame_nr) the code exits the ring processing loop at goes back to poll(). However, poll() will not indicate that there is more data, as the stale frame in the ring blocks the kernel from populating more frames beyond it. This is now a dead lock, as the kernel waits for Suricata and Suricata never touches the ring until it hears from the kernel. The scan logic will scan the whole ring at most once, so it won't reconsider the stale frame either. This patch addresses the issues in several ways: 1. the startup "discard" logic was fixed to not skip over kernel frames. Doing so would get us in a bad state at start up. 2. Instead of scanning the ring, we now enter a busy wait loop when encountering a kernel frame where we didn't expect one. This means that if we got a > 0 poll() result, we'll busy wait until we get at least one frame. 3. Error handling is unified and cleaned up. Any frame error now returns the frame to the kernel and progresses the frame pointer. 4. If we find a frame that is owned by us (TP_STATUS_USER_BUSY) we yield to poll() immediately, as the next expected status of that frame is TP_STATUS_KERNEL. 5. the ring is no longer processed until the "end" of the ring (so highest index), but instead we process at most one full ring size per run. 6. Work with a copy of `tp_status` instead of accessing original touched also by the kernel. Bug: #4785. (cherry picked from commit a022648b9e6910ca05a0564c2ead9344a4c34312) --- src/source-af-packet.c | 331 +++++++++++++++++++++-------------------- 1 file changed, 172 insertions(+), 159 deletions(-) diff --git a/src/source-af-packet.c b/src/source-af-packet.c index aeaad81c5f1e..5e84a1b73064 100644 --- a/src/source-af-packet.c +++ b/src/source-af-packet.c @@ -873,6 +873,143 @@ static void AFPReleasePacket(Packet *p) PacketFreeOrRelease(p); } +/** \internal + * \brief recoverable error - release packet and + * return AFP_SURI_FAILURE + */ +static inline int AFPSuriFailure(AFPThreadVars *ptv, union thdr h) +{ + h.h2->tp_status = TP_STATUS_KERNEL; + if (++ptv->frame_offset >= ptv->req.v2.tp_frame_nr) { + ptv->frame_offset = 0; + } + SCReturnInt(AFP_SURI_FAILURE); +} + +static inline void AFPReadApplyBypass(const AFPThreadVars *ptv, Packet *p) +{ + if (ptv->flags & AFP_BYPASS) { + p->BypassPacketsFlow = AFPBypassCallback; +#ifdef HAVE_PACKET_EBPF + p->afp_v.v4_map_fd = ptv->v4_map_fd; + p->afp_v.v6_map_fd = ptv->v6_map_fd; + p->afp_v.nr_cpus = ptv->ebpf_t_config.cpus_count; +#endif + } + if (ptv->flags & AFP_XDPBYPASS) { + p->BypassPacketsFlow = AFPXDPBypassCallback; +#ifdef HAVE_PACKET_EBPF + p->afp_v.v4_map_fd = ptv->v4_map_fd; + p->afp_v.v6_map_fd = ptv->v6_map_fd; + p->afp_v.nr_cpus = ptv->ebpf_t_config.cpus_count; +#endif + } +} + +/** \internal + * \brief setup packet for AFPReadFromRing + */ +static bool AFPReadFromRingSetupPacket( + AFPThreadVars *ptv, union thdr h, const unsigned int tp_status, Packet *p) +{ + PKT_SET_SRC(p, PKT_SRC_WIRE); + + /* Suricata will treat packet so telling it is busy, this + * status will be reset to 0 (ie TP_STATUS_KERNEL) in the release + * function. */ + h.h2->tp_status |= TP_STATUS_USER_BUSY; + p->livedev = ptv->livedev; + p->datalink = ptv->datalink; + ptv->pkts++; + + AFPReadApplyBypass(ptv, p); + + if (h.h2->tp_len > h.h2->tp_snaplen) { + SCLogDebug("Packet length (%d) > snaplen (%d), truncating", h.h2->tp_len, h.h2->tp_snaplen); + } + + /* get vlan id from header */ + if ((ptv->flags & AFP_VLAN_IN_HEADER) && + (tp_status & TP_STATUS_VLAN_VALID || h.h2->tp_vlan_tci)) { + p->vlan_id[0] = h.h2->tp_vlan_tci & 0x0fff; + p->vlan_idx = 1; + } + + if (ptv->flags & AFP_ZERO_COPY) { + if (PacketSetData(p, (unsigned char *)h.raw + h.h2->tp_mac, h.h2->tp_snaplen) == -1) { + return false; + } + + p->afp_v.relptr = h.raw; + p->ReleasePacket = AFPReleasePacket; + p->afp_v.mpeer = ptv->mpeer; + AFPRefSocket(ptv->mpeer); + + p->afp_v.copy_mode = ptv->copy_mode; + if (p->afp_v.copy_mode != AFP_COPY_MODE_NONE) { + p->afp_v.peer = ptv->mpeer->peer; + } else { + p->afp_v.peer = NULL; + } + } else { + if (PacketCopyData(p, (unsigned char *)h.raw + h.h2->tp_mac, h.h2->tp_snaplen) == -1) { + return false; + } + } + /* Timestamp */ + p->ts.tv_sec = h.h2->tp_sec; + p->ts.tv_usec = h.h2->tp_nsec / 1000; + SCLogDebug("pktlen: %" PRIu32 " (pkt %p, pkt data %p)", GET_PKT_LEN(p), p, GET_PKT_DATA(p)); + + /* We only check for checksum disable */ + if (ptv->checksum_mode == CHECKSUM_VALIDATION_DISABLE) { + p->flags |= PKT_IGNORE_CHECKSUM; + } else if (ptv->checksum_mode == CHECKSUM_VALIDATION_AUTO) { + if (ChecksumAutoModeCheck(ptv->pkts, SC_ATOMIC_GET(ptv->livedev->pkts), + SC_ATOMIC_GET(ptv->livedev->invalid_checksums))) { + ptv->checksum_mode = CHECKSUM_VALIDATION_DISABLE; + p->flags |= PKT_IGNORE_CHECKSUM; + } + } else { + if (tp_status & TP_STATUS_CSUMNOTREADY) { + p->flags |= PKT_IGNORE_CHECKSUM; + } + } + return true; +} + +static inline int AFPReadFromRingWaitForPacket(AFPThreadVars *ptv) +{ + union thdr h; + struct timeval start_time; + gettimeofday(&start_time, NULL); + + /* busy wait loop until we have packets available */ + while (1) { + if (unlikely(suricata_ctl_flags != 0)) { + break; + } + h.raw = (((union thdr **)ptv->ring.v2)[ptv->frame_offset]); + if (unlikely(h.raw == NULL)) { + return AFP_READ_FAILURE; + } + const unsigned int tp_status = h.h2->tp_status; + if (tp_status == TP_STATUS_KERNEL) { + struct timeval cur_time; + memset(&cur_time, 0, sizeof(cur_time)); + uint64_t milliseconds = + ((cur_time.tv_sec - start_time.tv_sec) * 1000) + + (((1000000 + cur_time.tv_usec - start_time.tv_usec) / 1000) - 1000); + if (milliseconds > 1000) { + break; + } + continue; + } + break; + } + return AFP_READ_OK; +} + /** * \brief AF packet read function for ring * @@ -884,181 +1021,67 @@ static void AFPReleasePacket(Packet *p) */ static int AFPReadFromRing(AFPThreadVars *ptv) { - Packet *p = NULL; union thdr h; - uint8_t emergency_flush = 0; - int read_pkts = 0; - int loop_start = -1; + bool emergency_flush = false; + const unsigned int start_pos = ptv->frame_offset; + /* poll() told us there are frames, so lets wait for at least + * one frame to become available. */ + if (AFPReadFromRingWaitForPacket(ptv) != AFP_READ_OK) + return AFP_READ_FAILURE; - /* Loop till we have packets available */ + /* process the frames in the ring */ while (1) { if (unlikely(suricata_ctl_flags != 0)) { break; } - - /* Read packet from ring */ h.raw = (((union thdr **)ptv->ring.v2)[ptv->frame_offset]); if (unlikely(h.raw == NULL)) { - /* Impossible we reach this point in normal condition, so trigger - * a failure in reading */ - SCReturnInt(AFP_READ_FAILURE); + return AFP_READ_FAILURE; } - - if ((! h.h2->tp_status) || (h.h2->tp_status & TP_STATUS_USER_BUSY)) { - if (read_pkts == 0) { - if (loop_start == -1) { - loop_start = ptv->frame_offset; - } else if (unlikely(loop_start == (int)ptv->frame_offset)) { - SCReturnInt(AFP_READ_OK); - } - if (++ptv->frame_offset >= ptv->req.v2.tp_frame_nr) { - ptv->frame_offset = 0; - } - continue; - } - if ((emergency_flush) && (ptv->flags & AFP_EMERGENCY_MODE)) { - SCReturnInt(AFP_KERNEL_DROP); - } else { - SCReturnInt(AFP_READ_OK); - } + const unsigned int tp_status = h.h2->tp_status; + /* if we find a kernel frame we are done */ + if (unlikely(tp_status == TP_STATUS_KERNEL)) { + break; } - - read_pkts++; - loop_start = -1; - - /* Our packet is still used by suricata, we exit read loop to - * gain some time */ - if (h.h2->tp_status & TP_STATUS_USER_BUSY) { - SCReturnInt(AFP_READ_OK); + /* if in autofp mode the frame is still busy, return to poll */ + if (unlikely(tp_status & TP_STATUS_USER_BUSY)) { + break; } + emergency_flush |= ((tp_status & TP_STATUS_LOSING) != 0); - if ((ptv->flags & AFP_EMERGENCY_MODE) && (emergency_flush == 1)) { + if ((ptv->flags & AFP_EMERGENCY_MODE) && emergency_flush) { h.h2->tp_status = TP_STATUS_KERNEL; goto next_frame; } - p = PacketGetFromQueueOrAlloc(); + Packet *p = PacketGetFromQueueOrAlloc(); if (p == NULL) { - SCReturnInt(AFP_SURI_FAILURE); - } - PKT_SET_SRC(p, PKT_SRC_WIRE); - if (ptv->flags & AFP_BYPASS) { - p->BypassPacketsFlow = AFPBypassCallback; -#ifdef HAVE_PACKET_EBPF - p->afp_v.v4_map_fd = ptv->v4_map_fd; - p->afp_v.v6_map_fd = ptv->v6_map_fd; - p->afp_v.nr_cpus = ptv->ebpf_t_config.cpus_count; -#endif - } - if (ptv->flags & AFP_XDPBYPASS) { - p->BypassPacketsFlow = AFPXDPBypassCallback; -#ifdef HAVE_PACKET_EBPF - p->afp_v.v4_map_fd = ptv->v4_map_fd; - p->afp_v.v6_map_fd = ptv->v6_map_fd; - p->afp_v.nr_cpus = ptv->ebpf_t_config.cpus_count; -#endif - } - - /* Suricata will treat packet so telling it is busy, this - * status will be reset to 0 (ie TP_STATUS_KERNEL) in the release - * function. */ - h.h2->tp_status |= TP_STATUS_USER_BUSY; - - ptv->pkts++; - p->livedev = ptv->livedev; - p->datalink = ptv->datalink; - - if (h.h2->tp_len > h.h2->tp_snaplen) { - SCLogDebug("Packet length (%d) > snaplen (%d), truncating", - h.h2->tp_len, h.h2->tp_snaplen); - } - - /* get vlan id from header */ - if ((ptv->flags & AFP_VLAN_IN_HEADER) && - (h.h2->tp_status & TP_STATUS_VLAN_VALID || h.h2->tp_vlan_tci)) { - p->vlan_id[0] = h.h2->tp_vlan_tci & 0x0fff; - p->vlan_idx = 1; - } - - if (ptv->flags & AFP_ZERO_COPY) { - if (PacketSetData(p, (unsigned char*)h.raw + h.h2->tp_mac, h.h2->tp_snaplen) == -1) { - TmqhOutputPacketpool(ptv->tv, p); - SCReturnInt(AFP_SURI_FAILURE); - } else { - p->afp_v.relptr = h.raw; - p->ReleasePacket = AFPReleasePacket; - p->afp_v.mpeer = ptv->mpeer; - AFPRefSocket(ptv->mpeer); - - p->afp_v.copy_mode = ptv->copy_mode; - if (p->afp_v.copy_mode != AFP_COPY_MODE_NONE) { - p->afp_v.peer = ptv->mpeer->peer; - } else { - p->afp_v.peer = NULL; - } - } - } else { - if (PacketCopyData(p, (unsigned char*)h.raw + h.h2->tp_mac, h.h2->tp_snaplen) == -1) { - /* As we can possibly fail to copy the data due to invalid data, let's - * skip this packet and switch to the next one. - */ - h.h2->tp_status = TP_STATUS_KERNEL; - if (++ptv->frame_offset >= ptv->req.v2.tp_frame_nr) { - ptv->frame_offset = 0; - } - TmqhOutputPacketpool(ptv->tv, p); - SCReturnInt(AFP_SURI_FAILURE); - } - } - - /* Timestamp */ - p->ts.tv_sec = h.h2->tp_sec; - p->ts.tv_usec = h.h2->tp_nsec/1000; - SCLogDebug("pktlen: %" PRIu32 " (pkt %p, pkt data %p)", - GET_PKT_LEN(p), p, GET_PKT_DATA(p)); - - /* We only check for checksum disable */ - if (ptv->checksum_mode == CHECKSUM_VALIDATION_DISABLE) { - p->flags |= PKT_IGNORE_CHECKSUM; - } else if (ptv->checksum_mode == CHECKSUM_VALIDATION_AUTO) { - if (ChecksumAutoModeCheck(ptv->pkts, - SC_ATOMIC_GET(ptv->livedev->pkts), - SC_ATOMIC_GET(ptv->livedev->invalid_checksums))) { - ptv->checksum_mode = CHECKSUM_VALIDATION_DISABLE; - p->flags |= PKT_IGNORE_CHECKSUM; - } - } else { - if (h.h2->tp_status & TP_STATUS_CSUMNOTREADY) { - p->flags |= PKT_IGNORE_CHECKSUM; - } + return AFPSuriFailure(ptv, h); } - if (h.h2->tp_status & TP_STATUS_LOSING) { - emergency_flush = 1; - AFPDumpCounters(ptv); + if (AFPReadFromRingSetupPacket(ptv, h, tp_status, p) == false) { + TmqhOutputPacketpool(ptv->tv, p); + return AFPSuriFailure(ptv, h); } - /* release frame if not in zero copy mode */ - if (!(ptv->flags & AFP_ZERO_COPY)) { + if (!(ptv->flags & AFP_ZERO_COPY)) { h.h2->tp_status = TP_STATUS_KERNEL; } if (TmThreadsSlotProcessPkt(ptv->tv, ptv->slot, p) != TM_ECODE_OK) { - h.h2->tp_status = TP_STATUS_KERNEL; - if (++ptv->frame_offset >= ptv->req.v2.tp_frame_nr) { - ptv->frame_offset = 0; - } - SCReturnInt(AFP_SURI_FAILURE); + return AFPSuriFailure(ptv, h); } - next_frame: if (++ptv->frame_offset >= ptv->req.v2.tp_frame_nr) { ptv->frame_offset = 0; /* Get out of loop to be sure we will reach maintenance tasks */ - SCReturnInt(AFP_READ_OK); + if (ptv->frame_offset == start_pos) + break; } } - + if (emergency_flush) { + AFPDumpCounters(ptv); + } SCReturnInt(AFP_READ_OK); } @@ -1075,21 +1098,8 @@ static inline int AFPParsePacketV3(AFPThreadVars *ptv, struct tpacket_block_desc SCReturnInt(AFP_SURI_FAILURE); } PKT_SET_SRC(p, PKT_SRC_WIRE); - if (ptv->flags & AFP_BYPASS) { - p->BypassPacketsFlow = AFPBypassCallback; -#ifdef HAVE_PACKET_EBPF - p->afp_v.v4_map_fd = ptv->v4_map_fd; - p->afp_v.v6_map_fd = ptv->v6_map_fd; - p->afp_v.nr_cpus = ptv->ebpf_t_config.cpus_count; -#endif - } else if (ptv->flags & AFP_XDPBYPASS) { - p->BypassPacketsFlow = AFPXDPBypassCallback; -#ifdef HAVE_PACKET_EBPF - p->afp_v.v4_map_fd = ptv->v4_map_fd; - p->afp_v.v6_map_fd = ptv->v6_map_fd; - p->afp_v.nr_cpus = ptv->ebpf_t_config.cpus_count; -#endif - } + + AFPReadApplyBypass(ptv, p); ptv->pkts++; p->livedev = ptv->livedev; @@ -1379,13 +1389,16 @@ static int AFPReadAndDiscardFromRing(AFPThreadVars *ptv, struct timeval *synctv, if (h.raw == NULL) { return -1; } - (*discarded_pkts)++; + if (h.h2->tp_status == TP_STATUS_KERNEL) + return 0; + if (((time_t)h.h2->tp_sec > synctv->tv_sec) || ((time_t)h.h2->tp_sec == synctv->tv_sec && (suseconds_t) (h.h2->tp_nsec / 1000) > synctv->tv_usec)) { return 1; } + (*discarded_pkts)++; h.h2->tp_status = TP_STATUS_KERNEL; if (++ptv->frame_offset >= ptv->req.v2.tp_frame_nr) { ptv->frame_offset = 0;