• R/O
  • HTTP
  • SSH
  • HTTPS

linux-2.4.36: Commit

2.4.36-stable kernel tree


Commit MetaInfo

Revision0cf06e61768d8a77aa28cc1444e111496a6553e4 (tree)
Zeit2008-09-22 13:32:20
AutorGilles Espinasse <g.esp@free...>
CommiterWilly Tarreau

Log Message

tcp: Clear probes_out more aggressively in tcp_ack().

backport of 2.6 commit 4b53fb67e385b856a991d402096379dab462170a

Test conditions : 2.4.36 kernel using this iptables configuration
iptables -N SLOWLO
iptables -A SLOWLO -m limit --limit 2/sec --limit-burst 1 -j ACCEPT
iptables -A SLOWLO -j DROP
iptables -A OUTPUT -o lo -p tcp --dport 12000 -j SLOWLO

borrowed ss from iproute2-2.4.7-now-ss020116-try.tar.gz,
I had the same result on 2.4.36.7 as Eric Dumazet on 2.6.25 without the patch with his test program.


This is based upon an excellent bug report from Eric Dumazet.

tcp_ack() should clear ->icsk_probes_out even if there are packets
outstanding. Otherwise if we get a sequence of ACKs while we do have
packets outstanding over and over again, we'll never clear the
probes_out value and eventually think the connection is too sick and
we'll reset it.

This appears to be some "optimization" added to tcp_ack() in the 2.4.x
timeframe. In 2.2.x, probes_out is pretty much always cleared by
tcp_ack().

Here is Eric's original report:


Apparently, we can in some situations reset TCP connections in a couple of seconds when some frames are lost.

In order to reproduce the problem, please try the following program on linux-2.6.25.*

Setup some iptables rules to allow two frames per second sent on loopback interface to tcp destination port 12000
...

Then run the attached program and see the output :

./test_tcp-input
State Recv-Q Send-Q Local Address:Port Peer Address:Port
ESTAB 0 40 127.0.0.1:32769 127.0.0.1:12000 timer:(persist,180ms,1)
State Recv-Q Send-Q Local Address:Port Peer Address:Port
ESTAB 0 40 127.0.0.1:32769 127.0.0.1:12000 timer:(persist,180ms,3)
State Recv-Q Send-Q Local Address:Port Peer Address:Port
ESTAB 0 40 127.0.0.1:32769 127.0.0.1:12000 timer:(persist,180ms,5)
State Recv-Q Send-Q Local Address:Port Peer Address:Port
ESTAB 0 40 127.0.0.1:32769 127.0.0.1:12000 timer:(persist,180ms,7)
State Recv-Q Send-Q Local Address:Port Peer Address:Port
ESTAB 0 40 127.0.0.1:32769 127.0.0.1:12000 timer:(persist,180ms,9)
State Recv-Q Send-Q Local Address:Port Peer Address:Port
ESTAB 0 40 127.0.0.1:32769 127.0.0.1:12000 timer:(persist,180ms,11)
State Recv-Q Send-Q Local Address:Port Peer Address:Port
ESTAB 0 40 127.0.0.1:32769 127.0.0.1:12000 timer:(persist,180ms,13)
State Recv-Q Send-Q Local Address:Port Peer Address:Port
ESTAB 0 40 127.0.0.1:32769 127.0.0.1:12000 timer:(persist,180ms,15)
write(): Connection timed out
wrote 880 bytes but was interrupted after 10 seconds
ESTAB 0 0 127.0.0.1:12000 127.0.0.1:32769
Exiting read() because no data available (4000 ms timeout).
read 860 bytes

While this tcp session makes progress (sending frames with 50 bytes of payload, every 500ms), linux tcp stack decides to reset it, when tcp_retries 2 is reached (default value : 15)

...

Source of program :

/*

  • small producer/consumer program.
  • setup a listener on 127.0.0.1:12000
  • Forks a child
  • child connect to 127.0.0.1, and sends 10 bytes on this tcp socket every 100 ms
  • Father accepts connection, and read all data
    */

#include <sys/types.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <unistd.h>
#include <stdio.h>
#include <time.h>
#include <sys/poll.h>

int port = 12000;
char buffer[4096];
int main(int argc, char *argv[])
{
int lfd = socket(AF_INET, SOCK_STREAM, 0);
struct sockaddr_in socket_address;
time_t t0, t1;
int on = 1, sfd, res;
unsigned long total = 0;
socklen_t alen = sizeof(socket_address);
pid_t pid;

time(&t0);
socket_address.sin_family = AF_INET;
socket_address.sin_port = htons(port);
socket_address.sin_addr.s_addr = htonl(INADDR_LOOPBACK);

if (lfd == -1) {
perror("socket()");
return 1;
}
setsockopt(lfd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(int));
if (bind(lfd, (struct sockaddr *)&socket_address, sizeof(socket_address)) == -1) {
perror("bind");
close(lfd);
return 1;
}
if (listen(lfd, 1) == -1) {
perror("listen()");
close(lfd);
return 1;
}
pid = fork();
if (pid == 0) {
int i, cfd = socket(AF_INET, SOCK_STREAM, 0);
close(lfd);
if (connect(cfd, (struct sockaddr *)&socket_address, sizeof(socket_address)) == -1) {
perror("connect()");
return 1;
}
for (i = 0 ; ;) {
res = write(cfd, "blablabla\n", 10);
if (res > 0) total += res;
else if (res == -1) {
perror("write()");
break;
} else break;
usleep(100000);
if (++i == 10) {
system("ss -on dst 127.0.0.1:12000");
i = 0;
}
}
time(&t1);
fprintf(stderr, "wrote %lu bytes but was interrupted after %g seconds\n", total, difftime(t1, t0));
system("ss -on | grep 127.0.0.1:12000");
close(cfd);
return 0;
}
sfd = accept(lfd, (struct sockaddr *)&socket_address, &alen);
if (sfd == -1) {
perror("accept");
return 1;
}
close(lfd);
while (1) {
struct pollfd pfd[1];
pfd[0].fd = sfd;
pfd[0].events = POLLIN;
if (poll(pfd, 1, 4000) == 0) {
fprintf(stderr, "Exiting read() because no data available (4000 ms timeout).\n");
break;
}
res = read(sfd, buffer, sizeof(buffer));
if (res > 0) total += res;
else if (res == 0) break;
else perror("read()");
}
fprintf(stderr, "read %lu bytes\n", total);
close(sfd);
return 0;
}


Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Gilles Espinasse g.esp@free.fr
Signed-off-by: Willy Tarreau <w@1wt.eu>

Ändern Zusammenfassung

Diff

--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2814,6 +2814,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
28142814 * log. Something worked...
28152815 */
28162816 sk->err_soft = 0;
2817+ tp->probes_out = 0;
28172818 tp->rcv_tstamp = tcp_time_stamp;
28182819 if ((prior_packets = tp->packets_out) == 0)
28192820 goto no_queue;
@@ -2845,8 +2846,6 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
28452846 return 1;
28462847
28472848 no_queue:
2848- tp->probes_out = 0;
2849-
28502849 /* If this ack opens up a zero window, clear backoff. It was
28512850 * being used to time the probes, and is probably far higher than
28522851 * it needs to be for normal retransmission.
Show on old repository browser