I sent this email (below) to a developer working on trying to fix the kernel bug mentioned here. The LAST_ACK issue does clear itself up after a few minutes, so as long as you don't make lots of connections in a short amount of time, you will be fine....
This is all based on what our commercial traffic generator product does in a more automated fashion.
Instructions and patches and scripts follow.
-------- Original Message --------
After starting/stopping a TCP connection several times, I see it get hung with one side in LAST_ACK state and the other in TIME_WAIT. I only see this when I am sending to myself with the attached send-to-self patch, so it's likely that either this patch is not completely right, or that the action is only triggerable when sending to yourself for some reason.
You need a patched iperf that allows binding to a local device. The iperf patch is attached (adds support for the -E argument).
You need to patch the kernel with the sts patch. This allows send-to-self.
Then, run the sts_script.sh, which sets up source based routing on eth0 and eth1. (You may need to edit this script based on your own network devices, etc.)
eth0 should be connected to eth1 with a loopback cable.
After running the sts_script.sh, launch two instances of the modified iperf:
# Run server instance on eth0 iperf -B 10.2.0.1 -E eth0 -s
# And client on eth1. iperf -B 10.2.0.2 -E eth1 -c 10.2.0.1
It took me 4 tries of starting/stopping the client before I saw the LAST_ACK state in netstat. That is the symptom of the bug.
Thanks, Ben
-- Ben Greear <greearb --at-- candelatech.com> Candela Technologies Inc http://www.candelatech.com
Attachment:
sts_script.sh
Description: application/shellscript
diff -urN iperf-2.0.2/include/Settings.hpp iperf-2.0.2.sts/include/Settings.hpp
--- iperf-2.0.2/include/Settings.hpp 2005-05-02 13:09:26.000000000 -0700
+++ iperf-2.0.2.sts/include/Settings.hpp 2007-05-18 14:46:31.000000000 -0700
@@ -112,6 +112,7 @@
char* mHost; // -c
char* mLocalhost; // -B
char* mOutputFileName; // -o
+ char* mBindDev; // -E eth0, --bind_dev eth0
FILE* Extractor_file;
ReportHeader* reporthdr;
MultiHeader* multihdr;
diff -urN iperf-2.0.2/src/Client.cpp iperf-2.0.2.sts/src/Client.cpp
--- iperf-2.0.2/src/Client.cpp 2005-05-02 13:09:27.000000000 -0700
+++ iperf-2.0.2.sts/src/Client.cpp 2007-05-18 14:49:28.000000000 -0700
@@ -318,6 +318,16 @@
WARN_errno( rc == SOCKET_ERROR, "bind" );
}
+#ifndef WIN32
+ // Bind to a local device. This will fail if not user root and may only work on Linux.
+ if ( mSettings->mBindDev != NULL ) {
+ if (setsockopt(mSettings->mSock, SOL_SOCKET, SO_BINDTODEVICE,
+ mSettings->mBindDev, 16)) {
+ WARN_errno( rc == SOCKET_ERROR, "setsockopt-SO_BINDTODEVICE" );
+ }
+ }
+#endif
+
// connect socket
rc = connect( mSettings->mSock, (sockaddr*) &mSettings->peer,
SockAddr_get_sizeof_sockaddr( &mSettings->peer ));
diff -urN iperf-2.0.2/src/Listener.cpp iperf-2.0.2.sts/src/Listener.cpp
--- iperf-2.0.2/src/Listener.cpp 2005-05-02 13:09:27.000000000 -0700
+++ iperf-2.0.2.sts/src/Listener.cpp 2007-05-18 14:49:30.000000000 -0700
@@ -323,6 +323,16 @@
Socklen_t len = sizeof(boolean);
setsockopt( mSettings->mSock, SOL_SOCKET, SO_REUSEADDR, (char*) &boolean, len );
+#ifndef WIN32
+ // Bind to a local device. This will fail if not user root and may only work on Linux.
+ if ( mSettings->mBindDev != NULL ) {
+ if (setsockopt(mSettings->mSock, SOL_SOCKET, SO_BINDTODEVICE,
+ mSettings->mBindDev, 16)) {
+ WARN_errno( rc == SOCKET_ERROR, "setsockopt-SO_BINDTODEVICE" );
+ }
+ }
+#endif
+
// bind socket to server address
#ifdef WIN32
if ( SockAddr_isMulticast( &mSettings->local ) ) {
diff -urN iperf-2.0.2/src/Locale.c iperf-2.0.2.sts/src/Locale.c
--- iperf-2.0.2/src/Locale.c 2005-05-02 13:09:27.000000000 -0700
+++ iperf-2.0.2.sts/src/Locale.c 2007-05-18 14:52:29.000000000 -0700
@@ -139,6 +139,7 @@
-u, --udp use UDP rather than TCP\n\
-w, --window #[KM] TCP window size (socket buffer size)\n\
-B, --bind <host> bind to <host>, an interface or multicast address\n\
+ -E, --bind-dev <dev> bind to <device>, for example eth0 This only does the SO_BINDTODEVICE call.\n\
-C, --compatibility for use with older versions does not sent extra msgs\n\
-M, --mss # set TCP maximum segment size (MTU - 40 bytes)\n\
-N, --nodelay set TCP no delay, disabling Nagle's Algorithm\n\
diff -urN iperf-2.0.2/src/Settings.cpp iperf-2.0.2.sts/src/Settings.cpp
--- iperf-2.0.2/src/Settings.cpp 2005-05-02 13:09:27.000000000 -0700
+++ iperf-2.0.2.sts/src/Settings.cpp 2007-05-18 14:37:51.000000000 -0700
@@ -108,6 +108,7 @@
{"bind", required_argument, NULL, 'B'},
{"compatibility", no_argument, NULL, 'C'},
{"daemon", no_argument, NULL, 'D'},
+{"bind-dev", required_argument, NULL, 'E'},
{"file_input", required_argument, NULL, 'F'},
{"stdin_input", no_argument, NULL, 'I'},
{"mss", required_argument, NULL, 'M'},
@@ -151,6 +152,7 @@
{"IPERF_BIND", required_argument, NULL, 'B'},
{"IPERF_COMPAT", no_argument, NULL, 'C'},
{"IPERF_DAEMON", no_argument, NULL, 'D'},
+{"IPERF_BIND_DEV", required_argument, NULL, 'E'},
{"IPERF_FILE_INPUT", required_argument, NULL, 'F'},
{"IPERF_STDIN_INPUT", no_argument, NULL, 'I'},
{"IPERF_MSS", required_argument, NULL, 'M'},
@@ -167,7 +169,7 @@
#define SHORT_OPTIONS()
-const char short_options[] = "1b:c:df:hi:l:mn:o:p:rst:uvw:x:y:B:CDF:IL:M:NP:RS:T:UVW";
+const char short_options[] = "1b:c:df:hi:l:mn:o:p:rst:uvw:x:y:B:CDE:F:IL:M:NP:RS:T:UVW";
/* -------------------------------------------------------------------
* defaults
@@ -229,7 +231,7 @@
main->mTTL = 1; // -T, link-local TTL
//main->mDomain = kMode_IPv4; // -V,
//main->mSuggestWin = false; // -W, Suggest the window size.
-
+ //main->mBindDev = NULL // -E --bind-dev
} // end Settings
void Settings_Copy( thread_Settings *from, thread_Settings **into ) {
@@ -251,6 +253,10 @@
(*into)->mFileName = new char[ strlen(from->mFileName) + 1];
strcpy( (*into)->mFileName, from->mFileName );
}
+ if ( from->mBindDev != NULL ) {
+ (*into)->mBindDev = new char[ strlen(from->mBindDev) + 1];
+ strcpy( (*into)->mBindDev, from->mBindDev );
+ }
// Zero out certain entries
(*into)->mTID = thread_zeroid();
(*into)->runNext = NULL;
@@ -266,6 +272,7 @@
DELETE_ARRAY( mSettings->mLocalhost );
DELETE_ARRAY( mSettings->mFileName );
DELETE_ARRAY( mSettings->mOutputFileName );
+ DELETE_ARRAY( mSettings->mBindDev );
DELETE_PTR( mSettings );
} // end ~Settings
@@ -566,6 +573,11 @@
setDaemon( mExtSettings );
break;
+ case 'E' : // Bind to a particular device.
+ mExtSettings->mBindDev = new char[strlen(optarg)+1];
+ strcpy( mExtSettings->mBindDev, optarg);
+ break;
+
case 'F' : // Get the input for the data stream from a file
if ( mExtSettings->mThreadMode != kMode_Client ) {
fprintf( stderr, warn_invalid_server_option, option );
@@ -713,12 +725,17 @@
(*listener)->mHost = NULL;
(*listener)->mLocalhost = NULL;
(*listener)->mOutputFileName = NULL;
+ (*listener)->mBindDev = NULL;
(*listener)->mMode = kTest_Normal;
(*listener)->mThreadMode = kMode_Listener;
if ( client->mHost != NULL ) {
(*listener)->mHost = new char[strlen( client->mHost ) + 1];
strcpy( (*listener)->mHost, client->mHost );
}
+ if ( client->mBindDev != NULL ) {
+ (*listener)->mBindDev = new char[strlen( client->mBindDev ) + 1];
+ strcpy( (*listener)->mBindDev, client->mBindDev );
+ }
if ( client->mLocalhost != NULL ) {
(*listener)->mLocalhost = new char[strlen( client->mLocalhost ) + 1];
strcpy( (*listener)->mLocalhost, client->mLocalhost );
@@ -770,6 +787,7 @@
}
(*client)->mFileName = NULL;
(*client)->mHost = NULL;
+ (*client)->mBindDev = NULL;
(*client)->mLocalhost = NULL;
(*client)->mOutputFileName = NULL;
(*client)->mMode = ((flags & RUN_NOW) == 0 ?
@@ -779,6 +797,10 @@
(*client)->mLocalhost = new char[strlen( server->mLocalhost ) + 1];
strcpy( (*client)->mLocalhost, server->mLocalhost );
}
+ if ( server->mBindDev != NULL ) {
+ (*client)->mBindDev = new char[strlen( server->mBindDev ) + 1];
+ strcpy( (*client)->mBindDev, server->mBindDev );
+ }
(*client)->mHost = new char[REPORT_ADDRLEN];
if ( ((sockaddr*)&server->peer)->sa_family == AF_INET ) {
inet_ntop( AF_INET, &((sockaddr_in*)&server->peer)->sin_addr,
diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h
index c0f7aec..88f78b6 100644
--- a/include/linux/inetdevice.h
+++ b/include/linux/inetdevice.h
@@ -31,6 +31,7 @@ struct ipv4_devconf
int no_policy;
int force_igmp_version;
int promote_secondaries;
+ int accept_sts;
void *sysctl;
};
@@ -84,6 +85,7 @@ struct in_device
#define IN_DEV_ARPFILTER(in_dev) (ipv4_devconf.arp_filter || (in_dev)->cnf.arp_filter)
#define IN_DEV_ARP_ANNOUNCE(in_dev) (max(ipv4_devconf.arp_announce, (in_dev)->cnf.arp_announce))
#define IN_DEV_ARP_IGNORE(in_dev) (max(ipv4_devconf.arp_ignore, (in_dev)->cnf.arp_ignore))
+#define IN_DEV_ACCEPT_STS(in_dev) (max(ipv4_devconf.accept_sts, (in_dev)->cnf.accept_sts))
struct in_ifaddr
{
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 47f1c53..6c00bf4 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -496,6 +496,7 @@ enum
NET_IPV4_CONF_ARP_IGNORE=19,
NET_IPV4_CONF_PROMOTE_SECONDARIES=20,
NET_IPV4_CONF_ARP_ACCEPT=21,
+ NET_IPV4_CONF_ACCEPT_STS=22,
__NET_IPV4_CONF_MAX
};
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 7110779..9866f1b 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -419,6 +419,26 @@ static int arp_ignore(struct in_device *in_dev, struct net_device *dev,
return !inet_confirm_addr(dev, sip, tip, scope);
}
+static int is_ip_on_dev(struct net_device* dev, __u32 ip) {
+ int rv = 0;
+ struct in_device* in_dev = in_dev_get(dev);
+ if (in_dev) {
+ struct in_ifaddr *ifa;
+
+ rcu_read_lock();
+ for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) {
+ if (ifa->ifa_address == ip) {
+ /* match */
+ rv = 1;
+ break;
+ }
+ }
+ rcu_read_unlock();
+ in_dev_put(in_dev);
+ }
+ return rv;
+}
+
static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev)
{
struct flowi fl = { .nl_u = { .ip4_u = { .daddr = sip,
@@ -430,8 +450,38 @@ static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev)
if (ip_route_output_key(&rt, &fl) < 0)
return 1;
if (rt->u.dst.dev != dev) {
- NET_INC_STATS_BH(LINUX_MIB_ARPFILTER);
- flag = 1;
+ struct in_device *in_dev = in_dev_get(dev);
+ if (in_dev && IN_DEV_ACCEPT_STS(in_dev) &&
+ (rt->u.dst.dev == &loopback_dev)) {
+ /* Accept these IFF target-ip == dev's IP */
+ /* TODO: Need to force the ARP response back out the interface
+ * instead of letting it route locally.
+ */
+
+ if (is_ip_on_dev(dev, tip)) {
+ /* OK, we'll let this special case slide, so that we can
+ * arp from one local interface to another. This seems
+ * to work, but could use some review. --Ben
+ */
+ /*printk("arp_filter, sip: %x tip: %x dev: %s, STS override (ip on dev)\n",
+ sip, tip, dev->name);*/
+ }
+ else {
+ /*printk("arp_filter, sip: %x tip: %x dev: %s, IP is NOT on dev\n",
+ sip, tip, dev->name);*/
+ NET_INC_STATS_BH(LINUX_MIB_ARPFILTER);
+ flag = 1;
+ }
+ }
+ else {
+ /*printk("arp_filter, not lpbk sip: %x tip: %x dev: %s flgs: %hx dst.dev: %p lbk: %p\n",
+ sip, tip, dev->name, dev->priv_flags, rt->u.dst.dev, &loopback_dev);*/
+ NET_INC_STATS_BH(LINUX_MIB_ARPFILTER);
+ flag = 1;
+ }
+ if (in_dev) {
+ in_dev_put(in_dev);
+ }
}
ip_rt_put(rt);
return flag;
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 7f95e6e..33ac2ed 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1513,6 +1513,15 @@ static struct devinet_sysctl_table {
.proc_handler = &ipv4_doint_and_flush,
.strategy = &ipv4_doint_and_flush_strategy,
},
+ {
+ .ctl_name = NET_IPV4_CONF_ACCEPT_STS,
+ .procname = "accept_sts",
+ .data = &ipv4_devconf.accept_sts,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+
},
.devinet_dev = {
{
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 837f295..9b57bf5 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -206,8 +206,16 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
if (fib_lookup(&fl, &res))
goto last_resort;
- if (res.type != RTN_UNICAST)
- goto e_inval_res;
+ if (res.type != RTN_UNICAST) {
+ if ((res.type == RTN_LOCAL) &&
+ (IN_DEV_ACCEPT_STS(in_dev))) {
+ /* All is OK */
+ }
+ else {
+ goto e_inval_res;
+ }
+ }
+
*spec_dst = FIB_RES_PREFSRC(res);
fib_combine_itag(itag, &res);
#ifdef CONFIG_IP_ROUTE_MULTIPATH