net/rds/cong.c - nest-learning-thermostat/5.6.1/linux - Git at Google

 /*
  * Copyright (c) 2007 Oracle.  All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
  * General Public License (GPL) Version 2, available from the file
  * COPYING in the main directory of this source tree, or the
  * OpenIB.org BSD license below:
  *
  *     Redistribution and use in source and binary forms, with or
  *     without modification, are permitted provided that the following
  *     conditions are met:
  *
  *      - Redistributions of source code must retain the above
  *        copyright notice, this list of conditions and the following
  *        disclaimer.
  *
  *      - Redistributions in binary form must reproduce the above
  *        copyright notice, this list of conditions and the following
  *        disclaimer in the documentation and/or other materials
  *        provided with the distribution.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  *
  */
 #include <linux/slab.h>
 #include <linux/types.h>
 #include <linux/rbtree.h>

 #include <asm-generic/bitops/le.h>

 #include "rds.h"

 /*
  * This file implements the receive side of the unconventional congestion
  * management in RDS.
  *
  * Messages waiting in the receive queue on the receiving socket are accounted
  * against the sockets SO_RCVBUF option value.  Only the payload bytes in the
  * message are accounted for.  If the number of bytes queued equals or exceeds
  * rcvbuf then the socket is congested.  All sends attempted to this socket's
  * address should return block or return -EWOULDBLOCK.
  *
  * Applications are expected to be reasonably tuned such that this situation
  * very rarely occurs.  An application encountering this "back-pressure" is
  * considered a bug.
  *
  * This is implemented by having each node maintain bitmaps which indicate
  * which ports on bound addresses are congested.  As the bitmap changes it is
  * sent through all the connections which terminate in the local address of the
  * bitmap which changed.
  *
  * The bitmaps are allocated as connections are brought up.  This avoids
  * allocation in the interrupt handling path which queues messages on sockets.
  * The dense bitmaps let transports send the entire bitmap on any bitmap change
  * reasonably efficiently.  This is much easier to implement than some
  * finer-grained communication of per-port congestion.  The sender does a very
  * inexpensive bit test to test if the port it's about to send to is congested
  * or not.
  */

 /*
  * Interaction with poll is a tad tricky. We want all processes stuck in
  * poll to wake up and check whether a congested destination became uncongested.
  * The really sad thing is we have no idea which destinations the application
  * wants to send to - we don't even know which rds_connections are involved.
  * So until we implement a more flexible rds poll interface, we have to make
  * do with this:
  * We maintain a global counter that is incremented each time a congestion map
  * update is received. Each rds socket tracks this value, and if rds_poll
  * finds that the saved generation number is smaller than the global generation
  * number, it wakes up the process.
  */
 static atomic_t		rds_cong_generation = ATOMIC_INIT(0);

 /*
  * Congestion monitoring
  */
 static LIST_HEAD(rds_cong_monitor);
 static DEFINE_RWLOCK(rds_cong_monitor_lock);

 /*
  * Yes, a global lock.  It's used so infrequently that it's worth keeping it
  * global to simplify the locking.  It's only used in the following
  * circumstances:
  *
  *  - on connection buildup to associate a conn with its maps
  *  - on map changes to inform conns of a new map to send
  *
  *  It's sadly ordered under the socket callback lock and the connection lock.
  *  Receive paths can mark ports congested from interrupt context so the
  *  lock masks interrupts.
  */
 static DEFINE_SPINLOCK(rds_cong_lock);
 static struct rb_root rds_cong_tree = RB_ROOT;

 static struct rds_cong_map *rds_cong_tree_walk(__be32 addr,
 					       struct rds_cong_map *insert)
 {
 	struct rb_node **p = &rds_cong_tree.rb_node;
 	struct rb_node *parent = NULL;
 	struct rds_cong_map *map;

 	while (*p) {
 		parent = *p;
 		map = rb_entry(parent, struct rds_cong_map, m_rb_node);

 		if (addr < map->m_addr)
 			p = &(*p)->rb_left;
 		else if (addr > map->m_addr)
 			p = &(*p)->rb_right;
 		else
 			return map;
 	}

 	if (insert) {
 		rb_link_node(&insert->m_rb_node, parent, p);
 		rb_insert_color(&insert->m_rb_node, &rds_cong_tree);
 	}
 	return NULL;
 }

 /*
  * There is only ever one bitmap for any address.  Connections try and allocate
  * these bitmaps in the process getting pointers to them.  The bitmaps are only
  * ever freed as the module is removed after all connections have been freed.
  */
 static struct rds_cong_map *rds_cong_from_addr(__be32 addr)
 {
 	struct rds_cong_map *map;
 	struct rds_cong_map *ret = NULL;
 	unsigned long zp;
 	unsigned long i;
 	unsigned long flags;

 	map = kzalloc(sizeof(struct rds_cong_map), GFP_KERNEL);
 	if (!map)
 		return NULL;

 	map->m_addr = addr;
 	init_waitqueue_head(&map->m_waitq);
 	INIT_LIST_HEAD(&map->m_conn_list);

 	for (i = 0; i < RDS_CONG_MAP_PAGES; i++) {
 		zp = get_zeroed_page(GFP_KERNEL);
 		if (zp == 0)
 			goto out;
 		map->m_page_addrs[i] = zp;
 	}

 	spin_lock_irqsave(&rds_cong_lock, flags);
 	ret = rds_cong_tree_walk(addr, map);
 	spin_unlock_irqrestore(&rds_cong_lock, flags);

 	if (!ret) {
 		ret = map;
 		map = NULL;
 	}

 out:
 	if (map) {
 		for (i = 0; i < RDS_CONG_MAP_PAGES && map->m_page_addrs[i]; i++)
 			free_page(map->m_page_addrs[i]);
 		kfree(map);
 	}

 	rdsdebug("map %p for addr %x\n", ret, be32_to_cpu(addr));

 	return ret;
 }

 /*
  * Put the conn on its local map's list.  This is called when the conn is
  * really added to the hash.  It's nested under the rds_conn_lock, sadly.
  */
 void rds_cong_add_conn(struct rds_connection *conn)
 {
 	unsigned long flags;

 	rdsdebug("conn %p now on map %p\n", conn, conn->c_lcong);
 	spin_lock_irqsave(&rds_cong_lock, flags);
 	list_add_tail(&conn->c_map_item, &conn->c_lcong->m_conn_list);
 	spin_unlock_irqrestore(&rds_cong_lock, flags);
 }

 void rds_cong_remove_conn(struct rds_connection *conn)
 {
 	unsigned long flags;

 	rdsdebug("removing conn %p from map %p\n", conn, conn->c_lcong);
 	spin_lock_irqsave(&rds_cong_lock, flags);
 	list_del_init(&conn->c_map_item);
 	spin_unlock_irqrestore(&rds_cong_lock, flags);
 }

 int rds_cong_get_maps(struct rds_connection *conn)
 {
 	conn->c_lcong = rds_cong_from_addr(conn->c_laddr);
 	conn->c_fcong = rds_cong_from_addr(conn->c_faddr);

 	if (!(conn->c_lcong && conn->c_fcong))
 		return -ENOMEM;

 	return 0;
 }

 void rds_cong_queue_updates(struct rds_cong_map *map)
 {
 	struct rds_connection *conn;
 	unsigned long flags;

 	spin_lock_irqsave(&rds_cong_lock, flags);

 	list_for_each_entry(conn, &map->m_conn_list, c_map_item) {
 		if (!test_and_set_bit(0, &conn->c_map_queued)) {
 			rds_stats_inc(s_cong_update_queued);
 			rds_send_xmit(conn);
 		}
 	}

 	spin_unlock_irqrestore(&rds_cong_lock, flags);
 }

 void rds_cong_map_updated(struct rds_cong_map *map, uint64_t portmask)
 {
 	rdsdebug("waking map %p for %pI4\n",
 	  map, &map->m_addr);
 	rds_stats_inc(s_cong_update_received);
 	atomic_inc(&rds_cong_generation);
 	if (waitqueue_active(&map->m_waitq))
 		wake_up(&map->m_waitq);
 	if (waitqueue_active(&rds_poll_waitq))
 		wake_up_all(&rds_poll_waitq);

 	if (portmask && !list_empty(&rds_cong_monitor)) {
 		unsigned long flags;
 		struct rds_sock *rs;

 		read_lock_irqsave(&rds_cong_monitor_lock, flags);
 		list_for_each_entry(rs, &rds_cong_monitor, rs_cong_list) {
 			spin_lock(&rs->rs_lock);
 			rs->rs_cong_notify |= (rs->rs_cong_mask & portmask);
 			rs->rs_cong_mask &= ~portmask;
 			spin_unlock(&rs->rs_lock);
 			if (rs->rs_cong_notify)
 				rds_wake_sk_sleep(rs);
 		}
 		read_unlock_irqrestore(&rds_cong_monitor_lock, flags);
 	}
 }
 EXPORT_SYMBOL_GPL(rds_cong_map_updated);

 int rds_cong_updated_since(unsigned long *recent)
 {
 	unsigned long gen = atomic_read(&rds_cong_generation);

 	if (likely(*recent == gen))
 		return 0;
 	*recent = gen;
 	return 1;
 }

 /*
  * We're called under the locking that protects the sockets receive buffer
  * consumption.  This makes it a lot easier for the caller to only call us
  * when it knows that an existing set bit needs to be cleared, and vice versa.
  * We can't block and we need to deal with concurrent sockets working against
  * the same per-address map.
  */
 void rds_cong_set_bit(struct rds_cong_map *map, __be16 port)
 {
 	unsigned long i;
 	unsigned long off;

 	rdsdebug("setting congestion for %pI4:%u in map %p\n",
 	  &map->m_addr, ntohs(port), map);

 	i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS;
 	off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS;

 	generic___set_le_bit(off, (void *)map->m_page_addrs[i]);
 }

 void rds_cong_clear_bit(struct rds_cong_map *map, __be16 port)
 {
 	unsigned long i;
 	unsigned long off;

 	rdsdebug("clearing congestion for %pI4:%u in map %p\n",
 	  &map->m_addr, ntohs(port), map);

 	i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS;
 	off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS;

 	generic___clear_le_bit(off, (void *)map->m_page_addrs[i]);
 }

 static int rds_cong_test_bit(struct rds_cong_map *map, __be16 port)
 {
 	unsigned long i;
 	unsigned long off;

 	i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS;
 	off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS;

 	return generic_test_le_bit(off, (void *)map->m_page_addrs[i]);
 }

 void rds_cong_add_socket(struct rds_sock *rs)
 {
 	unsigned long flags;

 	write_lock_irqsave(&rds_cong_monitor_lock, flags);
 	if (list_empty(&rs->rs_cong_list))
 		list_add(&rs->rs_cong_list, &rds_cong_monitor);
 	write_unlock_irqrestore(&rds_cong_monitor_lock, flags);
 }

 void rds_cong_remove_socket(struct rds_sock *rs)
 {
 	unsigned long flags;
 	struct rds_cong_map *map;

 	write_lock_irqsave(&rds_cong_monitor_lock, flags);
 	list_del_init(&rs->rs_cong_list);
 	write_unlock_irqrestore(&rds_cong_monitor_lock, flags);

 	/* update congestion map for now-closed port */
 	spin_lock_irqsave(&rds_cong_lock, flags);
 	map = rds_cong_tree_walk(rs->rs_bound_addr, NULL);
 	spin_unlock_irqrestore(&rds_cong_lock, flags);

 	if (map && rds_cong_test_bit(map, rs->rs_bound_port)) {
 		rds_cong_clear_bit(map, rs->rs_bound_port);
 		rds_cong_queue_updates(map);
 	}
 }

 int rds_cong_wait(struct rds_cong_map *map, __be16 port, int nonblock,
 		  struct rds_sock *rs)
 {
 	if (!rds_cong_test_bit(map, port))
 		return 0;
 	if (nonblock) {
 		if (rs && rs->rs_cong_monitor) {
 			unsigned long flags;

 			/* It would have been nice to have an atomic set_bit on
 			 * a uint64_t. */
 			spin_lock_irqsave(&rs->rs_lock, flags);
 			rs->rs_cong_mask |= RDS_CONG_MONITOR_MASK(ntohs(port));
 			spin_unlock_irqrestore(&rs->rs_lock, flags);

 			/* Test again - a congestion update may have arrived in
 			 * the meantime. */
 			if (!rds_cong_test_bit(map, port))
 				return 0;
 		}
 		rds_stats_inc(s_cong_send_error);
 		return -ENOBUFS;
 	}

 	rds_stats_inc(s_cong_send_blocked);
 	rdsdebug("waiting on map %p for port %u\n", map, be16_to_cpu(port));

 	return wait_event_interruptible(map->m_waitq,
 					!rds_cong_test_bit(map, port));
 }

 void rds_cong_exit(void)
 {
 	struct rb_node *node;
 	struct rds_cong_map *map;
 	unsigned long i;

 	while ((node = rb_first(&rds_cong_tree))) {
 		map = rb_entry(node, struct rds_cong_map, m_rb_node);
 		rdsdebug("freeing map %p\n", map);
 		rb_erase(&map->m_rb_node, &rds_cong_tree);
 		for (i = 0; i < RDS_CONG_MAP_PAGES && map->m_page_addrs[i]; i++)
 			free_page(map->m_page_addrs[i]);
 		kfree(map);
 	}
 }

 /*
  * Allocate a RDS message containing a congestion update.
  */
 struct rds_message *rds_cong_update_alloc(struct rds_connection *conn)
 {
 	struct rds_cong_map *map = conn->c_lcong;
 	struct rds_message *rm;

 	rm = rds_message_map_pages(map->m_page_addrs, RDS_CONG_MAP_BYTES);
 	if (!IS_ERR(rm))
 		rm->m_inc.i_hdr.h_flags = RDS_FLAG_CONG_BITMAP;

 	return rm;
 }
	/*
	* Copyright (c) 2007 Oracle. All rights reserved.
	*
	* This software is available to you under a choice of one of two
	* licenses. You may choose to be licensed under the terms of the GNU
	* General Public License (GPL) Version 2, available from the file
	* COPYING in the main directory of this source tree, or the
	* OpenIB.org BSD license below:
	*
	* Redistribution and use in source and binary forms, with or
	* without modification, are permitted provided that the following
	* conditions are met:
	*
	* - Redistributions of source code must retain the above
	* copyright notice, this list of conditions and the following
	* disclaimer.
	*
	* - Redistributions in binary form must reproduce the above
	* copyright notice, this list of conditions and the following
	* disclaimer in the documentation and/or other materials
	* provided with the distribution.
	*
	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
	* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
	* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
	* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
	* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
	* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
	* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
	* SOFTWARE.
	*
	*/
	#include <linux/slab.h>
	#include <linux/types.h>
	#include <linux/rbtree.h>

	#include <asm-generic/bitops/le.h>

	#include "rds.h"

	/*
	* This file implements the receive side of the unconventional congestion
	* management in RDS.
	*
	* Messages waiting in the receive queue on the receiving socket are accounted
	* against the sockets SO_RCVBUF option value. Only the payload bytes in the
	* message are accounted for. If the number of bytes queued equals or exceeds
	* rcvbuf then the socket is congested. All sends attempted to this socket's
	* address should return block or return -EWOULDBLOCK.
	*
	* Applications are expected to be reasonably tuned such that this situation
	* very rarely occurs. An application encountering this "back-pressure" is
	* considered a bug.
	*
	* This is implemented by having each node maintain bitmaps which indicate
	* which ports on bound addresses are congested. As the bitmap changes it is
	* sent through all the connections which terminate in the local address of the
	* bitmap which changed.
	*
	* The bitmaps are allocated as connections are brought up. This avoids
	* allocation in the interrupt handling path which queues messages on sockets.
	* The dense bitmaps let transports send the entire bitmap on any bitmap change
	* reasonably efficiently. This is much easier to implement than some
	* finer-grained communication of per-port congestion. The sender does a very
	* inexpensive bit test to test if the port it's about to send to is congested
	* or not.
	*/

	/*
	* Interaction with poll is a tad tricky. We want all processes stuck in
	* poll to wake up and check whether a congested destination became uncongested.
	* The really sad thing is we have no idea which destinations the application
	* wants to send to - we don't even know which rds_connections are involved.
	* So until we implement a more flexible rds poll interface, we have to make
	* do with this:
	* We maintain a global counter that is incremented each time a congestion map
	* update is received. Each rds socket tracks this value, and if rds_poll
	* finds that the saved generation number is smaller than the global generation
	* number, it wakes up the process.
	*/
	static atomic_t rds_cong_generation = ATOMIC_INIT(0);

	/*
	* Congestion monitoring
	*/
	static LIST_HEAD(rds_cong_monitor);
	static DEFINE_RWLOCK(rds_cong_monitor_lock);

	/*
	* Yes, a global lock. It's used so infrequently that it's worth keeping it
	* global to simplify the locking. It's only used in the following
	* circumstances:
	*
	* - on connection buildup to associate a conn with its maps
	* - on map changes to inform conns of a new map to send
	*
	* It's sadly ordered under the socket callback lock and the connection lock.
	* Receive paths can mark ports congested from interrupt context so the
	* lock masks interrupts.
	*/
	static DEFINE_SPINLOCK(rds_cong_lock);
	static struct rb_root rds_cong_tree = RB_ROOT;

	static struct rds_cong_map *rds_cong_tree_walk(__be32 addr,
	struct rds_cong_map *insert)
	{
	struct rb_node **p = &rds_cong_tree.rb_node;
	struct rb_node *parent = NULL;
	struct rds_cong_map *map;

	while (*p) {
	parent = *p;
	map = rb_entry(parent, struct rds_cong_map, m_rb_node);

	if (addr < map->m_addr)
	p = &(*p)->rb_left;
	else if (addr > map->m_addr)
	p = &(*p)->rb_right;
	else
	return map;
	}

	if (insert) {
	rb_link_node(&insert->m_rb_node, parent, p);
	rb_insert_color(&insert->m_rb_node, &rds_cong_tree);
	}
	return NULL;
	}

	/*
	* There is only ever one bitmap for any address. Connections try and allocate
	* these bitmaps in the process getting pointers to them. The bitmaps are only
	* ever freed as the module is removed after all connections have been freed.
	*/
	static struct rds_cong_map *rds_cong_from_addr(__be32 addr)
	{
	struct rds_cong_map *map;
	struct rds_cong_map *ret = NULL;
	unsigned long zp;
	unsigned long i;
	unsigned long flags;

	map = kzalloc(sizeof(struct rds_cong_map), GFP_KERNEL);
	if (!map)
	return NULL;

	map->m_addr = addr;
	init_waitqueue_head(&map->m_waitq);
	INIT_LIST_HEAD(&map->m_conn_list);

	for (i = 0; i < RDS_CONG_MAP_PAGES; i++) {
	zp = get_zeroed_page(GFP_KERNEL);
	if (zp == 0)
	goto out;
	map->m_page_addrs[i] = zp;
	}

	spin_lock_irqsave(&rds_cong_lock, flags);
	ret = rds_cong_tree_walk(addr, map);
	spin_unlock_irqrestore(&rds_cong_lock, flags);

	if (!ret) {
	ret = map;
	map = NULL;
	}

	out:
	if (map) {
	for (i = 0; i < RDS_CONG_MAP_PAGES && map->m_page_addrs[i]; i++)
	free_page(map->m_page_addrs[i]);
	kfree(map);
	}

	rdsdebug("map %p for addr %x\n", ret, be32_to_cpu(addr));

	return ret;
	}

	/*
	* Put the conn on its local map's list. This is called when the conn is
	* really added to the hash. It's nested under the rds_conn_lock, sadly.
	*/
	void rds_cong_add_conn(struct rds_connection *conn)
	{
	unsigned long flags;

	rdsdebug("conn %p now on map %p\n", conn, conn->c_lcong);
	spin_lock_irqsave(&rds_cong_lock, flags);
	list_add_tail(&conn->c_map_item, &conn->c_lcong->m_conn_list);
	spin_unlock_irqrestore(&rds_cong_lock, flags);
	}

	void rds_cong_remove_conn(struct rds_connection *conn)
	{
	unsigned long flags;

	rdsdebug("removing conn %p from map %p\n", conn, conn->c_lcong);
	spin_lock_irqsave(&rds_cong_lock, flags);
	list_del_init(&conn->c_map_item);
	spin_unlock_irqrestore(&rds_cong_lock, flags);
	}

	int rds_cong_get_maps(struct rds_connection *conn)
	{
	conn->c_lcong = rds_cong_from_addr(conn->c_laddr);
	conn->c_fcong = rds_cong_from_addr(conn->c_faddr);

	if (!(conn->c_lcong && conn->c_fcong))
	return -ENOMEM;

	return 0;
	}

	void rds_cong_queue_updates(struct rds_cong_map *map)
	{
	struct rds_connection *conn;
	unsigned long flags;

	spin_lock_irqsave(&rds_cong_lock, flags);

	list_for_each_entry(conn, &map->m_conn_list, c_map_item) {
	if (!test_and_set_bit(0, &conn->c_map_queued)) {
	rds_stats_inc(s_cong_update_queued);
	rds_send_xmit(conn);
	}
	}

	spin_unlock_irqrestore(&rds_cong_lock, flags);
	}

	void rds_cong_map_updated(struct rds_cong_map *map, uint64_t portmask)
	{
	rdsdebug("waking map %p for %pI4\n",
	map, &map->m_addr);
	rds_stats_inc(s_cong_update_received);
	atomic_inc(&rds_cong_generation);
	if (waitqueue_active(&map->m_waitq))
	wake_up(&map->m_waitq);
	if (waitqueue_active(&rds_poll_waitq))
	wake_up_all(&rds_poll_waitq);

	if (portmask && !list_empty(&rds_cong_monitor)) {
	unsigned long flags;
	struct rds_sock *rs;

	read_lock_irqsave(&rds_cong_monitor_lock, flags);
	list_for_each_entry(rs, &rds_cong_monitor, rs_cong_list) {
	spin_lock(&rs->rs_lock);
	rs->rs_cong_notify \|= (rs->rs_cong_mask & portmask);
	rs->rs_cong_mask &= ~portmask;
	spin_unlock(&rs->rs_lock);
	if (rs->rs_cong_notify)
	rds_wake_sk_sleep(rs);
	}
	read_unlock_irqrestore(&rds_cong_monitor_lock, flags);
	}
	}
	EXPORT_SYMBOL_GPL(rds_cong_map_updated);

	int rds_cong_updated_since(unsigned long *recent)
	{
	unsigned long gen = atomic_read(&rds_cong_generation);

	if (likely(*recent == gen))
	return 0;
	*recent = gen;
	return 1;
	}

	/*
	* We're called under the locking that protects the sockets receive buffer
	* consumption. This makes it a lot easier for the caller to only call us
	* when it knows that an existing set bit needs to be cleared, and vice versa.
	* We can't block and we need to deal with concurrent sockets working against
	* the same per-address map.
	*/
	void rds_cong_set_bit(struct rds_cong_map *map, __be16 port)
	{
	unsigned long i;
	unsigned long off;

	rdsdebug("setting congestion for %pI4:%u in map %p\n",
	&map->m_addr, ntohs(port), map);

	i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS;
	off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS;

	generic___set_le_bit(off, (void *)map->m_page_addrs[i]);
	}

	void rds_cong_clear_bit(struct rds_cong_map *map, __be16 port)
	{
	unsigned long i;
	unsigned long off;

	rdsdebug("clearing congestion for %pI4:%u in map %p\n",
	&map->m_addr, ntohs(port), map);

	i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS;
	off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS;

	generic___clear_le_bit(off, (void *)map->m_page_addrs[i]);
	}

	static int rds_cong_test_bit(struct rds_cong_map *map, __be16 port)
	{
	unsigned long i;
	unsigned long off;

	i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS;
	off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS;

	return generic_test_le_bit(off, (void *)map->m_page_addrs[i]);
	}

	void rds_cong_add_socket(struct rds_sock *rs)
	{
	unsigned long flags;

	write_lock_irqsave(&rds_cong_monitor_lock, flags);
	if (list_empty(&rs->rs_cong_list))
	list_add(&rs->rs_cong_list, &rds_cong_monitor);
	write_unlock_irqrestore(&rds_cong_monitor_lock, flags);
	}

	void rds_cong_remove_socket(struct rds_sock *rs)
	{
	unsigned long flags;
	struct rds_cong_map *map;

	write_lock_irqsave(&rds_cong_monitor_lock, flags);
	list_del_init(&rs->rs_cong_list);
	write_unlock_irqrestore(&rds_cong_monitor_lock, flags);

	/* update congestion map for now-closed port */
	spin_lock_irqsave(&rds_cong_lock, flags);
	map = rds_cong_tree_walk(rs->rs_bound_addr, NULL);
	spin_unlock_irqrestore(&rds_cong_lock, flags);

	if (map && rds_cong_test_bit(map, rs->rs_bound_port)) {
	rds_cong_clear_bit(map, rs->rs_bound_port);
	rds_cong_queue_updates(map);
	}
	}

	int rds_cong_wait(struct rds_cong_map *map, __be16 port, int nonblock,
	struct rds_sock *rs)
	{
	if (!rds_cong_test_bit(map, port))
	return 0;
	if (nonblock) {
	if (rs && rs->rs_cong_monitor) {
	unsigned long flags;

	/* It would have been nice to have an atomic set_bit on
	* a uint64_t. */
	spin_lock_irqsave(&rs->rs_lock, flags);
	rs->rs_cong_mask \|= RDS_CONG_MONITOR_MASK(ntohs(port));
	spin_unlock_irqrestore(&rs->rs_lock, flags);

	/* Test again - a congestion update may have arrived in
	* the meantime. */
	if (!rds_cong_test_bit(map, port))
	return 0;
	}
	rds_stats_inc(s_cong_send_error);
	return -ENOBUFS;
	}

	rds_stats_inc(s_cong_send_blocked);
	rdsdebug("waiting on map %p for port %u\n", map, be16_to_cpu(port));

	return wait_event_interruptible(map->m_waitq,
	!rds_cong_test_bit(map, port));
	}

	void rds_cong_exit(void)
	{
	struct rb_node *node;
	struct rds_cong_map *map;
	unsigned long i;

	while ((node = rb_first(&rds_cong_tree))) {
	map = rb_entry(node, struct rds_cong_map, m_rb_node);
	rdsdebug("freeing map %p\n", map);
	rb_erase(&map->m_rb_node, &rds_cong_tree);
	for (i = 0; i < RDS_CONG_MAP_PAGES && map->m_page_addrs[i]; i++)
	free_page(map->m_page_addrs[i]);
	kfree(map);
	}
	}

	/*
	* Allocate a RDS message containing a congestion update.
	*/
	struct rds_message rds_cong_update_alloc(struct rds_connection conn)
	{
	struct rds_cong_map *map = conn->c_lcong;
	struct rds_message *rm;

	rm = rds_message_map_pages(map->m_page_addrs, RDS_CONG_MAP_BYTES);
	if (!IS_ERR(rm))
	rm->m_inc.i_hdr.h_flags = RDS_FLAG_CONG_BITMAP;

	return rm;
	}