mirror of
https://github.com/FEX-Emu/linux.git
synced 2024-12-18 15:09:53 +00:00
Merge master.kernel.org:/pub/scm/linux/kernel/git/davem/net-2.6
This commit is contained in:
commit
5432ebb5f6
145
Documentation/networking/fib_trie.txt
Normal file
145
Documentation/networking/fib_trie.txt
Normal file
@ -0,0 +1,145 @@
|
||||
LC-trie implementation notes.
|
||||
|
||||
Node types
|
||||
----------
|
||||
leaf
|
||||
An end node with data. This has a copy of the relevant key, along
|
||||
with 'hlist' with routing table entries sorted by prefix length.
|
||||
See struct leaf and struct leaf_info.
|
||||
|
||||
trie node or tnode
|
||||
An internal node, holding an array of child (leaf or tnode) pointers,
|
||||
indexed through a subset of the key. See Level Compression.
|
||||
|
||||
A few concepts explained
|
||||
------------------------
|
||||
Bits (tnode)
|
||||
The number of bits in the key segment used for indexing into the
|
||||
child array - the "child index". See Level Compression.
|
||||
|
||||
Pos (tnode)
|
||||
The position (in the key) of the key segment used for indexing into
|
||||
the child array. See Path Compression.
|
||||
|
||||
Path Compression / skipped bits
|
||||
Any given tnode is linked to from the child array of its parent, using
|
||||
a segment of the key specified by the parent's "pos" and "bits"
|
||||
In certain cases, this tnode's own "pos" will not be immediately
|
||||
adjacent to the parent (pos+bits), but there will be some bits
|
||||
in the key skipped over because they represent a single path with no
|
||||
deviations. These "skipped bits" constitute Path Compression.
|
||||
Note that the search algorithm will simply skip over these bits when
|
||||
searching, making it necessary to save the keys in the leaves to
|
||||
verify that they actually do match the key we are searching for.
|
||||
|
||||
Level Compression / child arrays
|
||||
the trie is kept level balanced moving, under certain conditions, the
|
||||
children of a full child (see "full_children") up one level, so that
|
||||
instead of a pure binary tree, each internal node ("tnode") may
|
||||
contain an arbitrarily large array of links to several children.
|
||||
Conversely, a tnode with a mostly empty child array (see empty_children)
|
||||
may be "halved", having some of its children moved downwards one level,
|
||||
in order to avoid ever-increasing child arrays.
|
||||
|
||||
empty_children
|
||||
the number of positions in the child array of a given tnode that are
|
||||
NULL.
|
||||
|
||||
full_children
|
||||
the number of children of a given tnode that aren't path compressed.
|
||||
(in other words, they aren't NULL or leaves and their "pos" is equal
|
||||
to this tnode's "pos"+"bits").
|
||||
|
||||
(The word "full" here is used more in the sense of "complete" than
|
||||
as the opposite of "empty", which might be a tad confusing.)
|
||||
|
||||
Comments
|
||||
---------
|
||||
|
||||
We have tried to keep the structure of the code as close to fib_hash as
|
||||
possible to allow verification and help up reviewing.
|
||||
|
||||
fib_find_node()
|
||||
A good start for understanding this code. This function implements a
|
||||
straightforward trie lookup.
|
||||
|
||||
fib_insert_node()
|
||||
Inserts a new leaf node in the trie. This is bit more complicated than
|
||||
fib_find_node(). Inserting a new node means we might have to run the
|
||||
level compression algorithm on part of the trie.
|
||||
|
||||
trie_leaf_remove()
|
||||
Looks up a key, deletes it and runs the level compression algorithm.
|
||||
|
||||
trie_rebalance()
|
||||
The key function for the dynamic trie after any change in the trie
|
||||
it is run to optimize and reorganize. Tt will walk the trie upwards
|
||||
towards the root from a given tnode, doing a resize() at each step
|
||||
to implement level compression.
|
||||
|
||||
resize()
|
||||
Analyzes a tnode and optimizes the child array size by either inflating
|
||||
or shrinking it repeatedly until it fullfills the criteria for optimal
|
||||
level compression. This part follows the original paper pretty closely
|
||||
and there may be some room for experimentation here.
|
||||
|
||||
inflate()
|
||||
Doubles the size of the child array within a tnode. Used by resize().
|
||||
|
||||
halve()
|
||||
Halves the size of the child array within a tnode - the inverse of
|
||||
inflate(). Used by resize();
|
||||
|
||||
fn_trie_insert(), fn_trie_delete(), fn_trie_select_default()
|
||||
The route manipulation functions. Should conform pretty closely to the
|
||||
corresponding functions in fib_hash.
|
||||
|
||||
fn_trie_flush()
|
||||
This walks the full trie (using nextleaf()) and searches for empty
|
||||
leaves which have to be removed.
|
||||
|
||||
fn_trie_dump()
|
||||
Dumps the routing table ordered by prefix length. This is somewhat
|
||||
slower than the corresponding fib_hash function, as we have to walk the
|
||||
entire trie for each prefix length. In comparison, fib_hash is organized
|
||||
as one "zone"/hash per prefix length.
|
||||
|
||||
Locking
|
||||
-------
|
||||
|
||||
fib_lock is used for an RW-lock in the same way that this is done in fib_hash.
|
||||
However, the functions are somewhat separated for other possible locking
|
||||
scenarios. It might conceivably be possible to run trie_rebalance via RCU
|
||||
to avoid read_lock in the fn_trie_lookup() function.
|
||||
|
||||
Main lookup mechanism
|
||||
---------------------
|
||||
fn_trie_lookup() is the main lookup function.
|
||||
|
||||
The lookup is in its simplest form just like fib_find_node(). We descend the
|
||||
trie, key segment by key segment, until we find a leaf. check_leaf() does
|
||||
the fib_semantic_match in the leaf's sorted prefix hlist.
|
||||
|
||||
If we find a match, we are done.
|
||||
|
||||
If we don't find a match, we enter prefix matching mode. The prefix length,
|
||||
starting out at the same as the key length, is reduced one step at a time,
|
||||
and we backtrack upwards through the trie trying to find a longest matching
|
||||
prefix. The goal is always to reach a leaf and get a positive result from the
|
||||
fib_semantic_match mechanism.
|
||||
|
||||
Inside each tnode, the search for longest matching prefix consists of searching
|
||||
through the child array, chopping off (zeroing) the least significant "1" of
|
||||
the child index until we find a match or the child index consists of nothing but
|
||||
zeros.
|
||||
|
||||
At this point we backtrack (t->stats.backtrack++) up the trie, continuing to
|
||||
chop off part of the key in order to find the longest matching prefix.
|
||||
|
||||
At this point we will repeatedly descend subtries to look for a match, and there
|
||||
are some optimizations available that can provide us with "shortcuts" to avoid
|
||||
descending into dead ends. Look for "HL_OPTIMIZE" sections in the code.
|
||||
|
||||
To alleviate any doubts about the correctness of the route selection process,
|
||||
a new netlink operation has been added. Look for NETLINK_FIB_LOOKUP, which
|
||||
gives userland access to fib_lookup().
|
@ -135,10 +135,8 @@ static int shaper_start_xmit(struct sk_buff *skb, struct net_device *dev)
|
||||
{
|
||||
struct shaper *shaper = dev->priv;
|
||||
struct sk_buff *ptr;
|
||||
|
||||
if (down_trylock(&shaper->sem))
|
||||
return -1;
|
||||
|
||||
|
||||
spin_lock(&shaper->lock);
|
||||
ptr=shaper->sendq.prev;
|
||||
|
||||
/*
|
||||
@ -232,7 +230,7 @@ static int shaper_start_xmit(struct sk_buff *skb, struct net_device *dev)
|
||||
shaper->stats.collisions++;
|
||||
}
|
||||
shaper_kick(shaper);
|
||||
up(&shaper->sem);
|
||||
spin_unlock(&shaper->lock);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -271,11 +269,9 @@ static void shaper_timer(unsigned long data)
|
||||
{
|
||||
struct shaper *shaper = (struct shaper *)data;
|
||||
|
||||
if (!down_trylock(&shaper->sem)) {
|
||||
shaper_kick(shaper);
|
||||
up(&shaper->sem);
|
||||
} else
|
||||
mod_timer(&shaper->timer, jiffies);
|
||||
spin_lock(&shaper->lock);
|
||||
shaper_kick(shaper);
|
||||
spin_unlock(&shaper->lock);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -331,21 +327,6 @@ static void shaper_kick(struct shaper *shaper)
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Flush the shaper queues on a closedown
|
||||
*/
|
||||
|
||||
static void shaper_flush(struct shaper *shaper)
|
||||
{
|
||||
struct sk_buff *skb;
|
||||
|
||||
down(&shaper->sem);
|
||||
while((skb=skb_dequeue(&shaper->sendq))!=NULL)
|
||||
dev_kfree_skb(skb);
|
||||
shaper_kick(shaper);
|
||||
up(&shaper->sem);
|
||||
}
|
||||
|
||||
/*
|
||||
* Bring the interface up. We just disallow this until a
|
||||
* bind.
|
||||
@ -375,7 +356,15 @@ static int shaper_open(struct net_device *dev)
|
||||
static int shaper_close(struct net_device *dev)
|
||||
{
|
||||
struct shaper *shaper=dev->priv;
|
||||
shaper_flush(shaper);
|
||||
struct sk_buff *skb;
|
||||
|
||||
while ((skb = skb_dequeue(&shaper->sendq)) != NULL)
|
||||
dev_kfree_skb(skb);
|
||||
|
||||
spin_lock_bh(&shaper->lock);
|
||||
shaper_kick(shaper);
|
||||
spin_unlock_bh(&shaper->lock);
|
||||
|
||||
del_timer_sync(&shaper->timer);
|
||||
return 0;
|
||||
}
|
||||
@ -576,6 +565,7 @@ static void shaper_init_priv(struct net_device *dev)
|
||||
init_timer(&sh->timer);
|
||||
sh->timer.function=shaper_timer;
|
||||
sh->timer.data=(unsigned long)sh;
|
||||
spin_lock_init(&sh->lock);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -7,6 +7,7 @@
|
||||
/* PCI config registers */
|
||||
#define PCI_DEV_REG1 0x40
|
||||
#define PCI_DEV_REG2 0x44
|
||||
#define PCI_REV_DESC 0x4
|
||||
|
||||
#define PCI_STATUS_ERROR_BITS (PCI_STATUS_DETECTED_PARITY | \
|
||||
PCI_STATUS_SIG_SYSTEM_ERROR | \
|
||||
|
@ -66,8 +66,8 @@
|
||||
|
||||
#define DRV_MODULE_NAME "tg3"
|
||||
#define PFX DRV_MODULE_NAME ": "
|
||||
#define DRV_MODULE_VERSION "3.32"
|
||||
#define DRV_MODULE_RELDATE "June 24, 2005"
|
||||
#define DRV_MODULE_VERSION "3.33"
|
||||
#define DRV_MODULE_RELDATE "July 5, 2005"
|
||||
|
||||
#define TG3_DEF_MAC_MODE 0
|
||||
#define TG3_DEF_RX_MODE 0
|
||||
@ -5117,7 +5117,7 @@ static void tg3_set_bdinfo(struct tg3 *tp, u32 bdinfo_addr,
|
||||
}
|
||||
|
||||
static void __tg3_set_rx_mode(struct net_device *);
|
||||
static void tg3_set_coalesce(struct tg3 *tp, struct ethtool_coalesce *ec)
|
||||
static void __tg3_set_coalesce(struct tg3 *tp, struct ethtool_coalesce *ec)
|
||||
{
|
||||
tw32(HOSTCC_RXCOL_TICKS, ec->rx_coalesce_usecs);
|
||||
tw32(HOSTCC_TXCOL_TICKS, ec->tx_coalesce_usecs);
|
||||
@ -5460,7 +5460,7 @@ static int tg3_reset_hw(struct tg3 *tp)
|
||||
udelay(10);
|
||||
}
|
||||
|
||||
tg3_set_coalesce(tp, &tp->coal);
|
||||
__tg3_set_coalesce(tp, &tp->coal);
|
||||
|
||||
/* set status block DMA address */
|
||||
tw32(HOSTCC_STATUS_BLK_HOST_ADDR + TG3_64BIT_REG_HIGH,
|
||||
@ -7821,6 +7821,60 @@ static int tg3_get_coalesce(struct net_device *dev, struct ethtool_coalesce *ec)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int tg3_set_coalesce(struct net_device *dev, struct ethtool_coalesce *ec)
|
||||
{
|
||||
struct tg3 *tp = netdev_priv(dev);
|
||||
u32 max_rxcoal_tick_int = 0, max_txcoal_tick_int = 0;
|
||||
u32 max_stat_coal_ticks = 0, min_stat_coal_ticks = 0;
|
||||
|
||||
if (!(tp->tg3_flags2 & TG3_FLG2_5705_PLUS)) {
|
||||
max_rxcoal_tick_int = MAX_RXCOAL_TICK_INT;
|
||||
max_txcoal_tick_int = MAX_TXCOAL_TICK_INT;
|
||||
max_stat_coal_ticks = MAX_STAT_COAL_TICKS;
|
||||
min_stat_coal_ticks = MIN_STAT_COAL_TICKS;
|
||||
}
|
||||
|
||||
if ((ec->rx_coalesce_usecs > MAX_RXCOL_TICKS) ||
|
||||
(ec->tx_coalesce_usecs > MAX_TXCOL_TICKS) ||
|
||||
(ec->rx_max_coalesced_frames > MAX_RXMAX_FRAMES) ||
|
||||
(ec->tx_max_coalesced_frames > MAX_TXMAX_FRAMES) ||
|
||||
(ec->rx_coalesce_usecs_irq > max_rxcoal_tick_int) ||
|
||||
(ec->tx_coalesce_usecs_irq > max_txcoal_tick_int) ||
|
||||
(ec->rx_max_coalesced_frames_irq > MAX_RXCOAL_MAXF_INT) ||
|
||||
(ec->tx_max_coalesced_frames_irq > MAX_TXCOAL_MAXF_INT) ||
|
||||
(ec->stats_block_coalesce_usecs > max_stat_coal_ticks) ||
|
||||
(ec->stats_block_coalesce_usecs < min_stat_coal_ticks))
|
||||
return -EINVAL;
|
||||
|
||||
/* No rx interrupts will be generated if both are zero */
|
||||
if ((ec->rx_coalesce_usecs == 0) &&
|
||||
(ec->rx_max_coalesced_frames == 0))
|
||||
return -EINVAL;
|
||||
|
||||
/* No tx interrupts will be generated if both are zero */
|
||||
if ((ec->tx_coalesce_usecs == 0) &&
|
||||
(ec->tx_max_coalesced_frames == 0))
|
||||
return -EINVAL;
|
||||
|
||||
/* Only copy relevant parameters, ignore all others. */
|
||||
tp->coal.rx_coalesce_usecs = ec->rx_coalesce_usecs;
|
||||
tp->coal.tx_coalesce_usecs = ec->tx_coalesce_usecs;
|
||||
tp->coal.rx_max_coalesced_frames = ec->rx_max_coalesced_frames;
|
||||
tp->coal.tx_max_coalesced_frames = ec->tx_max_coalesced_frames;
|
||||
tp->coal.rx_coalesce_usecs_irq = ec->rx_coalesce_usecs_irq;
|
||||
tp->coal.tx_coalesce_usecs_irq = ec->tx_coalesce_usecs_irq;
|
||||
tp->coal.rx_max_coalesced_frames_irq = ec->rx_max_coalesced_frames_irq;
|
||||
tp->coal.tx_max_coalesced_frames_irq = ec->tx_max_coalesced_frames_irq;
|
||||
tp->coal.stats_block_coalesce_usecs = ec->stats_block_coalesce_usecs;
|
||||
|
||||
if (netif_running(dev)) {
|
||||
tg3_full_lock(tp, 0);
|
||||
__tg3_set_coalesce(tp, &tp->coal);
|
||||
tg3_full_unlock(tp);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct ethtool_ops tg3_ethtool_ops = {
|
||||
.get_settings = tg3_get_settings,
|
||||
.set_settings = tg3_set_settings,
|
||||
@ -7856,6 +7910,7 @@ static struct ethtool_ops tg3_ethtool_ops = {
|
||||
.get_stats_count = tg3_get_stats_count,
|
||||
.get_ethtool_stats = tg3_get_ethtool_stats,
|
||||
.get_coalesce = tg3_get_coalesce,
|
||||
.set_coalesce = tg3_set_coalesce,
|
||||
};
|
||||
|
||||
static void __devinit tg3_get_eeprom_size(struct tg3 *tp)
|
||||
@ -9800,6 +9855,12 @@ static void __devinit tg3_init_coal(struct tg3 *tp)
|
||||
ec->tx_coalesce_usecs = LOW_TXCOL_TICKS_CLRTCKS;
|
||||
ec->tx_coalesce_usecs_irq = DEFAULT_TXCOAL_TICK_INT_CLRTCKS;
|
||||
}
|
||||
|
||||
if (tp->tg3_flags2 & TG3_FLG2_5705_PLUS) {
|
||||
ec->rx_coalesce_usecs_irq = 0;
|
||||
ec->tx_coalesce_usecs_irq = 0;
|
||||
ec->stats_block_coalesce_usecs = 0;
|
||||
}
|
||||
}
|
||||
|
||||
static int __devinit tg3_init_one(struct pci_dev *pdev,
|
||||
|
@ -879,31 +879,41 @@
|
||||
#define LOW_RXCOL_TICKS_CLRTCKS 0x00000014
|
||||
#define DEFAULT_RXCOL_TICKS 0x00000048
|
||||
#define HIGH_RXCOL_TICKS 0x00000096
|
||||
#define MAX_RXCOL_TICKS 0x000003ff
|
||||
#define HOSTCC_TXCOL_TICKS 0x00003c0c
|
||||
#define LOW_TXCOL_TICKS 0x00000096
|
||||
#define LOW_TXCOL_TICKS_CLRTCKS 0x00000048
|
||||
#define DEFAULT_TXCOL_TICKS 0x0000012c
|
||||
#define HIGH_TXCOL_TICKS 0x00000145
|
||||
#define MAX_TXCOL_TICKS 0x000003ff
|
||||
#define HOSTCC_RXMAX_FRAMES 0x00003c10
|
||||
#define LOW_RXMAX_FRAMES 0x00000005
|
||||
#define DEFAULT_RXMAX_FRAMES 0x00000008
|
||||
#define HIGH_RXMAX_FRAMES 0x00000012
|
||||
#define MAX_RXMAX_FRAMES 0x000000ff
|
||||
#define HOSTCC_TXMAX_FRAMES 0x00003c14
|
||||
#define LOW_TXMAX_FRAMES 0x00000035
|
||||
#define DEFAULT_TXMAX_FRAMES 0x0000004b
|
||||
#define HIGH_TXMAX_FRAMES 0x00000052
|
||||
#define MAX_TXMAX_FRAMES 0x000000ff
|
||||
#define HOSTCC_RXCOAL_TICK_INT 0x00003c18
|
||||
#define DEFAULT_RXCOAL_TICK_INT 0x00000019
|
||||
#define DEFAULT_RXCOAL_TICK_INT_CLRTCKS 0x00000014
|
||||
#define MAX_RXCOAL_TICK_INT 0x000003ff
|
||||
#define HOSTCC_TXCOAL_TICK_INT 0x00003c1c
|
||||
#define DEFAULT_TXCOAL_TICK_INT 0x00000019
|
||||
#define DEFAULT_TXCOAL_TICK_INT_CLRTCKS 0x00000014
|
||||
#define MAX_TXCOAL_TICK_INT 0x000003ff
|
||||
#define HOSTCC_RXCOAL_MAXF_INT 0x00003c20
|
||||
#define DEFAULT_RXCOAL_MAXF_INT 0x00000005
|
||||
#define MAX_RXCOAL_MAXF_INT 0x000000ff
|
||||
#define HOSTCC_TXCOAL_MAXF_INT 0x00003c24
|
||||
#define DEFAULT_TXCOAL_MAXF_INT 0x00000005
|
||||
#define MAX_TXCOAL_MAXF_INT 0x000000ff
|
||||
#define HOSTCC_STAT_COAL_TICKS 0x00003c28
|
||||
#define DEFAULT_STAT_COAL_TICKS 0x000f4240
|
||||
#define MAX_STAT_COAL_TICKS 0xd693d400
|
||||
#define MIN_STAT_COAL_TICKS 0x00000064
|
||||
/* 0x3c2c --> 0x3c30 unused */
|
||||
#define HOSTCC_STATS_BLK_HOST_ADDR 0x00003c30 /* 64-bit */
|
||||
#define HOSTCC_STATUS_BLK_HOST_ADDR 0x00003c38 /* 64-bit */
|
||||
|
@ -23,7 +23,7 @@ struct shaper
|
||||
__u32 shapeclock;
|
||||
unsigned long recovery; /* Time we can next clock a packet out on
|
||||
an empty queue */
|
||||
struct semaphore sem;
|
||||
spinlock_t lock;
|
||||
struct net_device_stats stats;
|
||||
struct net_device *dev;
|
||||
int (*hard_start_xmit) (struct sk_buff *skb,
|
||||
|
@ -183,7 +183,6 @@ struct skb_shared_info {
|
||||
* @priority: Packet queueing priority
|
||||
* @users: User count - see {datagram,tcp}.c
|
||||
* @protocol: Packet protocol from driver
|
||||
* @security: Security level of packet
|
||||
* @truesize: Buffer size
|
||||
* @head: Head of buffer
|
||||
* @data: Data head pointer
|
||||
@ -249,18 +248,18 @@ struct sk_buff {
|
||||
data_len,
|
||||
mac_len,
|
||||
csum;
|
||||
unsigned char local_df,
|
||||
cloned:1,
|
||||
nohdr:1,
|
||||
pkt_type,
|
||||
ip_summed;
|
||||
__u32 priority;
|
||||
unsigned short protocol,
|
||||
security;
|
||||
__u8 local_df:1,
|
||||
cloned:1,
|
||||
ip_summed:2,
|
||||
nohdr:1;
|
||||
/* 3 bits spare */
|
||||
__u8 pkt_type;
|
||||
__u16 protocol;
|
||||
|
||||
void (*destructor)(struct sk_buff *skb);
|
||||
#ifdef CONFIG_NETFILTER
|
||||
unsigned long nfmark;
|
||||
unsigned long nfmark;
|
||||
__u32 nfcache;
|
||||
__u32 nfctinfo;
|
||||
struct nf_conntrack *nfct;
|
||||
@ -1211,7 +1210,7 @@ static inline void *skb_header_pointer(const struct sk_buff *skb, int offset,
|
||||
{
|
||||
int hlen = skb_headlen(skb);
|
||||
|
||||
if (offset + len <= hlen)
|
||||
if (hlen - offset >= len)
|
||||
return skb->data + offset;
|
||||
|
||||
if (skb_copy_bits(skb, offset, buffer, len) < 0)
|
||||
|
@ -45,7 +45,7 @@ enum
|
||||
TCF_META_ID_REALDEV,
|
||||
TCF_META_ID_PRIORITY,
|
||||
TCF_META_ID_PROTOCOL,
|
||||
TCF_META_ID_SECURITY,
|
||||
TCF_META_ID_SECURITY, /* obsolete */
|
||||
TCF_META_ID_PKTTYPE,
|
||||
TCF_META_ID_PKTLEN,
|
||||
TCF_META_ID_DATALEN,
|
||||
|
@ -286,7 +286,7 @@ struct tcp_sock {
|
||||
__u32 max_window; /* Maximal window ever seen from peer */
|
||||
__u32 pmtu_cookie; /* Last pmtu seen by socket */
|
||||
__u32 mss_cache; /* Cached effective mss, not including SACKS */
|
||||
__u16 mss_cache_std; /* Like mss_cache, but without TSO */
|
||||
__u16 xmit_size_goal; /* Goal for segmenting output packets */
|
||||
__u16 ext_header_len; /* Network protocol overhead (IP/IPv6 options) */
|
||||
__u8 ca_state; /* State of fast-retransmit machine */
|
||||
__u8 retransmits; /* Number of unrecovered RTO timeouts. */
|
||||
|
@ -13,13 +13,12 @@ struct qdisc_walker
|
||||
|
||||
extern rwlock_t qdisc_tree_lock;
|
||||
|
||||
#define QDISC_ALIGN 32
|
||||
#define QDISC_ALIGN_CONST (QDISC_ALIGN - 1)
|
||||
#define QDISC_ALIGNTO 32
|
||||
#define QDISC_ALIGN(len) (((len) + QDISC_ALIGNTO-1) & ~(QDISC_ALIGNTO-1))
|
||||
|
||||
static inline void *qdisc_priv(struct Qdisc *q)
|
||||
{
|
||||
return (char *)q + ((sizeof(struct Qdisc) + QDISC_ALIGN_CONST)
|
||||
& ~QDISC_ALIGN_CONST);
|
||||
return (char *) q + QDISC_ALIGN(sizeof(struct Qdisc));
|
||||
}
|
||||
|
||||
/*
|
||||
@ -207,8 +206,6 @@ psched_tod_diff(int delta_sec, int bound)
|
||||
|
||||
#endif /* !CONFIG_NET_SCH_CLK_GETTIMEOFDAY */
|
||||
|
||||
extern struct Qdisc noop_qdisc;
|
||||
extern struct Qdisc_ops noop_qdisc_ops;
|
||||
extern struct Qdisc_ops pfifo_qdisc_ops;
|
||||
extern struct Qdisc_ops bfifo_qdisc_ops;
|
||||
|
||||
@ -216,14 +213,6 @@ extern int register_qdisc(struct Qdisc_ops *qops);
|
||||
extern int unregister_qdisc(struct Qdisc_ops *qops);
|
||||
extern struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle);
|
||||
extern struct Qdisc *qdisc_lookup_class(struct net_device *dev, u32 handle);
|
||||
extern void dev_init_scheduler(struct net_device *dev);
|
||||
extern void dev_shutdown(struct net_device *dev);
|
||||
extern void dev_activate(struct net_device *dev);
|
||||
extern void dev_deactivate(struct net_device *dev);
|
||||
extern void qdisc_reset(struct Qdisc *qdisc);
|
||||
extern void qdisc_destroy(struct Qdisc *qdisc);
|
||||
extern struct Qdisc * qdisc_create_dflt(struct net_device *dev,
|
||||
struct Qdisc_ops *ops);
|
||||
extern struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
|
||||
struct rtattr *tab);
|
||||
extern void qdisc_put_rtab(struct qdisc_rate_table *tab);
|
||||
|
@ -164,6 +164,19 @@ extern void qdisc_unlock_tree(struct net_device *dev);
|
||||
#define tcf_tree_lock(tp) qdisc_lock_tree((tp)->q->dev)
|
||||
#define tcf_tree_unlock(tp) qdisc_unlock_tree((tp)->q->dev)
|
||||
|
||||
extern struct Qdisc noop_qdisc;
|
||||
extern struct Qdisc_ops noop_qdisc_ops;
|
||||
|
||||
extern void dev_init_scheduler(struct net_device *dev);
|
||||
extern void dev_shutdown(struct net_device *dev);
|
||||
extern void dev_activate(struct net_device *dev);
|
||||
extern void dev_deactivate(struct net_device *dev);
|
||||
extern void qdisc_reset(struct Qdisc *qdisc);
|
||||
extern void qdisc_destroy(struct Qdisc *qdisc);
|
||||
extern struct Qdisc *qdisc_alloc(struct net_device *dev, struct Qdisc_ops *ops);
|
||||
extern struct Qdisc *qdisc_create_dflt(struct net_device *dev,
|
||||
struct Qdisc_ops *ops);
|
||||
|
||||
static inline void
|
||||
tcf_destroy(struct tcf_proto *tp)
|
||||
{
|
||||
|
@ -170,19 +170,14 @@ struct slcompress {
|
||||
};
|
||||
#define NULLSLCOMPR (struct slcompress *)0
|
||||
|
||||
#define __ARGS(x) x
|
||||
|
||||
/* In slhc.c: */
|
||||
struct slcompress *slhc_init __ARGS((int rslots, int tslots));
|
||||
void slhc_free __ARGS((struct slcompress *comp));
|
||||
struct slcompress *slhc_init(int rslots, int tslots);
|
||||
void slhc_free(struct slcompress *comp);
|
||||
|
||||
int slhc_compress __ARGS((struct slcompress *comp, unsigned char *icp,
|
||||
int isize, unsigned char *ocp, unsigned char **cpp,
|
||||
int compress_cid));
|
||||
int slhc_uncompress __ARGS((struct slcompress *comp, unsigned char *icp,
|
||||
int isize));
|
||||
int slhc_remember __ARGS((struct slcompress *comp, unsigned char *icp,
|
||||
int isize));
|
||||
int slhc_toss __ARGS((struct slcompress *comp));
|
||||
int slhc_compress(struct slcompress *comp, unsigned char *icp, int isize,
|
||||
unsigned char *ocp, unsigned char **cpp, int compress_cid);
|
||||
int slhc_uncompress(struct slcompress *comp, unsigned char *icp, int isize);
|
||||
int slhc_remember(struct slcompress *comp, unsigned char *icp, int isize);
|
||||
int slhc_toss(struct slcompress *comp);
|
||||
|
||||
#endif /* _SLHC_H */
|
||||
|
@ -1134,13 +1134,16 @@ static inline void sk_stream_moderate_sndbuf(struct sock *sk)
|
||||
static inline struct sk_buff *sk_stream_alloc_pskb(struct sock *sk,
|
||||
int size, int mem, int gfp)
|
||||
{
|
||||
struct sk_buff *skb = alloc_skb(size + sk->sk_prot->max_header, gfp);
|
||||
struct sk_buff *skb;
|
||||
int hdr_len;
|
||||
|
||||
hdr_len = SKB_DATA_ALIGN(sk->sk_prot->max_header);
|
||||
skb = alloc_skb(size + hdr_len, gfp);
|
||||
if (skb) {
|
||||
skb->truesize += mem;
|
||||
if (sk->sk_forward_alloc >= (int)skb->truesize ||
|
||||
sk_stream_mem_schedule(sk, skb->truesize, 0)) {
|
||||
skb_reserve(skb, sk->sk_prot->max_header);
|
||||
skb_reserve(skb, hdr_len);
|
||||
return skb;
|
||||
}
|
||||
__kfree_skb(skb);
|
||||
|
@ -721,11 +721,16 @@ static inline int tcp_ack_scheduled(struct tcp_sock *tp)
|
||||
return tp->ack.pending&TCP_ACK_SCHED;
|
||||
}
|
||||
|
||||
static __inline__ void tcp_dec_quickack_mode(struct tcp_sock *tp)
|
||||
static __inline__ void tcp_dec_quickack_mode(struct tcp_sock *tp, unsigned int pkts)
|
||||
{
|
||||
if (tp->ack.quick && --tp->ack.quick == 0) {
|
||||
/* Leaving quickack mode we deflate ATO. */
|
||||
tp->ack.ato = TCP_ATO_MIN;
|
||||
if (tp->ack.quick) {
|
||||
if (pkts >= tp->ack.quick) {
|
||||
tp->ack.quick = 0;
|
||||
|
||||
/* Leaving quickack mode we deflate ATO. */
|
||||
tp->ack.ato = TCP_ATO_MIN;
|
||||
} else
|
||||
tp->ack.quick -= pkts;
|
||||
}
|
||||
}
|
||||
|
||||
@ -843,7 +848,9 @@ extern __u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb,
|
||||
|
||||
/* tcp_output.c */
|
||||
|
||||
extern int tcp_write_xmit(struct sock *, int nonagle);
|
||||
extern void __tcp_push_pending_frames(struct sock *sk, struct tcp_sock *tp,
|
||||
unsigned int cur_mss, int nonagle);
|
||||
extern int tcp_may_send_now(struct sock *sk, struct tcp_sock *tp);
|
||||
extern int tcp_retransmit_skb(struct sock *, struct sk_buff *);
|
||||
extern void tcp_xmit_retransmit_queue(struct sock *);
|
||||
extern void tcp_simple_retransmit(struct sock *);
|
||||
@ -855,10 +862,13 @@ extern int tcp_write_wakeup(struct sock *);
|
||||
extern void tcp_send_fin(struct sock *sk);
|
||||
extern void tcp_send_active_reset(struct sock *sk, int priority);
|
||||
extern int tcp_send_synack(struct sock *);
|
||||
extern void tcp_push_one(struct sock *, unsigned mss_now);
|
||||
extern void tcp_push_one(struct sock *, unsigned int mss_now);
|
||||
extern void tcp_send_ack(struct sock *sk);
|
||||
extern void tcp_send_delayed_ack(struct sock *sk);
|
||||
|
||||
/* tcp_input.c */
|
||||
extern void tcp_cwnd_application_limited(struct sock *sk);
|
||||
|
||||
/* tcp_timer.c */
|
||||
extern void tcp_init_xmit_timers(struct sock *);
|
||||
extern void tcp_clear_xmit_timers(struct sock *);
|
||||
@ -958,7 +968,7 @@ static inline void tcp_reset_xmit_timer(struct sock *sk, int what, unsigned long
|
||||
static inline void tcp_initialize_rcv_mss(struct sock *sk)
|
||||
{
|
||||
struct tcp_sock *tp = tcp_sk(sk);
|
||||
unsigned int hint = min(tp->advmss, tp->mss_cache_std);
|
||||
unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache);
|
||||
|
||||
hint = min(hint, tp->rcv_wnd/2);
|
||||
hint = min(hint, TCP_MIN_RCVMSS);
|
||||
@ -1225,28 +1235,6 @@ static inline void tcp_sync_left_out(struct tcp_sock *tp)
|
||||
tp->left_out = tp->sacked_out + tp->lost_out;
|
||||
}
|
||||
|
||||
extern void tcp_cwnd_application_limited(struct sock *sk);
|
||||
|
||||
/* Congestion window validation. (RFC2861) */
|
||||
|
||||
static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp)
|
||||
{
|
||||
__u32 packets_out = tp->packets_out;
|
||||
|
||||
if (packets_out >= tp->snd_cwnd) {
|
||||
/* Network is feed fully. */
|
||||
tp->snd_cwnd_used = 0;
|
||||
tp->snd_cwnd_stamp = tcp_time_stamp;
|
||||
} else {
|
||||
/* Network starves. */
|
||||
if (tp->packets_out > tp->snd_cwnd_used)
|
||||
tp->snd_cwnd_used = tp->packets_out;
|
||||
|
||||
if ((s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= tp->rto)
|
||||
tcp_cwnd_application_limited(sk);
|
||||
}
|
||||
}
|
||||
|
||||
/* Set slow start threshould and cwnd not falling to slow start */
|
||||
static inline void __tcp_enter_cwr(struct tcp_sock *tp)
|
||||
{
|
||||
@ -1279,12 +1267,6 @@ static __inline__ __u32 tcp_max_burst(const struct tcp_sock *tp)
|
||||
return 3;
|
||||
}
|
||||
|
||||
static __inline__ int tcp_minshall_check(const struct tcp_sock *tp)
|
||||
{
|
||||
return after(tp->snd_sml,tp->snd_una) &&
|
||||
!after(tp->snd_sml, tp->snd_nxt);
|
||||
}
|
||||
|
||||
static __inline__ void tcp_minshall_update(struct tcp_sock *tp, int mss,
|
||||
const struct sk_buff *skb)
|
||||
{
|
||||
@ -1292,122 +1274,18 @@ static __inline__ void tcp_minshall_update(struct tcp_sock *tp, int mss,
|
||||
tp->snd_sml = TCP_SKB_CB(skb)->end_seq;
|
||||
}
|
||||
|
||||
/* Return 0, if packet can be sent now without violation Nagle's rules:
|
||||
1. It is full sized.
|
||||
2. Or it contains FIN.
|
||||
3. Or TCP_NODELAY was set.
|
||||
4. Or TCP_CORK is not set, and all sent packets are ACKed.
|
||||
With Minshall's modification: all sent small packets are ACKed.
|
||||
*/
|
||||
|
||||
static __inline__ int
|
||||
tcp_nagle_check(const struct tcp_sock *tp, const struct sk_buff *skb,
|
||||
unsigned mss_now, int nonagle)
|
||||
{
|
||||
return (skb->len < mss_now &&
|
||||
!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) &&
|
||||
((nonagle&TCP_NAGLE_CORK) ||
|
||||
(!nonagle &&
|
||||
tp->packets_out &&
|
||||
tcp_minshall_check(tp))));
|
||||
}
|
||||
|
||||
extern void tcp_set_skb_tso_segs(struct sock *, struct sk_buff *);
|
||||
|
||||
/* This checks if the data bearing packet SKB (usually sk->sk_send_head)
|
||||
* should be put on the wire right now.
|
||||
*/
|
||||
static __inline__ int tcp_snd_test(struct sock *sk,
|
||||
struct sk_buff *skb,
|
||||
unsigned cur_mss, int nonagle)
|
||||
{
|
||||
struct tcp_sock *tp = tcp_sk(sk);
|
||||
int pkts = tcp_skb_pcount(skb);
|
||||
|
||||
if (!pkts) {
|
||||
tcp_set_skb_tso_segs(sk, skb);
|
||||
pkts = tcp_skb_pcount(skb);
|
||||
}
|
||||
|
||||
/* RFC 1122 - section 4.2.3.4
|
||||
*
|
||||
* We must queue if
|
||||
*
|
||||
* a) The right edge of this frame exceeds the window
|
||||
* b) There are packets in flight and we have a small segment
|
||||
* [SWS avoidance and Nagle algorithm]
|
||||
* (part of SWS is done on packetization)
|
||||
* Minshall version sounds: there are no _small_
|
||||
* segments in flight. (tcp_nagle_check)
|
||||
* c) We have too many packets 'in flight'
|
||||
*
|
||||
* Don't use the nagle rule for urgent data (or
|
||||
* for the final FIN -DaveM).
|
||||
*
|
||||
* Also, Nagle rule does not apply to frames, which
|
||||
* sit in the middle of queue (they have no chances
|
||||
* to get new data) and if room at tail of skb is
|
||||
* not enough to save something seriously (<32 for now).
|
||||
*/
|
||||
|
||||
/* Don't be strict about the congestion window for the
|
||||
* final FIN frame. -DaveM
|
||||
*/
|
||||
return (((nonagle&TCP_NAGLE_PUSH) || tp->urg_mode
|
||||
|| !tcp_nagle_check(tp, skb, cur_mss, nonagle)) &&
|
||||
(((tcp_packets_in_flight(tp) + (pkts-1)) < tp->snd_cwnd) ||
|
||||
(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)) &&
|
||||
!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd));
|
||||
}
|
||||
|
||||
static __inline__ void tcp_check_probe_timer(struct sock *sk, struct tcp_sock *tp)
|
||||
{
|
||||
if (!tp->packets_out && !tp->pending)
|
||||
tcp_reset_xmit_timer(sk, TCP_TIME_PROBE0, tp->rto);
|
||||
}
|
||||
|
||||
static __inline__ int tcp_skb_is_last(const struct sock *sk,
|
||||
const struct sk_buff *skb)
|
||||
{
|
||||
return skb->next == (struct sk_buff *)&sk->sk_write_queue;
|
||||
}
|
||||
|
||||
/* Push out any pending frames which were held back due to
|
||||
* TCP_CORK or attempt at coalescing tiny packets.
|
||||
* The socket must be locked by the caller.
|
||||
*/
|
||||
static __inline__ void __tcp_push_pending_frames(struct sock *sk,
|
||||
struct tcp_sock *tp,
|
||||
unsigned cur_mss,
|
||||
int nonagle)
|
||||
{
|
||||
struct sk_buff *skb = sk->sk_send_head;
|
||||
|
||||
if (skb) {
|
||||
if (!tcp_skb_is_last(sk, skb))
|
||||
nonagle = TCP_NAGLE_PUSH;
|
||||
if (!tcp_snd_test(sk, skb, cur_mss, nonagle) ||
|
||||
tcp_write_xmit(sk, nonagle))
|
||||
tcp_check_probe_timer(sk, tp);
|
||||
}
|
||||
tcp_cwnd_validate(sk, tp);
|
||||
}
|
||||
|
||||
static __inline__ void tcp_push_pending_frames(struct sock *sk,
|
||||
struct tcp_sock *tp)
|
||||
{
|
||||
__tcp_push_pending_frames(sk, tp, tcp_current_mss(sk, 1), tp->nonagle);
|
||||
}
|
||||
|
||||
static __inline__ int tcp_may_send_now(struct sock *sk, struct tcp_sock *tp)
|
||||
{
|
||||
struct sk_buff *skb = sk->sk_send_head;
|
||||
|
||||
return (skb &&
|
||||
tcp_snd_test(sk, skb, tcp_current_mss(sk, 1),
|
||||
tcp_skb_is_last(sk, skb) ? TCP_NAGLE_PUSH : tp->nonagle));
|
||||
}
|
||||
|
||||
static __inline__ void tcp_init_wl(struct tcp_sock *tp, u32 ack, u32 seq)
|
||||
{
|
||||
tp->snd_wl1 = seq;
|
||||
|
@ -2089,10 +2089,11 @@ void dev_set_promiscuity(struct net_device *dev, int inc)
|
||||
{
|
||||
unsigned short old_flags = dev->flags;
|
||||
|
||||
dev->flags |= IFF_PROMISC;
|
||||
if ((dev->promiscuity += inc) == 0)
|
||||
dev->flags &= ~IFF_PROMISC;
|
||||
if (dev->flags ^ old_flags) {
|
||||
else
|
||||
dev->flags |= IFF_PROMISC;
|
||||
if (dev->flags != old_flags) {
|
||||
dev_mc_upload(dev);
|
||||
printk(KERN_INFO "device %s %s promiscuous mode\n",
|
||||
dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
|
||||
|
@ -36,7 +36,7 @@
|
||||
#include <linux/filter.h>
|
||||
|
||||
/* No hurry in this branch */
|
||||
static u8 *load_pointer(struct sk_buff *skb, int k)
|
||||
static void *__load_pointer(struct sk_buff *skb, int k)
|
||||
{
|
||||
u8 *ptr = NULL;
|
||||
|
||||
@ -50,6 +50,18 @@ static u8 *load_pointer(struct sk_buff *skb, int k)
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static inline void *load_pointer(struct sk_buff *skb, int k,
|
||||
unsigned int size, void *buffer)
|
||||
{
|
||||
if (k >= 0)
|
||||
return skb_header_pointer(skb, k, size, buffer);
|
||||
else {
|
||||
if (k >= SKF_AD_OFF)
|
||||
return NULL;
|
||||
return __load_pointer(skb, k);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* sk_run_filter - run a filter on a socket
|
||||
* @skb: buffer to run the filter on
|
||||
@ -64,15 +76,12 @@ static u8 *load_pointer(struct sk_buff *skb, int k)
|
||||
|
||||
int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int flen)
|
||||
{
|
||||
unsigned char *data = skb->data;
|
||||
/* len is UNSIGNED. Byte wide insns relies only on implicit
|
||||
type casts to prevent reading arbitrary memory locations.
|
||||
*/
|
||||
unsigned int len = skb->len-skb->data_len;
|
||||
struct sock_filter *fentry; /* We walk down these */
|
||||
void *ptr;
|
||||
u32 A = 0; /* Accumulator */
|
||||
u32 X = 0; /* Index Register */
|
||||
u32 mem[BPF_MEMWORDS]; /* Scratch Memory Store */
|
||||
u32 tmp;
|
||||
int k;
|
||||
int pc;
|
||||
|
||||
@ -168,86 +177,35 @@ int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int flen)
|
||||
case BPF_LD|BPF_W|BPF_ABS:
|
||||
k = fentry->k;
|
||||
load_w:
|
||||
if (k >= 0 && (unsigned int)(k+sizeof(u32)) <= len) {
|
||||
A = ntohl(*(u32*)&data[k]);
|
||||
ptr = load_pointer(skb, k, 4, &tmp);
|
||||
if (ptr != NULL) {
|
||||
A = ntohl(*(u32 *)ptr);
|
||||
continue;
|
||||
}
|
||||
if (k < 0) {
|
||||
u8 *ptr;
|
||||
|
||||
if (k >= SKF_AD_OFF)
|
||||
break;
|
||||
ptr = load_pointer(skb, k);
|
||||
if (ptr) {
|
||||
A = ntohl(*(u32*)ptr);
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
u32 _tmp, *p;
|
||||
p = skb_header_pointer(skb, k, 4, &_tmp);
|
||||
if (p != NULL) {
|
||||
A = ntohl(*p);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
case BPF_LD|BPF_H|BPF_ABS:
|
||||
k = fentry->k;
|
||||
load_h:
|
||||
if (k >= 0 && (unsigned int)(k + sizeof(u16)) <= len) {
|
||||
A = ntohs(*(u16*)&data[k]);
|
||||
ptr = load_pointer(skb, k, 2, &tmp);
|
||||
if (ptr != NULL) {
|
||||
A = ntohs(*(u16 *)ptr);
|
||||
continue;
|
||||
}
|
||||
if (k < 0) {
|
||||
u8 *ptr;
|
||||
|
||||
if (k >= SKF_AD_OFF)
|
||||
break;
|
||||
ptr = load_pointer(skb, k);
|
||||
if (ptr) {
|
||||
A = ntohs(*(u16*)ptr);
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
u16 _tmp, *p;
|
||||
p = skb_header_pointer(skb, k, 2, &_tmp);
|
||||
if (p != NULL) {
|
||||
A = ntohs(*p);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
case BPF_LD|BPF_B|BPF_ABS:
|
||||
k = fentry->k;
|
||||
load_b:
|
||||
if (k >= 0 && (unsigned int)k < len) {
|
||||
A = data[k];
|
||||
ptr = load_pointer(skb, k, 1, &tmp);
|
||||
if (ptr != NULL) {
|
||||
A = *(u8 *)ptr;
|
||||
continue;
|
||||
}
|
||||
if (k < 0) {
|
||||
u8 *ptr;
|
||||
|
||||
if (k >= SKF_AD_OFF)
|
||||
break;
|
||||
ptr = load_pointer(skb, k);
|
||||
if (ptr) {
|
||||
A = *ptr;
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
u8 _tmp, *p;
|
||||
p = skb_header_pointer(skb, k, 1, &_tmp);
|
||||
if (p != NULL) {
|
||||
A = *p;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
case BPF_LD|BPF_W|BPF_LEN:
|
||||
A = len;
|
||||
A = skb->len;
|
||||
continue;
|
||||
case BPF_LDX|BPF_W|BPF_LEN:
|
||||
X = len;
|
||||
X = skb->len;
|
||||
continue;
|
||||
case BPF_LD|BPF_W|BPF_IND:
|
||||
k = X + fentry->k;
|
||||
@ -259,10 +217,12 @@ load_b:
|
||||
k = X + fentry->k;
|
||||
goto load_b;
|
||||
case BPF_LDX|BPF_B|BPF_MSH:
|
||||
if (fentry->k >= len)
|
||||
return 0;
|
||||
X = (data[fentry->k] & 0xf) << 2;
|
||||
continue;
|
||||
ptr = load_pointer(skb, fentry->k, 1, &tmp);
|
||||
if (ptr != NULL) {
|
||||
X = (*(u8 *)ptr & 0xf) << 2;
|
||||
continue;
|
||||
}
|
||||
return 0;
|
||||
case BPF_LD|BPF_IMM:
|
||||
A = fentry->k;
|
||||
continue;
|
||||
|
@ -357,7 +357,6 @@ struct sk_buff *skb_clone(struct sk_buff *skb, int gfp_mask)
|
||||
C(ip_summed);
|
||||
C(priority);
|
||||
C(protocol);
|
||||
C(security);
|
||||
n->destructor = NULL;
|
||||
#ifdef CONFIG_NETFILTER
|
||||
C(nfmark);
|
||||
@ -422,7 +421,6 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
|
||||
new->pkt_type = old->pkt_type;
|
||||
new->stamp = old->stamp;
|
||||
new->destructor = NULL;
|
||||
new->security = old->security;
|
||||
#ifdef CONFIG_NETFILTER
|
||||
new->nfmark = old->nfmark;
|
||||
new->nfcache = old->nfcache;
|
||||
|
@ -551,7 +551,8 @@ int dn_fib_dump(struct sk_buff *skb, struct netlink_callback *cb)
|
||||
if (t < s_t)
|
||||
continue;
|
||||
if (t > s_t)
|
||||
memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(int));
|
||||
memset(&cb->args[1], 0,
|
||||
sizeof(cb->args) - sizeof(cb->args[0]));
|
||||
tb = dn_fib_get_table(t, 0);
|
||||
if (tb == NULL)
|
||||
continue;
|
||||
|
@ -1009,6 +1009,15 @@ static int __init init_ipv4_mibs(void)
|
||||
static int ipv4_proc_init(void);
|
||||
extern void ipfrag_init(void);
|
||||
|
||||
/*
|
||||
* IP protocol layer initialiser
|
||||
*/
|
||||
|
||||
static struct packet_type ip_packet_type = {
|
||||
.type = __constant_htons(ETH_P_IP),
|
||||
.func = ip_rcv,
|
||||
};
|
||||
|
||||
static int __init inet_init(void)
|
||||
{
|
||||
struct sk_buff *dummy_skb;
|
||||
@ -1102,6 +1111,8 @@ static int __init inet_init(void)
|
||||
|
||||
ipfrag_init();
|
||||
|
||||
dev_add_pack(&ip_packet_type);
|
||||
|
||||
rc = 0;
|
||||
out:
|
||||
return rc;
|
||||
|
@ -43,7 +43,7 @@
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*/
|
||||
|
||||
#define VERSION "0.324"
|
||||
#define VERSION "0.325"
|
||||
|
||||
#include <linux/config.h>
|
||||
#include <asm/uaccess.h>
|
||||
@ -136,6 +136,7 @@ struct trie_use_stats {
|
||||
unsigned int semantic_match_passed;
|
||||
unsigned int semantic_match_miss;
|
||||
unsigned int null_node_hit;
|
||||
unsigned int resize_node_skipped;
|
||||
};
|
||||
#endif
|
||||
|
||||
@ -164,8 +165,8 @@ static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
|
||||
static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull);
|
||||
static int tnode_child_length(struct tnode *tn);
|
||||
static struct node *resize(struct trie *t, struct tnode *tn);
|
||||
static struct tnode *inflate(struct trie *t, struct tnode *tn);
|
||||
static struct tnode *halve(struct trie *t, struct tnode *tn);
|
||||
static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err);
|
||||
static struct tnode *halve(struct trie *t, struct tnode *tn, int *err);
|
||||
static void tnode_free(struct tnode *tn);
|
||||
static void trie_dump_seq(struct seq_file *seq, struct trie *t);
|
||||
extern struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio);
|
||||
@ -358,11 +359,32 @@ static inline void free_leaf_info(struct leaf_info *li)
|
||||
kfree(li);
|
||||
}
|
||||
|
||||
static struct tnode *tnode_alloc(unsigned int size)
|
||||
{
|
||||
if (size <= PAGE_SIZE) {
|
||||
return kmalloc(size, GFP_KERNEL);
|
||||
} else {
|
||||
return (struct tnode *)
|
||||
__get_free_pages(GFP_KERNEL, get_order(size));
|
||||
}
|
||||
}
|
||||
|
||||
static void __tnode_free(struct tnode *tn)
|
||||
{
|
||||
unsigned int size = sizeof(struct tnode) +
|
||||
(1<<tn->bits) * sizeof(struct node *);
|
||||
|
||||
if (size <= PAGE_SIZE)
|
||||
kfree(tn);
|
||||
else
|
||||
free_pages((unsigned long)tn, get_order(size));
|
||||
}
|
||||
|
||||
static struct tnode* tnode_new(t_key key, int pos, int bits)
|
||||
{
|
||||
int nchildren = 1<<bits;
|
||||
int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *);
|
||||
struct tnode *tn = kmalloc(sz, GFP_KERNEL);
|
||||
struct tnode *tn = tnode_alloc(sz);
|
||||
|
||||
if(tn) {
|
||||
memset(tn, 0, sz);
|
||||
@ -390,7 +412,7 @@ static void tnode_free(struct tnode *tn)
|
||||
printk("FL %p \n", tn);
|
||||
}
|
||||
else if(IS_TNODE(tn)) {
|
||||
kfree(tn);
|
||||
__tnode_free(tn);
|
||||
if(trie_debug > 0 )
|
||||
printk("FT %p \n", tn);
|
||||
}
|
||||
@ -460,6 +482,7 @@ static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int w
|
||||
static struct node *resize(struct trie *t, struct tnode *tn)
|
||||
{
|
||||
int i;
|
||||
int err = 0;
|
||||
|
||||
if (!tn)
|
||||
return NULL;
|
||||
@ -556,12 +579,20 @@ static struct node *resize(struct trie *t, struct tnode *tn)
|
||||
*/
|
||||
|
||||
check_tnode(tn);
|
||||
|
||||
|
||||
err = 0;
|
||||
while ((tn->full_children > 0 &&
|
||||
50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >=
|
||||
inflate_threshold * tnode_child_length(tn))) {
|
||||
|
||||
tn = inflate(t, tn);
|
||||
tn = inflate(t, tn, &err);
|
||||
|
||||
if(err) {
|
||||
#ifdef CONFIG_IP_FIB_TRIE_STATS
|
||||
t->stats.resize_node_skipped++;
|
||||
#endif
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
check_tnode(tn);
|
||||
@ -570,11 +601,22 @@ static struct node *resize(struct trie *t, struct tnode *tn)
|
||||
* Halve as long as the number of empty children in this
|
||||
* node is above threshold.
|
||||
*/
|
||||
|
||||
err = 0;
|
||||
while (tn->bits > 1 &&
|
||||
100 * (tnode_child_length(tn) - tn->empty_children) <
|
||||
halve_threshold * tnode_child_length(tn))
|
||||
halve_threshold * tnode_child_length(tn)) {
|
||||
|
||||
tn = halve(t, tn, &err);
|
||||
|
||||
if(err) {
|
||||
#ifdef CONFIG_IP_FIB_TRIE_STATS
|
||||
t->stats.resize_node_skipped++;
|
||||
#endif
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
tn = halve(t, tn);
|
||||
|
||||
/* Only one child remains */
|
||||
|
||||
@ -599,7 +641,7 @@ static struct node *resize(struct trie *t, struct tnode *tn)
|
||||
return (struct node *) tn;
|
||||
}
|
||||
|
||||
static struct tnode *inflate(struct trie *t, struct tnode *tn)
|
||||
static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err)
|
||||
{
|
||||
struct tnode *inode;
|
||||
struct tnode *oldtnode = tn;
|
||||
@ -611,8 +653,63 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
|
||||
|
||||
tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
|
||||
|
||||
if (!tn)
|
||||
trie_bug("tnode_new failed");
|
||||
if (!tn) {
|
||||
*err = -ENOMEM;
|
||||
return oldtnode;
|
||||
}
|
||||
|
||||
/*
|
||||
* Preallocate and store tnodes before the actual work so we
|
||||
* don't get into an inconsistent state if memory allocation
|
||||
* fails. In case of failure we return the oldnode and inflate
|
||||
* of tnode is ignored.
|
||||
*/
|
||||
|
||||
for(i = 0; i < olen; i++) {
|
||||
struct tnode *inode = (struct tnode *) tnode_get_child(oldtnode, i);
|
||||
|
||||
if (inode &&
|
||||
IS_TNODE(inode) &&
|
||||
inode->pos == oldtnode->pos + oldtnode->bits &&
|
||||
inode->bits > 1) {
|
||||
struct tnode *left, *right;
|
||||
|
||||
t_key m = TKEY_GET_MASK(inode->pos, 1);
|
||||
|
||||
left = tnode_new(inode->key&(~m), inode->pos + 1,
|
||||
inode->bits - 1);
|
||||
|
||||
if(!left) {
|
||||
*err = -ENOMEM;
|
||||
break;
|
||||
}
|
||||
|
||||
right = tnode_new(inode->key|m, inode->pos + 1,
|
||||
inode->bits - 1);
|
||||
|
||||
if(!right) {
|
||||
*err = -ENOMEM;
|
||||
break;
|
||||
}
|
||||
|
||||
put_child(t, tn, 2*i, (struct node *) left);
|
||||
put_child(t, tn, 2*i+1, (struct node *) right);
|
||||
}
|
||||
}
|
||||
|
||||
if(*err) {
|
||||
int size = tnode_child_length(tn);
|
||||
int j;
|
||||
|
||||
for(j = 0; j < size; j++)
|
||||
if( tn->child[j])
|
||||
tnode_free((struct tnode *)tn->child[j]);
|
||||
|
||||
tnode_free(tn);
|
||||
|
||||
*err = -ENOMEM;
|
||||
return oldtnode;
|
||||
}
|
||||
|
||||
for(i = 0; i < olen; i++) {
|
||||
struct node *node = tnode_get_child(oldtnode, i);
|
||||
@ -625,7 +722,7 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
|
||||
|
||||
if(IS_LEAF(node) || ((struct tnode *) node)->pos >
|
||||
tn->pos + tn->bits - 1) {
|
||||
if(tkey_extract_bits(node->key, tn->pos + tn->bits - 1,
|
||||
if(tkey_extract_bits(node->key, oldtnode->pos + oldtnode->bits,
|
||||
1) == 0)
|
||||
put_child(t, tn, 2*i, node);
|
||||
else
|
||||
@ -665,27 +762,22 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
|
||||
* the position (inode->pos)
|
||||
*/
|
||||
|
||||
t_key m = TKEY_GET_MASK(inode->pos, 1);
|
||||
|
||||
/* Use the old key, but set the new significant
|
||||
* bit to zero.
|
||||
*/
|
||||
left = tnode_new(inode->key&(~m), inode->pos + 1,
|
||||
inode->bits - 1);
|
||||
|
||||
if(!left)
|
||||
trie_bug("tnode_new failed");
|
||||
|
||||
|
||||
/* Use the old key, but set the new significant
|
||||
* bit to one.
|
||||
*/
|
||||
right = tnode_new(inode->key|m, inode->pos + 1,
|
||||
inode->bits - 1);
|
||||
left = (struct tnode *) tnode_get_child(tn, 2*i);
|
||||
put_child(t, tn, 2*i, NULL);
|
||||
|
||||
if(!left)
|
||||
BUG();
|
||||
|
||||
right = (struct tnode *) tnode_get_child(tn, 2*i+1);
|
||||
put_child(t, tn, 2*i+1, NULL);
|
||||
|
||||
if(!right)
|
||||
BUG();
|
||||
|
||||
if(!right)
|
||||
trie_bug("tnode_new failed");
|
||||
|
||||
size = tnode_child_length(left);
|
||||
for(j = 0; j < size; j++) {
|
||||
put_child(t, left, j, inode->child[j]);
|
||||
@ -701,7 +793,7 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
|
||||
return tn;
|
||||
}
|
||||
|
||||
static struct tnode *halve(struct trie *t, struct tnode *tn)
|
||||
static struct tnode *halve(struct trie *t, struct tnode *tn, int *err)
|
||||
{
|
||||
struct tnode *oldtnode = tn;
|
||||
struct node *left, *right;
|
||||
@ -712,8 +804,48 @@ static struct tnode *halve(struct trie *t, struct tnode *tn)
|
||||
|
||||
tn=tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
|
||||
|
||||
if(!tn)
|
||||
trie_bug("tnode_new failed");
|
||||
if (!tn) {
|
||||
*err = -ENOMEM;
|
||||
return oldtnode;
|
||||
}
|
||||
|
||||
/*
|
||||
* Preallocate and store tnodes before the actual work so we
|
||||
* don't get into an inconsistent state if memory allocation
|
||||
* fails. In case of failure we return the oldnode and halve
|
||||
* of tnode is ignored.
|
||||
*/
|
||||
|
||||
for(i = 0; i < olen; i += 2) {
|
||||
left = tnode_get_child(oldtnode, i);
|
||||
right = tnode_get_child(oldtnode, i+1);
|
||||
|
||||
/* Two nonempty children */
|
||||
if( left && right) {
|
||||
struct tnode *newBinNode =
|
||||
tnode_new(left->key, tn->pos + tn->bits, 1);
|
||||
|
||||
if(!newBinNode) {
|
||||
*err = -ENOMEM;
|
||||
break;
|
||||
}
|
||||
put_child(t, tn, i/2, (struct node *)newBinNode);
|
||||
}
|
||||
}
|
||||
|
||||
if(*err) {
|
||||
int size = tnode_child_length(tn);
|
||||
int j;
|
||||
|
||||
for(j = 0; j < size; j++)
|
||||
if( tn->child[j])
|
||||
tnode_free((struct tnode *)tn->child[j]);
|
||||
|
||||
tnode_free(tn);
|
||||
|
||||
*err = -ENOMEM;
|
||||
return oldtnode;
|
||||
}
|
||||
|
||||
for(i = 0; i < olen; i += 2) {
|
||||
left = tnode_get_child(oldtnode, i);
|
||||
@ -730,10 +862,11 @@ static struct tnode *halve(struct trie *t, struct tnode *tn)
|
||||
/* Two nonempty children */
|
||||
else {
|
||||
struct tnode *newBinNode =
|
||||
tnode_new(left->key, tn->pos + tn->bits, 1);
|
||||
(struct tnode *) tnode_get_child(tn, i/2);
|
||||
put_child(t, tn, i/2, NULL);
|
||||
|
||||
if(!newBinNode)
|
||||
trie_bug("tnode_new failed");
|
||||
BUG();
|
||||
|
||||
put_child(t, newBinNode, 0, left);
|
||||
put_child(t, newBinNode, 1, right);
|
||||
@ -2301,6 +2434,7 @@ static void collect_and_show(struct trie *t, struct seq_file *seq)
|
||||
seq_printf(seq,"semantic match passed = %d\n", t->stats.semantic_match_passed);
|
||||
seq_printf(seq,"semantic match miss = %d\n", t->stats.semantic_match_miss);
|
||||
seq_printf(seq,"null node hit= %d\n", t->stats.null_node_hit);
|
||||
seq_printf(seq,"skipped node resize = %d\n", t->stats.resize_node_skipped);
|
||||
#ifdef CLEAR_STATS
|
||||
memset(&(t->stats), 0, sizeof(t->stats));
|
||||
#endif
|
||||
|
@ -389,7 +389,6 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
|
||||
to->pkt_type = from->pkt_type;
|
||||
to->priority = from->priority;
|
||||
to->protocol = from->protocol;
|
||||
to->security = from->security;
|
||||
dst_release(to->dst);
|
||||
to->dst = dst_clone(from->dst);
|
||||
to->dev = from->dev;
|
||||
@ -1329,23 +1328,8 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar
|
||||
ip_rt_put(rt);
|
||||
}
|
||||
|
||||
/*
|
||||
* IP protocol layer initialiser
|
||||
*/
|
||||
|
||||
static struct packet_type ip_packet_type = {
|
||||
.type = __constant_htons(ETH_P_IP),
|
||||
.func = ip_rcv,
|
||||
};
|
||||
|
||||
/*
|
||||
* IP registers the packet type and then calls the subprotocol initialisers
|
||||
*/
|
||||
|
||||
void __init ip_init(void)
|
||||
{
|
||||
dev_add_pack(&ip_packet_type);
|
||||
|
||||
ip_rt_init();
|
||||
inet_initpeers();
|
||||
|
||||
|
124
net/ipv4/route.c
124
net/ipv4/route.c
@ -54,6 +54,7 @@
|
||||
* Marc Boucher : routing by fwmark
|
||||
* Robert Olsson : Added rt_cache statistics
|
||||
* Arnaldo C. Melo : Convert proc stuff to seq_file
|
||||
* Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
@ -70,6 +71,7 @@
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/sched.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/bootmem.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/socket.h>
|
||||
#include <linux/sockios.h>
|
||||
@ -201,8 +203,37 @@ __u8 ip_tos2prio[16] = {
|
||||
|
||||
struct rt_hash_bucket {
|
||||
struct rtable *chain;
|
||||
spinlock_t lock;
|
||||
} __attribute__((__aligned__(8)));
|
||||
};
|
||||
#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
|
||||
/*
|
||||
* Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
|
||||
* The size of this table is a power of two and depends on the number of CPUS.
|
||||
*/
|
||||
#if NR_CPUS >= 32
|
||||
#define RT_HASH_LOCK_SZ 4096
|
||||
#elif NR_CPUS >= 16
|
||||
#define RT_HASH_LOCK_SZ 2048
|
||||
#elif NR_CPUS >= 8
|
||||
#define RT_HASH_LOCK_SZ 1024
|
||||
#elif NR_CPUS >= 4
|
||||
#define RT_HASH_LOCK_SZ 512
|
||||
#else
|
||||
#define RT_HASH_LOCK_SZ 256
|
||||
#endif
|
||||
|
||||
static spinlock_t *rt_hash_locks;
|
||||
# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
|
||||
# define rt_hash_lock_init() { \
|
||||
int i; \
|
||||
rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
|
||||
if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
|
||||
for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
|
||||
spin_lock_init(&rt_hash_locks[i]); \
|
||||
}
|
||||
#else
|
||||
# define rt_hash_lock_addr(slot) NULL
|
||||
# define rt_hash_lock_init()
|
||||
#endif
|
||||
|
||||
static struct rt_hash_bucket *rt_hash_table;
|
||||
static unsigned rt_hash_mask;
|
||||
@ -575,19 +606,26 @@ static struct rtable **rt_remove_balanced_route(struct rtable **chain_head,
|
||||
/* This runs via a timer and thus is always in BH context. */
|
||||
static void rt_check_expire(unsigned long dummy)
|
||||
{
|
||||
static int rover;
|
||||
int i = rover, t;
|
||||
static unsigned int rover;
|
||||
unsigned int i = rover, goal;
|
||||
struct rtable *rth, **rthp;
|
||||
unsigned long now = jiffies;
|
||||
u64 mult;
|
||||
|
||||
for (t = ip_rt_gc_interval << rt_hash_log; t >= 0;
|
||||
t -= ip_rt_gc_timeout) {
|
||||
mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
|
||||
if (ip_rt_gc_timeout > 1)
|
||||
do_div(mult, ip_rt_gc_timeout);
|
||||
goal = (unsigned int)mult;
|
||||
if (goal > rt_hash_mask) goal = rt_hash_mask + 1;
|
||||
for (; goal > 0; goal--) {
|
||||
unsigned long tmo = ip_rt_gc_timeout;
|
||||
|
||||
i = (i + 1) & rt_hash_mask;
|
||||
rthp = &rt_hash_table[i].chain;
|
||||
|
||||
spin_lock(&rt_hash_table[i].lock);
|
||||
if (*rthp == 0)
|
||||
continue;
|
||||
spin_lock(rt_hash_lock_addr(i));
|
||||
while ((rth = *rthp) != NULL) {
|
||||
if (rth->u.dst.expires) {
|
||||
/* Entry is expired even if it is in use */
|
||||
@ -620,14 +658,14 @@ static void rt_check_expire(unsigned long dummy)
|
||||
rt_free(rth);
|
||||
#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
|
||||
}
|
||||
spin_unlock(&rt_hash_table[i].lock);
|
||||
spin_unlock(rt_hash_lock_addr(i));
|
||||
|
||||
/* Fallback loop breaker. */
|
||||
if (time_after(jiffies, now))
|
||||
break;
|
||||
}
|
||||
rover = i;
|
||||
mod_timer(&rt_periodic_timer, now + ip_rt_gc_interval);
|
||||
mod_timer(&rt_periodic_timer, jiffies + ip_rt_gc_interval);
|
||||
}
|
||||
|
||||
/* This can run from both BH and non-BH contexts, the latter
|
||||
@ -643,11 +681,11 @@ static void rt_run_flush(unsigned long dummy)
|
||||
get_random_bytes(&rt_hash_rnd, 4);
|
||||
|
||||
for (i = rt_hash_mask; i >= 0; i--) {
|
||||
spin_lock_bh(&rt_hash_table[i].lock);
|
||||
spin_lock_bh(rt_hash_lock_addr(i));
|
||||
rth = rt_hash_table[i].chain;
|
||||
if (rth)
|
||||
rt_hash_table[i].chain = NULL;
|
||||
spin_unlock_bh(&rt_hash_table[i].lock);
|
||||
spin_unlock_bh(rt_hash_lock_addr(i));
|
||||
|
||||
for (; rth; rth = next) {
|
||||
next = rth->u.rt_next;
|
||||
@ -780,7 +818,7 @@ static int rt_garbage_collect(void)
|
||||
|
||||
k = (k + 1) & rt_hash_mask;
|
||||
rthp = &rt_hash_table[k].chain;
|
||||
spin_lock_bh(&rt_hash_table[k].lock);
|
||||
spin_lock_bh(rt_hash_lock_addr(k));
|
||||
while ((rth = *rthp) != NULL) {
|
||||
if (!rt_may_expire(rth, tmo, expire)) {
|
||||
tmo >>= 1;
|
||||
@ -812,7 +850,7 @@ static int rt_garbage_collect(void)
|
||||
goal--;
|
||||
#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
|
||||
}
|
||||
spin_unlock_bh(&rt_hash_table[k].lock);
|
||||
spin_unlock_bh(rt_hash_lock_addr(k));
|
||||
if (goal <= 0)
|
||||
break;
|
||||
}
|
||||
@ -882,7 +920,7 @@ restart:
|
||||
|
||||
rthp = &rt_hash_table[hash].chain;
|
||||
|
||||
spin_lock_bh(&rt_hash_table[hash].lock);
|
||||
spin_lock_bh(rt_hash_lock_addr(hash));
|
||||
while ((rth = *rthp) != NULL) {
|
||||
#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
|
||||
if (!(rth->u.dst.flags & DST_BALANCED) &&
|
||||
@ -908,7 +946,7 @@ restart:
|
||||
rth->u.dst.__use++;
|
||||
dst_hold(&rth->u.dst);
|
||||
rth->u.dst.lastuse = now;
|
||||
spin_unlock_bh(&rt_hash_table[hash].lock);
|
||||
spin_unlock_bh(rt_hash_lock_addr(hash));
|
||||
|
||||
rt_drop(rt);
|
||||
*rp = rth;
|
||||
@ -949,7 +987,7 @@ restart:
|
||||
if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
|
||||
int err = arp_bind_neighbour(&rt->u.dst);
|
||||
if (err) {
|
||||
spin_unlock_bh(&rt_hash_table[hash].lock);
|
||||
spin_unlock_bh(rt_hash_lock_addr(hash));
|
||||
|
||||
if (err != -ENOBUFS) {
|
||||
rt_drop(rt);
|
||||
@ -990,7 +1028,7 @@ restart:
|
||||
}
|
||||
#endif
|
||||
rt_hash_table[hash].chain = rt;
|
||||
spin_unlock_bh(&rt_hash_table[hash].lock);
|
||||
spin_unlock_bh(rt_hash_lock_addr(hash));
|
||||
*rp = rt;
|
||||
return 0;
|
||||
}
|
||||
@ -1058,7 +1096,7 @@ static void rt_del(unsigned hash, struct rtable *rt)
|
||||
{
|
||||
struct rtable **rthp;
|
||||
|
||||
spin_lock_bh(&rt_hash_table[hash].lock);
|
||||
spin_lock_bh(rt_hash_lock_addr(hash));
|
||||
ip_rt_put(rt);
|
||||
for (rthp = &rt_hash_table[hash].chain; *rthp;
|
||||
rthp = &(*rthp)->u.rt_next)
|
||||
@ -1067,7 +1105,7 @@ static void rt_del(unsigned hash, struct rtable *rt)
|
||||
rt_free(rt);
|
||||
break;
|
||||
}
|
||||
spin_unlock_bh(&rt_hash_table[hash].lock);
|
||||
spin_unlock_bh(rt_hash_lock_addr(hash));
|
||||
}
|
||||
|
||||
void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
|
||||
@ -3073,12 +3111,14 @@ __setup("rhash_entries=", set_rhash_entries);
|
||||
|
||||
int __init ip_rt_init(void)
|
||||
{
|
||||
int i, order, goal, rc = 0;
|
||||
int rc = 0;
|
||||
|
||||
rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
|
||||
(jiffies ^ (jiffies >> 7)));
|
||||
|
||||
#ifdef CONFIG_NET_CLS_ROUTE
|
||||
{
|
||||
int order;
|
||||
for (order = 0;
|
||||
(PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
|
||||
/* NOTHING */;
|
||||
@ -3086,6 +3126,7 @@ int __init ip_rt_init(void)
|
||||
if (!ip_rt_acct)
|
||||
panic("IP: failed to allocate ip_rt_acct\n");
|
||||
memset(ip_rt_acct, 0, PAGE_SIZE << order);
|
||||
}
|
||||
#endif
|
||||
|
||||
ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
|
||||
@ -3096,36 +3137,19 @@ int __init ip_rt_init(void)
|
||||
if (!ipv4_dst_ops.kmem_cachep)
|
||||
panic("IP: failed to allocate ip_dst_cache\n");
|
||||
|
||||
goal = num_physpages >> (26 - PAGE_SHIFT);
|
||||
if (rhash_entries)
|
||||
goal = (rhash_entries * sizeof(struct rt_hash_bucket)) >> PAGE_SHIFT;
|
||||
for (order = 0; (1UL << order) < goal; order++)
|
||||
/* NOTHING */;
|
||||
|
||||
do {
|
||||
rt_hash_mask = (1UL << order) * PAGE_SIZE /
|
||||
sizeof(struct rt_hash_bucket);
|
||||
while (rt_hash_mask & (rt_hash_mask - 1))
|
||||
rt_hash_mask--;
|
||||
rt_hash_table = (struct rt_hash_bucket *)
|
||||
__get_free_pages(GFP_ATOMIC, order);
|
||||
} while (rt_hash_table == NULL && --order > 0);
|
||||
|
||||
if (!rt_hash_table)
|
||||
panic("Failed to allocate IP route cache hash table\n");
|
||||
|
||||
printk(KERN_INFO "IP: routing cache hash table of %u buckets, %ldKbytes\n",
|
||||
rt_hash_mask,
|
||||
(long) (rt_hash_mask * sizeof(struct rt_hash_bucket)) / 1024);
|
||||
|
||||
for (rt_hash_log = 0; (1 << rt_hash_log) != rt_hash_mask; rt_hash_log++)
|
||||
/* NOTHING */;
|
||||
|
||||
rt_hash_mask--;
|
||||
for (i = 0; i <= rt_hash_mask; i++) {
|
||||
spin_lock_init(&rt_hash_table[i].lock);
|
||||
rt_hash_table[i].chain = NULL;
|
||||
}
|
||||
rt_hash_table = (struct rt_hash_bucket *)
|
||||
alloc_large_system_hash("IP route cache",
|
||||
sizeof(struct rt_hash_bucket),
|
||||
rhash_entries,
|
||||
(num_physpages >= 128 * 1024) ?
|
||||
(27 - PAGE_SHIFT) :
|
||||
(29 - PAGE_SHIFT),
|
||||
HASH_HIGHMEM,
|
||||
&rt_hash_log,
|
||||
&rt_hash_mask,
|
||||
0);
|
||||
memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
|
||||
rt_hash_lock_init();
|
||||
|
||||
ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
|
||||
ip_rt_max_size = (rt_hash_mask + 1) * 16;
|
||||
|
@ -615,7 +615,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
|
||||
size_t psize, int flags)
|
||||
{
|
||||
struct tcp_sock *tp = tcp_sk(sk);
|
||||
int mss_now;
|
||||
int mss_now, size_goal;
|
||||
int err;
|
||||
ssize_t copied;
|
||||
long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
|
||||
@ -628,6 +628,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
|
||||
clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
|
||||
|
||||
mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
|
||||
size_goal = tp->xmit_size_goal;
|
||||
copied = 0;
|
||||
|
||||
err = -EPIPE;
|
||||
@ -641,7 +642,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
|
||||
int offset = poffset % PAGE_SIZE;
|
||||
int size = min_t(size_t, psize, PAGE_SIZE - offset);
|
||||
|
||||
if (!sk->sk_send_head || (copy = mss_now - skb->len) <= 0) {
|
||||
if (!sk->sk_send_head || (copy = size_goal - skb->len) <= 0) {
|
||||
new_segment:
|
||||
if (!sk_stream_memory_free(sk))
|
||||
goto wait_for_sndbuf;
|
||||
@ -652,7 +653,7 @@ new_segment:
|
||||
goto wait_for_memory;
|
||||
|
||||
skb_entail(sk, tp, skb);
|
||||
copy = mss_now;
|
||||
copy = size_goal;
|
||||
}
|
||||
|
||||
if (copy > size)
|
||||
@ -693,7 +694,7 @@ new_segment:
|
||||
if (!(psize -= copy))
|
||||
goto out;
|
||||
|
||||
if (skb->len != mss_now || (flags & MSG_OOB))
|
||||
if (skb->len < mss_now || (flags & MSG_OOB))
|
||||
continue;
|
||||
|
||||
if (forced_push(tp)) {
|
||||
@ -713,6 +714,7 @@ wait_for_memory:
|
||||
goto do_error;
|
||||
|
||||
mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
|
||||
size_goal = tp->xmit_size_goal;
|
||||
}
|
||||
|
||||
out:
|
||||
@ -754,15 +756,20 @@ ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
|
||||
|
||||
static inline int select_size(struct sock *sk, struct tcp_sock *tp)
|
||||
{
|
||||
int tmp = tp->mss_cache_std;
|
||||
int tmp = tp->mss_cache;
|
||||
|
||||
if (sk->sk_route_caps & NETIF_F_SG) {
|
||||
int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
|
||||
if (sk->sk_route_caps & NETIF_F_TSO)
|
||||
tmp = 0;
|
||||
else {
|
||||
int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
|
||||
|
||||
if (tmp >= pgbreak &&
|
||||
tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
|
||||
tmp = pgbreak;
|
||||
if (tmp >= pgbreak &&
|
||||
tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
|
||||
tmp = pgbreak;
|
||||
}
|
||||
}
|
||||
|
||||
return tmp;
|
||||
}
|
||||
|
||||
@ -773,7 +780,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
|
||||
struct tcp_sock *tp = tcp_sk(sk);
|
||||
struct sk_buff *skb;
|
||||
int iovlen, flags;
|
||||
int mss_now;
|
||||
int mss_now, size_goal;
|
||||
int err, copied;
|
||||
long timeo;
|
||||
|
||||
@ -792,6 +799,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
|
||||
clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
|
||||
|
||||
mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
|
||||
size_goal = tp->xmit_size_goal;
|
||||
|
||||
/* Ok commence sending. */
|
||||
iovlen = msg->msg_iovlen;
|
||||
@ -814,7 +822,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
|
||||
skb = sk->sk_write_queue.prev;
|
||||
|
||||
if (!sk->sk_send_head ||
|
||||
(copy = mss_now - skb->len) <= 0) {
|
||||
(copy = size_goal - skb->len) <= 0) {
|
||||
|
||||
new_segment:
|
||||
/* Allocate new segment. If the interface is SG,
|
||||
@ -837,7 +845,7 @@ new_segment:
|
||||
skb->ip_summed = CHECKSUM_HW;
|
||||
|
||||
skb_entail(sk, tp, skb);
|
||||
copy = mss_now;
|
||||
copy = size_goal;
|
||||
}
|
||||
|
||||
/* Try to append data to the end of skb. */
|
||||
@ -872,11 +880,6 @@ new_segment:
|
||||
tcp_mark_push(tp, skb);
|
||||
goto new_segment;
|
||||
} else if (page) {
|
||||
/* If page is cached, align
|
||||
* offset to L1 cache boundary
|
||||
*/
|
||||
off = (off + L1_CACHE_BYTES - 1) &
|
||||
~(L1_CACHE_BYTES - 1);
|
||||
if (off == PAGE_SIZE) {
|
||||
put_page(page);
|
||||
TCP_PAGE(sk) = page = NULL;
|
||||
@ -937,7 +940,7 @@ new_segment:
|
||||
if ((seglen -= copy) == 0 && iovlen == 0)
|
||||
goto out;
|
||||
|
||||
if (skb->len != mss_now || (flags & MSG_OOB))
|
||||
if (skb->len < mss_now || (flags & MSG_OOB))
|
||||
continue;
|
||||
|
||||
if (forced_push(tp)) {
|
||||
@ -957,6 +960,7 @@ wait_for_memory:
|
||||
goto do_error;
|
||||
|
||||
mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
|
||||
size_goal = tp->xmit_size_goal;
|
||||
}
|
||||
}
|
||||
|
||||
@ -2128,7 +2132,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
|
||||
|
||||
info->tcpi_rto = jiffies_to_usecs(tp->rto);
|
||||
info->tcpi_ato = jiffies_to_usecs(tp->ack.ato);
|
||||
info->tcpi_snd_mss = tp->mss_cache_std;
|
||||
info->tcpi_snd_mss = tp->mss_cache;
|
||||
info->tcpi_rcv_mss = tp->ack.rcv_mss;
|
||||
|
||||
info->tcpi_unacked = tp->packets_out;
|
||||
@ -2178,7 +2182,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
|
||||
|
||||
switch (optname) {
|
||||
case TCP_MAXSEG:
|
||||
val = tp->mss_cache_std;
|
||||
val = tp->mss_cache;
|
||||
if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
|
||||
val = tp->rx_opt.user_mss;
|
||||
break;
|
||||
|
@ -740,10 +740,10 @@ __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst)
|
||||
__u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
|
||||
|
||||
if (!cwnd) {
|
||||
if (tp->mss_cache_std > 1460)
|
||||
if (tp->mss_cache > 1460)
|
||||
cwnd = 2;
|
||||
else
|
||||
cwnd = (tp->mss_cache_std > 1095) ? 3 : 4;
|
||||
cwnd = (tp->mss_cache > 1095) ? 3 : 4;
|
||||
}
|
||||
return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
|
||||
}
|
||||
@ -914,7 +914,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
|
||||
if (sk->sk_route_caps & NETIF_F_TSO) {
|
||||
sk->sk_route_caps &= ~NETIF_F_TSO;
|
||||
sock_set_flag(sk, SOCK_NO_LARGESEND);
|
||||
tp->mss_cache = tp->mss_cache_std;
|
||||
tp->mss_cache = tp->mss_cache;
|
||||
}
|
||||
|
||||
if (!tp->sacked_out)
|
||||
@ -1077,7 +1077,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
|
||||
(IsFack(tp) ||
|
||||
!before(lost_retrans,
|
||||
TCP_SKB_CB(skb)->ack_seq + tp->reordering *
|
||||
tp->mss_cache_std))) {
|
||||
tp->mss_cache))) {
|
||||
TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
|
||||
tp->retrans_out -= tcp_skb_pcount(skb);
|
||||
|
||||
@ -1957,15 +1957,6 @@ static inline void tcp_ack_packets_out(struct sock *sk, struct tcp_sock *tp)
|
||||
}
|
||||
}
|
||||
|
||||
/* There is one downside to this scheme. Although we keep the
|
||||
* ACK clock ticking, adjusting packet counters and advancing
|
||||
* congestion window, we do not liberate socket send buffer
|
||||
* space.
|
||||
*
|
||||
* Mucking with skb->truesize and sk->sk_wmem_alloc et al.
|
||||
* then making a write space wakeup callback is a possible
|
||||
* future enhancement. WARNING: it is not trivial to make.
|
||||
*/
|
||||
static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb,
|
||||
__u32 now, __s32 *seq_rtt)
|
||||
{
|
||||
@ -2047,7 +2038,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt
|
||||
* the other end.
|
||||
*/
|
||||
if (after(scb->end_seq, tp->snd_una)) {
|
||||
if (tcp_skb_pcount(skb) > 1)
|
||||
if (tcp_skb_pcount(skb) > 1 &&
|
||||
after(tp->snd_una, scb->seq))
|
||||
acked |= tcp_tso_acked(sk, skb,
|
||||
now, &seq_rtt);
|
||||
break;
|
||||
@ -3308,6 +3300,28 @@ void tcp_cwnd_application_limited(struct sock *sk)
|
||||
tp->snd_cwnd_stamp = tcp_time_stamp;
|
||||
}
|
||||
|
||||
static inline int tcp_should_expand_sndbuf(struct sock *sk, struct tcp_sock *tp)
|
||||
{
|
||||
/* If the user specified a specific send buffer setting, do
|
||||
* not modify it.
|
||||
*/
|
||||
if (sk->sk_userlocks & SOCK_SNDBUF_LOCK)
|
||||
return 0;
|
||||
|
||||
/* If we are under global TCP memory pressure, do not expand. */
|
||||
if (tcp_memory_pressure)
|
||||
return 0;
|
||||
|
||||
/* If we are under soft global TCP memory pressure, do not expand. */
|
||||
if (atomic_read(&tcp_memory_allocated) >= sysctl_tcp_mem[0])
|
||||
return 0;
|
||||
|
||||
/* If we filled the congestion window, do not expand. */
|
||||
if (tp->packets_out >= tp->snd_cwnd)
|
||||
return 0;
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* When incoming ACK allowed to free some skb from write_queue,
|
||||
* we remember this event in flag SOCK_QUEUE_SHRUNK and wake up socket
|
||||
@ -3319,11 +3333,8 @@ static void tcp_new_space(struct sock *sk)
|
||||
{
|
||||
struct tcp_sock *tp = tcp_sk(sk);
|
||||
|
||||
if (tp->packets_out < tp->snd_cwnd &&
|
||||
!(sk->sk_userlocks & SOCK_SNDBUF_LOCK) &&
|
||||
!tcp_memory_pressure &&
|
||||
atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) {
|
||||
int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache_std) +
|
||||
if (tcp_should_expand_sndbuf(sk, tp)) {
|
||||
int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) +
|
||||
MAX_TCP_HEADER + 16 + sizeof(struct sk_buff),
|
||||
demanded = max_t(unsigned int, tp->snd_cwnd,
|
||||
tp->reordering + 1);
|
||||
@ -3346,22 +3357,9 @@ static inline void tcp_check_space(struct sock *sk)
|
||||
}
|
||||
}
|
||||
|
||||
static void __tcp_data_snd_check(struct sock *sk, struct sk_buff *skb)
|
||||
static __inline__ void tcp_data_snd_check(struct sock *sk, struct tcp_sock *tp)
|
||||
{
|
||||
struct tcp_sock *tp = tcp_sk(sk);
|
||||
|
||||
if (after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) ||
|
||||
tcp_packets_in_flight(tp) >= tp->snd_cwnd ||
|
||||
tcp_write_xmit(sk, tp->nonagle))
|
||||
tcp_check_probe_timer(sk, tp);
|
||||
}
|
||||
|
||||
static __inline__ void tcp_data_snd_check(struct sock *sk)
|
||||
{
|
||||
struct sk_buff *skb = sk->sk_send_head;
|
||||
|
||||
if (skb != NULL)
|
||||
__tcp_data_snd_check(sk, skb);
|
||||
tcp_push_pending_frames(sk, tp);
|
||||
tcp_check_space(sk);
|
||||
}
|
||||
|
||||
@ -3655,7 +3653,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
|
||||
*/
|
||||
tcp_ack(sk, skb, 0);
|
||||
__kfree_skb(skb);
|
||||
tcp_data_snd_check(sk);
|
||||
tcp_data_snd_check(sk, tp);
|
||||
return 0;
|
||||
} else { /* Header too small */
|
||||
TCP_INC_STATS_BH(TCP_MIB_INERRS);
|
||||
@ -3721,7 +3719,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
|
||||
if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) {
|
||||
/* Well, only one small jumplet in fast path... */
|
||||
tcp_ack(sk, skb, FLAG_DATA);
|
||||
tcp_data_snd_check(sk);
|
||||
tcp_data_snd_check(sk, tp);
|
||||
if (!tcp_ack_scheduled(tp))
|
||||
goto no_ack;
|
||||
}
|
||||
@ -3799,7 +3797,7 @@ step5:
|
||||
/* step 7: process the segment text */
|
||||
tcp_data_queue(sk, skb);
|
||||
|
||||
tcp_data_snd_check(sk);
|
||||
tcp_data_snd_check(sk, tp);
|
||||
tcp_ack_snd_check(sk);
|
||||
return 0;
|
||||
|
||||
@ -4109,7 +4107,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
|
||||
/* Do step6 onward by hand. */
|
||||
tcp_urg(sk, skb, th);
|
||||
__kfree_skb(skb);
|
||||
tcp_data_snd_check(sk);
|
||||
tcp_data_snd_check(sk, tp);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -4300,7 +4298,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
|
||||
|
||||
/* tcp_data could move socket to TIME-WAIT */
|
||||
if (sk->sk_state != TCP_CLOSE) {
|
||||
tcp_data_snd_check(sk);
|
||||
tcp_data_snd_check(sk, tp);
|
||||
tcp_ack_snd_check(sk);
|
||||
}
|
||||
|
||||
|
@ -2045,7 +2045,7 @@ static int tcp_v4_init_sock(struct sock *sk)
|
||||
*/
|
||||
tp->snd_ssthresh = 0x7fffffff; /* Infinity */
|
||||
tp->snd_cwnd_clamp = ~0;
|
||||
tp->mss_cache_std = tp->mss_cache = 536;
|
||||
tp->mss_cache = 536;
|
||||
|
||||
tp->reordering = sysctl_tcp_reordering;
|
||||
tp->ca_ops = &tcp_init_congestion_ops;
|
||||
|
@ -49,7 +49,7 @@ int sysctl_tcp_retrans_collapse = 1;
|
||||
* will allow a single TSO frame to consume. Building TSO frames
|
||||
* which are too large can cause TCP streams to be bursty.
|
||||
*/
|
||||
int sysctl_tcp_tso_win_divisor = 8;
|
||||
int sysctl_tcp_tso_win_divisor = 3;
|
||||
|
||||
static inline void update_send_head(struct sock *sk, struct tcp_sock *tp,
|
||||
struct sk_buff *skb)
|
||||
@ -140,11 +140,11 @@ static inline void tcp_event_data_sent(struct tcp_sock *tp,
|
||||
tp->ack.pingpong = 1;
|
||||
}
|
||||
|
||||
static __inline__ void tcp_event_ack_sent(struct sock *sk)
|
||||
static __inline__ void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
|
||||
{
|
||||
struct tcp_sock *tp = tcp_sk(sk);
|
||||
|
||||
tcp_dec_quickack_mode(tp);
|
||||
tcp_dec_quickack_mode(tp, pkts);
|
||||
tcp_clear_xmit_timer(sk, TCP_TIME_DACK);
|
||||
}
|
||||
|
||||
@ -355,7 +355,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
|
||||
tp->af_specific->send_check(sk, th, skb->len, skb);
|
||||
|
||||
if (tcb->flags & TCPCB_FLAG_ACK)
|
||||
tcp_event_ack_sent(sk);
|
||||
tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
|
||||
|
||||
if (skb->len != tcp_header_size)
|
||||
tcp_event_data_sent(tp, skb, sk);
|
||||
@ -403,42 +403,11 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
|
||||
sk->sk_send_head = skb;
|
||||
}
|
||||
|
||||
static inline void tcp_tso_set_push(struct sk_buff *skb)
|
||||
{
|
||||
/* Force push to be on for any TSO frames to workaround
|
||||
* problems with busted implementations like Mac OS-X that
|
||||
* hold off socket receive wakeups until push is seen.
|
||||
*/
|
||||
if (tcp_skb_pcount(skb) > 1)
|
||||
TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
|
||||
}
|
||||
|
||||
/* Send _single_ skb sitting at the send head. This function requires
|
||||
* true push pending frames to setup probe timer etc.
|
||||
*/
|
||||
void tcp_push_one(struct sock *sk, unsigned cur_mss)
|
||||
{
|
||||
struct tcp_sock *tp = tcp_sk(sk);
|
||||
struct sk_buff *skb = sk->sk_send_head;
|
||||
|
||||
if (tcp_snd_test(sk, skb, cur_mss, TCP_NAGLE_PUSH)) {
|
||||
/* Send it out now. */
|
||||
TCP_SKB_CB(skb)->when = tcp_time_stamp;
|
||||
tcp_tso_set_push(skb);
|
||||
if (!tcp_transmit_skb(sk, skb_clone(skb, sk->sk_allocation))) {
|
||||
sk->sk_send_head = NULL;
|
||||
tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
|
||||
tcp_packets_out_inc(sk, tp, skb);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb)
|
||||
static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb)
|
||||
{
|
||||
struct tcp_sock *tp = tcp_sk(sk);
|
||||
|
||||
if (skb->len <= tp->mss_cache_std ||
|
||||
if (skb->len <= tp->mss_cache ||
|
||||
!(sk->sk_route_caps & NETIF_F_TSO)) {
|
||||
/* Avoid the costly divide in the normal
|
||||
* non-TSO case.
|
||||
@ -448,10 +417,10 @@ void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb)
|
||||
} else {
|
||||
unsigned int factor;
|
||||
|
||||
factor = skb->len + (tp->mss_cache_std - 1);
|
||||
factor /= tp->mss_cache_std;
|
||||
factor = skb->len + (tp->mss_cache - 1);
|
||||
factor /= tp->mss_cache;
|
||||
skb_shinfo(skb)->tso_segs = factor;
|
||||
skb_shinfo(skb)->tso_size = tp->mss_cache_std;
|
||||
skb_shinfo(skb)->tso_size = tp->mss_cache;
|
||||
}
|
||||
}
|
||||
|
||||
@ -537,6 +506,7 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
|
||||
}
|
||||
|
||||
/* Link BUFF into the send queue. */
|
||||
skb_header_release(buff);
|
||||
__skb_append(skb, buff);
|
||||
|
||||
return 0;
|
||||
@ -657,7 +627,7 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
|
||||
|
||||
/* And store cached results */
|
||||
tp->pmtu_cookie = pmtu;
|
||||
tp->mss_cache = tp->mss_cache_std = mss_now;
|
||||
tp->mss_cache = mss_now;
|
||||
|
||||
return mss_now;
|
||||
}
|
||||
@ -669,59 +639,318 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
|
||||
* cannot be large. However, taking into account rare use of URG, this
|
||||
* is not a big flaw.
|
||||
*/
|
||||
|
||||
unsigned int tcp_current_mss(struct sock *sk, int large)
|
||||
unsigned int tcp_current_mss(struct sock *sk, int large_allowed)
|
||||
{
|
||||
struct tcp_sock *tp = tcp_sk(sk);
|
||||
struct dst_entry *dst = __sk_dst_get(sk);
|
||||
unsigned int do_large, mss_now;
|
||||
u32 mss_now;
|
||||
u16 xmit_size_goal;
|
||||
int doing_tso = 0;
|
||||
|
||||
mss_now = tp->mss_cache;
|
||||
|
||||
if (large_allowed &&
|
||||
(sk->sk_route_caps & NETIF_F_TSO) &&
|
||||
!tp->urg_mode)
|
||||
doing_tso = 1;
|
||||
|
||||
mss_now = tp->mss_cache_std;
|
||||
if (dst) {
|
||||
u32 mtu = dst_mtu(dst);
|
||||
if (mtu != tp->pmtu_cookie)
|
||||
mss_now = tcp_sync_mss(sk, mtu);
|
||||
}
|
||||
|
||||
do_large = (large &&
|
||||
(sk->sk_route_caps & NETIF_F_TSO) &&
|
||||
!tp->urg_mode);
|
||||
|
||||
if (do_large) {
|
||||
unsigned int large_mss, factor, limit;
|
||||
|
||||
large_mss = 65535 - tp->af_specific->net_header_len -
|
||||
tp->ext_header_len - tp->tcp_header_len;
|
||||
|
||||
if (tp->max_window && large_mss > (tp->max_window>>1))
|
||||
large_mss = max((tp->max_window>>1),
|
||||
68U - tp->tcp_header_len);
|
||||
|
||||
factor = large_mss / mss_now;
|
||||
|
||||
/* Always keep large mss multiple of real mss, but
|
||||
* do not exceed 1/tso_win_divisor of the congestion window
|
||||
* so we can keep the ACK clock ticking and minimize
|
||||
* bursting.
|
||||
*/
|
||||
limit = tp->snd_cwnd;
|
||||
if (sysctl_tcp_tso_win_divisor)
|
||||
limit /= sysctl_tcp_tso_win_divisor;
|
||||
limit = max(1U, limit);
|
||||
if (factor > limit)
|
||||
factor = limit;
|
||||
|
||||
tp->mss_cache = mss_now * factor;
|
||||
|
||||
mss_now = tp->mss_cache;
|
||||
}
|
||||
|
||||
if (tp->rx_opt.eff_sacks)
|
||||
mss_now -= (TCPOLEN_SACK_BASE_ALIGNED +
|
||||
(tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK));
|
||||
|
||||
xmit_size_goal = mss_now;
|
||||
|
||||
if (doing_tso) {
|
||||
xmit_size_goal = 65535 -
|
||||
tp->af_specific->net_header_len -
|
||||
tp->ext_header_len - tp->tcp_header_len;
|
||||
|
||||
if (tp->max_window &&
|
||||
(xmit_size_goal > (tp->max_window >> 1)))
|
||||
xmit_size_goal = max((tp->max_window >> 1),
|
||||
68U - tp->tcp_header_len);
|
||||
|
||||
xmit_size_goal -= (xmit_size_goal % mss_now);
|
||||
}
|
||||
tp->xmit_size_goal = xmit_size_goal;
|
||||
|
||||
return mss_now;
|
||||
}
|
||||
|
||||
/* Congestion window validation. (RFC2861) */
|
||||
|
||||
static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp)
|
||||
{
|
||||
__u32 packets_out = tp->packets_out;
|
||||
|
||||
if (packets_out >= tp->snd_cwnd) {
|
||||
/* Network is feed fully. */
|
||||
tp->snd_cwnd_used = 0;
|
||||
tp->snd_cwnd_stamp = tcp_time_stamp;
|
||||
} else {
|
||||
/* Network starves. */
|
||||
if (tp->packets_out > tp->snd_cwnd_used)
|
||||
tp->snd_cwnd_used = tp->packets_out;
|
||||
|
||||
if ((s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= tp->rto)
|
||||
tcp_cwnd_application_limited(sk);
|
||||
}
|
||||
}
|
||||
|
||||
static unsigned int tcp_window_allows(struct tcp_sock *tp, struct sk_buff *skb, unsigned int mss_now, unsigned int cwnd)
|
||||
{
|
||||
u32 window, cwnd_len;
|
||||
|
||||
window = (tp->snd_una + tp->snd_wnd - TCP_SKB_CB(skb)->seq);
|
||||
cwnd_len = mss_now * cwnd;
|
||||
return min(window, cwnd_len);
|
||||
}
|
||||
|
||||
/* Can at least one segment of SKB be sent right now, according to the
|
||||
* congestion window rules? If so, return how many segments are allowed.
|
||||
*/
|
||||
static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, struct sk_buff *skb)
|
||||
{
|
||||
u32 in_flight, cwnd;
|
||||
|
||||
/* Don't be strict about the congestion window for the final FIN. */
|
||||
if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)
|
||||
return 1;
|
||||
|
||||
in_flight = tcp_packets_in_flight(tp);
|
||||
cwnd = tp->snd_cwnd;
|
||||
if (in_flight < cwnd)
|
||||
return (cwnd - in_flight);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* This must be invoked the first time we consider transmitting
|
||||
* SKB onto the wire.
|
||||
*/
|
||||
static inline int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb)
|
||||
{
|
||||
int tso_segs = tcp_skb_pcount(skb);
|
||||
|
||||
if (!tso_segs) {
|
||||
tcp_set_skb_tso_segs(sk, skb);
|
||||
tso_segs = tcp_skb_pcount(skb);
|
||||
}
|
||||
return tso_segs;
|
||||
}
|
||||
|
||||
static inline int tcp_minshall_check(const struct tcp_sock *tp)
|
||||
{
|
||||
return after(tp->snd_sml,tp->snd_una) &&
|
||||
!after(tp->snd_sml, tp->snd_nxt);
|
||||
}
|
||||
|
||||
/* Return 0, if packet can be sent now without violation Nagle's rules:
|
||||
* 1. It is full sized.
|
||||
* 2. Or it contains FIN. (already checked by caller)
|
||||
* 3. Or TCP_NODELAY was set.
|
||||
* 4. Or TCP_CORK is not set, and all sent packets are ACKed.
|
||||
* With Minshall's modification: all sent small packets are ACKed.
|
||||
*/
|
||||
|
||||
static inline int tcp_nagle_check(const struct tcp_sock *tp,
|
||||
const struct sk_buff *skb,
|
||||
unsigned mss_now, int nonagle)
|
||||
{
|
||||
return (skb->len < mss_now &&
|
||||
((nonagle&TCP_NAGLE_CORK) ||
|
||||
(!nonagle &&
|
||||
tp->packets_out &&
|
||||
tcp_minshall_check(tp))));
|
||||
}
|
||||
|
||||
/* Return non-zero if the Nagle test allows this packet to be
|
||||
* sent now.
|
||||
*/
|
||||
static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb,
|
||||
unsigned int cur_mss, int nonagle)
|
||||
{
|
||||
/* Nagle rule does not apply to frames, which sit in the middle of the
|
||||
* write_queue (they have no chances to get new data).
|
||||
*
|
||||
* This is implemented in the callers, where they modify the 'nonagle'
|
||||
* argument based upon the location of SKB in the send queue.
|
||||
*/
|
||||
if (nonagle & TCP_NAGLE_PUSH)
|
||||
return 1;
|
||||
|
||||
/* Don't use the nagle rule for urgent data (or for the final FIN). */
|
||||
if (tp->urg_mode ||
|
||||
(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN))
|
||||
return 1;
|
||||
|
||||
if (!tcp_nagle_check(tp, skb, cur_mss, nonagle))
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Does at least the first segment of SKB fit into the send window? */
|
||||
static inline int tcp_snd_wnd_test(struct tcp_sock *tp, struct sk_buff *skb, unsigned int cur_mss)
|
||||
{
|
||||
u32 end_seq = TCP_SKB_CB(skb)->end_seq;
|
||||
|
||||
if (skb->len > cur_mss)
|
||||
end_seq = TCP_SKB_CB(skb)->seq + cur_mss;
|
||||
|
||||
return !after(end_seq, tp->snd_una + tp->snd_wnd);
|
||||
}
|
||||
|
||||
/* This checks if the data bearing packet SKB (usually sk->sk_send_head)
|
||||
* should be put on the wire right now. If so, it returns the number of
|
||||
* packets allowed by the congestion window.
|
||||
*/
|
||||
static unsigned int tcp_snd_test(struct sock *sk, struct sk_buff *skb,
|
||||
unsigned int cur_mss, int nonagle)
|
||||
{
|
||||
struct tcp_sock *tp = tcp_sk(sk);
|
||||
unsigned int cwnd_quota;
|
||||
|
||||
tcp_init_tso_segs(sk, skb);
|
||||
|
||||
if (!tcp_nagle_test(tp, skb, cur_mss, nonagle))
|
||||
return 0;
|
||||
|
||||
cwnd_quota = tcp_cwnd_test(tp, skb);
|
||||
if (cwnd_quota &&
|
||||
!tcp_snd_wnd_test(tp, skb, cur_mss))
|
||||
cwnd_quota = 0;
|
||||
|
||||
return cwnd_quota;
|
||||
}
|
||||
|
||||
static inline int tcp_skb_is_last(const struct sock *sk,
|
||||
const struct sk_buff *skb)
|
||||
{
|
||||
return skb->next == (struct sk_buff *)&sk->sk_write_queue;
|
||||
}
|
||||
|
||||
int tcp_may_send_now(struct sock *sk, struct tcp_sock *tp)
|
||||
{
|
||||
struct sk_buff *skb = sk->sk_send_head;
|
||||
|
||||
return (skb &&
|
||||
tcp_snd_test(sk, skb, tcp_current_mss(sk, 1),
|
||||
(tcp_skb_is_last(sk, skb) ?
|
||||
TCP_NAGLE_PUSH :
|
||||
tp->nonagle)));
|
||||
}
|
||||
|
||||
/* Trim TSO SKB to LEN bytes, put the remaining data into a new packet
|
||||
* which is put after SKB on the list. It is very much like
|
||||
* tcp_fragment() except that it may make several kinds of assumptions
|
||||
* in order to speed up the splitting operation. In particular, we
|
||||
* know that all the data is in scatter-gather pages, and that the
|
||||
* packet has never been sent out before (and thus is not cloned).
|
||||
*/
|
||||
static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len)
|
||||
{
|
||||
struct sk_buff *buff;
|
||||
int nlen = skb->len - len;
|
||||
u16 flags;
|
||||
|
||||
/* All of a TSO frame must be composed of paged data. */
|
||||
BUG_ON(skb->len != skb->data_len);
|
||||
|
||||
buff = sk_stream_alloc_pskb(sk, 0, 0, GFP_ATOMIC);
|
||||
if (unlikely(buff == NULL))
|
||||
return -ENOMEM;
|
||||
|
||||
buff->truesize = nlen;
|
||||
skb->truesize -= nlen;
|
||||
|
||||
/* Correct the sequence numbers. */
|
||||
TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
|
||||
TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
|
||||
TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
|
||||
|
||||
/* PSH and FIN should only be set in the second packet. */
|
||||
flags = TCP_SKB_CB(skb)->flags;
|
||||
TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH);
|
||||
TCP_SKB_CB(buff)->flags = flags;
|
||||
|
||||
/* This packet was never sent out yet, so no SACK bits. */
|
||||
TCP_SKB_CB(buff)->sacked = 0;
|
||||
|
||||
buff->ip_summed = skb->ip_summed = CHECKSUM_HW;
|
||||
skb_split(skb, buff, len);
|
||||
|
||||
/* Fix up tso_factor for both original and new SKB. */
|
||||
tcp_set_skb_tso_segs(sk, skb);
|
||||
tcp_set_skb_tso_segs(sk, buff);
|
||||
|
||||
/* Link BUFF into the send queue. */
|
||||
skb_header_release(buff);
|
||||
__skb_append(skb, buff);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Try to defer sending, if possible, in order to minimize the amount
|
||||
* of TSO splitting we do. View it as a kind of TSO Nagle test.
|
||||
*
|
||||
* This algorithm is from John Heffner.
|
||||
*/
|
||||
static int tcp_tso_should_defer(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb)
|
||||
{
|
||||
u32 send_win, cong_win, limit, in_flight;
|
||||
|
||||
if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)
|
||||
return 0;
|
||||
|
||||
if (tp->ca_state != TCP_CA_Open)
|
||||
return 0;
|
||||
|
||||
in_flight = tcp_packets_in_flight(tp);
|
||||
|
||||
BUG_ON(tcp_skb_pcount(skb) <= 1 ||
|
||||
(tp->snd_cwnd <= in_flight));
|
||||
|
||||
send_win = (tp->snd_una + tp->snd_wnd) - TCP_SKB_CB(skb)->seq;
|
||||
|
||||
/* From in_flight test above, we know that cwnd > in_flight. */
|
||||
cong_win = (tp->snd_cwnd - in_flight) * tp->mss_cache;
|
||||
|
||||
limit = min(send_win, cong_win);
|
||||
|
||||
/* If sk_send_head can be sent fully now, just do it. */
|
||||
if (skb->len <= limit)
|
||||
return 0;
|
||||
|
||||
if (sysctl_tcp_tso_win_divisor) {
|
||||
u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache);
|
||||
|
||||
/* If at least some fraction of a window is available,
|
||||
* just use it.
|
||||
*/
|
||||
chunk /= sysctl_tcp_tso_win_divisor;
|
||||
if (limit >= chunk)
|
||||
return 0;
|
||||
} else {
|
||||
/* Different approach, try not to defer past a single
|
||||
* ACK. Receiver should ACK every other full sized
|
||||
* frame, so if we have space for more than 3 frames
|
||||
* then send now.
|
||||
*/
|
||||
if (limit > tcp_max_burst(tp) * tp->mss_cache)
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Ok, it looks like it is advisable to defer. */
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* This routine writes packets to the network. It advances the
|
||||
* send_head. This happens as incoming acks open up the remote
|
||||
* window for us.
|
||||
@ -729,57 +958,158 @@ unsigned int tcp_current_mss(struct sock *sk, int large)
|
||||
* Returns 1, if no segments are in flight and we have queued segments, but
|
||||
* cannot send anything now because of SWS or another problem.
|
||||
*/
|
||||
int tcp_write_xmit(struct sock *sk, int nonagle)
|
||||
static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
|
||||
{
|
||||
struct tcp_sock *tp = tcp_sk(sk);
|
||||
unsigned int mss_now;
|
||||
struct sk_buff *skb;
|
||||
unsigned int tso_segs, sent_pkts;
|
||||
int cwnd_quota;
|
||||
|
||||
/* If we are closed, the bytes will have to remain here.
|
||||
* In time closedown will finish, we empty the write queue and all
|
||||
* will be happy.
|
||||
*/
|
||||
if (sk->sk_state != TCP_CLOSE) {
|
||||
struct sk_buff *skb;
|
||||
int sent_pkts = 0;
|
||||
if (unlikely(sk->sk_state == TCP_CLOSE))
|
||||
return 0;
|
||||
|
||||
/* Account for SACKS, we may need to fragment due to this.
|
||||
* It is just like the real MSS changing on us midstream.
|
||||
* We also handle things correctly when the user adds some
|
||||
* IP options mid-stream. Silly to do, but cover it.
|
||||
*/
|
||||
mss_now = tcp_current_mss(sk, 1);
|
||||
skb = sk->sk_send_head;
|
||||
if (unlikely(!skb))
|
||||
return 0;
|
||||
|
||||
while ((skb = sk->sk_send_head) &&
|
||||
tcp_snd_test(sk, skb, mss_now,
|
||||
tcp_skb_is_last(sk, skb) ? nonagle :
|
||||
TCP_NAGLE_PUSH)) {
|
||||
if (skb->len > mss_now) {
|
||||
if (tcp_fragment(sk, skb, mss_now))
|
||||
tso_segs = tcp_init_tso_segs(sk, skb);
|
||||
cwnd_quota = tcp_cwnd_test(tp, skb);
|
||||
if (unlikely(!cwnd_quota))
|
||||
goto out;
|
||||
|
||||
sent_pkts = 0;
|
||||
while (likely(tcp_snd_wnd_test(tp, skb, mss_now))) {
|
||||
BUG_ON(!tso_segs);
|
||||
|
||||
if (tso_segs == 1) {
|
||||
if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
|
||||
(tcp_skb_is_last(sk, skb) ?
|
||||
nonagle : TCP_NAGLE_PUSH))))
|
||||
break;
|
||||
} else {
|
||||
if (tcp_tso_should_defer(sk, tp, skb))
|
||||
break;
|
||||
}
|
||||
|
||||
if (tso_segs > 1) {
|
||||
u32 limit = tcp_window_allows(tp, skb,
|
||||
mss_now, cwnd_quota);
|
||||
|
||||
if (skb->len < limit) {
|
||||
unsigned int trim = skb->len % mss_now;
|
||||
|
||||
if (trim)
|
||||
limit = skb->len - trim;
|
||||
}
|
||||
if (skb->len > limit) {
|
||||
if (tso_fragment(sk, skb, limit))
|
||||
break;
|
||||
}
|
||||
|
||||
TCP_SKB_CB(skb)->when = tcp_time_stamp;
|
||||
tcp_tso_set_push(skb);
|
||||
if (tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)))
|
||||
} else if (unlikely(skb->len > mss_now)) {
|
||||
if (unlikely(tcp_fragment(sk, skb, mss_now)))
|
||||
break;
|
||||
|
||||
/* Advance the send_head. This one is sent out.
|
||||
* This call will increment packets_out.
|
||||
*/
|
||||
update_send_head(sk, tp, skb);
|
||||
|
||||
tcp_minshall_update(tp, mss_now, skb);
|
||||
sent_pkts = 1;
|
||||
}
|
||||
|
||||
if (sent_pkts) {
|
||||
tcp_cwnd_validate(sk, tp);
|
||||
return 0;
|
||||
}
|
||||
TCP_SKB_CB(skb)->when = tcp_time_stamp;
|
||||
|
||||
return !tp->packets_out && sk->sk_send_head;
|
||||
if (unlikely(tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC))))
|
||||
break;
|
||||
|
||||
/* Advance the send_head. This one is sent out.
|
||||
* This call will increment packets_out.
|
||||
*/
|
||||
update_send_head(sk, tp, skb);
|
||||
|
||||
tcp_minshall_update(tp, mss_now, skb);
|
||||
sent_pkts++;
|
||||
|
||||
/* Do not optimize this to use tso_segs. If we chopped up
|
||||
* the packet above, tso_segs will no longer be valid.
|
||||
*/
|
||||
cwnd_quota -= tcp_skb_pcount(skb);
|
||||
|
||||
BUG_ON(cwnd_quota < 0);
|
||||
if (!cwnd_quota)
|
||||
break;
|
||||
|
||||
skb = sk->sk_send_head;
|
||||
if (!skb)
|
||||
break;
|
||||
tso_segs = tcp_init_tso_segs(sk, skb);
|
||||
}
|
||||
|
||||
if (likely(sent_pkts)) {
|
||||
tcp_cwnd_validate(sk, tp);
|
||||
return 0;
|
||||
}
|
||||
out:
|
||||
return !tp->packets_out && sk->sk_send_head;
|
||||
}
|
||||
|
||||
/* Push out any pending frames which were held back due to
|
||||
* TCP_CORK or attempt at coalescing tiny packets.
|
||||
* The socket must be locked by the caller.
|
||||
*/
|
||||
void __tcp_push_pending_frames(struct sock *sk, struct tcp_sock *tp,
|
||||
unsigned int cur_mss, int nonagle)
|
||||
{
|
||||
struct sk_buff *skb = sk->sk_send_head;
|
||||
|
||||
if (skb) {
|
||||
if (tcp_write_xmit(sk, cur_mss, nonagle))
|
||||
tcp_check_probe_timer(sk, tp);
|
||||
}
|
||||
}
|
||||
|
||||
/* Send _single_ skb sitting at the send head. This function requires
|
||||
* true push pending frames to setup probe timer etc.
|
||||
*/
|
||||
void tcp_push_one(struct sock *sk, unsigned int mss_now)
|
||||
{
|
||||
struct tcp_sock *tp = tcp_sk(sk);
|
||||
struct sk_buff *skb = sk->sk_send_head;
|
||||
unsigned int tso_segs, cwnd_quota;
|
||||
|
||||
BUG_ON(!skb || skb->len < mss_now);
|
||||
|
||||
tso_segs = tcp_init_tso_segs(sk, skb);
|
||||
cwnd_quota = tcp_snd_test(sk, skb, mss_now, TCP_NAGLE_PUSH);
|
||||
|
||||
if (likely(cwnd_quota)) {
|
||||
BUG_ON(!tso_segs);
|
||||
|
||||
if (tso_segs > 1) {
|
||||
u32 limit = tcp_window_allows(tp, skb,
|
||||
mss_now, cwnd_quota);
|
||||
|
||||
if (skb->len < limit) {
|
||||
unsigned int trim = skb->len % mss_now;
|
||||
|
||||
if (trim)
|
||||
limit = skb->len - trim;
|
||||
}
|
||||
if (skb->len > limit) {
|
||||
if (unlikely(tso_fragment(sk, skb, limit)))
|
||||
return;
|
||||
}
|
||||
} else if (unlikely(skb->len > mss_now)) {
|
||||
if (unlikely(tcp_fragment(sk, skb, mss_now)))
|
||||
return;
|
||||
}
|
||||
|
||||
/* Send it out now. */
|
||||
TCP_SKB_CB(skb)->when = tcp_time_stamp;
|
||||
|
||||
if (likely(!tcp_transmit_skb(sk, skb_clone(skb, sk->sk_allocation)))) {
|
||||
update_send_head(sk, tp, skb);
|
||||
tcp_cwnd_validate(sk, tp);
|
||||
return;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* This function returns the amount that we can raise the
|
||||
@ -1039,7 +1369,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
|
||||
if (sk->sk_route_caps & NETIF_F_TSO) {
|
||||
sk->sk_route_caps &= ~NETIF_F_TSO;
|
||||
sock_set_flag(sk, SOCK_NO_LARGESEND);
|
||||
tp->mss_cache = tp->mss_cache_std;
|
||||
}
|
||||
|
||||
if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
|
||||
@ -1101,7 +1430,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
|
||||
* is still in somebody's hands, else make a clone.
|
||||
*/
|
||||
TCP_SKB_CB(skb)->when = tcp_time_stamp;
|
||||
tcp_tso_set_push(skb);
|
||||
|
||||
err = tcp_transmit_skb(sk, (skb_cloned(skb) ?
|
||||
pskb_copy(skb, GFP_ATOMIC):
|
||||
@ -1670,14 +1998,12 @@ int tcp_write_wakeup(struct sock *sk)
|
||||
if (sk->sk_route_caps & NETIF_F_TSO) {
|
||||
sock_set_flag(sk, SOCK_NO_LARGESEND);
|
||||
sk->sk_route_caps &= ~NETIF_F_TSO;
|
||||
tp->mss_cache = tp->mss_cache_std;
|
||||
}
|
||||
} else if (!tcp_skb_pcount(skb))
|
||||
tcp_set_skb_tso_segs(sk, skb);
|
||||
|
||||
TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
|
||||
TCP_SKB_CB(skb)->when = tcp_time_stamp;
|
||||
tcp_tso_set_push(skb);
|
||||
err = tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
|
||||
if (!err) {
|
||||
update_send_head(sk, tp, skb);
|
||||
|
@ -774,7 +774,6 @@ static int __init inet6_init(void)
|
||||
if (if6_proc_init())
|
||||
goto proc_if6_fail;
|
||||
#endif
|
||||
ipv6_packet_init();
|
||||
ip6_route_init();
|
||||
ip6_flowlabel_init();
|
||||
err = addrconf_init();
|
||||
@ -791,6 +790,8 @@ static int __init inet6_init(void)
|
||||
/* Init v6 transport protocols. */
|
||||
udpv6_init();
|
||||
tcpv6_init();
|
||||
|
||||
ipv6_packet_init();
|
||||
err = 0;
|
||||
out:
|
||||
return err;
|
||||
@ -798,7 +799,6 @@ out:
|
||||
addrconf_fail:
|
||||
ip6_flowlabel_cleanup();
|
||||
ip6_route_cleanup();
|
||||
ipv6_packet_cleanup();
|
||||
#ifdef CONFIG_PROC_FS
|
||||
if6_proc_exit();
|
||||
proc_if6_fail:
|
||||
|
@ -465,7 +465,6 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
|
||||
to->pkt_type = from->pkt_type;
|
||||
to->priority = from->priority;
|
||||
to->protocol = from->protocol;
|
||||
to->security = from->security;
|
||||
dst_release(to->dst);
|
||||
to->dst = dst_clone(from->dst);
|
||||
to->dev = from->dev;
|
||||
|
@ -2018,7 +2018,7 @@ static int tcp_v6_init_sock(struct sock *sk)
|
||||
*/
|
||||
tp->snd_ssthresh = 0x7fffffff;
|
||||
tp->snd_cwnd_clamp = ~0;
|
||||
tp->mss_cache_std = tp->mss_cache = 536;
|
||||
tp->mss_cache = 536;
|
||||
|
||||
tp->reordering = sysctl_tcp_reordering;
|
||||
|
||||
|
@ -4,7 +4,7 @@
|
||||
|
||||
obj-y := sch_generic.o
|
||||
|
||||
obj-$(CONFIG_NET_SCHED) += sch_api.o sch_fifo.o
|
||||
obj-$(CONFIG_NET_SCHED) += sch_api.o sch_fifo.o sch_blackhole.o
|
||||
obj-$(CONFIG_NET_CLS) += cls_api.o
|
||||
obj-$(CONFIG_NET_CLS_ACT) += act_api.o
|
||||
obj-$(CONFIG_NET_ACT_POLICE) += police.o
|
||||
|
@ -205,11 +205,6 @@ META_COLLECTOR(int_protocol)
|
||||
dst->value = skb->protocol;
|
||||
}
|
||||
|
||||
META_COLLECTOR(int_security)
|
||||
{
|
||||
dst->value = skb->security;
|
||||
}
|
||||
|
||||
META_COLLECTOR(int_pkttype)
|
||||
{
|
||||
dst->value = skb->pkt_type;
|
||||
@ -524,7 +519,6 @@ static struct meta_ops __meta_ops[TCF_META_TYPE_MAX+1][TCF_META_ID_MAX+1] = {
|
||||
[META_ID(REALDEV)] = META_FUNC(int_realdev),
|
||||
[META_ID(PRIORITY)] = META_FUNC(int_priority),
|
||||
[META_ID(PROTOCOL)] = META_FUNC(int_protocol),
|
||||
[META_ID(SECURITY)] = META_FUNC(int_security),
|
||||
[META_ID(PKTTYPE)] = META_FUNC(int_pkttype),
|
||||
[META_ID(PKTLEN)] = META_FUNC(int_pktlen),
|
||||
[META_ID(DATALEN)] = META_FUNC(int_datalen),
|
||||
|
@ -399,10 +399,8 @@ qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp)
|
||||
{
|
||||
int err;
|
||||
struct rtattr *kind = tca[TCA_KIND-1];
|
||||
void *p = NULL;
|
||||
struct Qdisc *sch;
|
||||
struct Qdisc_ops *ops;
|
||||
int size;
|
||||
|
||||
ops = qdisc_lookup_ops(kind);
|
||||
#ifdef CONFIG_KMOD
|
||||
@ -437,64 +435,55 @@ qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp)
|
||||
if (ops == NULL)
|
||||
goto err_out;
|
||||
|
||||
/* ensure that the Qdisc and the private data are 32-byte aligned */
|
||||
size = ((sizeof(*sch) + QDISC_ALIGN_CONST) & ~QDISC_ALIGN_CONST);
|
||||
size += ops->priv_size + QDISC_ALIGN_CONST;
|
||||
|
||||
p = kmalloc(size, GFP_KERNEL);
|
||||
err = -ENOBUFS;
|
||||
if (!p)
|
||||
sch = qdisc_alloc(dev, ops);
|
||||
if (IS_ERR(sch)) {
|
||||
err = PTR_ERR(sch);
|
||||
goto err_out2;
|
||||
memset(p, 0, size);
|
||||
sch = (struct Qdisc *)(((unsigned long)p + QDISC_ALIGN_CONST)
|
||||
& ~QDISC_ALIGN_CONST);
|
||||
sch->padded = (char *)sch - (char *)p;
|
||||
}
|
||||
|
||||
INIT_LIST_HEAD(&sch->list);
|
||||
skb_queue_head_init(&sch->q);
|
||||
|
||||
if (handle == TC_H_INGRESS)
|
||||
if (handle == TC_H_INGRESS) {
|
||||
sch->flags |= TCQ_F_INGRESS;
|
||||
|
||||
sch->ops = ops;
|
||||
sch->enqueue = ops->enqueue;
|
||||
sch->dequeue = ops->dequeue;
|
||||
sch->dev = dev;
|
||||
dev_hold(dev);
|
||||
atomic_set(&sch->refcnt, 1);
|
||||
sch->stats_lock = &dev->queue_lock;
|
||||
if (handle == 0) {
|
||||
handle = TC_H_MAKE(TC_H_INGRESS, 0);
|
||||
} else if (handle == 0) {
|
||||
handle = qdisc_alloc_handle(dev);
|
||||
err = -ENOMEM;
|
||||
if (handle == 0)
|
||||
goto err_out3;
|
||||
}
|
||||
|
||||
if (handle == TC_H_INGRESS)
|
||||
sch->handle =TC_H_MAKE(TC_H_INGRESS, 0);
|
||||
else
|
||||
sch->handle = handle;
|
||||
sch->handle = handle;
|
||||
|
||||
if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS-1])) == 0) {
|
||||
#ifdef CONFIG_NET_ESTIMATOR
|
||||
if (tca[TCA_RATE-1]) {
|
||||
err = gen_new_estimator(&sch->bstats, &sch->rate_est,
|
||||
sch->stats_lock,
|
||||
tca[TCA_RATE-1]);
|
||||
if (err) {
|
||||
/*
|
||||
* Any broken qdiscs that would require
|
||||
* a ops->reset() here? The qdisc was never
|
||||
* in action so it shouldn't be necessary.
|
||||
*/
|
||||
if (ops->destroy)
|
||||
ops->destroy(sch);
|
||||
goto err_out3;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
qdisc_lock_tree(dev);
|
||||
list_add_tail(&sch->list, &dev->qdisc_list);
|
||||
qdisc_unlock_tree(dev);
|
||||
|
||||
#ifdef CONFIG_NET_ESTIMATOR
|
||||
if (tca[TCA_RATE-1])
|
||||
gen_new_estimator(&sch->bstats, &sch->rate_est,
|
||||
sch->stats_lock, tca[TCA_RATE-1]);
|
||||
#endif
|
||||
return sch;
|
||||
}
|
||||
err_out3:
|
||||
dev_put(dev);
|
||||
kfree((char *) sch - sch->padded);
|
||||
err_out2:
|
||||
module_put(ops->owner);
|
||||
err_out:
|
||||
*errp = err;
|
||||
if (p)
|
||||
kfree(p);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
54
net/sched/sch_blackhole.c
Normal file
54
net/sched/sch_blackhole.c
Normal file
@ -0,0 +1,54 @@
|
||||
/*
|
||||
* net/sched/sch_blackhole.c Black hole queue
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* Authors: Thomas Graf <tgraf@suug.ch>
|
||||
*
|
||||
* Note: Quantum tunneling is not supported.
|
||||
*/
|
||||
|
||||
#include <linux/config.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/netdevice.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <net/pkt_sched.h>
|
||||
|
||||
static int blackhole_enqueue(struct sk_buff *skb, struct Qdisc *sch)
|
||||
{
|
||||
qdisc_drop(skb, sch);
|
||||
return NET_XMIT_SUCCESS;
|
||||
}
|
||||
|
||||
static struct sk_buff *blackhole_dequeue(struct Qdisc *sch)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static struct Qdisc_ops blackhole_qdisc_ops = {
|
||||
.id = "blackhole",
|
||||
.priv_size = 0,
|
||||
.enqueue = blackhole_enqueue,
|
||||
.dequeue = blackhole_dequeue,
|
||||
.owner = THIS_MODULE,
|
||||
};
|
||||
|
||||
static int __init blackhole_module_init(void)
|
||||
{
|
||||
return register_qdisc(&blackhole_qdisc_ops);
|
||||
}
|
||||
|
||||
static void __exit blackhole_module_exit(void)
|
||||
{
|
||||
unregister_qdisc(&blackhole_qdisc_ops);
|
||||
}
|
||||
|
||||
module_init(blackhole_module_init)
|
||||
module_exit(blackhole_module_exit)
|
||||
|
||||
MODULE_LICENSE("GPL");
|
@ -395,24 +395,23 @@ static struct Qdisc_ops pfifo_fast_ops = {
|
||||
.owner = THIS_MODULE,
|
||||
};
|
||||
|
||||
struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops)
|
||||
struct Qdisc *qdisc_alloc(struct net_device *dev, struct Qdisc_ops *ops)
|
||||
{
|
||||
void *p;
|
||||
struct Qdisc *sch;
|
||||
int size;
|
||||
unsigned int size;
|
||||
int err = -ENOBUFS;
|
||||
|
||||
/* ensure that the Qdisc and the private data are 32-byte aligned */
|
||||
size = ((sizeof(*sch) + QDISC_ALIGN_CONST) & ~QDISC_ALIGN_CONST);
|
||||
size += ops->priv_size + QDISC_ALIGN_CONST;
|
||||
size = QDISC_ALIGN(sizeof(*sch));
|
||||
size += ops->priv_size + (QDISC_ALIGNTO - 1);
|
||||
|
||||
p = kmalloc(size, GFP_KERNEL);
|
||||
if (!p)
|
||||
return NULL;
|
||||
goto errout;
|
||||
memset(p, 0, size);
|
||||
|
||||
sch = (struct Qdisc *)(((unsigned long)p + QDISC_ALIGN_CONST)
|
||||
& ~QDISC_ALIGN_CONST);
|
||||
sch->padded = (char *)sch - (char *)p;
|
||||
sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
|
||||
sch->padded = (char *) sch - (char *) p;
|
||||
|
||||
INIT_LIST_HEAD(&sch->list);
|
||||
skb_queue_head_init(&sch->q);
|
||||
@ -423,11 +422,24 @@ struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops)
|
||||
dev_hold(dev);
|
||||
sch->stats_lock = &dev->queue_lock;
|
||||
atomic_set(&sch->refcnt, 1);
|
||||
|
||||
return sch;
|
||||
errout:
|
||||
return ERR_PTR(-err);
|
||||
}
|
||||
|
||||
struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops)
|
||||
{
|
||||
struct Qdisc *sch;
|
||||
|
||||
sch = qdisc_alloc(dev, ops);
|
||||
if (IS_ERR(sch))
|
||||
goto errout;
|
||||
|
||||
if (!ops->init || ops->init(sch, NULL) == 0)
|
||||
return sch;
|
||||
|
||||
dev_put(dev);
|
||||
kfree(p);
|
||||
errout:
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@ -591,6 +603,7 @@ EXPORT_SYMBOL(__netdev_watchdog_up);
|
||||
EXPORT_SYMBOL(noop_qdisc);
|
||||
EXPORT_SYMBOL(noop_qdisc_ops);
|
||||
EXPORT_SYMBOL(qdisc_create_dflt);
|
||||
EXPORT_SYMBOL(qdisc_alloc);
|
||||
EXPORT_SYMBOL(qdisc_destroy);
|
||||
EXPORT_SYMBOL(qdisc_reset);
|
||||
EXPORT_SYMBOL(qdisc_restart);
|
||||
|
Loading…
Reference in New Issue
Block a user