import of old now defunct presentation slides svn repo

master
Harald Welte 7 years ago
commit fca59bea77
  1. 336
      2001/netfilter-6fevu2001/netfilter-6fevu.html
  2. 18
      2001/netfilter-birmingham2001/abstract
  3. 9
      2001/networktour-birmingham2001/abstract
  4. 116
      2001/networktour-birmingham2001/packet-journey-2.4.sgml
  5. 397
      2001/qos-knf2001/ip-qos-knf.mgp
  6. 6202
      2001/qos-knf2001/ip-qos-knf.ps
  7. BIN
      2001/qos-saopaulo2001/ARIBLK.TTF
  8. BIN
      2001/qos-saopaulo2001/IMPACT.TTF
  9. BIN
      2001/qos-saopaulo2001/MONOTYPE.TTF
  10. BIN
      2001/qos-saopaulo2001/VERDANA.TTF
  11. BIN
      2001/qos-saopaulo2001/VERDANAB.TTF
  12. BIN
      2001/qos-saopaulo2001/VERDANAI.TTF
  13. BIN
      2001/qos-saopaulo2001/VERDANAZ.TTF
  14. 23
      2001/qos-saopaulo2001/abstract
  15. 23
      2001/qos-saopaulo2001/cnc-style.mgp
  16. BIN
      2001/qos-saopaulo2001/fundo-cnc.png
  17. 397
      2001/qos-saopaulo2001/ip-qos.mgp
  18. 611
      2001/qos-saopaulo2001/qos-1.eps
  19. BIN
      2001/qos-saopaulo2001/qos-1.png
  20. 48
      2002/firewalling-knf-2002/abstract
  21. 312
      2002/firewalling-knf-2002/firewall.mgp
  22. 100
      2002/firewalling-knf-2002/toc
  23. 243
      2002/ipv6-ccc2002/ipv6-ccc2002.mgp
  24. 114
      2002/ipv6-ccc2002/topics
  25. 25
      2002/netfilter-bof-ols2002/abstract
  26. 374
      2002/netfilter-curdevel-lk2002/netfilter-curdevel-lk2002.mgp
  27. 374
      2002/netfilter-curdevel-lsm2002/netfilter-curdevel-lsm2002.mgp
  28. 31
      2002/netfilter-failover-ols2002/abstract
  29. 22
      2002/netfilter-failover-ols2002/biography
  30. 294
      2002/netfilter-failover-ols2002/netfilter-failover-ols2002.mgp
  31. 504
      2002/netfilter-failover-ols2002/netfilter-failover-ols2002.tex
  32. 56
      2002/netfilter-failover-ols2002/ols.sty
  33. 33
      2002/netfilter-future-lk2002/abstract
  34. 374
      2002/netfilter-future-lk2002/netfilter-future-lk2002.mgp
  35. 49
      2002/netfilter-internals-lsm2002/abstract
  36. 520
      2002/netfilter-internals-lsm2002/netfilter-internals-lsm2002.mgp
  37. 537
      2002/netfilter-internals-lsm2002/netfilter-internals-lsm2002.tex
  38. 49
      2002/netfilter-internals-lt2002/abstract
  39. 22
      2002/netfilter-internals-lt2002/biography
  40. 466
      2002/netfilter-internals-lt2002/netfilter-internals-lt2002.mgp
  41. 537
      2002/netfilter-internals-lt2002/netfilter-internals-lt2002.tex
  42. 50
      2002/netfilter-knf2002/abstract
  43. 466
      2002/netfilter-knf2002/netfilter-knf2002.mgp
  44. 201
      2002/tcp-statetracking-ccc2002/tcp-statetracking-ccc2002.mgp
  45. 147
      2002/tex-introduction-cc2002/tex-einfuehrung
  46. 430
      2002/tex-introduction-cc2002/tex-einfuehrung.tex
  47. BIN
      2003/firmware-reveng-ccc2003/ALL0277_1.02.6_ETSI_0703_code.bin
  48. BIN
      2003/firmware-reveng-ccc2003/ALL0277_1.02.6_ETSI_0703_code.bin.cramfs
  49. BIN
      2003/firmware-reveng-ccc2003/ALL0277_1.02.6_ETSI_0703_code.bin.fs.tar.bz2
  50. BIN
      2003/firmware-reveng-ccc2003/ALL0277_1.02.6_ETSI_0703_code.bin.kernel
  51. 77416
      2003/firmware-reveng-ccc2003/ALL0277_1.02.6_ETSI_0703_code.bin.magic_ofs
  52. 1
      2003/firmware-reveng-ccc2003/ALL0277_1.02.6_ETSI_0703_code.bin.url
  53. 113
      2003/firmware-reveng-ccc2003/firmware-reveng-ccc2003.mgp
  54. 79
      2003/firmware-reveng-ccc2003/magic_ofs.c
  55. 26
      2003/linux-kernel-knf2003/abstract
  56. 300
      2003/linux-kernel-knf2003/linux-kernel-knf2003.mgp
  57. 315
      2003/linux-kernel-smp-bangalore2003/kernel-smp-bangalore2003.mgp
  58. 71
      2003/netfilter-bof-ols2003/topics
  59. 368
      2003/netfilter-curdevel-fosdem2003/netfilter-curdevel-fosdem2003.mpg
  60. 12
      2003/netfilter-curdevel-lt2003/abstract
  61. 22
      2003/netfilter-curdevel-lt2003/biography
  62. 19
      2003/netfilter-curdevel-lt2003/curdevel
  63. 299
      2003/netfilter-curdevel-lt2003/netfilter-curdevel-lt2003.mgp
  64. 304
      2003/netfilter-curdevel-ukuug2003/netfilter-curdevel-ukuug2003.mgp
  65. 318
      2003/netfilter-curdevel-ukuug2003/netfilter-curdevel-ukuug2003.tex
  66. 73
      2003/netfilter-free-openfest2003/abstract
  67. 220
      2003/netfilter-free-openfest2003/netfilter-free-openfest2003.mgp
  68. 511
      2003/netfilter-internals-bangalore2003/netfilter-internals-bangalore2003.mgp
  69. 49
      2003/netfilter-internals-kiblix2003/abstract
  70. 22
      2003/netfilter-internals-kiblix2003/biography
  71. 509
      2003/netfilter-internals-kiblix2003/netfilter-internals-kiblix2003.mgp
  72. 70
      2003/netfilter-programming-clt2003/abstract
  73. 54
      2003/netfilter-programming-clt2003/ipt_workshop.c
  74. 6
      2003/netfilter-programming-clt2003/ipt_workshop.h
  75. 102
      2003/netfilter-programming-clt2003/libipt_workshop.c
  76. 636
      2003/netfilter-programming-clt2003/netfilter-programming-clt2003.mgp
  77. 57
      2003/netfilter-programming-clt2003/nf_workshop.c
  78. 54
      2003/netfilter-programming-ols2003/ipt_workshop.c
  79. 6
      2003/netfilter-programming-ols2003/ipt_workshop.h
  80. 102
      2003/netfilter-programming-ols2003/libipt_workshop.c
  81. 615
      2003/netfilter-programming-ols2003/netfilter-programming-ols2003.mgp
  82. 615
      2003/netfilter-programming-ols2003/netfilter-programming-ols2003_.mgp
  83. 57
      2003/netfilter-programming-ols2003/nf_workshop.c
  84. 105
      2003/opensource-astaro2003/brainstorming
  85. 185
      2003/opensource-astaro2003/opensource-astaro2003.mgp
  86. 281
      2004/firewall-vpn-gse2004/firewall-vpn-gse2004.mgp
  87. BIN
      2004/firewall-vpn-gse2004/firewall-vpn-gse2004.pdf
  88. BIN
      2004/firewall-vpn-gse2004/firewall-vpn-gse2004_2.pdf
  89. 21
      2004/gpl-berlinux2004/biography
  90. 30
      2004/gpl-berlinux2004/extended-abstract
  91. 253
      2004/gpl-berlinux2004/gpl-berlinux2004.mgp
  92. 21
      2004/gpl-bof-ols2004/abstract
  93. 25
      2004/gpl-bof-ols2004/biography
  94. 228
      2004/gpl-bof-ols2004/gpl-bof-ols2004.mgp
  95. 24
      2004/gpl-ccc2004/biography
  96. 46
      2004/gpl-ccc2004/cfp-reply
  97. 29
      2004/gpl-ccc2004/extended-abstract
  98. 406
      2004/gpl-ccc2004/gpl-ccc2004.mgp
  99. 280
      2004/gpl-ccc2004/gpl-ccc2004.xml
  100. 4
      2004/gpl-ccc2004/short-abstract
  101. Some files were not shown because too many files have changed in this diff Show More

@ -0,0 +1,336 @@
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
<HTML>
<HEAD>
<META NAME="GENERATOR" CONTENT="SGML-Tools 1.0.9">
<TITLE>The netfilter framework in Linux 2.4</TITLE>
</HEAD>
<BODY>
<H1>The netfilter framework in Linux 2.4</H1>
<H2>Harald Welte <CODE>laforge@gnumonks.org</CODE></H2>$Date: 2004-10-10 15:04:54 +0200 (Sun, 10 Oct 2004) $
<P><HR>
<EM>This is the paper on which my talk about netfilter at Linux-Kongress 2000, CCC Congress 2000 (and probably some more occassions where I give this talk) is based. It describes the netfilter infrastructure, as well as the systems for packet filtering, NAT and packet mangling on top of it</EM>
<HR>
<H2><A NAME="s1">1. PART I - Netfilter basics / concepts</A></H2>
<H2>1.1 What is netfilter?</H2>
<P>Netfilter is definitely more than any of the firewall subsystems in the past linux kernels. Netfilter provides a abstract, generalized framework of which one particular incarnation is the packet filtering subsystem. So don't expect a talk about "how to set up a firewall or a masquerading gateway in 2.4". This would only cover a part of netfilter.
<P>The netfilter framework consists out of three parts:
<P>
<P>
<OL>
<LI>Each protocol defines a set of 'hooks' (IPv4 defines 5), which are well-defined points in a packet's traversal of that protocol stack. At each of these points, the protocol stack will call the netfilter framework with the packet and the hook number.
</LI>
<LI>Parts of the kernel can register to listen to the different hooks for each protocol. So when a packet is passed to the netfilter framework, it checks to see if anyone has registered for that protocol and hook; if so, they get a chance to examine (and possibly alter) the packet, discard it, allow it to pass or ask netfilter to queue the packet for userspace.
</LI>
<LI>Packets that have been queued are collected for sending to userspace; these packets are handled asynchronously. A userspace process can examine the packet, can alter it, and reinject it at the same hook it left the kernel.</LI>
</OL>
<P>
<P>All the packet filtering / NAT / ... stuff is based on this framework. There is no more dirty packet altering code spread all over the network stack.
<P>
<P>The netfilter framework currently has been implemented for IPv4, IPv6 and DECnet.
<P>
<H2>1.2 Why did we need netfilter?</H2>
<P>This chapter could be called 'What is wrong with ipchains?', too. So why did we need this change? (I only give a few examples here)
<P>
<UL>
<LI>No infrastructure for passing packets to userspace, so all code which does some packet fiddling must be done as kernel code. Kernel programming is hard, must be done in C, and is dangerous.
</LI>
<LI>Transparent proxying is extremely difficult
We have to look up _every_ packet to see if there's a socket bound to that adderess. No clean interface, 34 #ifdef' in 11 different files of the network stack
</LI>
<LI>Creating of packet filter rules independent of interface address is impossible.
We must know local interface address to distinguish locally-generated or locally-terminated packets from through packets. The forward chain has only information on outgoing interface. So we must try to figure out where the packet came from.
</LI>
<LI>Masquerading and packet filtering are implemented as one part
This makes the firewalling code way too complex.
</LI>
<LI>Ipchains code is neither modular nor extensible (eg. for MAC adress filtering)</LI>
</UL>
<P>
<H2>1.3 The authors of netfilter</H2>
<P>The concept of the netfilter framework and most of its implementation were done by Rusty Russell. He is co-author if ipchains and is the current Linux Kernel IP firewall maintainer. Rusty got paid one Year by Watchguard (a firewall company) to do nothing, so he had enough time to do it :)
<P>
<P>The official netfilter core team consists out of Rusty Russell, Marc Boucher, James Morris and Harald Welte. Of course there are various other hackers who have contributed some stuff (for more information see
<A HREF="http://netfilter.samba.org/scoreboard.html">http://netfilter.samba.org/scoreboard.html</A>).
<P>
<H2>1.4 Netfilter architecture in IPv4</H2>
<P>A Packet Traversing the Netfilter System:
<BLOCKQUOTE><CODE>
<PRE>
--->[1]--->[ROUTE]--->[3]--->[4]--->
| ^
| |
| [ROUTE]
v |
[2] [5]
| ^
| |
v |
</PRE>
</CODE></BLOCKQUOTE>
<P>
<P>
<P>Packets come in from the left. After verification of the IP checksum, the packets hit the NF_IP_PRE_ROUTING [1] hook.
<P>Next they enter the routing code, which decides if the packets are local or have to be passed to another interface.
<P>If the packets are considered to be local, they traverse th NF_IP_LOCAL_IN [2] hook and get passed to the process (if any) afterwards.
<P>If the packets are routed to another interface, they pass the NF_IP_FORWARD [3] hook.
<P>The packet passes a final netfilter hook, NF_IP_POST_ROUTING [4], before they get transmitted on the target interface.
<P>The NF_IP_LOCAL_OUT [5] hook is called for locally generated packets. Here You can see that routing occurs after this hook is called: in fact, the routing code is called first (to figure out the source IP address and some IP options), and called again if the packet is altered.
<P>Locally generated packets hit NF_IP_POST_ROUTING [4], too.
<P>
<H2>1.5 Netfilter base</H2>
<P>Kernel modules can register a callback function for each one of these hooks. This callback function is called for each packet traversing the hook. The module is free to alter the packet. It has to return netfilter one of these constants:
<P>
<UL>
<LI>NF_ACCEPT continue traversal as normal</LI>
<LI>NF_DROP drop the packet; do not continue traversal</LI>
<LI>NF_STOLEN I've taken over the packet; do not continue traversal</LI>
<LI>NF_QUEUE queue the packet (usually for userspace handling)</LI>
<LI>NF_REPEAT call this hook again</LI>
</UL>
<P>
<P>
<H2>1.6 Packet selection: IP tables</H2>
<P>A packet selection system called IP tables has been built. It is a direct descendant of ipchains, with extensibility.
<P>Kernel modules can create a new table utilizing the IP tables core, and ask for a packet to traverse a given table.
<P>IP tables are used for packet filtering (the 'filter' table), Network Address Translation (the 'nat' table) and general packet mangling (the 'mangle' table).
<P>The three big parts of Linux 2.4 packet handling are built using netfilter hooks and IP tables. They are seperate modules and are independent from each other. They all plug in nicely into the infrastructure provided by netfilter.
<P>
<OL>
<LI>Packet filtering
<P>This table 'filter' should never alter packets, only filter them.
One of the advantages of iptables over ipchains is that it is small and fast, and it hooks into netfilter at the NF_IP_LOCAL_IN, NF_IP_FORWARD and NF_IP_LOCAL_OUT hooks.
<P>Therefore, for each packet there is one, and only one, place to filter it. This is one big change compared to ipchains, where a forwarded packet used to traverse three chains.
<P>
</LI>
<LI> NAT
<P>The nat table listens at three netfilter hooks: NF_IP_PRE_ROUTING and NF_IP_POST_ROUTING to do source and destination NAT for routed packets. For destination altering of local packets, the NF_IP_LOCAL_OUT hook is used.
<P>This table is different from the 'filter' table, in that only the first packet of a new connection will traverse the table. The result of this traversal is then applied to all future packets of the same connection.
<P>The NAT table is used for source NAT, destination NAT, masquerading (which is a special case of source nat) and transparent proxying (which is a special case of destination nat).
<P>
</LI>
<LI> Packet mangling
<P>The 'mangle' table registers at the NF_IP_PRE_ROUTING and NF_IP_LOCAL_OUT hooks.
<P>Using the mangle table You can modify the packet itself or some of the out-of-band data attached to the packet. Currently the alteration of the TOS bits as well as setting the nfmark field inside the skb is implemented on top of the mangle table.
</LI>
</OL>
<P>
<H2>1.7 Connection tracking</H2>
<P>Connection tracking is fundamental to NAT, but has been implemented as a seperate module. This allows an extension to the packet filtering code to simply use connection tracking for "stateful firewalling". (the 'state' match)
<P>
<P>
<H2><A NAME="s2">2. PART II - packet filtering using iptables and netfilter</A></H2>
<H2>2.1 Overview</H2>
<P>I expect You are familiar with TCP/IP, routing, firewall concepts and packet filtering in general.
<P>As already explained in Part I, the filter table listens on three hooks, thus providing us three chains for packet filtering.
<P>All packets coming from the network and destined for the local box traverse the INPUT chain.
<P>All packets which are forwarded (routed) by us traverse the FORWARD chain (and only the FORWARD chain). Please again note this difference to the previous linux firewall implementations!
<P>Finally, the packets originating from the local box traverse the OUTPUT chain.
<P>
<H2>2.2 Inserting rules into chains</H2>
<P>To insert/delete/modify any rules in linux 2.4 IP tables we have a neat and powerful commandline tool, called 'iptables'. I don't want to get too deep into all its features and extensibility. Here are some of its major features:
<UL>
<LI>It handles all different kinds of IP tables. Currently the filter, nat and mangle tables, but also all future table modules
</LI>
<LI>It supports plugins for new matches and new targets. Thus, nobody ever needs to patch anything to provide a netfilter extension. You have a kernel module doing the real work and a iptables plugin (dynamic library) to add the neccessary configuration options.
</LI>
<LI>It comes in two incarnations: iptalbes (IPv4) and ip6tables (IPv6). Both of them are based on the same library and mostly the same code.</LI>
</UL>
<P>
<H3>Basic iptables commands</H3>
<P>An iptables command usually consists out of 5 parts:
<OL>
<LI>which table we want to work with</LI>
<LI>which chain in this table we want it to use</LI>
<LI>an operation (insert, add, delete, modify)</LI>
<LI>a target for this particular rule</LI>
<LI>a description of which packets we want to match this rule</LI>
</OL>
<P>The basic syntax is
<PRE>
iptables -t table -Operation chain -j target match(es)
</PRE>
<P>To add a rule allowing all traffic from anywhere to our local smtp port:
<PRE>
iptables -t filter -A INPUT -j ACCEPT -p tcp --dport smtp
</PRE>
<P>Of course there are various other commands like flush chain, set the default policy of a chain, add a user-defined chain, ...
<P>Basic Operations:
<PRE>
-A append rule
-I insert rule
-D delete rule
-R replace rule
-L list rules
</PRE>
<P>Basic Targets, common to all chains:
<PRE>
ACCEPT accept the packet
DROP drop the packet
QUEUE queue packet to userspace
RETURN return to the previous (calling) chain
foobar user defined chain
</PRE>
<P>
<P>Basic matches, common to all chains:
<PRE>
-p protocol (tcp/icmp/udp/...)
-s source address (ip address/masklen)
-d destination address (ip address/masklen)
-i incoming interface
-o outgoing interface
</PRE>
<P>Apart from these basic operations, matches and targets there are various extensions, which I'll describe in the apropriate chapters.
<P>
<H2>2.3 iptables match extensions for filtering</H2>
<P>There are various extensions which are useful for packet filtering. Describing them all in detail would take way too much time. Just to give You an impression about the power :)
<P>At first there are some match extensions, which give us more power to describe which packets to match:
<UL>
<LI>TCP match extensions to match source port, destination port, arbitrary combinations of TCP flags, tcp options.</LI>
<LI>UPD match extensions to match source port and destination port</LI>
<LI>ICMP match extension to match icmp type</LI>
<LI>MAC match extension to match incoming mac (ethernet) address</LI>
<LI>MARK match extension to match the nfmark </LI>
<LI>OWNER match extension (for locally generated packets only) to match user id, group id, process id, session id</LI>
<LI>LIMIT match extension to match only a certain limit of packets per time frame. Very useful to prevent forwarding of any kind of flooding.</LI>
<LI>STATE match extension to match packets of a certain state (decided by the connection tracking subsystem). Possible states are
<UL>
<LI>INVALID (not matched against a connection), </LI>
<LI>ESTABLISHED (packet belongs to an already established connection), </LI>
<LI>NEW (packet would establish a new connection) and </LI>
<LI>RELATED (packet is in some way related to an already established connection. For example an ICMP error message or a ftp data connection)</LI>
</UL>
</LI>
<LI>TOS match extension to match the value of the TOS IP header field</LI>
<LI>TTL match extension to match the value of the TTL IP header field</LI>
</UL>
<P>
<P>
<H2>2.4 iptables target extensions for filtering</H2>
<P>
<UL>
<LI>LOG log matched packets via syslog()</LI>
<LI>ULOG log matched packets via userspace logging daemon
(supports interpreter and output plugins for flexible logging)</LI>
<LI>REJECT not only drop the packet, but also send some kind of error
message to the sender (which message is configurable)</LI>
<LI>MIRROR retransmit the packet after exchanging source and destination
IP address </LI>
</UL>
<P>
<H2><A NAME="s3">3. PART III - NAT using iptables and netfilter</A></H2>
<P>Regarding to NAT (Network Address Translation) the previous Linux Kernels only supported one spacial case called "Masquerading"
<P>Netfilter now enables Linux to do any kind of NAT.
<P>Nat is divided into `source NAT' and `destination NAT'.
<P>Source NAT alters the source address of a packet while passing the NF_IP_POST_ROUTING hook. Masquerading is a special application of SNAT
<P>Destination NAT alters the destination address of a packet while passing the NF_IP_LOCAL_OUT respectively NF_IP_PRE_ROUTING hook. Port forwarding and transparent proxying are forms of DNAT.
<P>
<H2>3.1 iptables target extensions for NAT</H2>
<P>
<P>
<DL>
<P>
<DT><B>SNAT</B><DD><P>Change the source address to something different
<P>Example:
<PRE>
iptables -t nat -A POSTROUTING -j SNAT --to-source 1.2.3.4
</PRE>
<P>
<DT><B>MASQUERADE</B><DD><P>SNAT for dialup connections with dynamic ip address
<P>Does almost the same as SNAT, but if the link goes down, all connection tracking information is dropped. The connections are lost anyway, because we get a different IP address at reconnect.
<P>Example:
<PRE>
iptables -t nat -A POSTROUTING -j MASQUERADE -o ppp0
</PRE>
<P>
<DT><B>DNAT</B><DD><P>Change the destination address to something different
<P>This is done at the PREROUTING chain, just as the packet comes in. Therefore, anything else on the Linux box itself (routing, packet filtering) will se the packet to its real (new) destination.
<P>Example:
<PRE>
iptables -t nat -A PREROUTING -j DNAT --to-destination 1.2.3.4:8080 -p tcp --dport 80 -i eth1
</PRE>
<P>
<DT><B>REDIRECT</B><DD><P>Redirect packets to local destination
<P>Exactly the same as doing DNAT to the address of the incoming interface
<P>Example:
<PRE>
iptables -t nat -A PREROUTING -j REDIRECT --to-port 3128 -i eth1 -p tcp --dport 80
</PRE>
<P>
</DL>
<P>
<H2><A NAME="s4">4. PART IV - Packet mangling using iptables and netfilter</A></H2>
<P>The `mangle' table enables us to alter the packet itself or some data accompaning the packet.
<P>
<H2>4.1 iptables target extensions for packet mangling</H2>
<P>
<DL>
<P>
<DT><B>MARK</B><DD><P>set the value of the nfmark field
<P>We can change the value of the nfmark field. The nfmark is just a user defined mark (anything within the range of an unsigned long) of the packet. The mark value is used to do policy routing, tell ipqmpd (the userspace queue multiplex daemon) which process to queue the packet to, etc.
<P>Example:
<BLOCKQUOTE><CODE>
<PRE>
iptables -t mangle -A PREROUTING -j MARK --set-mark 0x0a -p tcp
</PRE>
</CODE></BLOCKQUOTE>
<P>
<DT><B>TOS</B><DD><P>set the value of the TOS bits inside the IP header
<P>We can change the value of the type of service bits inside the IP haeder. This is useful if You are using TOS based packet scheduling / routing.
<P>Example:
<BLOCKQUOTE><CODE>
<PRE>
iptables -t mangle -A PREROUTING -j TOS --set-tos 0x10 -p tcp --dport ssh
</PRE>
</CODE></BLOCKQUOTE>
<P>
<DT><B>TTL</B><DD><P>alther the value of the TTL field inside the IP header
<P>Enables the user to set, increase or decrease the TTL field.
<P>Example:
<BLOCKQUOTE><CODE>
<PRE>
iptables -t mangle -A PREROUTING -j TTL --ttl-dec 2 -i eth0
</PRE>
</CODE></BLOCKQUOTE>
</DL>
<P>
<H2><A NAME="s5">5. Queueing packets to userspace</A></H2>
<P>As I already mentioned, at any time in any netfilter chain, the packet can be queued to userspace. The actual queuing is done by a kernel module (ip_queue.o).
<P>The packets (including metadata like nfmark and mac address) are sent to an userspace process using netlink sockets. This process can do whatever it wants to do with the packet.
<P>After the userspace process is done with its work on the packet, it can either reinject the packet into the kernel, or set a verdict (DROP, ...) what to do with the packet.
<P>This is one key technology of netfilter, enabling to do complicated packet handling by userspace processes. Thus, preventing more complexity in the kernel space.
<P>
<P>Userspace packet handling processes can be easily developed using a netfilter-provided library called 'libipq'.
<P>
<P>Currently only one userspace process is supported, but the first beta release of an userspace ip queueing multiplex daemon (ipqmpd) is available. ipqmpd provides a compatibility library (libipqmpd) which makes upgrading from raw ipqueue interface to the new ipqpmd as easy as relinking to another library.
<P>
<H2><A NAME="s6">6. PART V Credits</A></H2>
<P>Credits to all the netfilter hackers, especially the core team.
<P>Namely: <B>Paul 'Rusty' Russel</B>, <B>Marc Boucher</B> and <B>James Morris</B>.
<P>Additional special thanks to Rusty for his `netfilter-hacking-HOWTO', `packet-filtering-HOWTO' and `NAT-HOWTO' which I heavily used as a basis for this presentation.
<P>
</BODY>
</HTML>

@ -0,0 +1,18 @@
Tutorial: Firewalling using netfilter/iptables in Linux 2.4
One of the major advantages of the new Linux 2.4.x kernel series is the
new packet filtering / NAT / packet mangling sybsystem, called iptables.
Iptables is the successor of ipchains and ipfwadm in 2.2 and 2.0 kernels.
Major new features are stateful firewalling, extensibility and better NAT
(Network Address Translation) support.
Topics:
- concepts behind new netfilter/iptables infrastructure
- usage of iptables
- case example of a real-world firewall
- current (experimental) netfilter work - or "what is patch-o-matic"
- writing netfilter/iptables extension modules
The tutorial will be presented by two of the netfilter core team members,
Rusty Russel <rusty@rustcorp.com.au> and Harald Welte <laforge@gnumonks.org>

@ -0,0 +1,9 @@
Technical Presentation: A tour through the Linux 2.4 network stack
Linux based systems are known for performance and realiability in the area of
networking. This presentation will give a tour through the Linux 2.4 kernel
network stack, it's structure and implementation. Some of the topics covered
are: Network hardware drivers, core network functions, IPv4 protocol stack,
sockets implementation, zero-copy TCP.
The Author of this Presentation is Harald Welte <laforge@gnumonks.org>

@ -0,0 +1,116 @@
<!doctype linuxdoc system>
<article>
<title>The journey of a packet through the linux 2.4 network stack</title>
<author>Harald Welte <tt>laforge@gnumonks.org</tt>
<date>$Revision: 537 $, $Date: 2004-10-10 15:04:54 +0200 (Sun, 10 Oct 2004) $</date>
<!-- $Id: packet-journey-2.4.sgml 537 2004-10-10 13:04:54Z laforge $ -->
<abstract>
This document describes the journey of a network packet inside the linux kernel 2.4.x. This has changed drastically since 2.2 because the globally serialized bottom half was abandoned in favor of the new softirq system.
<toc>
<sect>Preface
<p>
I have to excuse for my ignorance, but this document has a strong focus on the "default case": x86 architecture and ip packets which get forwarded.
<p>
I am definitely no kernel guru and the information provided by this document may be wrong. So don't expect too much, I'll always appreciate Your comments and bugfixes.
<sect>Receiving the packet
<sect1>The receive interrupt
<p>
If the network card receives an ethernet frame which matches the local MAC address or is a linklayer broadcast, it issues an interrupt.
The network driver for this particular card handles the interrupt, fetches the packet data via DMA / PIO / whatever into RAM. It then allocates a skb and calls a function of the protocol independent device support routines: <tt>net/core/dev.c:netif_rx(skb)</tt>.
<p>
If the driver didn't already timestamp the skb, it is timestamped now. Afterwards the skb gets enqueued in the apropriate queue for the processor handling this packet. If the queue backlog is full the packet is dropped at this place. After enqueuing the skb the receive softinterrupt is marked for execution via <tt>include/linux/interrupt.h:__cpu_raise_softirq()</tt>.
<p>
The interrupt handler exits and all interrupts are reenabled.
<sect1>The network RX softirq
<p>
Now we encounter one of the big changes between 2.2 and 2.4: The whole network stack is no longer a bottom half, but a softirq. Softirqs have the major advantage, that they may run on more than one CPU simultaneously. bh's were guaranteed to run only on one CPU at a time.
<p>
Our network receive softirq is registered in <tt>net/core/dev.c:net_init()</tt> using the function <tt>kernel/softirq.c:open_softirq()</tt> provided by the softirq subsystem.
<p>
Further handling of our packet is done in the network receive softirq (NET_RX_SOFTIRQ) which is called from <tt>kernel/softirq.c:do_softirq()</tt>. do_softirq() itself is called from three places within the kernel:
<enum>
<item>from <tt>arch/i386/kernel/irq.c:do_IRQ()</tt>, which is the generic IRQ handler
<item>from <tt>arch/i386/kernel/entry.S</tt> in case the kernel just returned from a syscall
<item>inside the main process scheduler in <tt>kernel/sched.c:schedule()</tt>
</enum>
<p>
So if execution passes one of these points, do_softirq() is called, it detects the NET_RX_SOFTIRQ marked an calls <tt>net/core/dev.c:net_rx_action()</tt>. Here the sbk is dequeued from this cpu's receive queue and afterwards handled to the apropriate packet handler. In case of IPv4 this is the IPv4 packet handler.
<sect1>The IPv4 packet handler
<p>
The IP packet handler is registered via <tt>net/core/dev.c:dev_add_pack()</tt> called from <tt>net/ipv4/ip_output.c:ip_init()</tt>.
<p>
The IPv4 packet handling function is <tt>net/ipv4/ip_input.c:ip_rcv()</tt>. After some initial checks (if the packet is for this host, ...) the ip checksum is calculated. Additional checks are done on the length and IP protocol version 4.
<p>
Every packet failing one of the sanity checks is dropped at this point.
<p>
If the packet passes the tests, we determine the size of the ip packet and trim the skb in case the transport medium has appended some padding.
<p>
Now it is the first time one of the netfilter hooks is called.
<p>
Netfilter provides an generict and abstract interface to the standard routing code. This is currently used for packet filtering, mangling, NAT and queuing packets to userspace. For further reference see my conference paper 'The netfilter subsystem in Linux 2.4' or one of Rustys unreliable guides, i.e the netfilter-hacking-guide.
<p>
After successful traversal the netfilter hook, <tt>net/ipv4/ipv_input.c:ip_rcv_finish()</tt> is called.
<p>
Inside ip_rcv_finish(), the packet's destination is determined by calling the routing function <tt>net/ipv4/route.c:ip_route_input()</tt>. Furthermore, if our IP packet has IP options, they are processed now. Depending on the routing decision made by <tt>net/ipv4/route.c:ip_route_input_slow()</tt>, the journey of our packet continues in one of the following functions:
<descrip>
<tag>net/ipv4/ip_input.c:ip_local_deliver()</tag>
The packet's destination is local, we have to process the layer 4 protocol and pass it to an userspace process.
<tag>net/ipv4/ip_forward.c:ip_forward()</tag>
The packet's destination is not local, we have to forward it to another network
<tag>net/ipv4/route.c:ip_error()</tag>
An error occurred, we are unable to find an apropriate routing table entry for this packet.
<tag>net/ipv4/ipmr.c:ip_mr_input()</tag>
It is a Multicast packet and we have to do some multicast routing.
</descrip>
<sect>Packet forwarding to another device
<p>
If the routing decided that this packet has to be forwarded to another device, the function <tt>net/ipv4/ip_forward.c:ip_forward()</tt> is called.
<p>
The first task of this function is to check the ip header's TTL. If it is &lt;= 1 we drop the packet and return an ICMP time exceeded message to the sender.
<p>
We check the header's tailroom if we have enough tailroom for the destination device's link layer header and expand the skb if neccessary.
<p>
Next the TTL is decremented by one.
<p>
If our new packet is bigger than the MTU of the destination device and the don't fragment bit in the IP header is set, we drop the packet and send a ICMP frag needed message to the sender.
<p>
Finally it is time to call another one of the netfilter hooks - this time it is the NF_IP_FORWARD hook.
<p>
Assuming that the netfilter hooks is returning a NF_ACCEPT verdict, the function <tt>net/ipv4/ip_forward.c:ip_forward_finish()</tt> is the next step in our packet's journey.
<p>
ip_forward_finish() itself checks if we need to set any additional options in the IP header, and has and has <tt>net/ipv4/ip_options.c:ip_forward_options()</tt> doing this. Afterwards it calls <tt>include/net/ip.h:ip_send()</tt>.
<p>
If we need some fragmentation, <tt>net/ipv4/output.c:ip_fragment()</tt> gets called, otherwise we continue in <tt>net/ipv4/ip_forward:ip_finish_output()</tt>.
<p>
ip_finish_output() again does nothing else than calling the netfilter postrouting hook NF_IP_POST_ROUTING and calling ip_finish_output2() on successful traversal of this hook.
<p>
ip_finish_output2() calls prepends the hardware (link layer) header to our skb and calls <tt>dst->hh->hh_output()</tt> which seems to usually be <tt>net/core/dev.c:dev_queue_transmit()</tt>.
<p>
dev_queue_xmit() enqueues the packet for transmission by the network device.
</article>

@ -0,0 +1,397 @@
%include "cnc-style.mgp"
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%page
%nodefault
%pcache 1 1 0 1
%size 7, font "standard", fore "white", vgap 20, back "black"
%bimage "fundo-cnc.png" 1024x768
%center
%size 7
Quality of Service in IP Networks
%center
%size 4
by
Harald Welte <laforge@gnumonks.org>
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%page
QoS in IP Networks
Contents
Definition of QoS
Why QoS
IP Networks are not designed for QoS
How to do the impossible
What can Linux based systems help
Advanced Concepts (DiffServ, IntServ, RSVP, ...)
References / Further Reading
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%page
QoS in IP Networks
Definiton of QoS
Provide Service Differentiation
Performance Assurance by
Bandwitdh guarantees
for streaming multimedia traffic
priorizing certain important applications
Latency guarantees
for voice over IP
for interactive character-oriented applications (ssh,telnet)
Packet-loss guarantees
for unreliable layer-4 protocols
to avoid retransmits
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%page
QoS in IP Networks
Why QoS
Decide how and who available bandwidth is devided
Limit available bandwidth for certain users / applications
Guarantee bandwidth for certain users / applications
Divide bandwidth more equally between users / applications
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%page
QoS in IP Networks
IP networks not designed for QoS
Properties of IP-based networks:
offer a "best-effort" service
make NO guarantees about
bandwidth
latency
packet loss
provide a non-reliable packet transport
Conclusion: IP networks are not suitable for QoS
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%page
QoS in IP Networks
How to do the Impossible
%size 4
As IP Networks including Hardware (Routers, ...) are widely deployed, all QoS efforts have to layer on top of the existing technology.
There's no real solution to control latency
latency widely dependent on routing, which may be dynamic
There's no real solution to control packet loss
packet loss may occurr on any intermediate router
But we can control bandwidth usage!
The sender can limit bandwidth for outgoing streams
Intermediate routers BEFORE a bottleneck can control bandwidth usage
%size 5
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%page
QoS in IP Networks
What can Linux systems do?
Bandwidth limiting at the sender application
not many applications support it
server often out of control (on Internet, ...)
server doesn't know what's between him and the client
Bandwidth control on intermediate router before bottleneck
Ideal case because this is where packet loss would occurr
Sophisticated queue scheduling on the outgoing queue
Variety of different queue scheduling algorithms
Flow throttling at the Receiver
Worst case, because influence is limited
Theoretically possible for TCP, no implementation yet.
Ingress qdisc might help
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%page
QoS in IP Networks
Bandwidth limiting at server
Some Internet Servers support bandwidth limiting
ProFTPd (builtin support)
Apache (using contributed mod_bandwidth)
Using those features it is easy to limit
maximum bandwidth used per connection
maximum bandwidth used per client (IP/network)
maximum bandwidth used by one virtual host (webserver/ftpserver)
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%page
QoS in IP Networks
Router before bottleneck
%size 4
The router receives more packets on his incoming interface(s) than it can send out on the outgoing interface. It has to build a queue of packets (usually a FIFO one) and starts dropping packets as soon as the queue is full
%image "qos-1.png" 0 100 30
The idea is to change this queue, thus decide
which packets get enqueued in which order
how many packets get queued
which packets get dropped in case of a filling queue
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%page
QoS in IP Networks
The Linux 2.2 / 2.4 Solution
Packet Scheduling algorithms in the Kernel
CBQ - Class Based Queue
RED - Random Early Drop
SFQ - Stochastic Fairness Queueing
TEQL - True Link Equalizer
TBF - Token Bucket Filter
tc command of iproute2 package for configuration
almost no documentation
very few examples on the internet
Packet Classification
tc builtin classes (route, u23, ...)
all iptables/netfilter matches by using fwmark
Conclusion: Linux is the best suited general-purpose operating system for QoS, but almost nobody is using it because lack of knowledge.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%page
QoS in IP Networks
Available queuing algorithms
CBQ - Class Based Queue
hierarchical bandwidth classes
used as basis in almost all cases
TBF - Token Bucket Filter
really accurate algorithm
uses a lot of CPU
not possible for high bandwidth links (>1MBit)
SFQ - Stochastic Fairness Queueing
less accurate algorithm
tries to distinguish between individual streams
does round robin between those streams
TEQL - True Link Equalizer
allows to 'bundle' interfaces
RED - Random Early Detect / Drop
simulates congested link by statistic packet dropping
uses almost no CPU
recommended for high-bandwidth backbones
others (WRR, TCINDEX, DSMARK, ..)
WRR not officially included in kernel, similar to CBQ
others mostly used for DiffServ
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%page
QoS in IP Networks
The big picture
Overview of the a packet's journey
%size 3
%font "typewriter"
Incoming Packets
%size 3
%font "typewriter"
|
%size 3
%font "typewriter"
V
%size 3
%font "typewriter"
Packet Classification classify
%size 3
%font "typewriter"
(ipchains/iptables) set nfmark
%size 3
%font "typewriter"
|
%size 3
%font "typewriter"
V
%size 3
%font "typewriter"
Routing decision
%size 3
%font "typewriter"
|
%size 3
%font "typewriter"
V
%size 3
%font "typewriter"
TC filter select classes based on nfmark
%size 3
%font "typewriter"
/ | \
%size 3
%font "typewriter"
/ | \
%size 3
%font "typewriter"
/ | \
%size 3
%font "typewriter"
Different Bandwidth classes bandwidth classes (CBQ)
%size 3
%font "typewriter"
\ | /
%size 3
%font "typewriter"
\ | /
%size 3
%font "typewriter"
\ | /
%size 3
%font "typewriter"
Enqueuing output queue discipline
%size 3
%font "typewriter"
|
%size 3
%font "typewriter"
V
%size 3
%font "typewriter"
Outgoing packets
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%page
QoS in IP Networks
Example scenario usin CBQ
%size 4
Let's assume we have a link with 10 MBit maximum available bandwidth.
We offer two major services to the outside world: Anonymous FTP and a Webserver offering important Information.
FTP Bulk data transfers are using up almost all available bandwidth, thus slowing down accesses to our website :(
We want to have FTP transfers use up to 8MBit and reserve 2MBit for WWW.
Implementation uses CBQ for bandwidth divisions.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%page
QoS in IP Networks
Example scenario
%size 3
attach a CBQ to the device
%size 3
%font "typewriter"
tc qdisc add dev eth0 root handle 10: cbq
bandwidth 10Mbit avpkt 1000
%size 3
%font "standard"
create CBQ classes
%size 3
%font "typewriter"
tc class add dev eth0 parent 10:0 classid 10:1 cbq
bandwidth 10MBit rate 10MBit allot 1514
weight 1Mbit prio 8 maxburst 20 avpkt 1000
tc class add dev eth0 parent 10:1 classid 10:100 cbq
bandwidth 10MBit rate 8MBit allot 1514
weight 800kbit prio 5 maxburst 20 avpkt 1000 bounded
tc class add dev eth0 parent 10:1 classid 10:200 cbq
bandwidth 10MBit rate 2MBit allot 1514
weight 200kbit prio 5 maxburst 20 avpkt 1000 bounded
%size 3
%font "standard"
add filter rules
%size 3
%font "typewriter"
tc filter add dev eth0 parent 10:1 protocol ip handle 6 fw classid 10:100
tc filter add dev eth0 parent 10:1 protocol ip handle 7 fw classid 10:200
iptables -t mangle -A PREROUTING -j MARK -p tcp --sport 20 --set-mark 6
iptables -t mangle -A PREROUTING -j MARK -p tcp ! --sport 20 --set-mark 7
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%page
QoS in IP Networks
Further optimization
%size 4
Now we have achieved bandwidth division between two services.
Within one service, however, one individual user with a high bandwith link can still use up most of our bandwidth, slowing down other user.
We can improve this behaviour of changing the scheduling algorithm from it's default (fifo)
%size 3
%font "typewriter"
tc qdisc add dev eth0 parent 10:100 sfq quantum 1514b perturb 15
tc qdisc add dev eth0 parent 10:200 sfq quantum 1514b perturb 15
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%page
QoS in IP Networks
Further reading / Links
Bandwidth limiting on Servers
ProFTPd
http://www.proftpd.net/
Apache mod_bandwidth / mod_bwshare
ftp://ftp.cohprog.com/pub/apache/module/mod_bandwidth.c
http://www.topology.org/src/bwshare/
Queue scheduling
Advanced Routing HOWTO
http://www.ds9a.nl/2.4Routing/
Linux QoS HOWTO
http://www.ittc.ukans.edu/~rsarav/howto/
iproute2+tc
This presentation
Authors Homepage
http://www.gnumonks.org/

File diff suppressed because it is too large Load Diff

Binary file not shown.

Binary file not shown.

Binary file not shown.

@ -0,0 +1,23 @@
Quality of Service in IP Networks
IP networks were designed some 25 years ago. Networks based on TCP/IP are
widely deployed, as organization-local Intranets as well as in the Internet
itself. The usage patterns of those networks change. Especially new
technologies like voice-over-IP as well as streaming multimedia applications
have different requirements on the underlying network infrastructure than
bulk data transfers like ftp/www or interactive traffic like telnet/ssh.
Organizations usually run a mixture of different services on their Internet
uplinks or on their organization-internal wide area networks. Bandwidth is
usually a limited ressource, so everybody wants to divide bandwidth between
different services according to his specific needs.
Linux always had a very strong focus on network functionality and has
sophisticated means for bandwidth control / QoS since Kernel 2.2.
The presentation is organized in the following parts:
Basics of QoS in IP networks
How can Linux help with QoS
Sample scenarios of Linux-based QoS solutions
Overview about advanced conecpts (DiffServ, IntServ, RSVP, ...)

@ -0,0 +1,23 @@
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%
%deffont "standard" tfont "VERDANA.TTF"
%deffont "standard-i" tfont "VERDANAI.TTF"
%deffont "thick" tfont "ARIBLK.TTF"
%deffont "typewriter" xfont "courier-medium-r", tfont "courbd.ttf", tmfont "wadalab-gothic.ttf"
%%
%% Default settings per each line numbers.
%%
%default 1 leftfill, size 2, fore "white", back "black", font "thick"
%default 1 bimage "fundo-cnc.png" 1024x768
%default 1 pcache 1 1 0 0
%default 2 size 7, vgap 10, prefix " "
%default 3 size 2, bar "midnightblue", vgap 30
%default 4 size 5, fore "lemon chiffon", vgap 30, prefix " ", font "standard"
%%
%% Default settings that are applied to TAB-indented lines.
%%
%tab 1 size 4, vgap 40, prefix " ", icon arc "tomato" 40
%tab 2 size 4, vgap 20, prefix " ", icon box "spring green" 40
%tab 3 size 3, vgap 20, prefix " ", icon delta3 "white" 40
%tab 4 size 3, vgap 20, prefix " ", icon delta3 "white" 40
%%

Binary file not shown.

After

Width:  |  Height:  |  Size: 127 KiB

@ -0,0 +1,397 @@
%include "cnc-style.mgp"
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%page
%nodefault
%pcache 1 1 0 1
%size 7, font "standard", fore "white", vgap 20, back "black"
%bimage "fundo-cnc.png" 1024x768
%center
%size 7
Quality of Service in IP Networks
%center
%size 4
by
Harald Welte <laforge@conectiva.com>
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%page
QoS in IP Networks
Contents
Definition of QoS
Why QoS
IP Networks are not designed for QoS
How to do the impossible
What can Linux based systems help
Advanced Concepts (DiffServ, IntServ, RSVP, ...)
References / Further Reading
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%page
QoS in IP Networks
Definiton of QoS
Provide Service Differentiation
Performance Assurance by
Bandwitdh guarantees
for streaming multimedia traffic
priorizing certain important applications
Latency guarantees
for voice over IP
for interactive character-oriented applications (ssh,telnet)
Packet-loss guarantees
for unreliable layer-4 protocols
to avoid retransmits
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%page
QoS in IP Networks
Why QoS
Decide how and who available bandwidth is devided
Limit available bandwidth for certain users / applications
Guarantee bandwidth for certain users / applications
Divide bandwidth more equally between users / applications
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%page
QoS in IP Networks
IP networks not designed for QoS
Properties of IP-based networks:
offer a "best-effort" service
make NO guarantees about
bandwidth
latency
packet loss
provide a non-reliable packet transport
Conclusion: IP networks are not suitable for QoS
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%page
QoS in IP Networks
How to do the Impossible
%size 4
As IP Networks including Hardware (Routers, ...) are widely deployed, all QoS efforts have to layer on top of the existing technology.
There's no real solution to control latency
latency widely dependent on routing, which may be dynamic
There's no real solution to control packet loss
packet loss may occurr on any intermediate router
But we can control bandwidth usage!
The sender can limit bandwidth for outgoing streams
Intermediate routers BEFORE a bottleneck can control bandwidth usage
%size 5
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%page
QoS in IP Networks
What can Linux systems do?
Bandwidth limiting at the sender application
not many applications support it
server often out of control (on Internet, ...)
server doesn't know what's between him and the client
Bandwidth control on intermediate router before bottleneck
Ideal case because this is where packet loss would occurr
Sophisticated queue scheduling on the outgoing queue
Variety of different queue scheduling algorithms
Flow throttling at the Receiver
Worst case, because influence is limited
Theoretically possible for TCP, no implementation yet.
Ingress qdisc might help
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%page
QoS in IP Networks
Bandwidth limiting at server
Some Internet Servers support bandwidth limiting
ProFTPd (builtin support)
Apache (using contributed mod_bandwidth)
Using those features it is easy to limit
maximum bandwidth used per connection
maximum bandwidth used per client (IP/network)
maximum bandwidth used by one virtual host (webserver/ftpserver)
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%page
QoS in IP Networks
Router before bottleneck
%size 4
The router receives more packets on his incoming interface(s) than it can send out on the outgoing interface. It has to build a queue of packets (usually a FIFO one) and starts dropping packets as soon as the queue is full
%image "qos-1.png" 0 100 30
The idea is to change this queue, thus decide
which packets get enqueued in which order
how many packets get queued
which packets get dropped in case of a filling queue
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%page
QoS in IP Networks
The Linux 2.2 / 2.4 Solution
Packet Scheduling algorithms in the Kernel
CBQ - Class Based Queue
RED - Random Early Drop
SFQ - Stochastic Fairness Queueing
TEQL - True Link Equalizer
TBF - Token Bucket Filter
tc command of iproute2 package for configuration
almost no documentation
very few examples on the internet
Packet Classification
tc builtin classes (route, u23, ...)
all iptables/netfilter matches by using fwmark
Conclusion: Linux is the best suited general-purpose operating system for QoS, but almost nobody is using it because lack of knowledge.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%page
QoS in IP Networks
Available queuing algorithms
CBQ - Class Based Queue
hierarchical bandwidth classes
used as basis in almost all cases
TBF - Token Bucket Filter
really accurate algorithm
uses a lot of CPU
not possible for high bandwidth links (>1MBit)
SFQ - Stochastic Fairness Queueing
less accurate algorithm
tries to distinguish between individual streams
does round robin between those streams
TEQL - True Link Equalizer
allows to 'bundle' interfaces
RED - Random Early Detect / Drop
simulates congested link by statistic packet dropping
uses almost no CPU
recommended for high-bandwidth backbones
others (WRR, TCINDEX, DSMARK, ..)
WRR not officially included in kernel, similar to CBQ
others mostly used for DiffServ
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%page
QoS in IP Networks
The big picture
Overview of the a packet's journey
%size 3
%font "typewriter"
Incoming Packets
%size 3
%font "typewriter"
|
%size 3
%font "typewriter"
V
%size 3
%font "typewriter"
Packet Classification classify
%size 3
%font "typewriter"
(ipchains/iptables) set nfmark
%size 3
%font "typewriter"
|
%size 3
%font "typewriter"
V
%size 3
%font "typewriter"
Routing decision
%size 3
%font "typewriter"
|
%size 3
%font "typewriter"
V
%size 3
%font "typewriter"
TC filter select classes based on nfmark
%size 3
%font "typewriter"
/ | \
%size 3
%font "typewriter"
/ | \
%size 3
%font "typewriter"
/ | \
%size 3
%font "typewriter"
Different Bandwidth classes bandwidth classes (CBQ)
%size 3
%font "typewriter"
\ | /
%size 3
%font "typewriter"
\ | /
%size 3
%font "typewriter"
\ | /
%size 3
%font "typewriter"
Enqueuing output queue discipline
%size 3
%font "typewriter"
|
%size 3
%font "typewriter"
V
%size 3
%font "typewriter"
Outgoing packets
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%page
QoS in IP Networks
Example scenario usin CBQ
%size 4
Let's assume we have a link with 10 MBit maximum available bandwidth.
We offer two major services to the outside world: Anonymous FTP and a Webserver offering important Information.
FTP Bulk data transfers are using up almost all available bandwidth, thus slowing down accesses to our website :(
We want to have FTP transfers use up to 8MBit and reserve 2MBit for WWW.
Implementation uses CBQ for bandwidth divisions.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%page
QoS in IP Networks
Example scenario
%size 3
attach a CBQ to the device
%size 3
%font "typewriter"
tc qdisc add dev eth0 root handle 10: cbq
bandwidth 10Mbit avpkt 1000
%size 3
%font "standard"
create CBQ classes
%size 3
%font "typewriter"
tc class add dev eth0 parent 10:0 classid 10:1 cbq
bandwidth 10MBit rate 10MBit allot 1514
weight 1Mbit prio 8 maxburst 20 avpkt 1000
tc class add dev eth0 parent 10:1 classid 10:100 cbq
bandwidth 10MBit rate 8MBit allot 1514
weight 800kbit prio 5 maxburst 20 avpkt 1000 bounded
tc class add dev eth0 parent 10:1 classid 10:200 cbq
bandwidth 10MBit rate 2MBit allot 1514
weight 200kbit prio 5 maxburst 20 avpkt 1000 bounded
%size 3
%font "standard"
add filter rules
%size 3
%font "typewriter"
tc filter add dev eth0 parent 10:1 protocol ip handle 6 fw classid 10:100
tc filter add dev eth0 parent 10:1 protocol ip handle 7 fw classid 10:200
iptables -t mangle -A PREROUTING -j MARK -p tcp --sport 20 --set-mark 6
iptables -t mangle -A PREROUTING -j MARK -p tcp ! --sport 20 --set-mark 7
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%page
QoS in IP Networks
Further optimization
%size 4
Now we have achieved bandwidth division between two services.
Within one service, however, one individual user with a high bandwith link can still use up most of our bandwidth, slowing down other user.
We can improve this behaviour of changing the scheduling algorithm from it's default (fifo)
%size 3
%font "typewriter"
tc qdisc add dev eth0 parent 10:100 sfq quantum 1514b perturb 15
tc qdisc add dev eth0 parent 10:200 sfq quantum 1514b perturb 15
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%page
QoS in IP Networks
Further reading / Links
Bandwidth limiting on Servers
ProFTPd
http://www.proftpd.net/
Apache mod_bandwidth / mod_bwshare
ftp://ftp.cohprog.com/pub/apache/module/mod_bandwidth.c
http://www.topology.org/src/bwshare/
Queue scheduling
Advanced Routing HOWTO
http://www.ds9a.nl/2.4Routing/
Linux QoS HOWTO
http://www.ittc.ukans.edu/~rsarav/howto/
iproute2+tc
This presentation
Authors Homepage
http://www.gnumonks.org/

@ -0,0 +1,611 @@
%!PS-Adobe-2.0 EPSF-2.0
%%Title: /laforge/home/laforge/incoming/qos-1
%%Creator: Dia v0.86
%%CreationDate: Mon Apr 2 16:14:45 2001
%%For: a user
%%Magnification: 1.0000
%%Orientation: Portrait
%%BoundingBox: 0 0 1356 288
%%Pages: 1
%%BeginSetup
%%EndSetup
%%EndComments
[ /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef
/.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef
/.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef
/.notdef /.notdef /space /exclam /quotedbl /numbersign /dollar /percent /ampersand /quoteright
/parenleft /parenright /asterisk /plus /comma /hyphen /period /slash /zero /one
/two /three /four /five /six /seven /eight /nine /colon /semicolon
/less /equal /greater /question /at /A /B /C /D /E
/F /G /H /I /J /K /L /M /N /O
/P /Q /R /S /T /U /V /W /X /Y
/Z /bracketleft /backslash /bracketright /asciicircum /underscore /quoteleft /a /b /c
/d /e /f /g /h /i /j /k /l /m
/n /o /p /q /r /s /t /u /v /w
/x /y /z /braceleft /bar /braceright /asciitilde /.notdef /.notdef /.notdef
/.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef
/.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef
/.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef
/space /exclamdown /cent /sterling /currency /yen /brokenbar /section /dieresis /copyright
/ordfeminine /guillemotleft /logicalnot /hyphen /registered /macron /degree /plusminus /twosuperior /threesuperior
/acute /mu /paragraph /periodcentered /cedilla /onesuperior /ordmasculine /guillemotright /onequarter /onehalf
/threequarters /questiondown /Agrave /Aacute /Acircumflex /Atilde /Adieresis /Aring /AE /Ccedilla
/Egrave /Eacute /Ecircumflex /Edieresis /Igrave /Iacute /Icircumflex /Idieresis /Eth /Ntilde
/Ograve /Oacute /Ocircumflex /Otilde /Odieresis /multiply /Oslash /Ugrave /Uacute /Ucircumflex
/Udieresis /Yacute /Thorn /germandbls /agrave /aacute /acircumflex /atilde /adieresis /aring
/ae /ccedilla /egrave /eacute /ecircumflex /edieresis /igrave /iacute /icircumflex /idieresis
/eth /ntilde /ograve /oacute /ocircumflex /otilde /odieresis /divide /oslash /ugrave
/uacute /ucircumflex /udieresis /yacute /thorn /ydieresis] /isolatin1encoding exch def
/Times-Roman-latin1
/Times-Roman findfont
dup length dict begin
{1 index /FID ne {def} {pop pop} ifelse} forall
/Encoding isolatin1encoding def
currentdict end
definefont pop
/Times-Italic-latin1
/Times-Italic findfont
dup length dict begin
{1 index /FID ne {def} {pop pop} ifelse} forall
/Encoding isolatin1encoding def
currentdict end
definefont pop
/Times-Bold-latin1
/Times-Bold findfont
dup length dict begin
{1 index /FID ne {def} {pop pop} ifelse} forall
/Encoding isolatin1encoding def
currentdict end
definefont pop
/Times-BoldItalic-latin1
/Times-BoldItalic findfont
dup length dict begin
{1 index /FID ne {def} {pop pop} ifelse} forall
/Encoding isolatin1encoding def
currentdict end
definefont pop
/AvantGarde-Book-latin1
/AvantGarde-Book findfont
dup length dict begin
{1 index /FID ne {def} {pop pop} ifelse} forall
/Encoding isolatin1encoding def
currentdict end
definefont pop
/AvantGarde-BookOblique-latin1
/AvantGarde-BookOblique findfont
dup length dict begin
{1 index /FID ne {def} {pop pop} ifelse} forall
/Encoding isolatin1encoding def
currentdict end
definefont pop
/AvantGarde-Demi-latin1
/AvantGarde-Demi findfont
dup length dict begin
{1 index /FID ne {def} {pop pop} ifelse} forall
/Encoding isolatin1encoding def
currentdict end
definefont pop
/AvantGarde-DemiOblique-latin1
/AvantGarde-DemiOblique findfont
dup length dict begin
{1 index /FID ne {def} {pop pop} ifelse} forall
/Encoding isolatin1encoding def
currentdict end
definefont pop
/Bookman-Light-latin1
/Bookman-Light findfont
dup length dict begin
{1 index /FID ne {def} {pop pop} ifelse} forall
/Encoding isolatin1encoding def
currentdict end
definefont pop
/Bookman-LightItalic-latin1
/Bookman-LightItalic findfont
dup length dict begin
{1 index /FID ne {def} {pop pop} ifelse} forall
/Encoding isolatin1encoding def
currentdict end
definefont pop
/Bookman-Demi-latin1
/Bookman-Demi findfont
dup length dict begin
{1 index /FID ne {def} {pop pop} ifelse} forall
/Encoding isolatin1encoding def
currentdict end
definefont pop
/Bookman-DemiItalic-latin1
/Bookman-DemiItalic findfont
dup length dict begin
{1 index /FID ne {def} {pop pop} ifelse} forall
/Encoding isolatin1encoding def
currentdict end
definefont pop
/Courier-latin1
/Courier findfont