linux-2.6.11.12.sublime-workspace

{
	"auto_complete":
	{
		"selected_items":
		[
		]
	},
	"buffers":
	[
		{
			"contents": "/*\n * INET		An implementation of the TCP/IP protocol suite for the LINUX\n *		operating system.  INET is implemented using the  BSD Socket\n *		interface as the means of communication with the user level.\n *\n *		Implementation of the Transmission Control Protocol(TCP).\n *\n * Version:	$Id: tcp.c,v 1.216 2002/02/01 22:01:04 davem Exp $\n *\n * Authors:	Ross Biro, <bir7@leland.Stanford.Edu>\n *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>\n *		Mark Evans, <evansmp@uhura.aston.ac.uk>\n *		Corey Minyard <wf-rch!minyard@relay.EU.net>\n *		Florian La Roche, <flla@stud.uni-sb.de>\n *		Charles Hedrick, <hedrick@klinzhai.rutgers.edu>\n *		Linus Torvalds, <torvalds@cs.helsinki.fi>\n *		Alan Cox, <gw4pts@gw4pts.ampr.org>\n *		Matthew Dillon, <dillon@apollo.west.oic.com>\n *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>\n *		Jorge Cwik, <jorge@laser.satlink.net>\n *\n * Fixes:\n *		Alan Cox	:	Numerous verify_area() calls\n *		Alan Cox	:	Set the ACK bit on a reset\n *		Alan Cox	:	Stopped it crashing if it closed while\n *					sk->inuse=1 and was trying to connect\n *					(tcp_err()).\n *		Alan Cox	:	All icmp error handling was broken\n *					pointers passed where wrong and the\n *					socket was looked up backwards. Nobody\n *					tested any icmp error code obviously.\n *		Alan Cox	:	tcp_err() now handled properly. It\n *					wakes people on errors. poll\n *					behaves and the icmp error race\n *					has gone by moving it into sock.c\n *		Alan Cox	:	tcp_send_reset() fixed to work for\n *					everything not just packets for\n *					unknown sockets.\n *		Alan Cox	:	tcp option processing.\n *		Alan Cox	:	Reset tweaked (still not 100%) [Had\n *					syn rule wrong]\n *		Herp Rosmanith  :	More reset fixes\n *		Alan Cox	:	No longer acks invalid rst frames.\n *					Acking any kind of RST is right out.\n *		Alan Cox	:	Sets an ignore me flag on an rst\n *					receive otherwise odd bits of prattle\n *					escape still\n *		Alan Cox	:	Fixed another acking RST frame bug.\n *					Should stop LAN workplace lockups.\n *		Alan Cox	: 	Some tidyups using the new skb list\n *					facilities\n *		Alan Cox	:	sk->keepopen now seems to work\n *		Alan Cox	:	Pulls options out correctly on accepts\n *		Alan Cox	:	Fixed assorted sk->rqueue->next errors\n *		Alan Cox	:	PSH doesn't end a TCP read. Switched a\n *					bit to skb ops.\n *		Alan Cox	:	Tidied tcp_data to avoid a potential\n *					nasty.\n *		Alan Cox	:	Added some better commenting, as the\n *					tcp is hard to follow\n *		Alan Cox	:	Removed incorrect check for 20 * psh\n *	Michael O'Reilly	:	ack < copied bug fix.\n *	Johannes Stille		:	Misc tcp fixes (not all in yet).\n *		Alan Cox	:	FIN with no memory -> CRASH\n *		Alan Cox	:	Added socket option proto entries.\n *					Also added awareness of them to accept.\n *		Alan Cox	:	Added TCP options (SOL_TCP)\n *		Alan Cox	:	Switched wakeup calls to callbacks,\n *					so the kernel can layer network\n *					sockets.\n *		Alan Cox	:	Use ip_tos/ip_ttl settings.\n *		Alan Cox	:	Handle FIN (more) properly (we hope).\n *		Alan Cox	:	RST frames sent on unsynchronised\n *					state ack error.\n *		Alan Cox	:	Put in missing check for SYN bit.\n *		Alan Cox	:	Added tcp_select_window() aka NET2E\n *					window non shrink trick.\n *		Alan Cox	:	Added a couple of small NET2E timer\n *					fixes\n *		Charles Hedrick :	TCP fixes\n *		Toomas Tamm	:	TCP window fixes\n *		Alan Cox	:	Small URG fix to rlogin ^C ack fight\n *		Charles Hedrick	:	Rewrote most of it to actually work\n *		Linus		:	Rewrote tcp_read() and URG handling\n *					completely\n *		Gerhard Koerting:	Fixed some missing timer handling\n *		Matthew Dillon  :	Reworked TCP machine states as per RFC\n *		Gerhard Koerting:	PC/TCP workarounds\n *		Adam Caldwell	:	Assorted timer/timing errors\n *		Matthew Dillon	:	Fixed another RST bug\n *		Alan Cox	:	Move to kernel side addressing changes.\n *		Alan Cox	:	Beginning work on TCP fastpathing\n *					(not yet usable)\n *		Arnt Gulbrandsen:	Turbocharged tcp_check() routine.\n *		Alan Cox	:	TCP fast path debugging\n *		Alan Cox	:	Window clamping\n *		Michael Riepe	:	Bug in tcp_check()\n *		Matt Dillon	:	More TCP improvements and RST bug fixes\n *		Matt Dillon	:	Yet more small nasties remove from the\n *					TCP code (Be very nice to this man if\n *					tcp finally works 100%) 8)\n *		Alan Cox	:	BSD accept semantics.\n *		Alan Cox	:	Reset on closedown bug.\n *	Peter De Schrijver	:	ENOTCONN check missing in tcp_sendto().\n *		Michael Pall	:	Handle poll() after URG properly in\n *					all cases.\n *		Michael Pall	:	Undo the last fix in tcp_read_urg()\n *					(multi URG PUSH broke rlogin).\n *		Michael Pall	:	Fix the multi URG PUSH problem in\n *					tcp_readable(), poll() after URG\n *					works now.\n *		Michael Pall	:	recv(...,MSG_OOB) never blocks in the\n *					BSD api.\n *		Alan Cox	:	Changed the semantics of sk->socket to\n *					fix a race and a signal problem with\n *					accept() and async I/O.\n *		Alan Cox	:	Relaxed the rules on tcp_sendto().\n *		Yury Shevchuk	:	Really fixed accept() blocking problem.\n *		Craig I. Hagan  :	Allow for BSD compatible TIME_WAIT for\n *					clients/servers which listen in on\n *					fixed ports.\n *		Alan Cox	:	Cleaned the above up and shrank it to\n *					a sensible code size.\n *		Alan Cox	:	Self connect lockup fix.\n *		Alan Cox	:	No connect to multicast.\n *		Ross Biro	:	Close unaccepted children on master\n *					socket close.\n *		Alan Cox	:	Reset tracing code.\n *		Alan Cox	:	Spurious resets on shutdown.\n *		Alan Cox	:	Giant 15 minute/60 second timer error\n *		Alan Cox	:	Small whoops in polling before an\n *					accept.\n *		Alan Cox	:	Kept the state trace facility since\n *					it's handy for debugging.\n *		Alan Cox	:	More reset handler fixes.\n *		Alan Cox	:	Started rewriting the code based on\n *					the RFC's for other useful protocol\n *					references see: Comer, KA9Q NOS, and\n *					for a reference on the difference\n *					between specifications and how BSD\n *					works see the 4.4lite source.\n *		A.N.Kuznetsov	:	Don't time wait on completion of tidy\n *					close.\n *		Linus Torvalds	:	Fin/Shutdown & copied_seq changes.\n *		Linus Torvalds	:	Fixed BSD port reuse to work first syn\n *		Alan Cox	:	Reimplemented timers as per the RFC\n *					and using multiple timers for sanity.\n *		Alan Cox	:	Small bug fixes, and a lot of new\n *					comments.\n *		Alan Cox	:	Fixed dual reader crash by locking\n *					the buffers (much like datagram.c)\n *		Alan Cox	:	Fixed stuck sockets in probe. A probe\n *					now gets fed up of retrying without\n *					(even a no space) answer.\n *		Alan Cox	:	Extracted closing code better\n *		Alan Cox	:	Fixed the closing state machine to\n *					resemble the RFC.\n *		Alan Cox	:	More 'per spec' fixes.\n *		Jorge Cwik	:	Even faster checksumming.\n *		Alan Cox	:	tcp_data() doesn't ack illegal PSH\n *					only frames. At least one pc tcp stack\n *					generates them.\n *		Alan Cox	:	Cache last socket.\n *		Alan Cox	:	Per route irtt.\n *		Matt Day	:	poll()->select() match BSD precisely on error\n *		Alan Cox	:	New buffers\n *		Marc Tamsky	:	Various sk->prot->retransmits and\n *					sk->retransmits misupdating fixed.\n *					Fixed tcp_write_timeout: stuck close,\n *					and TCP syn retries gets used now.\n *		Mark Yarvis	:	In tcp_read_wakeup(), don't send an\n *					ack if state is TCP_CLOSED.\n *		Alan Cox	:	Look up device on a retransmit - routes may\n *					change. Doesn't yet cope with MSS shrink right\n *					but it's a start!\n *		Marc Tamsky	:	Closing in closing fixes.\n *		Mike Shaver	:	RFC1122 verifications.\n *		Alan Cox	:	rcv_saddr errors.\n *		Alan Cox	:	Block double connect().\n *		Alan Cox	:	Small hooks for enSKIP.\n *		Alexey Kuznetsov:	Path MTU discovery.\n *		Alan Cox	:	Support soft errors.\n *		Alan Cox	:	Fix MTU discovery pathological case\n *					when the remote claims no mtu!\n *		Marc Tamsky	:	TCP_CLOSE fix.\n *		Colin (G3TNE)	:	Send a reset on syn ack replies in\n *					window but wrong (fixes NT lpd problems)\n *		Pedro Roque	:	Better TCP window handling, delayed ack.\n *		Joerg Reuter	:	No modification of locked buffers in\n *					tcp_do_retransmit()\n *		Eric Schenk	:	Changed receiver side silly window\n *					avoidance algorithm to BSD style\n *					algorithm. This doubles throughput\n *					against machines running Solaris,\n *					and seems to result in general\n *					improvement.\n *	Stefan Magdalinski	:	adjusted tcp_readable() to fix FIONREAD\n *	Willy Konynenberg	:	Transparent proxying support.\n *	Mike McLagan		:	Routing by source\n *		Keith Owens	:	Do proper merging with partial SKB's in\n *					tcp_do_sendmsg to avoid burstiness.\n *		Eric Schenk	:	Fix fast close down bug with\n *					shutdown() followed by close().\n *		Andi Kleen 	:	Make poll agree with SIGIO\n *	Salvatore Sanfilippo	:	Support SO_LINGER with linger == 1 and\n *					lingertime == 0 (RFC 793 ABORT Call)\n *	Hirokazu Takahashi	:	Use copy_from_user() instead of\n *					csum_and_copy_from_user() if possible.\n *\n *		This program is free software; you can redistribute it and/or\n *		modify it under the terms of the GNU General Public License\n *		as published by the Free Software Foundation; either version\n *		2 of the License, or(at your option) any later version.\n *\n * Description of States:\n *\n *	TCP_SYN_SENT		sent a connection request, waiting for ack\n *\n *	TCP_SYN_RECV		received a connection request, sent ack,\n *				waiting for final ack in three-way handshake.\n *\n *	TCP_ESTABLISHED		connection established\n *\n *	TCP_FIN_WAIT1		our side has shutdown, waiting to complete\n *				transmission of remaining buffered data\n *\n *	TCP_FIN_WAIT2		all buffered data sent, waiting for remote\n *				to shutdown\n *\n *	TCP_CLOSING		both sides have shutdown but we still have\n *				data we have to finish sending\n *\n *	TCP_TIME_WAIT		timeout to catch resent junk before entering\n *				closed, can only be entered from FIN_WAIT2\n *				or CLOSING.  Required because the other end\n *				may not have gotten our last ACK causing it\n *				to retransmit the data packet (which we ignore)\n *\n *	TCP_CLOSE_WAIT		remote side has shutdown and is waiting for\n *				us to finish writing our data and to shutdown\n *				(we have to close() to move on to LAST_ACK)\n *\n *	TCP_LAST_ACK		out side has shutdown after remote has\n *				shutdown.  There may still be data in our\n *				buffer that we have to finish sending\n *\n *	TCP_CLOSE		socket is finished\n */\n\n#include <linux/config.h>\n#include <linux/module.h>\n#include <linux/types.h>\n#include <linux/fcntl.h>\n#include <linux/poll.h>\n#include <linux/init.h>\n#include <linux/smp_lock.h>\n#include <linux/fs.h>\n#include <linux/random.h>\n#include <linux/bootmem.h>\n\n#include <net/icmp.h>\n#include <net/tcp.h>\n#include <net/xfrm.h>\n#include <net/ip.h>\n\n\n#include <asm/uaccess.h>\n#include <asm/ioctls.h>\n\n/* 对于本端断开的套接口连接，保持在FIN_WAIT_2状态的时间。每个FIN_WAIT_2状态的连接消耗约1.5K的内存 */\nint sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;\n\nDEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics);\n\nkmem_cache_t *tcp_openreq_cachep;\nkmem_cache_t *tcp_bucket_cachep;\nkmem_cache_t *tcp_timewait_cachep;\n\n/**\n * TCP传输层中待销毁的套接口数目。\n */\natomic_t tcp_orphan_count = ATOMIC_INIT(0);\n\n/**\n * 三个内存控制值。对应于low，pressure，high三个阀值。\n *	low:	当TCP使用的内存页面数量低于此值时，TCP不释放内存，且总能分配成功。\n *	pressure:	当TCP使用的内存数量超过该值时，进入告警状态。分配内存会根据参数来永定本次分配是否成功。\n *	high:	一旦已经分配的缓冲区总大小超出该值，会根据情况对发送和接收缓存做具体的确认。	\n */\nint sysctl_tcp_mem[3];\n/**\n * 发送缓冲区控制 \n *		min:		发送队列总长度的上限\n *		default:	发送缓冲区长度上限的初始值。用于初始化sock结构的sk_sndbuf。\n *		max:		发送缓冲区长度上限的最大值。\n */\nint sysctl_tcp_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 };\n/**\n * 接收缓存阀值。\n */\nint sysctl_tcp_rmem[3] = { 4 * 1024, 87380, 87380 * 2 };\n\nEXPORT_SYMBOL(sysctl_tcp_mem);\nEXPORT_SYMBOL(sysctl_tcp_rmem);\nEXPORT_SYMBOL(sysctl_tcp_wmem);\n\natomic_t tcp_memory_allocated;	/* Current allocated memory. */\natomic_t tcp_sockets_allocated;	/* Current number of TCP sockets. */\n\nEXPORT_SYMBOL(tcp_memory_allocated);\nEXPORT_SYMBOL(tcp_sockets_allocated);\n\n/*\n * Pressure flag: try to collapse.\n * Technical note: it is used by multiple contexts non atomically.\n * All the sk_stream_mem_schedule() is of this nature: accounting\n * is strict, actions are advisory and have some latency.\n */\nint tcp_memory_pressure;\n\nEXPORT_SYMBOL(tcp_memory_pressure);\n\n/* 当TCP内存分配进入告警状态时，调用此函数设置告警标志 */\nvoid tcp_enter_memory_pressure(void)\n{\n	if (!tcp_memory_pressure) {\n		NET_INC_STATS(LINUX_MIB_TCPMEMORYPRESSURES);\n		tcp_memory_pressure = 1;\n	}\n}\n\nEXPORT_SYMBOL(tcp_enter_memory_pressure);\n\n/*\n * LISTEN is a special case for poll..\n */\nstatic __inline__ unsigned int tcp_listen_poll(struct sock *sk,\n					       poll_table *wait)\n{\n	return tcp_sk(sk)->accept_queue ? (POLLIN | POLLRDNORM) : 0;\n}\n\n/*\n *	Wait for a TCP event.\n *\n *	Note that we don't need to lock the socket, as the upper poll layers\n *	take care of normal races (between the test and the event) and we don't\n *	go look at any of the socket buffers directly.\n */\nunsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)\n{\n	unsigned int mask;\n	struct sock *sk = sock->sk;\n	struct tcp_sock *tp = tcp_sk(sk);\n\n	poll_wait(file, sk->sk_sleep, wait);\n	if (sk->sk_state == TCP_LISTEN)\n		return tcp_listen_poll(sk, wait);\n\n	/* Socket is not locked. We are protected from async events\n	   by poll logic and correct handling of state changes\n	   made by another threads is impossible in any case.\n	 */\n\n	mask = 0;\n	if (sk->sk_err)\n		mask = POLLERR;\n\n	/*\n	 * POLLHUP is certainly not done right. But poll() doesn't\n	 * have a notion of HUP in just one direction, and for a\n	 * socket the read side is more interesting.\n	 *\n	 * Some poll() documentation says that POLLHUP is incompatible\n	 * with the POLLOUT/POLLWR flags, so somebody should check this\n	 * all. But careful, it tends to be safer to return too many\n	 * bits than too few, and you can easily break real applications\n	 * if you don't tell them that something has hung up!\n	 *\n	 * Check-me.\n	 *\n	 * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and\n	 * our fs/select.c). It means that after we received EOF,\n	 * poll always returns immediately, making impossible poll() on write()\n	 * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP\n	 * if and only if shutdown has been made in both directions.\n	 * Actually, it is interesting to look how Solaris and DUX\n	 * solve this dilemma. I would prefer, if PULLHUP were maskable,\n	 * then we could set it on SND_SHUTDOWN. BTW examples given\n	 * in Stevens' books assume exactly this behaviour, it explains\n	 * why PULLHUP is incompatible with POLLOUT.	--ANK\n	 *\n	 * NOTE. Check for TCP_CLOSE is added. The goal is to prevent\n	 * blocking on fresh not-connected or disconnected socket. --ANK\n	 */\n	if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)\n		mask |= POLLHUP;\n	if (sk->sk_shutdown & RCV_SHUTDOWN)\n		mask |= POLLIN | POLLRDNORM;\n\n	/* Connected? */\n	if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {\n		/* Potential race condition. If read of tp below will\n		 * escape above sk->sk_state, we can be illegally awaken\n		 * in SYN_* states. */\n		if ((tp->rcv_nxt != tp->copied_seq) &&\n		    (tp->urg_seq != tp->copied_seq ||\n		     tp->rcv_nxt != tp->copied_seq + 1 ||\n		     sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data))\n			mask |= POLLIN | POLLRDNORM;\n\n		if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {\n			if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {\n				mask |= POLLOUT | POLLWRNORM;\n			} else {  /* send SIGIO later */\n				set_bit(SOCK_ASYNC_NOSPACE,\n					&sk->sk_socket->flags);\n				set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);\n\n				/* Race breaker. If space is freed after\n				 * wspace test but before the flags are set,\n				 * IO signal will be lost.\n				 */\n				if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))\n					mask |= POLLOUT | POLLWRNORM;\n			}\n		}\n\n		if (tp->urg_data & TCP_URG_VALID)\n			mask |= POLLPRI;\n	}\n	return mask;\n}\n\n/* TCP的ioctl实现 */\nint tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)\n{\n	struct tcp_sock *tp = tcp_sk(sk);\n	int answ;\n\n	switch (cmd) {\n	case SIOCINQ:/* 获取接收缓存中的未读的数据量。 */\n		if (sk->sk_state == TCP_LISTEN)/* 如果在listen状态，返回错误。 */\n			return -EINVAL;\n\n		lock_sock(sk);\n		if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))\n			answ = 0;\n		else if (sock_flag(sk, SOCK_URGINLINE) ||\n			 !tp->urg_data ||\n			 before(tp->urg_seq, tp->copied_seq) ||\n			 !before(tp->urg_seq, tp->rcv_nxt)) {\n			answ = tp->rcv_nxt - tp->copied_seq;\n\n			/* Subtract 1, if FIN is in queue. */\n			if (answ && !skb_queue_empty(&sk->sk_receive_queue))\n				answ -=\n		       ((struct sk_buff *)sk->sk_receive_queue.prev)->h.th->fin;\n		} else\n			answ = tp->urg_seq - tp->copied_seq;\n		release_sock(sk);\n		break;\n	case SIOCATMARK:/* 检测带外数据是否已经被用户进程接收 */\n		answ = tp->urg_data && tp->urg_seq == tp->copied_seq;\n		break;\n	case SIOCOUTQ:/* 获取在发送队列缓存中未发送出去的数据量。 */\n		if (sk->sk_state == TCP_LISTEN)\n			return -EINVAL;\n\n		if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))\n			answ = 0;\n		else\n			answ = tp->write_seq - tp->snd_una;\n		break;\n	default:\n		return -ENOIOCTLCMD;\n	};\n\n	return put_user(answ, (int __user *)arg);\n}\n\n\nint tcp_listen_start(struct sock *sk)\n{\n	struct inet_sock *inet = inet_sk(sk);\n	struct tcp_sock *tp = tcp_sk(sk);\n	struct tcp_listen_opt *lopt;\n\n	/* 初始连接队列长度上限 */\n	sk->sk_max_ack_backlog = 0;\n	sk->sk_ack_backlog = 0;\n	tp->accept_queue = tp->accept_queue_tail = NULL;\n	/* 初始化传输控制块中与延时发送ACK有关的数据结构 */\n	rwlock_init(&tp->syn_wait_lock);\n	tcp_delack_init(tp);\n\n	/* 为管理连接请求块的散列表分配存储空间，如果失败则退出 */\n	lopt = kmalloc(sizeof(struct tcp_listen_opt), GFP_KERNEL);\n	if (!lopt)\n		return -ENOMEM;\n\n	memset(lopt, 0, sizeof(struct tcp_listen_opt));\n	for (lopt->max_qlen_log = 6; ; lopt->max_qlen_log++)\n		if ((1 << lopt->max_qlen_log) >= sysctl_max_syn_backlog)\n			break;\n	/* 计算哈希表的哈希种子 */\n	get_random_bytes(&lopt->hash_rnd, 4);\n\n	/* 将散列块与传输控制块绑定 */\n	write_lock_bh(&tp->syn_wait_lock);\n	tp->listen_opt = lopt;\n	write_unlock_bh(&tp->syn_wait_lock);\n\n	/* There is race window here: we announce ourselves listening,\n	 * but this transition is still not validated by get_port().\n	 * It is OK, because this socket enters to hash table only\n	 * after validation is complete.\n	 */\n	sk->sk_state = TCP_LISTEN;/* 设置控制块的状态 */\n	if (!sk->sk_prot->get_port(sk, inet->num)) {/* 进行端口绑定 */\n		inet->sport = htons(inet->num);/* 设置网络字节序的端口号 */\n\n		sk_dst_reset(sk);/* 清除路由缓存 */\n		sk->sk_prot->hash(sk);/* 将传输控制块添加到侦听散列表中 */\n\n		return 0;\n	}\n\n	/* 绑定失败，设置其状态 */\n	sk->sk_state = TCP_CLOSE;\n	/* 解除侦听连接请求块与传输控制块的绑定 */\n	write_lock_bh(&tp->syn_wait_lock);\n	tp->listen_opt = NULL;\n	write_unlock_bh(&tp->syn_wait_lock);\n	kfree(lopt);/* 释放侦听连接请求块 */\n	return -EADDRINUSE;\n}\n\n/*\n *	This routine closes sockets which have been at least partially\n *	opened, but not yet accepted.\n */\n/* 关闭套接口时，终止侦听端口 */\nstatic void tcp_listen_stop (struct sock *sk)\n{\n	struct tcp_sock *tp = tcp_sk(sk);\n	struct tcp_listen_opt *lopt = tp->listen_opt;\n	struct open_request *acc_req = tp->accept_queue;\n	struct open_request *req;\n	int i;\n\n	/* 停止sk_timer定时器 */\n	tcp_delete_keepalive_timer(sk);\n\n	/* make all the listen_opt local to us */\n	write_lock_bh(&tp->syn_wait_lock);/* 设置listen_opt后，应当不会再接受新的连接请求了 */\n	tp->listen_opt = NULL;\n	write_unlock_bh(&tp->syn_wait_lock);\n	tp->accept_queue = tp->accept_queue_tail = NULL;\n\n	if (lopt->qlen) {/* 请求连接者数量大于0 */\n		for (i = 0; i < TCP_SYNQ_HSIZE; i++) {/* 遍历半连接哈希表 */\n			while ((req = lopt->syn_table[i]) != NULL) {/* 遍历链表中的半连接 */\n				lopt->syn_table[i] = req->dl_next;\n				lopt->qlen--;\n				tcp_openreq_free(req);/* 关闭半连接 */\n\n		/* Following specs, it would be better either to send FIN\n		 * (and enter FIN-WAIT-1, it is normal close)\n		 * or to send active reset (abort).\n		 * Certainly, it is pretty dangerous while synflood, but it is\n		 * bad justification for our negligence 8)\n		 * To be honest, we are not able to make either\n		 * of the variants now.			--ANK\n		 */\n			}\n		}\n	}\n	BUG_TRAP(!lopt->qlen);\n\n	kfree(lopt);\n\n	while ((req = acc_req) != NULL) {/* 已经连接但是还没有被accept的连接 */\n		struct sock *child = req->sk;\n\n		acc_req = req->dl_next;\n\n		local_bh_disable();\n		bh_lock_sock(child);\n		BUG_TRAP(!sock_owned_by_user(child));\n		sock_hold(child);\n\n		/* 断开已经建立连接但是还没有被accept的连接 */\n		tcp_disconnect(child, O_NONBLOCK);\n\n		sock_orphan(child);\n\n		atomic_inc(&tcp_orphan_count);\n\n		tcp_destroy_sock(child);\n\n		bh_unlock_sock(child);\n		local_bh_enable();\n		sock_put(child);\n\n		sk_acceptq_removed(sk);\n		tcp_openreq_fastfree(req);\n	}\n	BUG_TRAP(!sk->sk_ack_backlog);\n}\n\nstatic inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)\n{\n	TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;\n	tp->pushed_seq = tp->write_seq;\n}\n\nstatic inline int forced_push(struct tcp_sock *tp)\n{\n	return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));\n}\n\nstatic inline void skb_entail(struct sock *sk, struct tcp_sock *tp,\n			      struct sk_buff *skb)\n{\n	skb->csum = 0;\n	TCP_SKB_CB(skb)->seq = tp->write_seq;\n	TCP_SKB_CB(skb)->end_seq = tp->write_seq;\n	TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;\n	TCP_SKB_CB(skb)->sacked = 0;\n	__skb_queue_tail(&sk->sk_write_queue, skb);\n	sk_charge_skb(sk, skb);\n	if (!sk->sk_send_head)\n		sk->sk_send_head = skb;\n	else if (tp->nonagle&TCP_NAGLE_PUSH)\n		tp->nonagle &= ~TCP_NAGLE_PUSH; \n}\n\nstatic inline void tcp_mark_urg(struct tcp_sock *tp, int flags,\n				struct sk_buff *skb)\n{\n	if (flags & MSG_OOB) {\n		tp->urg_mode = 1;\n		tp->snd_up = tp->write_seq;\n		TCP_SKB_CB(skb)->sacked |= TCPCB_URG;\n	}\n}\n\n/* 增加PSH标志后将报文发送出去 */\nstatic inline void tcp_push(struct sock *sk, struct tcp_sock *tp, int flags,\n			    int mss_now, int nonagle)\n{\n	if (sk->sk_send_head) {\n		struct sk_buff *skb = sk->sk_write_queue.prev;\n		if (!(flags & MSG_MORE) || forced_push(tp))\n			tcp_mark_push(tp, skb);\n		tcp_mark_urg(tp, flags, skb);\n		__tcp_push_pending_frames(sk, tp, mss_now,\n					  (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);\n	}\n}\n\nstatic ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,\n			 size_t psize, int flags)\n{\n	struct tcp_sock *tp = tcp_sk(sk);\n	int mss_now;\n	int err;\n	ssize_t copied;\n	long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);\n\n	/* Wait for a connection to finish. */\n	if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))\n		if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)\n			goto out_err;\n\n	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);\n\n	mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));\n	copied = 0;\n\n	err = -EPIPE;\n	if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))\n		goto do_error;\n\n	while (psize > 0) {\n		struct sk_buff *skb = sk->sk_write_queue.prev;\n		struct page *page = pages[poffset / PAGE_SIZE];\n		int copy, i, can_coalesce;\n		int offset = poffset % PAGE_SIZE;\n		int size = min_t(size_t, psize, PAGE_SIZE - offset);\n\n		if (!sk->sk_send_head || (copy = mss_now - skb->len) <= 0) {\nnew_segment:\n			if (!sk_stream_memory_free(sk))\n				goto wait_for_sndbuf;\n\n			skb = sk_stream_alloc_pskb(sk, 0, 0,\n						   sk->sk_allocation);\n			if (!skb)\n				goto wait_for_memory;\n\n			skb_entail(sk, tp, skb);\n			copy = mss_now;\n		}\n\n		if (copy > size)\n			copy = size;\n\n		i = skb_shinfo(skb)->nr_frags;\n		can_coalesce = skb_can_coalesce(skb, i, page, offset);\n		if (!can_coalesce && i >= MAX_SKB_FRAGS) {\n			tcp_mark_push(tp, skb);\n			goto new_segment;\n		}\n		if (sk->sk_forward_alloc < copy &&\n		    !sk_stream_mem_schedule(sk, copy, 0))\n			goto wait_for_memory;\n		\n		if (can_coalesce) {\n			skb_shinfo(skb)->frags[i - 1].size += copy;\n		} else {\n			get_page(page);\n			skb_fill_page_desc(skb, i, page, offset, copy);\n		}\n\n		skb->len += copy;\n		skb->data_len += copy;\n		skb->truesize += copy;\n		sk->sk_wmem_queued += copy;\n		sk->sk_forward_alloc -= copy;\n		skb->ip_summed = CHECKSUM_HW;\n		tp->write_seq += copy;\n		TCP_SKB_CB(skb)->end_seq += copy;\n		skb_shinfo(skb)->tso_segs = 0;\n\n		if (!copied)\n			TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;\n\n		copied += copy;\n		poffset += copy;\n		if (!(psize -= copy))\n			goto out;\n\n		if (skb->len != mss_now || (flags & MSG_OOB))\n			continue;\n\n		if (forced_push(tp)) {\n			tcp_mark_push(tp, skb);\n			__tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);\n		} else if (skb == sk->sk_send_head)\n			tcp_push_one(sk, mss_now);\n		continue;\n\nwait_for_sndbuf:\n		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);\nwait_for_memory:\n		if (copied)\n			tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);\n\n		if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)\n			goto do_error;\n\n		mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));\n	}\n\nout:\n	if (copied)\n		tcp_push(sk, tp, flags, mss_now, tp->nonagle);\n	return copied;\n\ndo_error:\n	if (copied)\n		goto out;\nout_err:\n	return sk_stream_error(sk, flags, err);\n}\n\nssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,\n		     size_t size, int flags)\n{\n	ssize_t res;\n	struct sock *sk = sock->sk;\n\n#define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)\n\n	if (!(sk->sk_route_caps & NETIF_F_SG) ||\n	    !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS))\n		return sock_no_sendpage(sock, page, offset, size, flags);\n\n#undef TCP_ZC_CSUM_FLAGS\n\n	lock_sock(sk);\n	TCP_CHECK_TIMER(sk);\n	res = do_tcp_sendpages(sk, &page, offset, size, flags);\n	TCP_CHECK_TIMER(sk);\n	release_sock(sk);\n	return res;\n}\n\n#define TCP_PAGE(sk)	(sk->sk_sndmsg_page)\n#define TCP_OFF(sk)	(sk->sk_sndmsg_off)\n\nstatic inline int select_size(struct sock *sk, struct tcp_sock *tp)\n{\n	int tmp = tp->mss_cache_std;\n\n	if (sk->sk_route_caps & NETIF_F_SG) {\n		int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);\n\n		if (tmp >= pgbreak &&\n		    tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)\n			tmp = pgbreak;\n	}\n	return tmp;\n}\n\n/* sendmsg系统调用在TCP层的实现 */\nint tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,\n		size_t size)\n{\n	struct iovec *iov;\n	struct tcp_sock *tp = tcp_sk(sk);\n	struct sk_buff *skb;\n	int iovlen, flags;\n	int mss_now;\n	int err, copied;\n	long timeo;\n\n	/* 获取套接口的锁 */\n	lock_sock(sk);\n	TCP_CHECK_TIMER(sk);\n\n	/* 根据标志计算阻塞超时时间 */\n	flags = msg->msg_flags;\n	timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);\n\n	/* Wait for a connection to finish. */\n	if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))/* 只有这两种状态才能发送消息 */\n		if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)/* 其它状态下等待连接正确建立，超时则进行错误处理 */\n			goto out_err;\n\n	/* This should be in poll */\n	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);\n\n	/* 获得有效的MSS，如果支持OOB，则不能支持TSO，MSS则应当是比较小的值 */\n	mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));\n\n	/* Ok commence sending. */\n	/* 获取待发送数据块数及数据块指针 */\n	iovlen = msg->msg_iovlen;\n	iov = msg->msg_iov;\n	/* copied表示从用户数据块复制到skb中的字节数。 */\n	copied = 0;\n\n	err = -EPIPE;\n	/* 如果套接口存在错误，则不允许发送数据，返回EPIPE错误 */\n	if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))\n		goto do_error;\n\n	while (--iovlen >= 0) {/* 处理所有待发送数据块 */\n		int seglen = iov->iov_len;\n		unsigned char __user *from = iov->iov_base;\n\n		iov++;\n\n		while (seglen > 0) {/* 处理单个数据块中的所有数据 */\n			int copy;\n\n			skb = sk->sk_write_queue.prev;\n\n			if (!sk->sk_send_head ||/* 发送队列为空，前面取得的skb无效 */\n			    (copy = mss_now - skb->len) <= 0) {/* 如果skb有效，但是它已经没有多余的空间复制新数据了 */\n\nnew_segment:\n				/* Allocate new segment. If the interface is SG,\n				 * allocate skb fitting to single page.\n				 */\n				if (!sk_stream_memory_free(sk))/* 发送队列中数据长度达到发送缓冲区的上限，等待缓冲区 */\n					goto wait_for_sndbuf;\n\n				skb = sk_stream_alloc_pskb(sk, select_size(sk, tp),\n							   0, sk->sk_allocation);/* 分配新的skb */\n				if (!skb)/* 分配失败，说明系统内存不足，等待 */\n					goto wait_for_memory;\n\n				/*\n				 * Check whether we can use HW checksum.\n				 */\n				if (sk->sk_route_caps &\n				    (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM |\n				     NETIF_F_HW_CSUM))/* 根据路由网络设备的特性，确定是否由硬件执行校验和 */\n					skb->ip_summed = CHECKSUM_HW;\n\n				skb_entail(sk, tp, skb);/* 将SKB添加到发送队列尾部 */\n				copy = mss_now;/* 本次需要复制的数据量是MSS */\n			}\n\n			/* Try to append data to the end of skb. */\n			if (copy > seglen)/* 要复制的数据不能大于当前段的长度 */\n				copy = seglen;\n\n			/* Where to copy to? */\n			if (skb_tailroom(skb) > 0) {/* skb线性存储区底部还有空间 */\n				/* We have some space in skb head. Superb! */\n				if (copy > skb_tailroom(skb))/* 本次只复制skb存储区底部剩余空间大小的数据量 */\n					copy = skb_tailroom(skb);\n				/* 从用户空间复制指定长度的数据到skb中，如果失败，则退出 */\n				if ((err = skb_add_data(skb, from, copy)) != 0)\n					goto do_fault;\n			} else {/* 线性存储区底部已经没有空间了，复制到分散/聚集存储区中 */\n				int merge = 0;/* 是否在页中添加数据 */\n				int i = skb_shinfo(skb)->nr_frags;/* 分散/聚集片断数 */\n				struct page *page = TCP_PAGE(sk);/* 分片页页 */\n				int off = TCP_OFF(sk);/* 分片内的偏移 */\n\n				if (skb_can_coalesce(skb, i, page, off) &&\n				    off != PAGE_SIZE) {/* 当前分片还能添加数据 */\n					/* We can extend the last page\n					 * fragment. */\n					merge = 1;\n				} else if (i == MAX_SKB_FRAGS ||/* 目前skb中的页不能添加数据，这里判断是否能再分配页 */\n					   (!i &&\n					   !(sk->sk_route_caps & NETIF_F_SG))) {/* 网卡不支持S/G，不能分片 */\n					/* Need to add new fragment and cannot\n					 * do this because interface is non-SG,\n					 * or because all the page slots are\n					 * busy. */\n					tcp_mark_push(tp, skb);/* SKB可以提交了 */\n					goto new_segment;/* 重新分配skb */\n				} else if (page) {/* 分页数量未达到上限，判断当前页是否还有空间 */\n					/* If page is cached, align\n					 * offset to L1 cache boundary\n					 */\n					off = (off + L1_CACHE_BYTES - 1) &\n					      ~(L1_CACHE_BYTES - 1);\n					if (off == PAGE_SIZE) {/* 最后一个分页数据已经满，需要分配新页 */\n						put_page(page);\n						TCP_PAGE(sk) = page = NULL;\n					}\n				}\n\n				if (!page) {/* 需要分配新页 */\n					/* Allocate new cache page. */\n					if (!(page = sk_stream_alloc_page(sk)))/* 分配新页，如果内存不足则等待内存 */\n						goto wait_for_memory;\n					off = 0;\n				}\n\n				if (copy > PAGE_SIZE - off)/* 待复制的数据不能大于页中剩余空间 */\n					copy = PAGE_SIZE - off;\n\n				/* Time to copy data. We are close to\n				 * the end! */\n				err = skb_copy_to_page(sk, from, skb, page,\n						       off, copy);/* 从用户态复制数据到页中 */\n				if (err) {/* 复制失败了 */\n					/* If this page was new, give it to the\n					 * socket so it does not get leaked.\n					 */\n					if (!TCP_PAGE(sk)) {/* 如果是新分配的页，则将页记录到skb中，供今后使用 */\n						TCP_PAGE(sk) = page;\n						TCP_OFF(sk) = 0;\n					}\n					goto do_error;\n				}\n\n				/* Update the skb. */\n				/* 更新skb的分段信息 */\n				if (merge) {/* 在最后一个页中追加数据 */\n					skb_shinfo(skb)->frags[i - 1].size +=\n									copy;/* 更新最后一页的数据长度 */\n				} else {/* 新分配的页 */\n					/* 更新skb中分片信息 */\n					skb_fill_page_desc(skb, i, page, off, copy);\n					if (TCP_PAGE(sk)) {\n						get_page(page);\n					} else if (off + copy < PAGE_SIZE) {\n						get_page(page);\n						TCP_PAGE(sk) = page;\n					}\n				}\n\n				/* 更新页内偏移 */\n				TCP_OFF(sk) = off + copy;\n			}\n\n			if (!copied)/* 如果没有复制数据，则取消PSH标志 */\n				TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;\n\n			tp->write_seq += copy;/* 更新发送队列最后一个包的序号 */\n			TCP_SKB_CB(skb)->end_seq += copy;/* 更新skb的序号 */\n			skb_shinfo(skb)->tso_segs = 0;\n\n			/* 更新数据复制的指针 */\n			from += copy;\n			copied += copy;\n			/* 如果所有数据已经复制完毕则退出 */\n			if ((seglen -= copy) == 0 && iovlen == 0)\n				goto out;\n\n			/* 如果当前skb中的数据小于mss，说明可以往里面继续复制数据。或者发送的是OOB数据，则也跳过发送过程，继续复制数据 */\n			if (skb->len != mss_now || (flags & MSG_OOB))\n				continue;\n\n			if (forced_push(tp)) {/* 必须立即发送数据，即上次发送后产生的数据已经超过通告窗口值的一半 */\n				/* 设置PSH标志后发送数据 */\n				tcp_mark_push(tp, skb);\n				__tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);\n			} else if (skb == sk->sk_send_head)/* 虽然不是必须发送数据，但是发送队列上只存在当前段，也将其发送出去 */\n				tcp_push_one(sk, mss_now);\n			continue;\n\nwait_for_sndbuf:\n			/* 由于发送队列满的原因导致等待 */\n			set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);\nwait_for_memory:\n			if (copied)/* 虽然没有内存了，但是本次调用复制了数据到缓冲区，调用tcp_push将其发送出去 */\n				tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);\n\n			/* 等待内存可用 */\n			if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)\n				goto do_error;/* 确实没有内存了，超时后返回失败 */\n\n			/* 睡眠后，MSS可能发生了变化，重新计算 */\n			mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));\n		}\n	}\n\nout:\n	if (copied)/* 从用户态复制了数据，发送它 */\n		tcp_push(sk, tp, flags, mss_now, tp->nonagle);\n	TCP_CHECK_TIMER(sk);\n	release_sock(sk);/* 释放锁以后返回 */\n	return copied;\n\ndo_fault:\n	if (!skb->len) {/* 复制数据失败了，如果skb长度为0，说明是新分配的，释放它 */\n		if (sk->sk_send_head == skb)/* 如果skb是发送队列头，则清空队列头 */\n			sk->sk_send_head = NULL;\n		__skb_unlink(skb, skb->list);\n		sk_stream_free_skb(sk, skb);/* 释放skb */\n	}\n\ndo_error:\n	if (copied)\n		goto out;\nout_err:\n	err = sk_stream_error(sk, flags, err);\n	TCP_CHECK_TIMER(sk);\n	release_sock(sk);\n	return err;\n}\n\n/*\n *	Handle reading urgent data. BSD has very simple semantics for\n *	this, no blocking and very strange errors 8)\n */\n/* 将保存在传输控制块中的带外数据读取到用户空间中，当用户通过recv调用读取带外数据时使用 */\nstatic int tcp_recv_urg(struct sock *sk, long timeo,\n			struct msghdr *msg, int len, int flags,\n			int *addr_len)\n{\n	struct tcp_sock *tp = tcp_sk(sk);\n\n	/* No URG data to read. */\n	if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||/* 将URG读到正常数据流，或者没有带外数据 */\n	    tp->urg_data == TCP_URG_READ)/* 带外数据已经被读取 */\n		return -EINVAL;	/* Yes this is right ! */\n\n	if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))/* 还没有连接，不能读取带外数据 */\n		return -ENOTCONN;\n\n	if (tp->urg_data & TCP_URG_VALID) {/* 带外数据有效 */\n		int err = 0;\n		char c = tp->urg_data;\n\n		if (!(flags & MSG_PEEK))/* 如果不是查看，则设置READ标志表示数据已经被读走 */\n			tp->urg_data = TCP_URG_READ;\n\n		/* Read urgent data. */\n		msg->msg_flags |= MSG_OOB;/* 向用户返回标志，表示读取了带外数据 */\n\n		if (len > 0) {/* 用户指定了接收缓冲区 */\n			if (!(flags & MSG_TRUNC))/* 用户不是想清除带外数据 */\n				err = memcpy_toiovec(msg->msg_iov, &c, 1);/* 将带外数据复制到用户缓冲区 */\n			len = 1;\n		} else\n			msg->msg_flags |= MSG_TRUNC;/* 返回此标志表示缓冲区不足，数据被截断 */\n\n		return err ? -EFAULT : len;\n	}\n\n	if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))/* 此时返回0表示没有读到数据 */\n		return 0;\n\n	/* Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and\n	 * the available implementations agree in this case:\n	 * this call should never block, independent of the\n	 * blocking state of the socket.\n	 * Mike <pall@rz.uni-karlsruhe.de>\n	 */\n	/* 会运行到这里吗? */\n	return -EAGAIN;\n}\n\n/* Clean up the receive buffer for full frames taken by the user,\n * then send an ACK if necessary.  COPIED is the number of bytes\n * tcp_recvmsg has given to the user so far, it speeds up the\n * calculation of whether or not we must ACK for the sake of\n * a window update.\n */\n/* 将接收队列中的数据复制到用户空间后，为满负荷的段清理接收缓冲区，然后根据需要确定是否发送ACK段 */\nstatic void cleanup_rbuf(struct sock *sk, int copied)\n{\n	struct tcp_sock *tp = tcp_sk(sk);\n	int time_to_ack = 0;\n\n#if TCP_DEBUG\n	struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);\n\n	BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));\n#endif\n\n	/* 确定是否需要立即发送ACK给对方 */\n	if (tcp_ack_scheduled(tp)) {\n		   /* Delayed ACKs frequently hit locked sockets during bulk\n		    * receive. */\n		if (tp->ack.blocked ||/* 由于锁被占用，因此ACK被延迟发送 */\n		    /* Once-per-two-segments ACK was not sent by tcp_input.c */\n		    tp->rcv_nxt - tp->rcv_wup > tp->ack.rcv_mss ||/* 有一个以上的全尺寸段还没有给对方确认 */\n		    /*\n		     * If this read emptied read buffer, we send ACK, if\n		     * connection is not bidirectional, user drained\n		     * receive buffer and there was a small segment\n		     * in queue.\n		     */\n		    (copied > 0 && (tp->ack.pending & TCP_ACK_PUSHED) &&/* 发送到用户空间的数据量大于0，并且发送紧急程度为TCP_ACK_PUSHED */\n		     !tp->ack.pingpong && !atomic_read(&sk->sk_rmem_alloc)))\n			time_to_ack = 1;\n	}\n\n	/* We send an ACK if we can now advertise a non-zero window\n	 * which has been raised \"significantly\".\n	 *\n	 * Even if window raised up to infinity, do not send window open ACK\n	 * in states, where we will not receive more. It is useless.\n	 */\n	/* 其他情况下，如果还需要接收报文则继续判断 */\n	if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {\n		__u32 rcv_window_now = tcp_receive_window(tp);\n\n		/* Optimize, __tcp_select_window() is not cheap. */\n		if (2*rcv_window_now <= tp->window_clamp) {/* 当前接收窗口小于接收窗口上限的一半 */\n			__u32 new_window = __tcp_select_window(sk);\n\n			/* Send ACK now, if this read freed lots of space\n			 * in our buffer. Certainly, new_window is new window.\n			 * We can advertise it now, if it is not less than current one.\n			 * \"Lots\" means \"at least twice\" here.\n			 */\n			/* ??? */\n			if (new_window && new_window >= 2 * rcv_window_now)\n				time_to_ack = 1;\n		}\n	}\n	if (time_to_ack)\n		tcp_send_ack(sk);\n}\n\nstatic void tcp_prequeue_process(struct sock *sk)\n{\n	struct sk_buff *skb;\n	struct tcp_sock *tp = tcp_sk(sk);\n\n	NET_ADD_STATS_USER(LINUX_MIB_TCPPREQUEUED, skb_queue_len(&tp->ucopy.prequeue));\n\n	/* RX process wants to run with disabled BHs, though it is not\n	 * necessary */\n	local_bh_disable();\n	while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)\n		sk->sk_backlog_rcv(sk, skb);\n	local_bh_enable();\n\n	/* Clear memory counter. */\n	tp->ucopy.memory = 0;\n}\n\nstatic inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)\n{\n	struct sk_buff *skb;\n	u32 offset;\n\n	skb_queue_walk(&sk->sk_receive_queue, skb) {\n		offset = seq - TCP_SKB_CB(skb)->seq;\n		if (skb->h.th->syn)\n			offset--;\n		if (offset < skb->len || skb->h.th->fin) {\n			*off = offset;\n			return skb;\n		}\n	}\n	return NULL;\n}\n\n/*\n * This routine provides an alternative to tcp_recvmsg() for routines\n * that would like to handle copying from skbuffs directly in 'sendfile'\n * fashion.\n * Note:\n *	- It is assumed that the socket was locked by the caller.\n *	- The routine does not block.\n *	- At present, there is no support for reading OOB data\n *	  or for 'peeking' the socket using this routine\n *	  (although both would be easy to implement).\n */\nint tcp_read_sock(struct sock *sk, read_descriptor_t *desc,\n		  sk_read_actor_t recv_actor)\n{\n	struct sk_buff *skb;\n	struct tcp_sock *tp = tcp_sk(sk);\n	u32 seq = tp->copied_seq;\n	u32 offset;\n	int copied = 0;\n\n	if (sk->sk_state == TCP_LISTEN)\n		return -ENOTCONN;\n	while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {\n		if (offset < skb->len) {\n			size_t used, len;\n\n			len = skb->len - offset;\n			/* Stop reading if we hit a patch of urgent data */\n			if (tp->urg_data) {\n				u32 urg_offset = tp->urg_seq - seq;\n				if (urg_offset < len)\n					len = urg_offset;\n				if (!len)\n					break;\n			}\n			used = recv_actor(desc, skb, offset, len);\n			if (used <= len) {\n				seq += used;\n				copied += used;\n				offset += used;\n			}\n			if (offset != skb->len)\n				break;\n		}\n		if (skb->h.th->fin) {\n			sk_eat_skb(sk, skb);\n			++seq;\n			break;\n		}\n		sk_eat_skb(sk, skb);\n		if (!desc->count)\n			break;\n	}\n	tp->copied_seq = seq;\n\n	tcp_rcv_space_adjust(sk);\n\n	/* Clean up data we have read: This will do ACK frames. */\n	if (copied)\n		cleanup_rbuf(sk, copied);\n	return copied;\n}\n\n/*\n *	This routine copies from a sock struct into the user buffer.\n *\n *	Technical note: in 2.3 we work on _locked_ socket, so that\n *	tricks with *seq access order and skb->users are not required.\n *	Probably, code can be easily improved even more.\n */\n\n/* recvmsg系统调用的传输层实现 */\nint tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,\n		size_t len, int nonblock, int flags, int *addr_len)\n{\n	struct tcp_sock *tp = tcp_sk(sk);\n	int copied = 0;\n	u32 peek_seq;\n	u32 *seq;\n	unsigned long used;\n	int err;\n	int target;		/* Read at least this many bytes */\n	long timeo;\n	struct task_struct *user_recv = NULL;\n\n	lock_sock(sk);/* 首先获取套接口的锁 */\n\n	TCP_CHECK_TIMER(sk);\n\n	err = -ENOTCONN;\n	if (sk->sk_state == TCP_LISTEN)/* LISTEN状态的套接口是不能读取数据的 */\n		goto out;\n\n	timeo = sock_rcvtimeo(sk, nonblock);/* 计算超时时间 */\n\n	/* Urgent data needs to be handled specially. */\n	if (flags & MSG_OOB)/* 带外数据的处理比较复杂，特殊处理 */\n		goto recv_urg;\n\n	seq = &tp->copied_seq;/* 默认情况下，是将报文从内核态读到用户态，需要更新copied_seq */\n	if (flags & MSG_PEEK) {/* 如果只查看而不取走数据，则不能更新copied_seq，后面就只更新临时变量peek_seq了 */\n		peek_seq = tp->copied_seq;\n		seq = &peek_seq;\n	}\n\n	/* target是本次复制数据的长度，如果指定了MSG_WAITALL，就需要读取用户指定长度的数据，否则可以只读取部分数据 */\n	target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);\n\n	do {\n		struct sk_buff *skb;\n		u32 offset;\n\n		/* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */\n		if (tp->urg_data && tp->urg_seq == *seq) {/* 当前遇到了带外数据 */\n			if (copied)/* 如果已经复制了部分数据到用户态，则退出 */\n				break;\n			if (signal_pending(current)) {/* 如果接收到信号，也退出 */\n				copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;\n				break;\n			}\n		}\n\n		/* Next get a buffer. */\n\n		skb = skb_peek(&sk->sk_receive_queue);/* 获取下一个待读取的段 */\n		do {\n			if (!skb)/* 接收队列为空，退出，处理prequeue队列和后备队列 */\n				break;\n\n			/* Now that we have two receive queues this\n			 * shouldn't happen.\n			 */\n			if (before(*seq, TCP_SKB_CB(skb)->seq)) {/* 下一个段不是预期读取的段，只能退出处理prequeue队列和后备队列，实际上这不可能发生 */\n				printk(KERN_INFO \"recvmsg bug: copied %X \"\n				       \"seq %X\\n\", *seq, TCP_SKB_CB(skb)->seq);\n				break;\n			}\n			/* 计算我们应当在该段的何处开始复制数据，因为上次recv调用可能已经读取了部分数据 */\n			offset = *seq - TCP_SKB_CB(skb)->seq;\n			if (skb->h.th->syn)/* syn标志占用一个序号，因此需要调整偏移 */\n				offset--;\n			if (offset < skb->len)/* 偏移还在段范围内，说明当前段是有效的，从该段中读取数据 */\n				goto found_ok_skb;\n			if (skb->h.th->fin)/* 该段的数据已经读取完毕，如果fin标志，那么不能继续处理后面的数据了 */\n				goto found_fin_ok;\n			BUG_TRAP(flags & MSG_PEEK);\n			skb = skb->next;\n		} while (skb != (struct sk_buff *)&sk->sk_receive_queue);\n\n		/* Well, if we have backlog, try to process it now yet. */\n\n		/* 数据已经读完，并且后备队列为空，直接退出 */\n		if (copied >= target && !sk->sk_backlog.tail)\n			break;\n\n		if (copied) {/* 复制了部分数据，检查是否有退出事件需要处理 */\n			if (sk->sk_err ||/* SOCK发生了错误 */\n			    sk->sk_state == TCP_CLOSE ||/* 套口已经关闭 */\n			    (sk->sk_shutdown & RCV_SHUTDOWN) ||/* 停止接收 */\n			    !timeo ||/* 超时时间到 */\n			    signal_pending(current) ||/* 接收到信号 */\n			    (flags & MSG_PEEK))/* 仅仅查看数据 */\n				break;/* 这些事件都导致接收过程退出 */\n		} else {\n			if (sock_flag(sk, SOCK_DONE))/* TCP会话已经结束，收到了FIN报文 */\n				break;\n\n			if (sk->sk_err) {/* 有错误发生，退出 */\n				copied = sock_error(sk);\n				break;\n			}\n\n			if (sk->sk_shutdown & RCV_SHUTDOWN)/* 停止接收 */\n				break;\n\n			if (sk->sk_state == TCP_CLOSE) {\n				if (!sock_flag(sk, SOCK_DONE)) {/* 可能是连接还没有建立 */\n					/* This occurs when user tries to read\n					 * from never connected socket.\n					 */\n					copied = -ENOTCONN;\n					break;\n				}\n				break;\n			}\n\n			if (!timeo) {/* 非阻塞读，退出 */\n				copied = -EAGAIN;\n				break;\n			}\n\n			if (signal_pending(current)) {/* 接收到信号 */\n				copied = sock_intr_errno(timeo);\n				break;\n			}\n		}\n\n		/* 检测是否有确认需要发送 */\n		cleanup_rbuf(sk, copied);\n\n		if (tp->ucopy.task == user_recv) {\n			/* Install new reader */\n			/* 第一次检测处理 */\n			if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {\n				user_recv = current;\n				tp->ucopy.task = user_recv;\n				tp->ucopy.iov = msg->msg_iov;\n			}\n\n			tp->ucopy.len = len;/* 更新可使用的用户态缓存大小 */\n\n			BUG_TRAP(tp->copied_seq == tp->rcv_nxt ||\n				 (flags & (MSG_PEEK | MSG_TRUNC)));\n\n			/* Ugly... If prequeue is not empty, we have to\n			 * process it before releasing socket, otherwise\n			 * order will be broken at second iteration.\n			 * More elegant solution is required!!!\n			 *\n			 * Look: we have the following (pseudo)queues:\n			 *\n			 * 1. packets in flight\n			 * 2. backlog\n			 * 3. prequeue\n			 * 4. receive_queue\n			 *\n			 * Each queue can be processed only if the next ones\n			 * are empty. At this point we have empty receive_queue.\n			 * But prequeue _can_ be not empty after 2nd iteration,\n			 * when we jumped to start of loop because backlog\n			 * processing added something to receive_queue.\n			 * We cannot release_sock(), because backlog contains\n			 * packets arrived _after_ prequeued ones.\n			 *\n			 * Shortly, algorithm is clear --- to process all\n			 * the queues in order. We could make it more directly,\n			 * requeueing packets from backlog to prequeue, if\n			 * is not empty. It is more elegant, but eats cycles,\n			 * unfortunately.\n			 */\n			/* 如果prequeue不为空，则处理prequeue队列 */\n			if (skb_queue_len(&tp->ucopy.prequeue))\n				goto do_prequeue;\n\n			/* __ Set realtime policy in scheduler __ */\n		}\n\n		if (copied >= target) {/* 数据读取完毕 */\n			/* Do not sleep, just process backlog. */\n			release_sock(sk);/* 释放锁，主要是处理后备队列 */\n			lock_sock(sk);/* 再次获取锁 */\n		} else\n			sk_wait_data(sk, &timeo);/* 等待新数据到来，或者超时。在此期间软中断可能复制数据到用户态 */\n\n		if (user_recv) {\n			int chunk;\n\n			/* __ Restore normal policy in scheduler __ */\n\n			/* 睡眠期间，复制了数据到用户态 */\n			if ((chunk = len - tp->ucopy.len) != 0) {\n				NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);\n				len -= chunk;/* 更新计数 */\n				copied += chunk;\n			}\n\n			if (tp->rcv_nxt == tp->copied_seq &&/* 接收队列中的数据已经全部复制到用户态 */\n			    skb_queue_len(&tp->ucopy.prequeue)) {/* prequeue还有数据 */\ndo_prequeue:\n				tcp_prequeue_process(sk);/* 处理prequeue队列 */\n\n				if ((chunk = len - tp->ucopy.len) != 0) {/* 从prequeue队列复制了数据到用户态 */\n					NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);\n					len -= chunk;/* 更新计数 */\n					copied += chunk;\n				}\n			}\n		}\n		if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) {\n			if (net_ratelimit())\n				printk(KERN_DEBUG \"TCP(%s:%d): Application bug, race in MSG_PEEK.\\n\",\n				       current->comm, current->pid);\n			peek_seq = tp->copied_seq;\n		}\n		continue;/* 继续处理待读取的段 */\n\n	found_ok_skb:\n		/* Ok so how much can we use? */\n		used = skb->len - offset;/* 本段中可以读取的长度 */\n		if (len < used)/* 如果可读的长度较长，则只读取用户期望读取的长度 */\n			used = len;\n\n		/* Do we have urgent data here? */\n		if (tp->urg_data) {/* 有带外数据 */\n			u32 urg_offset = tp->urg_seq - *seq;\n			if (urg_offset < used) {/* 带外数据在可读数据内，表示带外数据有效 */\n				if (!urg_offset) {/* 偏移为0，表示当前要读的位置正好是带外数据 */\n					if (!sock_flag(sk, SOCK_URGINLINE)) {/* 带外数据不放入数据流 */\n						++*seq;/* 调整读取位置 */\n						offset++;\n						used--;\n						if (!used)/* 调整后可读数据为0，说明没有数据可读，跳过 */\n							goto skip_copy;\n					}\n				} else/* 当前位置不是带外数据，则调整位置，只读到带外数据处 */\n					used = urg_offset;\n			}\n		}\n\n		if (!(flags & MSG_TRUNC)) {/* 不是截断数据，表示要将数据复制到用户态 */\n			err = skb_copy_datagram_iovec(skb, offset,\n						      msg->msg_iov, used);/* 将数据复制到用户态 */\n			if (err) {\n				/* Exception. Bailout! */\n				if (!copied)\n					copied = -EFAULT;\n				break;\n			}\n		}\n\n		/* 调整一些参数 */\n		*seq += used;\n		copied += used;\n		len -= used;\n\n		/* 调整合理的TCP接收缓冲区大小 */\n		tcp_rcv_space_adjust(sk);\n\nskip_copy:\n		/* 如果完成了带外数据的处理，则清除标志，设置首部预测标志 */\n		if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {\n			tp->urg_data = 0;\n			tcp_fast_path_check(sk, tp);\n		}\n		/* 还有数据没有复制到用户态，就不能删除这个段 */\n		if (used + offset < skb->len)\n			continue;\n\n		if (skb->h.th->fin)/* 处理完该段，检测FIN标志 */\n			goto found_fin_ok;\n		if (!(flags & MSG_PEEK))/* 如果是读取而不是查看报文，并且处理完本段报文，则删除它 */\n			sk_eat_skb(sk, skb);\n		continue;/* 继续处理下一个段 */\n\n	found_fin_ok:\n		/* Process the FIN. */\n		++*seq;/* FIN占用一个序号，因此递增序号 */\n		if (!(flags & MSG_PEEK))/* 不是查看数据，将其从队列中删除 */\n			sk_eat_skb(sk, skb);\n		break;/* 收到FIN，不需要继续处理后续的段，退出 */\n	} while (len > 0);\n\n	if (user_recv) {\n		if (skb_queue_len(&tp->ucopy.prequeue)) {/* prequeue队列不为空 */\n			int chunk;\n\n			tp->ucopy.len = copied > 0 ? len : 0;\n\n			tcp_prequeue_process(sk);/* 处理prequeue队列 */\n\n			/* 在处理prequeue的过程中，有数据复制到用户态 */\n			if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {\n				NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);\n				len -= chunk;\n				copied += chunk;\n			}\n		}\n\n		/* 清除task和len，表示用户当前没有读取数据。这样处理prequeue队列时就不会向用户态复制了 */\n		tp->ucopy.task = NULL;\n		tp->ucopy.len = 0;\n	}\n\n	/* According to UNIX98, msg_name/msg_namelen are ignored\n	 * on connected socket. I was just happy when found this 8) --ANK\n	 */\n\n	/* Clean up data we have read: This will do ACK frames. */\n	/* 再次检查是否立即发送ACK */\n	cleanup_rbuf(sk, copied);\n\n	TCP_CHECK_TIMER(sk);\n	release_sock(sk);/* 解锁传输控制块 */\n	return copied;/* 返回复制的字节数 */\n\nout:/* 接收过程中，如果发生错误，则解锁后返回 */\n	TCP_CHECK_TIMER(sk);\n	release_sock(sk);\n	return err;\n\nrecv_urg:\n	/* 调用tcp_recv_urg处理带外数据  */\n	err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);\n	goto out;\n}\n\n/*\n *	State processing on a close. This implements the state shift for\n *	sending our FIN frame. Note that we only send a FIN for some\n *	states. A shutdown() may have already sent the FIN, or we may be\n *	closed.\n */\n\nstatic unsigned char new_state[16] = {\n  /* current state:        new state:      action:	*/\n  /* (Invalid)		*/ TCP_CLOSE,\n  /* TCP_ESTABLISHED	*/ TCP_FIN_WAIT1 | TCP_ACTION_FIN,\n  /* TCP_SYN_SENT	*/ TCP_CLOSE,\n  /* TCP_SYN_RECV	*/ TCP_FIN_WAIT1 | TCP_ACTION_FIN,\n  /* TCP_FIN_WAIT1	*/ TCP_FIN_WAIT1,\n  /* TCP_FIN_WAIT2	*/ TCP_FIN_WAIT2,\n  /* TCP_TIME_WAIT	*/ TCP_CLOSE,\n  /* TCP_CLOSE		*/ TCP_CLOSE,\n  /* TCP_CLOSE_WAIT	*/ TCP_LAST_ACK  | TCP_ACTION_FIN,\n  /* TCP_LAST_ACK	*/ TCP_LAST_ACK,\n  /* TCP_LISTEN		*/ TCP_CLOSE,\n  /* TCP_CLOSING	*/ TCP_CLOSING,\n};\n\nstatic int tcp_close_state(struct sock *sk)\n{\n	int next = (int)new_state[sk->sk_state];\n	int ns = next & TCP_STATE_MASK;\n\n	tcp_set_state(sk, ns);\n\n	return next & TCP_ACTION_FIN;\n}\n\n/*\n *	Shutdown the sending side of a connection. Much like close except\n *	that we don't receive shut down or set_sock_flag(sk, SOCK_DEAD).\n */\n/* shutdown系统调用的传输层实现 */\nvoid tcp_shutdown(struct sock *sk, int how)\n{\n	/*	We need to grab some memory, and put together a FIN,\n	 *	and then put it into the queue to be sent.\n	 *		Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.\n	 */\n	if (!(how & SEND_SHUTDOWN))/* 参数错误，返回 */\n		return;\n\n	/* If we've already sent a FIN, or it's a closed state, skip this. */\n	if ((1 << sk->sk_state) &\n	    (TCPF_ESTABLISHED | TCPF_SYN_SENT |\n	     TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {/* 这些状态没有发送FIN */\n		/* Clear out any half completed packets.  FIN if needed. */\n		if (tcp_close_state(sk))/* 如果需要发送FIN，则发送 */\n			tcp_send_fin(sk);\n	}\n}\n\n/*\n * At this point, there should be no process reference to this\n * socket, and thus no user references at all.  Therefore we\n * can assume the socket waitqueue is inactive and nobody will\n * try to jump onto it.\n */\nvoid tcp_destroy_sock(struct sock *sk)\n{\n	BUG_TRAP(sk->sk_state == TCP_CLOSE);\n	BUG_TRAP(sock_flag(sk, SOCK_DEAD));\n\n	/* It cannot be in hash table! */\n	BUG_TRAP(sk_unhashed(sk));\n\n	/* If it has not 0 inet_sk(sk)->num, it must be bound */\n	BUG_TRAP(!inet_sk(sk)->num || tcp_sk(sk)->bind_hash);\n\n	sk->sk_prot->destroy(sk);\n\n	sk_stream_kill_queues(sk);\n\n	xfrm_sk_free_policy(sk);\n\n#ifdef INET_REFCNT_DEBUG\n	if (atomic_read(&sk->sk_refcnt) != 1) {\n		printk(KERN_DEBUG \"Destruction TCP %p delayed, c=%d\\n\",\n		       sk, atomic_read(&sk->sk_refcnt));\n	}\n#endif\n\n	atomic_dec(&tcp_orphan_count);\n	sock_put(sk);\n}\n\n/* close系统调用的传输层实现 */\nvoid tcp_close(struct sock *sk, long timeout)\n{\n	struct sk_buff *skb;\n	int data_was_unread = 0;\n\n	lock_sock(sk);/* 获取套接口锁 */\n	sk->sk_shutdown = SHUTDOWN_MASK;/* 表示两个方向的上的关闭 */\n\n	if (sk->sk_state == TCP_LISTEN) {/* LISTEN状态 */\n		tcp_set_state(sk, TCP_CLOSE);/* 设置其状态为CLOSE */\n\n		/* Special case. */\n		tcp_listen_stop(sk);/* 终止侦听 */\n\n		goto adjudge_to_death;\n	}\n\n	/*  We need to flush the recv. buffs.  We do this only on the\n	 *  descriptor close, not protocol-sourced closes, because the\n	 *  reader process may not have drained the data yet!\n	 */\n	/* 遍历接收队列中的段 */\n	while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {\n		/* 段中数据长度，如果是fin段，则减少一个字节长度，因为fin占用一个序号 */\n		u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -\n			  skb->h.th->fin;\n		/* 未读取的段长度 */\n		data_was_unread += len;\n		/* 释放段 */\n		__kfree_skb(skb);\n	}\n\n	/* 释放套接口占用的缓存 */\n	sk_stream_mem_reclaim(sk);\n\n	/* As outlined in draft-ietf-tcpimpl-prob-03.txt, section\n	 * 3.10, we send a RST here because data was lost.  To\n	 * witness the awful effects of the old behavior of always\n	 * doing a FIN, run an older 2.1.x kernel or 2.0.x, start\n	 * a bulk GET in an FTP client, suspend the process, wait\n	 * for the client to advertise a zero window, then kill -9\n	 * the FTP client, wheee...  Note: timeout is always zero\n	 * in such a case.\n	 */\n	if (data_was_unread) {/* 有未读数据 */\n		/* Unread data was tossed, zap the connection. */\n		NET_INC_STATS_USER(LINUX_MIB_TCPABORTONCLOSE);\n		tcp_set_state(sk, TCP_CLOSE);\n		/* 发送RST表示非正常的结束，不能发送FIN */\n		tcp_send_active_reset(sk, GFP_KERNEL);\n	} else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {/* 虽然设置了SOCK_LINGER选项，但是延时时间为0 */\n		/* Check zero linger _after_ checking for unread data. */\n		/* 调用disconnect断开、删除并释放已建立连接但是未被accept的传输控制块，同时删除并释放已经接收到接收队列和失序队列上的段和发送队列上的段 */\n		sk->sk_prot->disconnect(sk, 0);\n		NET_INC_STATS_USER(LINUX_MIB_TCPABORTONDATA);\n	} else if (tcp_close_state(sk)) {/* 其他情况，包括没有设置SOCK_LINGER或者启用了SOCK_LINGER且延时时间不为0，转换当前状态到对应的状态，如果新状态需要发送FIN */\n		/* We FIN if the application ate all the data before\n		 * zapping the connection.\n		 */\n\n		/* RED-PEN. Formally speaking, we have broken TCP state\n		 * machine. State transitions:\n		 *\n		 * TCP_ESTABLISHED -> TCP_FIN_WAIT1\n		 * TCP_SYN_RECV	-> TCP_FIN_WAIT1 (forget it, it's impossible)\n		 * TCP_CLOSE_WAIT -> TCP_LAST_ACK\n		 *\n		 * are legal only when FIN has been sent (i.e. in window),\n		 * rather than queued out of window. Purists blame.\n		 *\n		 * F.e. \"RFC state\" is ESTABLISHED,\n		 * if Linux state is FIN-WAIT-1, but FIN is still not sent.\n		 *\n		 * The visible declinations are that sometimes\n		 * we enter time-wait state, when it is not required really\n		 * (harmless), do not send active resets, when they are\n		 * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when\n		 * they look as CLOSING or LAST_ACK for Linux)\n		 * Probably, I missed some more holelets.\n		 * 						--ANK\n		 */\n		tcp_send_fin(sk);/* 发送FIN段，将发送队列上未发送的段发送出去 */\n	}\n\n	/* 在给对端发送RST或FIN段后，等待套接口的关闭，直到TCP状态为FIN_WAIT_1、CLOSING、LAST_ACK或等待超时 */\n	sk_stream_wait_close(sk, timeout);\n\nadjudge_to_death:\n	/* It is the last release_sock in its life. It will remove backlog. */\n	release_sock(sk);/* 释放锁的目的，是为了处理后备队列，新版本将其移动到后面，这里应当有一个BUG */\n\n\n	/* Now socket is owned by kernel and we acquire BH lock\n	   to finish close. No need to check for user refs.\n	 */\n	local_bh_disable();/* 关闭下半部并获得锁 */\n	bh_lock_sock(sk);\n	BUG_TRAP(!sock_owned_by_user(sk));\n\n	sock_hold(sk);\n	sock_orphan(sk);/* 设置套接口为DEAD状态，成为孤儿套接口，同时更新系统中孤儿套接口数 */\n\n	/*	This is a (useful) BSD violating of the RFC. There is a\n	 *	problem with TCP as specified in that the other end could\n	 *	keep a socket open forever with no application left this end.\n	 *	We use a 3 minute timeout (about the same as BSD) then kill\n	 *	our end. If they send after that then tough - BUT: long enough\n	 *	that we won't make the old 4*rto = almost no time - whoops\n	 *	reset mistake.\n	 *\n	 *	Nope, it was not mistake. It is really desired behaviour\n	 *	f.e. on http servers, when such sockets are useless, but\n	 *	consume significant resources. Let's do it with special\n	 *	linger2	option.					--ANK\n	 */\n\n	if (sk->sk_state == TCP_FIN_WAIT2) {/* 当前状态为TCP_FIN_WAIT2 */\n		struct tcp_sock *tp = tcp_sk(sk);\n		if (tp->linger2 < 0) {/* 该值小于0，表示可以从TCP_FIN_WAIT2状态直接转换为TCP_CLOSE状态 */\n			/* 设置为CLOSE状态 */\n			tcp_set_state(sk, TCP_CLOSE);\n			/* 向对方发送RST段 */\n			tcp_send_active_reset(sk, GFP_ATOMIC);\n			NET_INC_STATS_BH(LINUX_MIB_TCPABORTONLINGER);\n		} else {/* 需要等待才进入CLOSE状态 */\n			int tmo = tcp_fin_time(tp);/* 保持TCP_FIN_WAIT2状态的时间 */\n\n			if (tmo > TCP_TIMEWAIT_LEN) {/* 超过60s */\n				/* 通过FIN_WAIT_2定时器来处理状态转换 */\n				tcp_reset_keepalive_timer(sk, tcp_fin_time(tp));\n			} else {\n				atomic_inc(&tcp_orphan_count);\n				/* 小于60s，则等待，直到状态转换成功 */\n				tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);\n				goto out;\n			}\n		}\n	}\n	if (sk->sk_state != TCP_CLOSE) {/* 此时不处于CLOSE状态 */\n		sk_stream_mem_reclaim(sk);/* 释放内存缓存 */\n		if (atomic_read(&tcp_orphan_count) > sysctl_tcp_max_orphans ||/* 孤儿套接口数量太多 */\n		    (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&/* 发送队列中的段数量大于下限 */\n		     atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {/* 且系统中总的TCP传输层缓冲区分配的内存超过缓存区大小的最高硬性限制 */\n			if (net_ratelimit())\n				printk(KERN_INFO \"TCP: too many of orphaned \"\n				       \"sockets\\n\");\n			/* 这种情况下需要立即关闭套接口，设置其状态为CLOSE */\n			tcp_set_state(sk, TCP_CLOSE);\n			/* 向对方发送RST状态 */\n			tcp_send_active_reset(sk, GFP_ATOMIC);\n			NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY);\n		}\n	}\n	/* 增加孤儿套接口数量，我觉得这里有点不妥 */\n	atomic_inc(&tcp_orphan_count);\n\n	if (sk->sk_state == TCP_CLOSE)/* 如果状态为CLOSE，则可以释放传输块及其占用的资源 */\n		tcp_destroy_sock(sk);\n	/* Otherwise, socket is reprieved until protocol close. */\n\nout:\n	bh_unlock_sock(sk);\n	local_bh_enable();\n	sock_put(sk);\n}\n\n/* These states need RST on ABORT according to RFC793 */\n\nstatic inline int tcp_need_reset(int state)\n{\n	return (1 << state) &\n	       (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |\n		TCPF_FIN_WAIT2 | TCPF_SYN_RECV);\n}\n\nint tcp_disconnect(struct sock *sk, int flags)\n{\n	struct inet_sock *inet = inet_sk(sk);\n	struct tcp_sock *tp = tcp_sk(sk);\n	int err = 0;\n	int old_state = sk->sk_state;\n\n	if (old_state != TCP_CLOSE)\n		tcp_set_state(sk, TCP_CLOSE);\n\n	/* ABORT function of RFC793 */\n	if (old_state == TCP_LISTEN) {\n		tcp_listen_stop(sk);\n	} else if (tcp_need_reset(old_state) ||\n		   (tp->snd_nxt != tp->write_seq &&\n		    (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {\n		/* The last check adjusts for discrepance of Linux wrt. RFC\n		 * states\n		 */\n		tcp_send_active_reset(sk, gfp_any());\n		sk->sk_err = ECONNRESET;\n	} else if (old_state == TCP_SYN_SENT)\n		sk->sk_err = ECONNRESET;\n\n	tcp_clear_xmit_timers(sk);\n	__skb_queue_purge(&sk->sk_receive_queue);\n	sk_stream_writequeue_purge(sk);\n	__skb_queue_purge(&tp->out_of_order_queue);\n\n	inet->dport = 0;\n\n	if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))\n		inet_reset_saddr(sk);\n\n	sk->sk_shutdown = 0;\n	sock_reset_flag(sk, SOCK_DONE);\n	tp->srtt = 0;\n	if ((tp->write_seq += tp->max_window + 2) == 0)\n		tp->write_seq = 1;\n	tp->backoff = 0;\n	tp->snd_cwnd = 2;\n	tp->probes_out = 0;\n	tp->packets_out = 0;\n	tp->snd_ssthresh = 0x7fffffff;\n	tp->snd_cwnd_cnt = 0;\n	tcp_set_ca_state(tp, TCP_CA_Open);\n	tcp_clear_retrans(tp);\n	tcp_delack_init(tp);\n	sk->sk_send_head = NULL;\n	tp->rx_opt.saw_tstamp = 0;\n	tcp_sack_reset(&tp->rx_opt);\n	__sk_dst_reset(sk);\n\n	BUG_TRAP(!inet->num || tp->bind_hash);\n\n	sk->sk_error_report(sk);\n	return err;\n}\n\n/*\n *	Wait for an incoming connection, avoid race\n *	conditions. This must be called with the socket locked.\n */\nstatic int wait_for_connect(struct sock *sk, long timeo)\n{\n	struct tcp_sock *tp = tcp_sk(sk);\n	DEFINE_WAIT(wait);\n	int err;\n\n	/*\n	 * True wake-one mechanism for incoming connections: only\n	 * one process gets woken up, not the 'whole herd'.\n	 * Since we do not 'race & poll' for established sockets\n	 * anymore, the common case will execute the loop only once.\n	 *\n	 * Subtle issue: \"add_wait_queue_exclusive()\" will be added\n	 * after any current non-exclusive waiters, and we know that\n	 * it will always _stay_ after any new non-exclusive waiters\n	 * because all non-exclusive waiters are added at the\n	 * beginning of the wait-queue. As such, it's ok to \"drop\"\n	 * our exclusiveness temporarily when we get woken up without\n	 * having to remove and re-insert us on the wait queue.\n	 */\n	for (;;) {\n		prepare_to_wait_exclusive(sk->sk_sleep, &wait,\n					  TASK_INTERRUPTIBLE);\n		release_sock(sk);\n		if (!tp->accept_queue)\n			timeo = schedule_timeout(timeo);\n		lock_sock(sk);\n		err = 0;\n		if (tp->accept_queue)\n			break;\n		err = -EINVAL;\n		if (sk->sk_state != TCP_LISTEN)\n			break;\n		err = sock_intr_errno(timeo);\n		if (signal_pending(current))\n			break;\n		err = -EAGAIN;\n		if (!timeo)\n			break;\n	}\n	finish_wait(sk->sk_sleep, &wait);\n	return err;\n}\n\n/*\n *	This will accept the next outstanding connection.\n */\n/* accept调用的传输层实现 */\nstruct sock *tcp_accept(struct sock *sk, int flags, int *err)\n{\n	struct tcp_sock *tp = tcp_sk(sk);\n	struct open_request *req;\n	struct sock *newsk;\n	int error;\n\n	lock_sock(sk);\n\n	/* We need to make sure that this socket is listening,\n	 * and that it has something pending.\n	 */\n	error = -EINVAL;\n	if (sk->sk_state != TCP_LISTEN)/* 本调用仅仅针对侦听套口，不是此状态的套口则退出 */\n		goto out;\n\n	/* Find already established connection */\n	if (!tp->accept_queue) {/* accept队列为空，说明还没有收到新连接 */\n		long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);/* 如果套口是非阻塞的，或者在一定时间内没有新连接，则返回 */\n\n		/* If this is a non blocking socket don't sleep */\n		error = -EAGAIN;\n		if (!timeo)/* 超时时间到，没有新连接，退出 */\n			goto out;\n\n		/* 运行到这里，说明有新连接到来，则等待新的传输控制块 */\n		error = wait_for_connect(sk, timeo);\n		if (error)\n			goto out;\n	}\n\n	req = tp->accept_queue;\n	if ((tp->accept_queue = req->dl_next) == NULL)\n		tp->accept_queue_tail = NULL;\n\n 	newsk = req->sk;\n	sk_acceptq_removed(sk);\n	tcp_openreq_fastfree(req);\n	BUG_TRAP(newsk->sk_state != TCP_SYN_RECV);\n	release_sock(sk);\n	return newsk;\n\nout:\n	release_sock(sk);\n	*err = error;\n	return NULL;\n}\n\n/*\n *	Socket option code for TCP.\n */\n/* 设置tcp选项 */\nint tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,\n		   int optlen)\n{\n	struct tcp_sock *tp = tcp_sk(sk);\n	int val;\n	int err = 0;\n\n	if (level != SOL_TCP)/* 如果不是TCP级别的选项，就调用接口处理ip层的选项 */\n		return tp->af_specific->setsockopt(sk, level, optname,\n						   optval, optlen);\n\n	if (optlen < sizeof(int))\n		return -EINVAL;\n\n	if (get_user(val, (int __user *)optval))\n		return -EFAULT;\n\n	lock_sock(sk);/* 获得连接锁后设置其选项 */\n\n	switch (optname) {\n	case TCP_MAXSEG:/* 设置应用层的MSS上限 */\n		/* Values greater than interface MTU won't take effect. However\n		 * at the point when this call is done we typically don't yet\n		 * know which interface is going to be used */\n		if (val < 8 || val > MAX_TCP_WINDOW) {/* 有效的MSS值在8到32767之间 */\n			err = -EINVAL;\n			break;\n		}\n		tp->rx_opt.user_mss = val;/* 设置连接的用户层MSS */\n		break;\n\n	case TCP_NODELAY:/* 禁止或者启用套接口上的Nagle算法 */\n		if (val) {\n			/* TCP_NODELAY is weaker than TCP_CORK, so that\n			 * this option on corked socket is remembered, but\n			 * it is not activated until cork is cleared.\n			 *\n			 * However, when TCP_NODELAY is set we make\n			 * an explicit push, which overrides even TCP_CORK\n			 * for currently queued segments.\n			 */\n			tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;/* 禁用Nagle算法 */\n			tcp_push_pending_frames(sk, tp);/* 将连接中的待发送数据发出去 */\n		} else {\n			tp->nonagle &= ~TCP_NAGLE_OFF;\n		}\n		break;\n\n	case TCP_CORK:/* 使能此选项后，会对Nagle进行优化，200ms内发送的数据会被组合成大的报文 */\n		/* When set indicates to always queue non-full frames.\n		 * Later the user clears this option and we transmit\n		 * any pending partial frames in the queue.  This is\n		 * meant to be used alongside sendfile() to get properly\n		 * filled frames when the user (for example) must write\n		 * out headers with a write() call first and then use\n		 * sendfile to send out the data parts.\n		 *\n		 * TCP_CORK can be set together with TCP_NODELAY and it is\n		 * stronger than TCP_NODELAY.\n		 */\n		if (val) {\n			tp->nonagle |= TCP_NAGLE_CORK;\n		} else {\n			tp->nonagle &= ~TCP_NAGLE_CORK;\n			if (tp->nonagle&TCP_NAGLE_OFF)\n				tp->nonagle |= TCP_NAGLE_PUSH;\n			tcp_push_pending_frames(sk, tp);\n		}\n		break;\n\n	case TCP_KEEPIDLE:/* 设置保活探测前TCP空闲时间 */\n		if (val < 1 || val > MAX_TCP_KEEPIDLE)/* 参数检测 */\n			err = -EINVAL;\n		else {\n			tp->keepalive_time = val * HZ;/* 设置保活启动时间 */\n			if (sock_flag(sk, SOCK_KEEPOPEN) &&\n			    !((1 << sk->sk_state) &\n			      (TCPF_CLOSE | TCPF_LISTEN))) {/* 判断当前是否需要启动保活定时器 */\n				__u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;\n				if (tp->keepalive_time > elapsed)\n					elapsed = tp->keepalive_time - elapsed;\n				else\n					elapsed = 0;\n				tcp_reset_keepalive_timer(sk, elapsed);\n			}\n		}\n		break;\n	case TCP_KEEPINTVL:/* 设置保活探测间隔时间 */\n		if (val < 1 || val > MAX_TCP_KEEPINTVL)\n			err = -EINVAL;\n		else\n			tp->keepalive_intvl = val * HZ;\n		break;\n	case TCP_KEEPCNT:/* 设置保活探测次数，超过此值，则认为连接已经断开 */\n		if (val < 1 || val > MAX_TCP_KEEPCNT)\n			err = -EINVAL;\n		else\n			tp->keepalive_probes = val;\n		break;\n	case TCP_SYNCNT:/* 为建立连接而重发SYN的次数 */\n		if (val < 1 || val > MAX_TCP_SYNCNT)\n			err = -EINVAL;\n		else\n			tp->syn_retries = val;\n		break;\n\n	case TCP_LINGER2:/* 保持在FIN_WAIT_2状态的时间 */\n		if (val < 0)\n			tp->linger2 = -1;\n		else if (val > sysctl_tcp_fin_timeout / HZ)\n			tp->linger2 = 0;\n		else\n			tp->linger2 = val * HZ;\n		break;\n\n	case TCP_DEFER_ACCEPT:/* 延迟accept，这样可以将ack放到数据报文中进行应答。对HTTP来说有用。 */\n		tp->defer_accept = 0;\n		if (val > 0) {\n			/* Translate value in seconds to number of\n			 * retransmits */\n			while (tp->defer_accept < 32 &&\n			       val > ((TCP_TIMEOUT_INIT / HZ) <<\n				       tp->defer_accept))\n				tp->defer_accept++;\n			tp->defer_accept++;\n		}\n		break;\n\n	case TCP_WINDOW_CLAMP:/* 设置滑动窗口上限 */\n		if (!val) {\n			if (sk->sk_state != TCP_CLOSE) {\n				err = -EINVAL;\n				break;\n			}\n			tp->window_clamp = 0;\n		} else\n			tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?\n						SOCK_MIN_RCVBUF / 2 : val;\n		break;\n\n	case TCP_QUICKACK:/* 启用或者禁用快速确认模式，该标志是暂时性的。 */\n		if (!val) {\n			tp->ack.pingpong = 1;\n		} else {\n			tp->ack.pingpong = 0;\n			if ((1 << sk->sk_state) &\n			    (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&\n			    tcp_ack_scheduled(tp)) {\n				tp->ack.pending |= TCP_ACK_PUSHED;\n				cleanup_rbuf(sk, 1);\n				if (!(val & 1))\n					tp->ack.pingpong = 1;\n			}\n		}\n		break;\n\n	default:\n		err = -ENOPROTOOPT;\n		break;\n	};\n	release_sock(sk);\n	return err;\n}\n\n/* Return information about state of tcp endpoint in API format. */\nvoid tcp_get_info(struct sock *sk, struct tcp_info *info)\n{\n	struct tcp_sock *tp = tcp_sk(sk);\n	u32 now = tcp_time_stamp;\n\n	memset(info, 0, sizeof(*info));\n\n	info->tcpi_state = sk->sk_state;\n	info->tcpi_ca_state = tp->ca_state;\n	info->tcpi_retransmits = tp->retransmits;\n	info->tcpi_probes = tp->probes_out;\n	info->tcpi_backoff = tp->backoff;\n\n	if (tp->rx_opt.tstamp_ok)\n		info->tcpi_options |= TCPI_OPT_TIMESTAMPS;\n	if (tp->rx_opt.sack_ok)\n		info->tcpi_options |= TCPI_OPT_SACK;\n	if (tp->rx_opt.wscale_ok) {\n		info->tcpi_options |= TCPI_OPT_WSCALE;\n		info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;\n		info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;\n	} \n\n	if (tp->ecn_flags&TCP_ECN_OK)\n		info->tcpi_options |= TCPI_OPT_ECN;\n\n	info->tcpi_rto = jiffies_to_usecs(tp->rto);\n	info->tcpi_ato = jiffies_to_usecs(tp->ack.ato);\n	info->tcpi_snd_mss = tp->mss_cache_std;\n	info->tcpi_rcv_mss = tp->ack.rcv_mss;\n\n	info->tcpi_unacked = tp->packets_out;\n	info->tcpi_sacked = tp->sacked_out;\n	info->tcpi_lost = tp->lost_out;\n	info->tcpi_retrans = tp->retrans_out;\n	info->tcpi_fackets = tp->fackets_out;\n\n	info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);\n	info->tcpi_last_data_recv = jiffies_to_msecs(now - tp->ack.lrcvtime);\n	info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);\n\n	info->tcpi_pmtu = tp->pmtu_cookie;\n	info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;\n	info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3;\n	info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2;\n	info->tcpi_snd_ssthresh = tp->snd_ssthresh;\n	info->tcpi_snd_cwnd = tp->snd_cwnd;\n	info->tcpi_advmss = tp->advmss;\n	info->tcpi_reordering = tp->reordering;\n\n	info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3;\n	info->tcpi_rcv_space = tp->rcvq_space.space;\n\n	info->tcpi_total_retrans = tp->total_retrans;\n}\n\nEXPORT_SYMBOL_GPL(tcp_get_info);\n\nint tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,\n		   int __user *optlen)\n{\n	struct tcp_sock *tp = tcp_sk(sk);\n	int val, len;\n\n	if (level != SOL_TCP)\n		return tp->af_specific->getsockopt(sk, level, optname,\n						   optval, optlen);\n\n	if (get_user(len, optlen))\n		return -EFAULT;\n\n	len = min_t(unsigned int, len, sizeof(int));\n\n	if (len < 0)\n		return -EINVAL;\n\n	switch (optname) {\n	case TCP_MAXSEG:\n		val = tp->mss_cache_std;\n		if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))\n			val = tp->rx_opt.user_mss;\n		break;\n	case TCP_NODELAY:\n		val = !!(tp->nonagle&TCP_NAGLE_OFF);\n		break;\n	case TCP_CORK:\n		val = !!(tp->nonagle&TCP_NAGLE_CORK);\n		break;\n	case TCP_KEEPIDLE:\n		val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time) / HZ;\n		break;\n	case TCP_KEEPINTVL:\n		val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl) / HZ;\n		break;\n	case TCP_KEEPCNT:\n		val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;\n		break;\n	case TCP_SYNCNT:\n		val = tp->syn_retries ? : sysctl_tcp_syn_retries;\n		break;\n	case TCP_LINGER2:\n		val = tp->linger2;\n		if (val >= 0)\n			val = (val ? : sysctl_tcp_fin_timeout) / HZ;\n		break;\n	case TCP_DEFER_ACCEPT:\n		val = !tp->defer_accept ? 0 : ((TCP_TIMEOUT_INIT / HZ) <<\n					       (tp->defer_accept - 1));\n		break;\n	case TCP_WINDOW_CLAMP:\n		val = tp->window_clamp;\n		break;\n	case TCP_INFO: {\n		struct tcp_info info;\n\n		if (get_user(len, optlen))\n			return -EFAULT;\n\n		tcp_get_info(sk, &info);\n\n		len = min_t(unsigned int, len, sizeof(info));\n		if (put_user(len, optlen))\n			return -EFAULT;\n		if (copy_to_user(optval, &info, len))\n			return -EFAULT;\n		return 0;\n	}\n	case TCP_QUICKACK:\n		val = !tp->ack.pingpong;\n		break;\n	default:\n		return -ENOPROTOOPT;\n	};\n\n	if (put_user(len, optlen))\n		return -EFAULT;\n	if (copy_to_user(optval, &val, len))\n		return -EFAULT;\n	return 0;\n}\n\n\nextern void __skb_cb_too_small_for_tcp(int, int);\nextern void tcpdiag_init(void);\n\nstatic __initdata unsigned long thash_entries;\nstatic int __init set_thash_entries(char *str)\n{\n	if (!str)\n		return 0;\n	thash_entries = simple_strtoul(str, &str, 0);\n	return 1;\n}\n__setup(\"thash_entries=\", set_thash_entries);\n\nvoid __init tcp_init(void)\n{\n	struct sk_buff *skb = NULL;\n	int order, i;\n\n	/* cb结构必须能够容纳tcp_skb_cb，否则报告错误 */\n	if (sizeof(struct tcp_skb_cb) > sizeof(skb->cb))\n		__skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),\n					   sizeof(skb->cb));\n\n	tcp_openreq_cachep = kmem_cache_create(\"tcp_open_request\",\n						   sizeof(struct open_request),\n					       0, SLAB_HWCACHE_ALIGN,\n					       NULL, NULL);\n	if (!tcp_openreq_cachep)\n		panic(\"tcp_init: Cannot alloc open_request cache.\");\n\n	/* 分配高速缓存，用于保存已经绑定端口的信息 */\n	tcp_bucket_cachep = kmem_cache_create(\"tcp_bind_bucket\",\n					      sizeof(struct tcp_bind_bucket),\n					      0, SLAB_HWCACHE_ALIGN,\n					      NULL, NULL);\n	if (!tcp_bucket_cachep)\n		panic(\"tcp_init: Cannot alloc tcp_bind_bucket cache.\");\n\n	tcp_timewait_cachep = kmem_cache_create(\"tcp_tw_bucket\",\n						sizeof(struct tcp_tw_bucket),\n						0, SLAB_HWCACHE_ALIGN,\n						NULL, NULL);\n	if (!tcp_timewait_cachep)\n		panic(\"tcp_init: Cannot alloc tcp_tw_bucket cache.\");\n\n	/* Size and allocate the main established and bind bucket\n	 * hash tables.\n	 *\n	 * The methodology is similar to that of the buffer cache.\n	 */\n	/* 分配已经建立的连接项的哈希表内存，thash_entries是内核参数 */\n	tcp_ehash = (struct tcp_ehash_bucket *)\n		alloc_large_system_hash(\"TCP established\",\n					sizeof(struct tcp_ehash_bucket),\n					thash_entries,\n					(num_physpages >= 128 * 1024) ?\n						(25 - PAGE_SHIFT) :\n						(27 - PAGE_SHIFT),\n					HASH_HIGHMEM,\n					&tcp_ehash_size,\n					NULL,\n					0);\n	tcp_ehash_size = (1 << tcp_ehash_size) >> 1;\n	for (i = 0; i < (tcp_ehash_size << 1); i++) {\n		rwlock_init(&tcp_ehash[i].lock);\n		INIT_HLIST_HEAD(&tcp_ehash[i].chain);\n	}\n\n	/* 分配绑定端口的散列表。 */\n	tcp_bhash = (struct tcp_bind_hashbucket *)\n		alloc_large_system_hash(\"TCP bind\",\n					sizeof(struct tcp_bind_hashbucket),\n					tcp_ehash_size,\n					(num_physpages >= 128 * 1024) ?\n						(25 - PAGE_SHIFT) :\n						(27 - PAGE_SHIFT),\n					HASH_HIGHMEM,\n					&tcp_bhash_size,\n					NULL,\n					64 * 1024);\n	tcp_bhash_size = 1 << tcp_bhash_size;\n	for (i = 0; i < tcp_bhash_size; i++) {\n		spin_lock_init(&tcp_bhash[i].lock);\n		INIT_HLIST_HEAD(&tcp_bhash[i].chain);\n	}\n\n	/* Try to be a bit smarter and adjust defaults depending\n	 * on available memory.\n	 */\n	/* 将哈希表的大小折算成order值 */\n	for (order = 0; ((1 << order) << PAGE_SHIFT) <\n			(tcp_bhash_size * sizeof(struct tcp_bind_hashbucket));\n			order++)\n		;\n	if (order > 4) {/* 根据order值基本上可以决定是服务器还是一般的桌面系统，据此设置相应参数 */\n		sysctl_local_port_range[0] = 32768;\n		sysctl_local_port_range[1] = 61000;\n		sysctl_tcp_max_tw_buckets = 180000;\n		sysctl_tcp_max_orphans = 4096 << (order - 4);\n		sysctl_max_syn_backlog = 1024;\n	} else if (order < 3) {\n		sysctl_local_port_range[0] = 1024 * (3 - order);\n		sysctl_tcp_max_tw_buckets >>= (3 - order);\n		sysctl_tcp_max_orphans >>= (3 - order);\n		sysctl_max_syn_backlog = 128;\n	}\n	tcp_port_rover = sysctl_local_port_range[0] - 1;\n\n	/* 初始化内存控制参数 */\n	sysctl_tcp_mem[0] =  768 << order;\n	sysctl_tcp_mem[1] = 1024 << order;\n	sysctl_tcp_mem[2] = 1536 << order;\n\n	if (order < 3) {\n		sysctl_tcp_wmem[2] = 64 * 1024;\n		sysctl_tcp_rmem[0] = PAGE_SIZE;\n		sysctl_tcp_rmem[1] = 43689;\n		sysctl_tcp_rmem[2] = 2 * 43689;\n	}\n\n	printk(KERN_INFO \"TCP: Hash tables configured \"\n	       \"(established %d bind %d)\\n\",\n	       tcp_ehash_size << 1, tcp_bhash_size);\n}\n\nEXPORT_SYMBOL(tcp_accept);\nEXPORT_SYMBOL(tcp_close);\nEXPORT_SYMBOL(tcp_destroy_sock);\nEXPORT_SYMBOL(tcp_disconnect);\nEXPORT_SYMBOL(tcp_getsockopt);\nEXPORT_SYMBOL(tcp_ioctl);\nEXPORT_SYMBOL(tcp_openreq_cachep);\nEXPORT_SYMBOL(tcp_poll);\nEXPORT_SYMBOL(tcp_read_sock);\nEXPORT_SYMBOL(tcp_recvmsg);\nEXPORT_SYMBOL(tcp_sendmsg);\nEXPORT_SYMBOL(tcp_sendpage);\nEXPORT_SYMBOL(tcp_setsockopt);\nEXPORT_SYMBOL(tcp_shutdown);\nEXPORT_SYMBOL(tcp_statistics);\nEXPORT_SYMBOL(tcp_timewait_cachep);\n",
			"file": "net/ipv4/tcp.c",
			"file_size": 75664,
			"file_write_time": 130392220660000000,
			"settings":
			{
				"buffer_size": 71560,
				"line_ending": "Unix",
				"scratch": true
			}
		}
	],
	"build_system": "",
	"command_palette":
	{
		"height": 67.0,
		"selected_items":
		[
			[
				"install",
				"Package Control: Install Package"
			],
			[
				"pac",
				"Preferences: Browse Packages"
			]
		],
		"width": 521.0
	},
	"console":
	{
		"height": 126.0,
		"history":
		[
			"import urllib.request,os,hashlib; h = '2deb499853c4371624f5a07e27c334aa' + 'bf8c4e67d14fb0525ba4f89698a6d7e1'; pf = 'Package Control.sublime-package'; ipp = sublime.installed_packages_path(); urllib.request.install_opener( urllib.request.build_opener( urllib.request.ProxyHandler()) ); by = urllib.request.urlopen( 'http://packagecontrol.io/' + pf.replace(' ', '%20')).read(); dh = hashlib.sha256(by).hexdigest(); print('Error validating download (got %s instead of %s), please try manual install' % (dh, h)) if dh != h else open(os.path.join( ipp, pf), 'wb' ).write(by)"
		]
	},
	"distraction_free":
	{
		"menu_visible": true,
		"show_minimap": false,
		"show_open_files": false,
		"show_tabs": false,
		"side_bar_visible": false,
		"status_bar_visible": false
	},
	"expanded_folders":
	[
		"/Users/fengjc/AliDrive/code/linux/linux-2.6.11.12",
		"/Users/fengjc/AliDrive/code/linux/linux-2.6.11.12/net",
		"/Users/fengjc/AliDrive/code/linux/linux-2.6.11.12/net/ipv4"
	],
	"file_history":
	[
		"/Users/fengjc/.bash_profile",
		"/Users/fengjc/AliDrive/code/linux/linux-2.6.11.12/net/TUNABLE",
		"/Users/fengjc/AliDrive/code/linux/linux-2.6.11.12/net/ipv4/tcp.c",
		"/Users/fengjc/AliDrive/code/linux/linux-2.6.11.12/Documentation/networking/tcp.txt",
		"/Users/fengjc/AliDrive/code/linux/linux-2.6.11.12/net/ipv4/tcp_ipv4.c",
		"/Users/fengjc/AliDrive/code/linux/linux-2.6.11.12/drivers/base/transport_class.c",
		"/Users/fengjc/AliDrive/code/linux/linux-2.6.11.12/include/linux/random.h",
		"/Users/fengjc/AliDrive/code/linux/linux-2.6.11.12/drivers/char/random.c",
		"/Users/fengjc/AliDrive/code/linux/linux-2.6.11.12/arch/arm/mm/init.c",
		"/Users/fengjc/AliDrive/code/linux/linux-2.6.11.12/README",
		"/Users/fengjc/AliDrive/code/linux/linux-2.6.11.12/include/net/tcp.h",
		"/Users/fengjc/Library/Application Support/Sublime Text 3/Packages/User/Default (OSX).sublime-keymap",
		"/Users/fengjc/AliDrive/code/linux/linux-2.6.11.12/include/config/nls/codepage/850.h",
		"/Users/fengjc/AliDrive/code/linux/linux-2.6.11.12/drivers/net/8139cp.c",
		"/Users/fengjc/AliDrive/code/linux/linux-2.6.11.12/net/ipv4/tcp_input.c",
		"/Users/fengjc/AliDrive/code/linux/linux-2.6.11.12/include/linux/tc_act/tc_pedit.h"
	],
	"find":
	{
		"height": 35.0
	},
	"find_in_files":
	{
		"height": 93.0,
		"where_history":
		[
		]
	},
	"find_state":
	{
		"case_sensitive": false,
		"find_history":
		[
			"tcp",
			"tcp_sendmsg"
		],
		"highlight": true,
		"in_selection": false,
		"preserve_case": false,
		"regex": false,
		"replace_history":
		[
		],
		"reverse": false,
		"show_context": true,
		"use_buffer2": true,
		"whole_word": false,
		"wrap": true
	},
	"groups":
	[
		{
			"selected": 0,
			"sheets":
			[
				{
					"buffer": 0,
					"file": "net/ipv4/tcp.c",
					"semi_transient": false,
					"settings":
					{
						"buffer_size": 71560,
						"regions":
						{
						},
						"selection":
						[
							[
								25947,
								25958
							]
						],
						"settings":
						{
							"BracketHighlighterBusy": false,
							"bh_regions":
							[
								"bh_angle",
								"bh_angle_center",
								"bh_angle_open",
								"bh_angle_close",
								"bh_angle_content",
								"bh_round",
								"bh_round_center",
								"bh_round_open",
								"bh_round_close",
								"bh_round_content",
								"bh_c_define",
								"bh_c_define_center",
								"bh_c_define_open",
								"bh_c_define_close",
								"bh_c_define_content",
								"bh_default",
								"bh_default_center",
								"bh_default_open",
								"bh_default_close",
								"bh_default_content",
								"bh_curly",
								"bh_curly_center",
								"bh_curly_open",
								"bh_curly_close",
								"bh_curly_content",
								"bh_regex",
								"bh_regex_center",
								"bh_regex_open",
								"bh_regex_close",
								"bh_regex_content",
								"bh_square",
								"bh_square_center",
								"bh_square_open",
								"bh_square_close",
								"bh_square_content",
								"bh_double_quote",
								"bh_double_quote_center",
								"bh_double_quote_open",
								"bh_double_quote_close",
								"bh_double_quote_content",
								"bh_tag",
								"bh_tag_center",
								"bh_tag_open",
								"bh_tag_close",
								"bh_tag_content",
								"bh_single_quote",
								"bh_single_quote_center",
								"bh_single_quote_open",
								"bh_single_quote_close",
								"bh_single_quote_content",
								"bh_unmatched",
								"bh_unmatched_center",
								"bh_unmatched_open",
								"bh_unmatched_close",
								"bh_unmatched_content"
							],
							"in_converting": true,
							"origin_encoding": "GB2312",
							"syntax": "Packages/C++/C++.tmLanguage",
							"translate_tabs_to_spaces": false
						},
						"translation.x": 0.0,
						"translation.y": 12033.0,
						"zoom_level": 1.0
					},
					"stack_index": 0,
					"type": "text"
				}
			]
		}
	],
	"incremental_find":
	{
		"height": 23.0
	},
	"input":
	{
		"height": 0.0
	},
	"layout":
	{
		"cells":
		[
			[
				0,
				0,
				1,
				1
			]
		],
		"cols":
		[
			0.0,
			1.0
		],
		"rows":
		[
			0.0,
			1.0
		]
	},
	"menu_visible": true,
	"output.find_results":
	{
		"height": 0.0
	},
	"project": "linux-2.6.11.12.sublime-project",
	"replace":
	{
		"height": 42.0
	},
	"save_all_on_build": true,
	"select_file":
	{
		"height": 0.0,
		"selected_items":
		[
			[
				"tcp.c",
				"net/ipv4/tcp.c"
			],
			[
				"tcp_ipv4",
				"net/ipv4/tcp_ipv4.c"
			],
			[
				"",
				"README"
			],
			[
				"tcp_ip",
				"net/ipv4/tcp_ipv4.c"
			]
		],
		"width": 0.0
	},
	"select_project":
	{
		"height": 0.0,
		"selected_items":
		[
		],
		"width": 0.0
	},
	"select_symbol":
	{
		"height": 0.0,
		"selected_items":
		[
		],
		"width": 0.0
	},
	"selected_group": 0,
	"settings":
	{
	},
	"show_minimap": true,
	"show_open_files": false,
	"show_tabs": true,
	"side_bar_visible": true,
	"side_bar_width": 291.0,
	"status_bar_visible": true,
	"template_settings":
	{
	}
}