tcpcong.c 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617
  1. /*++
  2. Copyright (c) 2013 Minoca Corp.
  3. This file is licensed under the terms of the GNU General Public License
  4. version 3. Alternative licensing terms are available. Contact
  5. info@minocacorp.com for details. See the LICENSE file at the root of this
  6. project for complete licensing information.
  7. Module Name:
  8. tcpcong.c
  9. Abstract:
  10. This module implements support for TCP congestion control. Specifically
  11. this module implements the New Reno algorithm, however this set of functions
  12. could easily be interfaced to include alternate congestion control
  13. algorithms.
  14. Author:
  15. Evan Green 15-Apr-2013
  16. Environment:
  17. Kernel
  18. --*/
  19. //
  20. // ------------------------------------------------------------------- Includes
  21. //
  22. //
  23. // Protocol drivers are supposed to be able to stand on their own (ie be able to
  24. // be implemented outside the core net library). For the builtin ones, avoid
  25. // including netcore.h, but still redefine those functions that would otherwise
  26. // generate imports.
  27. //
  28. #define NET_API __DLLEXPORT
  29. #include <minoca/kernel/driver.h>
  30. #include <minoca/net/netdrv.h>
  31. #include "tcp.h"
  32. //
  33. // ---------------------------------------------------------------- Definitions
  34. //
  35. //
  36. // ------------------------------------------------------ Data Type Definitions
  37. //
  38. //
  39. // ----------------------------------------------- Internal Function Prototypes
  40. //
  41. //
  42. // -------------------------------------------------------------------- Globals
  43. //
  44. ULONGLONG NetDefaultRoundTripTicks = 0;
  45. //
  46. // ------------------------------------------------------------------ Functions
  47. //
  48. VOID
  49. NetpTcpCongestionInitializeSocket (
  50. PTCP_SOCKET Socket
  51. )
  52. /*++
  53. Routine Description:
  54. This routine initializes the congestion control portion of the TCP socket.
  55. Arguments:
  56. Socket - Supplies a pointer to the socket to initialize.
  57. Return Value:
  58. None.
  59. --*/
  60. {
  61. ULONGLONG Ticks;
  62. if (NetDefaultRoundTripTicks == 0) {
  63. Ticks = TCP_DEFAULT_ROUND_TRIP_TIME * MICROSECONDS_PER_MILLISECOND;
  64. NetDefaultRoundTripTicks = KeConvertMicrosecondsToTimeTicks(Ticks) *
  65. TCP_ROUND_TRIP_SAMPLE_DENOMINATOR;
  66. }
  67. ASSERT((Socket->Flags & TCP_SOCKET_FLAG_IN_FAST_RECOVERY) == 0);
  68. Socket->SlowStartThreshold = MAX_ULONG;
  69. Socket->CongestionWindowSize = 2 * TCP_DEFAULT_MAX_SEGMENT_SIZE;
  70. Socket->FastRecoveryEndSequence = 0;
  71. Socket->RoundTripTime = NetDefaultRoundTripTicks;
  72. return;
  73. }
  74. VOID
  75. NetpTcpCongestionConnectionEstablished (
  76. PTCP_SOCKET Socket
  77. )
  78. /*++
  79. Routine Description:
  80. This routine is called when a socket moves to the Established state.
  81. Arguments:
  82. Socket - Supplies a pointer to the socket to initialize.
  83. Return Value:
  84. None.
  85. --*/
  86. {
  87. Socket->SlowStartThreshold = Socket->SendWindowSize;
  88. if (Socket->SendMaxSegmentSize == 0) {
  89. Socket->SendMaxSegmentSize = TCP_DEFAULT_MAX_SEGMENT_SIZE;
  90. }
  91. Socket->CongestionWindowSize = 2 * Socket->SendMaxSegmentSize;
  92. if (NetTcpDebugPrintCongestionControl != FALSE) {
  93. NetpTcpPrintSocketEndpoints(Socket, FALSE);
  94. RtlDebugPrint(" Initial SlowStartThreshold %d, "
  95. "CongestionWindowSize %d.\n",
  96. Socket->SlowStartThreshold,
  97. Socket->CongestionWindowSize);
  98. }
  99. return;
  100. }
  101. ULONG
  102. NetpTcpGetSendWindowSize (
  103. PTCP_SOCKET Socket
  104. )
  105. /*++
  106. Routine Description:
  107. This routine determines the current available window of data that can be
  108. sent, taking into account both the receiver's window and the congestion
  109. window.
  110. Arguments:
  111. Socket - Supplies a pointer to the socket whose send window should be
  112. returned.
  113. Return Value:
  114. Returns one beyond the highest sequence number that can currently be sent.
  115. --*/
  116. {
  117. ULONGLONG DueTime;
  118. ULONGLONG WaitInMicroseconds;
  119. ULONG WindowSize;
  120. WindowSize = Socket->CongestionWindowSize;
  121. if (Socket->SendWindowSize < WindowSize) {
  122. WindowSize = Socket->SendWindowSize;
  123. if (WindowSize == 0) {
  124. //
  125. // If this is the first time the window is being seen as zero,
  126. // start the probe timer.
  127. //
  128. if (Socket->RetryTime == 0) {
  129. WaitInMicroseconds = Socket->RetryWaitPeriod *
  130. MICROSECONDS_PER_MILLISECOND;
  131. DueTime = KeGetRecentTimeCounter() +
  132. KeConvertMicrosecondsToTimeTicks(WaitInMicroseconds);
  133. Socket->RetryTime = DueTime;
  134. //
  135. // This socket has grown impatient with a zero window size, try
  136. // sending something to see if an ACK comes back with an updated
  137. // window size.
  138. //
  139. } else if (KeGetRecentTimeCounter() > Socket->RetryTime) {
  140. Socket->RetryTime = 0;
  141. WindowSize = Socket->SendMaxSegmentSize;
  142. //
  143. // Double the wait period in case nothing comes back.
  144. //
  145. Socket->RetryWaitPeriod *= 2;
  146. if (Socket->RetryWaitPeriod > TCP_WINDOW_WAIT_PERIOD_MAX) {
  147. Socket->RetryWaitPeriod = TCP_WINDOW_WAIT_PERIOD_MAX;
  148. }
  149. }
  150. }
  151. }
  152. return WindowSize;
  153. }
  154. VOID
  155. NetpTcpCongestionAcknowledgeReceived (
  156. PTCP_SOCKET Socket,
  157. ULONG AcknowledgeNumber
  158. )
  159. /*++
  160. Routine Description:
  161. This routine is called when an acknowledge (duplicate or not) comes in.
  162. This routine assumes the socket lock is already held.
  163. Arguments:
  164. Socket - Supplies a pointer to the socket that just got an acknowledge.
  165. AcknowledgeNumber - Supplies the acknowledge number that came in.
  166. Return Value:
  167. None.
  168. --*/
  169. {
  170. ULONG Flags;
  171. ULONG SegmentSize;
  172. ULONG WindowIncrease;
  173. //
  174. // Process an ACK that made progress.
  175. //
  176. SegmentSize = Socket->SendMaxSegmentSize;
  177. if (Socket->DuplicateAcknowledgeCount == 0) {
  178. //
  179. // The same ACK can come in multiple times and not get counted as a
  180. // duplicate. Really only adjust things when new ACKs come in.
  181. //
  182. if (AcknowledgeNumber != Socket->PreviousAcknowledgeNumber) {
  183. //
  184. // Perform slow start if below the threshold. With slow start,
  185. // the congestion window is increased 1 Maximum Segment Size for
  186. // every new ACK received. Thus it is really exponentially
  187. // increasing.
  188. //
  189. Flags = Socket->Flags;
  190. if (Socket->CongestionWindowSize <= Socket->SlowStartThreshold) {
  191. Socket->CongestionWindowSize += SegmentSize;
  192. if (NetTcpDebugPrintCongestionControl != FALSE) {
  193. NetpTcpPrintSocketEndpoints(Socket, FALSE);
  194. RtlDebugPrint(" SlowStart Window up by %d to %d.\n",
  195. SegmentSize,
  196. Socket->CongestionWindowSize);
  197. }
  198. //
  199. // Perform fast recovery if enabled.
  200. //
  201. } else if ((Flags & TCP_SOCKET_FLAG_IN_FAST_RECOVERY) != 0) {
  202. //
  203. // If the acknowledge number is greater than the highest
  204. // sequence number in flight when the old packet was lost, then
  205. // go back to regular congestion avoidance mode.
  206. //
  207. if ((AcknowledgeNumber == Socket->FastRecoveryEndSequence) ||
  208. (TCP_SEQUENCE_GREATER_THAN(AcknowledgeNumber,
  209. Socket->FastRecoveryEndSequence))) {
  210. Socket->Flags &= ~TCP_SOCKET_FLAG_IN_FAST_RECOVERY;
  211. Socket->CongestionWindowSize = Socket->SlowStartThreshold;
  212. if (NetTcpDebugPrintCongestionControl != FALSE) {
  213. NetpTcpPrintSocketEndpoints(Socket, FALSE);
  214. RtlDebugPrint(" Exit FastRecovery: Window %d\n",
  215. Socket->CongestionWindowSize);
  216. }
  217. }
  218. //
  219. // If the socket is still in fast recovery mode, then only
  220. // partial progress was made. The acknowledge number must point
  221. // to the next hole, so send that off right away.
  222. //
  223. if (((Socket->Flags & TCP_SOCKET_FLAG_IN_FAST_RECOVERY) != 0) &&
  224. (Socket->SendWindowSize != 0)) {
  225. NetpTcpRetransmit(Socket);
  226. }
  227. //
  228. // Perform congestion avoidance.
  229. //
  230. } else {
  231. WindowIncrease = SegmentSize * SegmentSize /
  232. Socket->CongestionWindowSize;
  233. if (WindowIncrease == 0) {
  234. WindowIncrease = 1;
  235. }
  236. Socket->CongestionWindowSize += WindowIncrease;
  237. if (NetTcpDebugPrintCongestionControl != FALSE) {
  238. NetpTcpPrintSocketEndpoints(Socket, FALSE);
  239. RtlDebugPrint(" CongestionAvoid Window up by %d to %d.\n",
  240. WindowIncrease,
  241. Socket->CongestionWindowSize);
  242. }
  243. }
  244. }
  245. //
  246. // Process a duplicate ACK.
  247. //
  248. } else if (Socket->DuplicateAcknowledgeCount >=
  249. TCP_DUPLICATE_ACK_THRESHOLD) {
  250. //
  251. // Cut the window if this just crossed the "packet loss" threshold.
  252. //
  253. if (Socket->DuplicateAcknowledgeCount == TCP_DUPLICATE_ACK_THRESHOLD) {
  254. //
  255. // Set the slow start threshold to half the congestion window. The
  256. // congestion window is also halved, but three segment sizes are
  257. // added to it to represent the packets after the hole that are
  258. // presumably buffered on the other side. This is called "inflating"
  259. // the window.
  260. //
  261. Socket->SlowStartThreshold = Socket->CongestionWindowSize / 2;
  262. Socket->CongestionWindowSize = (Socket->CongestionWindowSize / 2) +
  263. (TCP_DUPLICATE_ACK_THRESHOLD * SegmentSize);
  264. Socket->Flags |= TCP_SOCKET_FLAG_IN_FAST_RECOVERY;
  265. Socket->FastRecoveryEndSequence = Socket->SendNextNetworkSequence;
  266. if (NetTcpDebugPrintCongestionControl != FALSE) {
  267. NetpTcpPrintSocketEndpoints(Socket, FALSE);
  268. RtlDebugPrint(" Entering FastRecovery. SlowStartThreshold %d, "
  269. "Window %d, FastRecoveryEnd %x\n",
  270. Socket->SlowStartThreshold,
  271. Socket->CongestionWindowSize,
  272. Socket->FastRecoveryEndSequence);
  273. }
  274. //
  275. // Process additional duplicate ACKs coming in after the window was cut.
  276. // Inflate the window to represent those packets sequentially after the
  277. // missing packet that are buffered up in the receiver.
  278. //
  279. } else {
  280. Socket->CongestionWindowSize += SegmentSize;
  281. if (NetTcpDebugPrintCongestionControl != FALSE) {
  282. NetpTcpPrintSocketEndpoints(Socket, FALSE);
  283. RtlDebugPrint(" FastRecovery ACK #%d. Window %d\n",
  284. Socket->DuplicateAcknowledgeCount,
  285. Socket->CongestionWindowSize);
  286. }
  287. }
  288. //
  289. // Fast retransmit the packet that's missing.
  290. //
  291. if (Socket->SendWindowSize != 0) {
  292. NetpTcpRetransmit(Socket);
  293. }
  294. }
  295. return;
  296. }
  297. VOID
  298. NetpTcpProcessNewRoundTripTimeSample (
  299. PTCP_SOCKET Socket,
  300. ULONGLONG RoundTripTicks
  301. )
  302. /*++
  303. Routine Description:
  304. This routine is called when a new round trip time sample arrives.
  305. Arguments:
  306. Socket - Supplies a pointer to the socket.
  307. RoundTripTicks - Supplies the most recent sample of round trip time, in
  308. time counter ticks.
  309. Return Value:
  310. None.
  311. --*/
  312. {
  313. ULONGLONG NewMilliseconds;
  314. ULONGLONG NewRoundTripTime;
  315. ULONGLONG SampleMilliseconds;
  316. ULONGLONG TimeCounterFrequency;
  317. //
  318. // The new round trip time is equal to A * NewSample + (1 - A) * OldValue,
  319. // basically a weighted average. The A part is split into a numerator and
  320. // denominator, and the result is stored multiplied by the denominator. So
  321. // the calculation is:
  322. // (Numerator * New) + (((Denominator - Numerator) * Original) /
  323. // Denominator).
  324. //
  325. NewRoundTripTime = (RoundTripTicks * TCP_ROUND_TRIP_SAMPLE_NUMERATOR) +
  326. ((Socket->RoundTripTime *
  327. (TCP_ROUND_TRIP_SAMPLE_DENOMINATOR -
  328. TCP_ROUND_TRIP_SAMPLE_NUMERATOR)) /
  329. TCP_ROUND_TRIP_SAMPLE_DENOMINATOR);
  330. Socket->RoundTripTime = NewRoundTripTime;
  331. if (NetTcpDebugPrintCongestionControl != FALSE) {
  332. TimeCounterFrequency = HlQueryTimeCounterFrequency();
  333. SampleMilliseconds = (RoundTripTicks * MILLISECONDS_PER_SECOND) /
  334. TimeCounterFrequency;
  335. NewMilliseconds = ((NewRoundTripTime *
  336. MILLISECONDS_PER_SECOND) /
  337. TCP_ROUND_TRIP_SAMPLE_DENOMINATOR) /
  338. TimeCounterFrequency;
  339. NetpTcpPrintSocketEndpoints(Socket, TRUE);
  340. RtlDebugPrint(" Round trip sample %I64dms, new estimate %I64dms.\n",
  341. SampleMilliseconds,
  342. NewMilliseconds);
  343. }
  344. return;
  345. }
  346. VOID
  347. NetpTcpGetTransmitTimeoutInterval (
  348. PTCP_SOCKET Socket,
  349. PTCP_SEND_SEGMENT Segment
  350. )
  351. /*++
  352. Routine Description:
  353. This routine sets the timeout duration for a transmitted packet.
  354. Arguments:
  355. Socket - Supplies a pointer to the socket.
  356. Segment - Supplies a pointer to the segment whose timeout interval needs to
  357. be set.
  358. Return Value:
  359. None. Upon completion, the segment's timeout interval is expected to be
  360. filled in by this routine.
  361. --*/
  362. {
  363. ULONGLONG Milliseconds;
  364. ULONGLONG NewTimeoutInterval;
  365. //
  366. // If this is the first time this is being sent, then set the timeout to
  367. // a couple round trip times.
  368. //
  369. if (Segment->SendAttemptCount == 0) {
  370. ASSERT(Segment->TimeoutInterval == 0);
  371. ASSERT(Socket->RoundTripTime != 0);
  372. Segment->TimeoutInterval = (Socket->RoundTripTime *
  373. TCP_ROUND_TRIP_TIMEOUT_FACTOR) /
  374. TCP_ROUND_TRIP_SAMPLE_DENOMINATOR;
  375. //
  376. // This packet is going around again, bump up the previous timeout
  377. // interval.
  378. //
  379. } else {
  380. ASSERT(Segment->TimeoutInterval != 0);
  381. NewTimeoutInterval =
  382. Segment->TimeoutInterval * TCP_ROUND_TRIP_TIMEOUT_FACTOR;
  383. //
  384. // This assert catches both a zero timeout interval (which is not
  385. // valid) and a timeout that just overflowed during the multiply.
  386. //
  387. ASSERT(NewTimeoutInterval > Segment->TimeoutInterval);
  388. Segment->TimeoutInterval = NewTimeoutInterval;
  389. }
  390. if (NetTcpDebugPrintCongestionControl != FALSE) {
  391. Milliseconds = Segment->TimeoutInterval * MILLISECONDS_PER_SECOND /
  392. HlQueryTimeCounterFrequency();
  393. RtlDebugPrint("TCP: Packet timeout %I64dms.\n", Milliseconds);
  394. }
  395. return;
  396. }
  397. VOID
  398. NetpTcpTransmissionTimeout (
  399. PTCP_SOCKET Socket,
  400. PTCP_SEND_SEGMENT Segment
  401. )
  402. /*++
  403. Routine Description:
  404. This routine is called when an acknowledge is not received for a sent
  405. packet in a timely manner (the packet timed out).
  406. Arguments:
  407. Socket - Supplies a pointer to the socket.
  408. Segment - Supplies a pointer to the segment that timed out.
  409. Return Value:
  410. None.
  411. --*/
  412. {
  413. ULONG RelativeSequenceNumber;
  414. ULONGLONG SentTime;
  415. ULONGLONG TimeoutTime;
  416. //
  417. // Set the slow start threshold to half of what the congestion window was
  418. // before the loss. Move all the way back to slow start for a loss.
  419. //
  420. Socket->SlowStartThreshold = Socket->CongestionWindowSize / 2;
  421. Socket->CongestionWindowSize = Socket->SendMaxSegmentSize;
  422. if (NetTcpDebugPrintCongestionControl != FALSE) {
  423. NetpTcpPrintSocketEndpoints(Socket, TRUE);
  424. RelativeSequenceNumber = Segment->SequenceNumber -
  425. Socket->SendInitialSequence;
  426. SentTime = (Segment->LastSendTime * MILLISECONDS_PER_SECOND) /
  427. HlQueryTimeCounterFrequency();
  428. TimeoutTime = (Segment->TimeoutInterval * MILLISECONDS_PER_SECOND) /
  429. HlQueryTimeCounterFrequency();
  430. RtlDebugPrint(" Timeout on Seq %d sent %I64dms timeout %I64dms, New "
  431. "SlowStartThreshold %d, CWindow %d.\n",
  432. RelativeSequenceNumber,
  433. SentTime,
  434. TimeoutTime,
  435. Socket->SlowStartThreshold,
  436. Socket->CongestionWindowSize);
  437. }
  438. return;
  439. }
  440. //
  441. // --------------------------------------------------------- Internal Functions
  442. //