tcpcong.c 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611
  1. /*++
  2. Copyright (c) 2013 Minoca Corp. All Rights Reserved
  3. Module Name:
  4. tcpcong.c
  5. Abstract:
  6. This module implements support for TCP congestion control. Specifically
  7. this module implements the New Reno algorithm, however this set of functions
  8. could easily be interfaced to include alternate congestion control
  9. algorithms.
  10. Author:
  11. Evan Green 15-Apr-2013
  12. Environment:
  13. Kernel
  14. --*/
  15. //
  16. // ------------------------------------------------------------------- Includes
  17. //
  18. //
  19. // Protocol drivers are supposed to be able to stand on their own (ie be able to
  20. // be implemented outside the core net library). For the builtin ones, avoid
  21. // including netcore.h, but still redefine those functions that would otherwise
  22. // generate imports.
  23. //
  24. #define NET_API __DLLEXPORT
  25. #include <minoca/kernel/driver.h>
  26. #include <minoca/net/netdrv.h>
  27. #include "tcp.h"
  28. //
  29. // ---------------------------------------------------------------- Definitions
  30. //
  31. //
  32. // ------------------------------------------------------ Data Type Definitions
  33. //
  34. //
  35. // ----------------------------------------------- Internal Function Prototypes
  36. //
  37. //
  38. // -------------------------------------------------------------------- Globals
  39. //
  40. ULONGLONG NetDefaultRoundTripTicks = 0;
  41. //
  42. // ------------------------------------------------------------------ Functions
  43. //
  44. VOID
  45. NetpTcpCongestionInitializeSocket (
  46. PTCP_SOCKET Socket
  47. )
  48. /*++
  49. Routine Description:
  50. This routine initializes the congestion control portion of the TCP socket.
  51. Arguments:
  52. Socket - Supplies a pointer to the socket to initialize.
  53. Return Value:
  54. None.
  55. --*/
  56. {
  57. ULONGLONG Ticks;
  58. if (NetDefaultRoundTripTicks == 0) {
  59. Ticks = TCP_DEFAULT_ROUND_TRIP_TIME * MICROSECONDS_PER_MILLISECOND;
  60. NetDefaultRoundTripTicks = KeConvertMicrosecondsToTimeTicks(Ticks) *
  61. TCP_ROUND_TRIP_SAMPLE_DENOMINATOR;
  62. }
  63. ASSERT((Socket->Flags & TCP_SOCKET_FLAG_IN_FAST_RECOVERY) == 0);
  64. Socket->SlowStartThreshold = MAX_ULONG;
  65. Socket->CongestionWindowSize = 2 * TCP_DEFAULT_MAX_SEGMENT_SIZE;
  66. Socket->FastRecoveryEndSequence = 0;
  67. Socket->RoundTripTime = NetDefaultRoundTripTicks;
  68. return;
  69. }
  70. VOID
  71. NetpTcpCongestionConnectionEstablished (
  72. PTCP_SOCKET Socket
  73. )
  74. /*++
  75. Routine Description:
  76. This routine is called when a socket moves to the Established state.
  77. Arguments:
  78. Socket - Supplies a pointer to the socket to initialize.
  79. Return Value:
  80. None.
  81. --*/
  82. {
  83. Socket->SlowStartThreshold = Socket->SendWindowSize;
  84. ASSERT(Socket->SendMaxSegmentSize != 0);
  85. Socket->CongestionWindowSize = 2 * Socket->SendMaxSegmentSize;
  86. if (NetTcpDebugPrintCongestionControl != FALSE) {
  87. NetpTcpPrintSocketEndpoints(Socket, FALSE);
  88. RtlDebugPrint(" Initial SlowStartThreshold %d, "
  89. "CongestionWindowSize %d.\n",
  90. Socket->SlowStartThreshold,
  91. Socket->CongestionWindowSize);
  92. }
  93. return;
  94. }
  95. ULONG
  96. NetpTcpGetSendWindowSize (
  97. PTCP_SOCKET Socket
  98. )
  99. /*++
  100. Routine Description:
  101. This routine determines the current available window of data that can be
  102. sent, taking into account both the receiver's window and the congestion
  103. window.
  104. Arguments:
  105. Socket - Supplies a pointer to the socket whose send window should be
  106. returned.
  107. Return Value:
  108. Returns one beyond the highest sequence number that can currently be sent.
  109. --*/
  110. {
  111. ULONGLONG DueTime;
  112. ULONGLONG WaitInMicroseconds;
  113. ULONG WindowSize;
  114. WindowSize = Socket->CongestionWindowSize;
  115. if (Socket->SendWindowSize < WindowSize) {
  116. WindowSize = Socket->SendWindowSize;
  117. if (WindowSize == 0) {
  118. //
  119. // If this is the first time the window is being seen as zero,
  120. // start the probe timer.
  121. //
  122. if (Socket->RetryTime == 0) {
  123. WaitInMicroseconds = Socket->RetryWaitPeriod *
  124. MICROSECONDS_PER_MILLISECOND;
  125. DueTime = KeGetRecentTimeCounter() +
  126. KeConvertMicrosecondsToTimeTicks(WaitInMicroseconds);
  127. Socket->RetryTime = DueTime;
  128. //
  129. // This socket has grown impatient with a zero window size, try
  130. // sending something to see if an ACK comes back with an updated
  131. // window size.
  132. //
  133. } else if (KeGetRecentTimeCounter() > Socket->RetryTime) {
  134. Socket->RetryTime = 0;
  135. WindowSize = Socket->SendMaxSegmentSize;
  136. //
  137. // Double the wait period in case nothing comes back.
  138. //
  139. Socket->RetryWaitPeriod *= 2;
  140. if (Socket->RetryWaitPeriod > TCP_WINDOW_WAIT_PERIOD_MAX) {
  141. Socket->RetryWaitPeriod = TCP_WINDOW_WAIT_PERIOD_MAX;
  142. }
  143. }
  144. }
  145. }
  146. return WindowSize;
  147. }
  148. VOID
  149. NetpTcpCongestionAcknowledgeReceived (
  150. PTCP_SOCKET Socket,
  151. ULONG AcknowledgeNumber
  152. )
  153. /*++
  154. Routine Description:
  155. This routine is called when an acknowledge (duplicate or not) comes in.
  156. This routine assumes the socket lock is already held.
  157. Arguments:
  158. Socket - Supplies a pointer to the socket that just got an acknowledge.
  159. AcknowledgeNumber - Supplies the acknowledge number that came in.
  160. Return Value:
  161. None.
  162. --*/
  163. {
  164. ULONG Flags;
  165. ULONG SegmentSize;
  166. ULONG WindowIncrease;
  167. //
  168. // Process an ACK that made progress.
  169. //
  170. SegmentSize = Socket->SendMaxSegmentSize;
  171. if (Socket->DuplicateAcknowledgeCount == 0) {
  172. //
  173. // The same ACK can come in multiple times and not get counted as a
  174. // duplicate. Really only adjust things when new ACKs come in.
  175. //
  176. if (AcknowledgeNumber != Socket->PreviousAcknowledgeNumber) {
  177. //
  178. // Perform slow start if below the threshold. With slow start,
  179. // the congestion window is increased 1 Maximum Segment Size for
  180. // every new ACK received. Thus it is really exponentially
  181. // increasing.
  182. //
  183. Flags = Socket->Flags;
  184. if (Socket->CongestionWindowSize <= Socket->SlowStartThreshold) {
  185. Socket->CongestionWindowSize += SegmentSize;
  186. if (NetTcpDebugPrintCongestionControl != FALSE) {
  187. NetpTcpPrintSocketEndpoints(Socket, FALSE);
  188. RtlDebugPrint(" SlowStart Window up by %d to %d.\n",
  189. SegmentSize,
  190. Socket->CongestionWindowSize);
  191. }
  192. //
  193. // Perform fast recovery if enabled.
  194. //
  195. } else if ((Flags & TCP_SOCKET_FLAG_IN_FAST_RECOVERY) != 0) {
  196. //
  197. // If the acknowledge number is greater than the highest
  198. // sequence number in flight when the old packet was lost, then
  199. // go back to regular congestion avoidance mode.
  200. //
  201. if ((AcknowledgeNumber == Socket->FastRecoveryEndSequence) ||
  202. (TCP_SEQUENCE_GREATER_THAN(AcknowledgeNumber,
  203. Socket->FastRecoveryEndSequence))) {
  204. Socket->Flags &= ~TCP_SOCKET_FLAG_IN_FAST_RECOVERY;
  205. Socket->CongestionWindowSize = Socket->SlowStartThreshold;
  206. if (NetTcpDebugPrintCongestionControl != FALSE) {
  207. NetpTcpPrintSocketEndpoints(Socket, FALSE);
  208. RtlDebugPrint(" Exit FastRecovery: Window %d\n",
  209. Socket->CongestionWindowSize);
  210. }
  211. }
  212. //
  213. // If the socket is still in fast recovery mode, then only
  214. // partial progress was made. The acknowledge number must point
  215. // to the next hole, so send that off right away.
  216. //
  217. if (((Socket->Flags & TCP_SOCKET_FLAG_IN_FAST_RECOVERY) != 0) &&
  218. (Socket->SendWindowSize != 0)) {
  219. NetpTcpRetransmit(Socket);
  220. }
  221. //
  222. // Perform congestion avoidance.
  223. //
  224. } else {
  225. WindowIncrease = SegmentSize * SegmentSize /
  226. Socket->CongestionWindowSize;
  227. if (WindowIncrease == 0) {
  228. WindowIncrease = 1;
  229. }
  230. Socket->CongestionWindowSize += WindowIncrease;
  231. if (NetTcpDebugPrintCongestionControl != FALSE) {
  232. NetpTcpPrintSocketEndpoints(Socket, FALSE);
  233. RtlDebugPrint(" CongestionAvoid Window up by %d to %d.\n",
  234. WindowIncrease,
  235. Socket->CongestionWindowSize);
  236. }
  237. }
  238. }
  239. //
  240. // Process a duplicate ACK.
  241. //
  242. } else if (Socket->DuplicateAcknowledgeCount >=
  243. TCP_DUPLICATE_ACK_THRESHOLD) {
  244. //
  245. // Cut the window if this just crossed the "packet loss" threshold.
  246. //
  247. if (Socket->DuplicateAcknowledgeCount == TCP_DUPLICATE_ACK_THRESHOLD) {
  248. //
  249. // Set the slow start threshold to half the congestion window. The
  250. // congestion window is also halved, but three segment sizes are
  251. // added to it to represent the packets after the hole that are
  252. // presumably buffered on the other side. This is called "inflating"
  253. // the window.
  254. //
  255. Socket->SlowStartThreshold = Socket->CongestionWindowSize / 2;
  256. Socket->CongestionWindowSize = (Socket->CongestionWindowSize / 2) +
  257. (TCP_DUPLICATE_ACK_THRESHOLD * SegmentSize);
  258. Socket->Flags |= TCP_SOCKET_FLAG_IN_FAST_RECOVERY;
  259. Socket->FastRecoveryEndSequence = Socket->SendNextNetworkSequence;
  260. if (NetTcpDebugPrintCongestionControl != FALSE) {
  261. NetpTcpPrintSocketEndpoints(Socket, FALSE);
  262. RtlDebugPrint(" Entering FastRecovery. SlowStartThreshold %d, "
  263. "Window %d, FastRecoveryEnd %x\n",
  264. Socket->SlowStartThreshold,
  265. Socket->CongestionWindowSize,
  266. Socket->FastRecoveryEndSequence);
  267. }
  268. //
  269. // Process additional duplicate ACKs coming in after the window was cut.
  270. // Inflate the window to represent those packets sequentially after the
  271. // missing packet that are buffered up in the receiver.
  272. //
  273. } else {
  274. Socket->CongestionWindowSize += SegmentSize;
  275. if (NetTcpDebugPrintCongestionControl != FALSE) {
  276. NetpTcpPrintSocketEndpoints(Socket, FALSE);
  277. RtlDebugPrint(" FastRecovery ACK #%d. Window %d\n",
  278. Socket->DuplicateAcknowledgeCount,
  279. Socket->CongestionWindowSize);
  280. }
  281. }
  282. //
  283. // Fast retransmit the packet that's missing.
  284. //
  285. if (Socket->SendWindowSize != 0) {
  286. NetpTcpRetransmit(Socket);
  287. }
  288. }
  289. return;
  290. }
  291. VOID
  292. NetpTcpProcessNewRoundTripTimeSample (
  293. PTCP_SOCKET Socket,
  294. ULONGLONG RoundTripTicks
  295. )
  296. /*++
  297. Routine Description:
  298. This routine is called when a new round trip time sample arrives.
  299. Arguments:
  300. Socket - Supplies a pointer to the socket.
  301. RoundTripTicks - Supplies the most recent sample of round trip time, in
  302. time counter ticks.
  303. Return Value:
  304. None.
  305. --*/
  306. {
  307. ULONGLONG NewMilliseconds;
  308. ULONGLONG NewRoundTripTime;
  309. ULONGLONG SampleMilliseconds;
  310. ULONGLONG TimeCounterFrequency;
  311. //
  312. // The new round trip time is equal to A * NewSample + (1 - A) * OldValue,
  313. // basically a weighted average. The A part is split into a numerator and
  314. // denominator, and the result is stored multiplied by the denominator. So
  315. // the calculation is:
  316. // (Numerator * New) + (((Denominator - Numerator) * Original) /
  317. // Denominator).
  318. //
  319. NewRoundTripTime = (RoundTripTicks * TCP_ROUND_TRIP_SAMPLE_NUMERATOR) +
  320. ((Socket->RoundTripTime *
  321. (TCP_ROUND_TRIP_SAMPLE_DENOMINATOR -
  322. TCP_ROUND_TRIP_SAMPLE_NUMERATOR)) /
  323. TCP_ROUND_TRIP_SAMPLE_DENOMINATOR);
  324. Socket->RoundTripTime = NewRoundTripTime;
  325. if (NetTcpDebugPrintCongestionControl != FALSE) {
  326. TimeCounterFrequency = HlQueryTimeCounterFrequency();
  327. SampleMilliseconds = (RoundTripTicks * MILLISECONDS_PER_SECOND) /
  328. TimeCounterFrequency;
  329. NewMilliseconds = ((NewRoundTripTime *
  330. MILLISECONDS_PER_SECOND) /
  331. TCP_ROUND_TRIP_SAMPLE_DENOMINATOR) /
  332. TimeCounterFrequency;
  333. NetpTcpPrintSocketEndpoints(Socket, TRUE);
  334. RtlDebugPrint(" Round trip sample %I64dms, new estimate %I64dms.\n",
  335. SampleMilliseconds,
  336. NewMilliseconds);
  337. }
  338. return;
  339. }
  340. VOID
  341. NetpTcpGetTransmitTimeoutInterval (
  342. PTCP_SOCKET Socket,
  343. PTCP_SEND_SEGMENT Segment
  344. )
  345. /*++
  346. Routine Description:
  347. This routine sets the timeout duration for a transmitted packet.
  348. Arguments:
  349. Socket - Supplies a pointer to the socket.
  350. Segment - Supplies a pointer to the segment whose timeout interval needs to
  351. be set.
  352. Return Value:
  353. None. Upon completion, the segment's timeout interval is expected to be
  354. filled in by this routine.
  355. --*/
  356. {
  357. ULONGLONG Milliseconds;
  358. ULONGLONG NewTimeoutInterval;
  359. //
  360. // If this is the first time this is being sent, then set the timeout to
  361. // a couple round trip times.
  362. //
  363. if (Segment->SendAttemptCount == 0) {
  364. ASSERT(Segment->TimeoutInterval == 0);
  365. ASSERT(Socket->RoundTripTime != 0);
  366. Segment->TimeoutInterval = (Socket->RoundTripTime *
  367. TCP_ROUND_TRIP_TIMEOUT_FACTOR) /
  368. TCP_ROUND_TRIP_SAMPLE_DENOMINATOR;
  369. //
  370. // This packet is going around again, bump up the previous timeout
  371. // interval.
  372. //
  373. } else {
  374. ASSERT(Segment->TimeoutInterval != 0);
  375. NewTimeoutInterval =
  376. Segment->TimeoutInterval * TCP_ROUND_TRIP_TIMEOUT_FACTOR;
  377. //
  378. // This assert catches both a zero timeout interval (which is not
  379. // valid) and a timeout that just overflowed during the multiply.
  380. //
  381. ASSERT(NewTimeoutInterval > Segment->TimeoutInterval);
  382. Segment->TimeoutInterval = NewTimeoutInterval;
  383. }
  384. if (NetTcpDebugPrintCongestionControl != FALSE) {
  385. Milliseconds = Segment->TimeoutInterval * MILLISECONDS_PER_SECOND /
  386. HlQueryTimeCounterFrequency();
  387. RtlDebugPrint("TCP: Packet timeout %I64dms.\n", Milliseconds);
  388. }
  389. return;
  390. }
  391. VOID
  392. NetpTcpTransmissionTimeout (
  393. PTCP_SOCKET Socket,
  394. PTCP_SEND_SEGMENT Segment
  395. )
  396. /*++
  397. Routine Description:
  398. This routine is called when an acknowledge is not recieved for a sent
  399. packet in a timely manner (the packet timed out).
  400. Arguments:
  401. Socket - Supplies a pointer to the socket.
  402. Segment - Supplies a pointer to the segment that timed out.
  403. Return Value:
  404. None.
  405. --*/
  406. {
  407. ULONG RelativeSequenceNumber;
  408. ULONGLONG SentTime;
  409. ULONGLONG TimeoutTime;
  410. //
  411. // Set the slow start threshold to half of what the congestion window was
  412. // before the loss. Move all the way back to slow start for a loss.
  413. //
  414. Socket->SlowStartThreshold = Socket->CongestionWindowSize / 2;
  415. Socket->CongestionWindowSize = Socket->SendMaxSegmentSize;
  416. if (NetTcpDebugPrintCongestionControl != FALSE) {
  417. NetpTcpPrintSocketEndpoints(Socket, TRUE);
  418. RelativeSequenceNumber = Segment->SequenceNumber -
  419. Socket->SendInitialSequence;
  420. SentTime = (Segment->LastSendTime * MILLISECONDS_PER_SECOND) /
  421. HlQueryTimeCounterFrequency();
  422. TimeoutTime = (Segment->TimeoutInterval * MILLISECONDS_PER_SECOND) /
  423. HlQueryTimeCounterFrequency();
  424. RtlDebugPrint(" Timeout on Seq %d sent %I64dms timeout %I64dms, New "
  425. "SlowStartThreshold %d, CWindow %d.\n",
  426. RelativeSequenceNumber,
  427. SentTime,
  428. TimeoutTime,
  429. Socket->SlowStartThreshold,
  430. Socket->CongestionWindowSize);
  431. }
  432. return;
  433. }
  434. //
  435. // --------------------------------------------------------- Internal Functions
  436. //