tcp.c 65 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235
  1. #include "u.h"
  2. #include "../port/lib.h"
  3. #include "mem.h"
  4. #include "dat.h"
  5. #include "fns.h"
  6. #include "../port/error.h"
  7. #include "ip.h"
  8. enum
  9. {
  10. QMAX = 64*1024-1,
  11. IP_TCPPROTO = 6,
  12. TCP4_IPLEN = 8,
  13. TCP4_PHDRSIZE = 12,
  14. TCP4_HDRSIZE = 20,
  15. TCP4_TCBPHDRSZ = 40,
  16. TCP4_PKT = TCP4_IPLEN+TCP4_PHDRSIZE,
  17. TCP6_IPLEN = 0,
  18. TCP6_PHDRSIZE = 40,
  19. TCP6_HDRSIZE = 20,
  20. TCP6_TCBPHDRSZ = 60,
  21. TCP6_PKT = TCP6_IPLEN+TCP6_PHDRSIZE,
  22. TcptimerOFF = 0,
  23. TcptimerON = 1,
  24. TcptimerDONE = 2,
  25. MAX_TIME = (1<<20), /* Forever */
  26. TCP_ACK = 50, /* Timed ack sequence in ms */
  27. MAXBACKMS = 9*60*1000, /* longest backoff time (ms) before hangup */
  28. URG = 0x20, /* Data marked urgent */
  29. ACK = 0x10, /* Acknowledge is valid */
  30. PSH = 0x08, /* Whole data pipe is pushed */
  31. RST = 0x04, /* Reset connection */
  32. SYN = 0x02, /* Pkt. is synchronise */
  33. FIN = 0x01, /* Start close down */
  34. EOLOPT = 0,
  35. NOOPOPT = 1,
  36. MSSOPT = 2,
  37. MSS_LENGTH = 4, /* Maximum segment size */
  38. WSOPT = 3,
  39. WS_LENGTH = 3, /* Bits to scale window size by */
  40. MSL2 = 10,
  41. MSPTICK = 50, /* Milliseconds per timer tick */
  42. DEF_MSS = 1460, /* Default maximum segment */
  43. DEF_MSS6 = 1280, /* Default maximum segment (min) for v6 */
  44. DEF_RTT = 500, /* Default round trip */
  45. DEF_KAT = 120000, /* Default time (ms) between keep alives */
  46. TCP_LISTEN = 0, /* Listen connection */
  47. TCP_CONNECT = 1, /* Outgoing connection */
  48. SYNACK_RXTIMER = 250, /* ms between SYNACK retransmits */
  49. TCPREXMTTHRESH = 3, /* dupack threshhold for rxt */
  50. FORCE = 1,
  51. CLONE = 2,
  52. RETRAN = 4,
  53. ACTIVE = 8,
  54. SYNACK = 16,
  55. LOGAGAIN = 3,
  56. LOGDGAIN = 2,
  57. Closed = 0, /* Connection states */
  58. Listen,
  59. Syn_sent,
  60. Syn_received,
  61. Established,
  62. Finwait1,
  63. Finwait2,
  64. Close_wait,
  65. Closing,
  66. Last_ack,
  67. Time_wait,
  68. Maxlimbo = 1000, /* maximum procs waiting for response to SYN ACK */
  69. NLHT = 256, /* hash table size, must be a power of 2 */
  70. LHTMASK = NLHT-1,
  71. HaveWS = 1<<8,
  72. };
  73. /* Must correspond to the enumeration above */
  74. char *tcpstates[] =
  75. {
  76. "Closed", "Listen", "Syn_sent", "Syn_received",
  77. "Established", "Finwait1", "Finwait2", "Close_wait",
  78. "Closing", "Last_ack", "Time_wait"
  79. };
  80. typedef struct Tcptimer Tcptimer;
  81. struct Tcptimer
  82. {
  83. Tcptimer *next;
  84. Tcptimer *prev;
  85. Tcptimer *readynext;
  86. int state;
  87. int start;
  88. int count;
  89. void (*func)(void*);
  90. void *arg;
  91. };
  92. /*
  93. * v4 and v6 pseudo headers used for
  94. * checksuming tcp
  95. */
  96. typedef struct Tcp4hdr Tcp4hdr;
  97. struct Tcp4hdr
  98. {
  99. uchar vihl; /* Version and header length */
  100. uchar tos; /* Type of service */
  101. uchar length[2]; /* packet length */
  102. uchar id[2]; /* Identification */
  103. uchar frag[2]; /* Fragment information */
  104. uchar Unused;
  105. uchar proto;
  106. uchar tcplen[2];
  107. uchar tcpsrc[4];
  108. uchar tcpdst[4];
  109. uchar tcpsport[2];
  110. uchar tcpdport[2];
  111. uchar tcpseq[4];
  112. uchar tcpack[4];
  113. uchar tcpflag[2];
  114. uchar tcpwin[2];
  115. uchar tcpcksum[2];
  116. uchar tcpurg[2];
  117. /* Options segment */
  118. uchar tcpopt[1];
  119. };
  120. typedef struct Tcp6hdr Tcp6hdr;
  121. struct Tcp6hdr
  122. {
  123. uchar vcf[4];
  124. uchar ploadlen[2];
  125. uchar proto;
  126. uchar ttl;
  127. uchar tcpsrc[IPaddrlen];
  128. uchar tcpdst[IPaddrlen];
  129. uchar tcpsport[2];
  130. uchar tcpdport[2];
  131. uchar tcpseq[4];
  132. uchar tcpack[4];
  133. uchar tcpflag[2];
  134. uchar tcpwin[2];
  135. uchar tcpcksum[2];
  136. uchar tcpurg[2];
  137. /* Options segment */
  138. uchar tcpopt[1];
  139. };
  140. /*
  141. * this represents the control info
  142. * for a single packet. It is derived from
  143. * a packet in ntohtcp{4,6}() and stuck into
  144. * a packet in htontcp{4,6}().
  145. */
  146. typedef struct Tcp Tcp;
  147. struct Tcp
  148. {
  149. ushort source;
  150. ushort dest;
  151. ulong seq;
  152. ulong ack;
  153. uchar flags;
  154. ushort ws; /* window scale option (if not zero) */
  155. ulong wnd;
  156. ushort urg;
  157. ushort mss; /* max segment size option (if not zero) */
  158. ushort len; /* size of data */
  159. };
  160. /*
  161. * this header is malloc'd to thread together fragments
  162. * waiting to be coalesced
  163. */
  164. typedef struct Reseq Reseq;
  165. struct Reseq
  166. {
  167. Reseq *next;
  168. Tcp seg;
  169. Block *bp;
  170. ushort length;
  171. };
  172. /*
  173. * the qlock in the Conv locks this structure
  174. */
  175. typedef struct Tcpctl Tcpctl;
  176. struct Tcpctl
  177. {
  178. uchar state; /* Connection state */
  179. uchar type; /* Listening or active connection */
  180. uchar code; /* Icmp code */
  181. struct {
  182. ulong una; /* Unacked data pointer */
  183. ulong nxt; /* Next sequence expected */
  184. ulong ptr; /* Data pointer */
  185. ulong wnd; /* Tcp send window */
  186. ulong urg; /* Urgent data pointer */
  187. ulong wl2;
  188. int scale; /* how much to right shift window in xmitted packets */
  189. /* to implement tahoe and reno TCP */
  190. ulong dupacks; /* number of duplicate acks rcvd */
  191. int recovery; /* loss recovery flag */
  192. ulong rxt; /* right window marker for recovery */
  193. } snd;
  194. struct {
  195. ulong nxt; /* Receive pointer to next uchar slot */
  196. ulong wnd; /* Receive window incoming */
  197. ulong urg; /* Urgent pointer */
  198. int blocked;
  199. int una; /* unacked data segs */
  200. int scale; /* how much to left shift window in rcved packets */
  201. } rcv;
  202. ulong iss; /* Initial sequence number */
  203. int sawwsopt; /* true if we saw a wsopt on the incoming SYN */
  204. ulong cwind; /* Congestion window */
  205. int scale; /* desired snd.scale */
  206. ushort ssthresh; /* Slow start threshold */
  207. int resent; /* Bytes just resent */
  208. int irs; /* Initial received squence */
  209. ushort mss; /* Maximum segment size */
  210. int rerecv; /* Overlap of data rerecevived */
  211. ulong window; /* Receive window */
  212. uchar backoff; /* Exponential backoff counter */
  213. int backedoff; /* ms we've backed off for rexmits */
  214. uchar flags; /* State flags */
  215. Reseq *reseq; /* Resequencing queue */
  216. Tcptimer timer; /* Activity timer */
  217. Tcptimer acktimer; /* Acknowledge timer */
  218. Tcptimer rtt_timer; /* Round trip timer */
  219. Tcptimer katimer; /* keep alive timer */
  220. ulong rttseq; /* Round trip sequence */
  221. int srtt; /* Shortened round trip */
  222. int mdev; /* Mean deviation of round trip */
  223. int kacounter; /* count down for keep alive */
  224. uint sndsyntime; /* time syn sent */
  225. ulong time; /* time Finwait2 or Syn_received was sent */
  226. int nochecksum; /* non-zero means don't send checksums */
  227. int flgcnt; /* number of flags in the sequence (FIN,SEQ) */
  228. union {
  229. Tcp4hdr tcp4hdr;
  230. Tcp6hdr tcp6hdr;
  231. } protohdr; /* prototype header */
  232. };
  233. /*
  234. * New calls are put in limbo rather than having a conversation structure
  235. * allocated. Thus, a SYN attack results in lots of limbo'd calls but not
  236. * any real Conv structures mucking things up. Calls in limbo rexmit their
  237. * SYN ACK every SYNACK_RXTIMER ms up to 4 times, i.e., they disappear after 1 second.
  238. *
  239. * In particular they aren't on a listener's queue so that they don't figure
  240. * in the input queue limit.
  241. *
  242. * If 1/2 of a T3 was attacking SYN packets, we'ld have a permanent queue
  243. * of 70000 limbo'd calls. Not great for a linear list but doable. Therefore
  244. * there is no hashing of this list.
  245. */
  246. typedef struct Limbo Limbo;
  247. struct Limbo
  248. {
  249. Limbo *next;
  250. uchar laddr[IPaddrlen];
  251. uchar raddr[IPaddrlen];
  252. ushort lport;
  253. ushort rport;
  254. ulong irs; /* initial received sequence */
  255. ulong iss; /* initial sent sequence */
  256. ushort mss; /* mss from the other end */
  257. ushort rcvscale; /* how much to scale rcvd windows */
  258. ushort sndscale; /* how much to scale sent windows */
  259. ulong lastsend; /* last time we sent a synack */
  260. uchar version; /* v4 or v6 */
  261. uchar rexmits; /* number of retransmissions */
  262. };
  263. int tcp_irtt = DEF_RTT; /* Initial guess at round trip time */
  264. ushort tcp_mss = DEF_MSS; /* Maximum segment size to be sent */
  265. enum {
  266. /* MIB stats */
  267. MaxConn,
  268. Mss,
  269. ActiveOpens,
  270. PassiveOpens,
  271. EstabResets,
  272. CurrEstab,
  273. InSegs,
  274. OutSegs,
  275. RetransSegs,
  276. RetransTimeouts,
  277. InErrs,
  278. OutRsts,
  279. /* non-MIB stats */
  280. CsumErrs,
  281. HlenErrs,
  282. LenErrs,
  283. OutOfOrder,
  284. Nstats
  285. };
  286. static char *statnames[] =
  287. {
  288. [MaxConn] "MaxConn",
  289. [Mss] "MaxSegment",
  290. [ActiveOpens] "ActiveOpens",
  291. [PassiveOpens] "PassiveOpens",
  292. [EstabResets] "EstabResets",
  293. [CurrEstab] "CurrEstab",
  294. [InSegs] "InSegs",
  295. [OutSegs] "OutSegs",
  296. [RetransSegs] "RetransSegs",
  297. [RetransTimeouts] "RetransTimeouts",
  298. [InErrs] "InErrs",
  299. [OutRsts] "OutRsts",
  300. [CsumErrs] "CsumErrs",
  301. [HlenErrs] "HlenErrs",
  302. [LenErrs] "LenErrs",
  303. [OutOfOrder] "OutOfOrder",
  304. };
  305. typedef struct Tcppriv Tcppriv;
  306. struct Tcppriv
  307. {
  308. /* List of active timers */
  309. QLock tl;
  310. Tcptimer *timers;
  311. /* hash table for matching conversations */
  312. Ipht ht;
  313. /* calls in limbo waiting for an ACK to our SYN ACK */
  314. int nlimbo;
  315. Limbo *lht[NLHT];
  316. /* for keeping track of tcpackproc */
  317. QLock apl;
  318. int ackprocstarted;
  319. uvlong stats[Nstats];
  320. };
  321. /*
  322. * Setting tcpporthogdefense to non-zero enables Dong Lin's
  323. * solution to hijacked systems staking out port's as a form
  324. * of DoS attack.
  325. *
  326. * To avoid stateless Conv hogs, we pick a sequence number at random. If
  327. * that number gets acked by the other end, we shut down the connection.
  328. * Look for tcpporthogdefense in the code.
  329. */
  330. int tcpporthogdefense = 0;
  331. int addreseq(Tcpctl*, Tcppriv*, Tcp*, Block*, ushort);
  332. void getreseq(Tcpctl*, Tcp*, Block**, ushort*);
  333. void localclose(Conv*, char*);
  334. void procsyn(Conv*, Tcp*);
  335. void tcpiput(Proto*, Ipifc*, Block*);
  336. void tcpoutput(Conv*);
  337. int tcptrim(Tcpctl*, Tcp*, Block**, ushort*);
  338. void tcpstart(Conv*, int);
  339. void tcptimeout(void*);
  340. void tcpsndsyn(Conv*, Tcpctl*);
  341. void tcprcvwin(Conv*);
  342. void tcpacktimer(void*);
  343. void tcpkeepalive(void*);
  344. void tcpsetkacounter(Tcpctl*);
  345. void tcprxmit(Conv*);
  346. void tcpsettimer(Tcpctl*);
  347. void tcpsynackrtt(Conv*);
  348. void tcpsetscale(Conv*, Tcpctl*, ushort, ushort);
  349. static void limborexmit(Proto*);
  350. static void limbo(Conv*, uchar*, uchar*, Tcp*, int);
  351. void
  352. tcpsetstate(Conv *s, uchar newstate)
  353. {
  354. Tcpctl *tcb;
  355. uchar oldstate;
  356. Tcppriv *tpriv;
  357. tpriv = s->p->priv;
  358. tcb = (Tcpctl*)s->ptcl;
  359. oldstate = tcb->state;
  360. if(oldstate == newstate)
  361. return;
  362. if(oldstate == Established)
  363. tpriv->stats[CurrEstab]--;
  364. if(newstate == Established)
  365. tpriv->stats[CurrEstab]++;
  366. /**
  367. print( "%d/%d %s->%s CurrEstab=%d\n", s->lport, s->rport,
  368. tcpstates[oldstate], tcpstates[newstate], tpriv->tstats.tcpCurrEstab );
  369. **/
  370. switch(newstate) {
  371. case Closed:
  372. qclose(s->rq);
  373. qclose(s->wq);
  374. qclose(s->eq);
  375. break;
  376. case Close_wait: /* Remote closes */
  377. qhangup(s->rq, nil);
  378. break;
  379. }
  380. tcb->state = newstate;
  381. if(oldstate == Syn_sent && newstate != Closed)
  382. Fsconnected(s, nil);
  383. }
  384. static char*
  385. tcpconnect(Conv *c, char **argv, int argc)
  386. {
  387. char *e;
  388. Tcpctl *tcb;
  389. tcb = (Tcpctl*)(c->ptcl);
  390. if(tcb->state != Closed)
  391. return Econinuse;
  392. e = Fsstdconnect(c, argv, argc);
  393. if(e != nil)
  394. return e;
  395. tcpstart(c, TCP_CONNECT);
  396. return nil;
  397. }
  398. static int
  399. tcpstate(Conv *c, char *state, int n)
  400. {
  401. Tcpctl *s;
  402. s = (Tcpctl*)(c->ptcl);
  403. return snprint(state, n,
  404. "%s qin %d qout %d srtt %d mdev %d cwin %lud swin %lud>>%d rwin %lud>>%d timer.start %d timer.count %d rerecv %d katimer.start %d katimer.count %d\n",
  405. tcpstates[s->state],
  406. c->rq ? qlen(c->rq) : 0,
  407. c->wq ? qlen(c->wq) : 0,
  408. s->srtt, s->mdev,
  409. s->cwind, s->snd.wnd, s->rcv.scale, s->rcv.wnd, s->snd.scale,
  410. s->timer.start, s->timer.count, s->rerecv,
  411. s->katimer.start, s->katimer.count);
  412. }
  413. static int
  414. tcpinuse(Conv *c)
  415. {
  416. Tcpctl *s;
  417. s = (Tcpctl*)(c->ptcl);
  418. return s->state != Closed;
  419. }
  420. static char*
  421. tcpannounce(Conv *c, char **argv, int argc)
  422. {
  423. char *e;
  424. Tcpctl *tcb;
  425. tcb = (Tcpctl*)(c->ptcl);
  426. if(tcb->state != Closed)
  427. return Econinuse;
  428. e = Fsstdannounce(c, argv, argc);
  429. if(e != nil)
  430. return e;
  431. tcpstart(c, TCP_LISTEN);
  432. Fsconnected(c, nil);
  433. return nil;
  434. }
  435. /*
  436. * tcpclose is always called with the q locked
  437. */
  438. static void
  439. tcpclose(Conv *c)
  440. {
  441. Tcpctl *tcb;
  442. tcb = (Tcpctl*)c->ptcl;
  443. qhangup(c->rq, nil);
  444. qhangup(c->wq, nil);
  445. qhangup(c->eq, nil);
  446. qflush(c->rq);
  447. switch(tcb->state) {
  448. case Listen:
  449. /*
  450. * reset any incoming calls to this listener
  451. */
  452. Fsconnected(c, "Hangup");
  453. localclose(c, nil);
  454. break;
  455. case Closed:
  456. case Syn_sent:
  457. localclose(c, nil);
  458. break;
  459. case Syn_received:
  460. case Established:
  461. tcb->flgcnt++;
  462. tcb->snd.nxt++;
  463. tcpsetstate(c, Finwait1);
  464. tcpoutput(c);
  465. break;
  466. case Close_wait:
  467. tcb->flgcnt++;
  468. tcb->snd.nxt++;
  469. tcpsetstate(c, Last_ack);
  470. tcpoutput(c);
  471. break;
  472. }
  473. }
  474. void
  475. tcpkick(void *x)
  476. {
  477. Conv *s = x;
  478. Tcpctl *tcb;
  479. tcb = (Tcpctl*)s->ptcl;
  480. if(waserror()){
  481. qunlock(s);
  482. nexterror();
  483. }
  484. qlock(s);
  485. switch(tcb->state) {
  486. case Syn_sent:
  487. case Syn_received:
  488. case Established:
  489. case Close_wait:
  490. /*
  491. * Push data
  492. */
  493. tcprcvwin(s);
  494. tcpoutput(s);
  495. break;
  496. default:
  497. localclose(s, "Hangup");
  498. break;
  499. }
  500. qunlock(s);
  501. poperror();
  502. }
  503. void
  504. tcprcvwin(Conv *s) /* Call with tcb locked */
  505. {
  506. int w;
  507. Tcpctl *tcb;
  508. tcb = (Tcpctl*)s->ptcl;
  509. w = tcb->window - qlen(s->rq);
  510. if(w < 0)
  511. w = 0;
  512. if(w == 0)
  513. netlog(s->p->f, Logtcp, "tcprcvwim: window %lud qlen %d\n", tcb->window, qlen(s->rq));
  514. tcb->rcv.wnd = w;
  515. if(w == 0)
  516. tcb->rcv.blocked = 1;
  517. }
  518. void
  519. tcpacktimer(void *v)
  520. {
  521. Tcpctl *tcb;
  522. Conv *s;
  523. s = v;
  524. tcb = (Tcpctl*)s->ptcl;
  525. if(waserror()){
  526. qunlock(s);
  527. nexterror();
  528. }
  529. qlock(s);
  530. if(tcb->state != Closed){
  531. tcb->flags |= FORCE;
  532. tcprcvwin(s);
  533. tcpoutput(s);
  534. }
  535. qunlock(s);
  536. poperror();
  537. }
  538. static void
  539. tcpcreate(Conv *c)
  540. {
  541. c->rq = qopen(QMAX, Qcoalesce, tcpacktimer, c);
  542. c->wq = qopen((3*QMAX)/2, Qkick, tcpkick, c);
  543. }
  544. static void
  545. timerstate(Tcppriv *priv, Tcptimer *t, int newstate)
  546. {
  547. if(newstate != TcptimerON){
  548. if(t->state == TcptimerON){
  549. /* unchain */
  550. if(priv->timers == t){
  551. priv->timers = t->next;
  552. if(t->prev != nil)
  553. panic("timerstate1");
  554. }
  555. if(t->next)
  556. t->next->prev = t->prev;
  557. if(t->prev)
  558. t->prev->next = t->next;
  559. t->next = t->prev = nil;
  560. }
  561. } else {
  562. if(t->state != TcptimerON){
  563. /* chain */
  564. if(t->prev != nil || t->next != nil)
  565. panic("timerstate2");
  566. t->prev = nil;
  567. t->next = priv->timers;
  568. if(t->next)
  569. t->next->prev = t;
  570. priv->timers = t;
  571. }
  572. }
  573. t->state = newstate;
  574. }
  575. void
  576. tcpackproc(void *a)
  577. {
  578. Tcptimer *t, *tp, *timeo;
  579. Proto *tcp;
  580. Tcppriv *priv;
  581. int loop;
  582. tcp = a;
  583. priv = tcp->priv;
  584. for(;;) {
  585. tsleep(&up->sleep, return0, 0, MSPTICK);
  586. qlock(&priv->tl);
  587. timeo = nil;
  588. loop = 0;
  589. for(t = priv->timers; t != nil; t = tp) {
  590. if(loop++ > 10000)
  591. panic("tcpackproc1");
  592. tp = t->next;
  593. if(t->state == TcptimerON) {
  594. t->count--;
  595. if(t->count == 0) {
  596. timerstate(priv, t, TcptimerDONE);
  597. t->readynext = timeo;
  598. timeo = t;
  599. }
  600. }
  601. }
  602. qunlock(&priv->tl);
  603. loop = 0;
  604. for(t = timeo; t != nil; t = t->readynext) {
  605. if(loop++ > 10000)
  606. panic("tcpackproc2");
  607. if(t->state == TcptimerDONE && t->func != nil && !waserror()){
  608. (*t->func)(t->arg);
  609. poperror();
  610. }
  611. }
  612. limborexmit(tcp);
  613. }
  614. }
  615. void
  616. tcpgo(Tcppriv *priv, Tcptimer *t)
  617. {
  618. if(t == nil || t->start == 0)
  619. return;
  620. qlock(&priv->tl);
  621. t->count = t->start;
  622. timerstate(priv, t, TcptimerON);
  623. qunlock(&priv->tl);
  624. }
  625. void
  626. tcphalt(Tcppriv *priv, Tcptimer *t)
  627. {
  628. if(t == nil)
  629. return;
  630. qlock(&priv->tl);
  631. timerstate(priv, t, TcptimerOFF);
  632. qunlock(&priv->tl);
  633. }
  634. int
  635. backoff(int n)
  636. {
  637. return 1 << n;
  638. }
  639. void
  640. localclose(Conv *s, char *reason) /* called with tcb locked */
  641. {
  642. Tcpctl *tcb;
  643. Reseq *rp,*rp1;
  644. Tcppriv *tpriv;
  645. tpriv = s->p->priv;
  646. tcb = (Tcpctl*)s->ptcl;
  647. iphtrem(&tpriv->ht, s);
  648. tcphalt(tpriv, &tcb->timer);
  649. tcphalt(tpriv, &tcb->rtt_timer);
  650. tcphalt(tpriv, &tcb->acktimer);
  651. tcphalt(tpriv, &tcb->katimer);
  652. /* Flush reassembly queue; nothing more can arrive */
  653. for(rp = tcb->reseq; rp != nil; rp = rp1) {
  654. rp1 = rp->next;
  655. freeblist(rp->bp);
  656. free(rp);
  657. }
  658. tcb->reseq = nil;
  659. if(tcb->state == Syn_sent)
  660. Fsconnected(s, reason);
  661. if(s->state == Announced)
  662. wakeup(&s->listenr);
  663. qhangup(s->rq, reason);
  664. qhangup(s->wq, reason);
  665. tcpsetstate(s, Closed);
  666. }
  667. /* mtu (- TCP + IP hdr len) of 1st hop */
  668. int
  669. tcpmtu(Proto *tcp, uchar *addr, int version, int *scale)
  670. {
  671. Ipifc *ifc;
  672. int mtu;
  673. ifc = findipifc(tcp->f, addr, 0);
  674. switch(version){
  675. default:
  676. case V4:
  677. mtu = DEF_MSS;
  678. if(ifc != nil)
  679. mtu = ifc->maxtu - ifc->m->hsize - (TCP4_PKT + TCP4_HDRSIZE);
  680. break;
  681. case V6:
  682. mtu = DEF_MSS6;
  683. if(ifc != nil)
  684. mtu = ifc->maxtu - ifc->m->hsize - (TCP6_PKT + TCP6_HDRSIZE);
  685. break;
  686. }
  687. if(ifc != nil){
  688. if(ifc->mbps > 1000)
  689. *scale = HaveWS | 4;
  690. else if(ifc->mbps > 100)
  691. *scale = HaveWS | 3;
  692. else if(ifc->mbps > 10)
  693. *scale = HaveWS | 1;
  694. else
  695. *scale = HaveWS | 0;
  696. } else
  697. *scale = HaveWS | 0;
  698. return mtu;
  699. }
  700. void
  701. inittcpctl(Conv *s, int mode)
  702. {
  703. Tcpctl *tcb;
  704. Tcp4hdr* h4;
  705. Tcp6hdr* h6;
  706. Tcppriv *tpriv;
  707. int mss;
  708. tcb = (Tcpctl*)s->ptcl;
  709. memset(tcb, 0, sizeof(Tcpctl));
  710. tcb->ssthresh = 65535;
  711. tcb->srtt = tcp_irtt<<LOGAGAIN;
  712. tcb->mdev = 0;
  713. /* setup timers */
  714. tcb->timer.start = tcp_irtt / MSPTICK;
  715. tcb->timer.func = tcptimeout;
  716. tcb->timer.arg = s;
  717. tcb->rtt_timer.start = MAX_TIME;
  718. tcb->acktimer.start = TCP_ACK / MSPTICK;
  719. tcb->acktimer.func = tcpacktimer;
  720. tcb->acktimer.arg = s;
  721. tcb->katimer.start = DEF_KAT / MSPTICK;
  722. tcb->katimer.func = tcpkeepalive;
  723. tcb->katimer.arg = s;
  724. mss = DEF_MSS;
  725. /* create a prototype(pseudo) header */
  726. if(mode != TCP_LISTEN){
  727. if(ipcmp(s->laddr, IPnoaddr) == 0)
  728. findlocalip(s->p->f, s->laddr, s->raddr);
  729. switch(s->ipversion){
  730. case V4:
  731. h4 = &tcb->protohdr.tcp4hdr;
  732. memset(h4, 0, sizeof(*h4));
  733. h4->proto = IP_TCPPROTO;
  734. hnputs(h4->tcpsport, s->lport);
  735. hnputs(h4->tcpdport, s->rport);
  736. v6tov4(h4->tcpsrc, s->laddr);
  737. v6tov4(h4->tcpdst, s->raddr);
  738. break;
  739. case V6:
  740. h6 = &tcb->protohdr.tcp6hdr;
  741. memset(h6, 0, sizeof(*h6));
  742. h6->proto = IP_TCPPROTO;
  743. hnputs(h6->tcpsport, s->lport);
  744. hnputs(h6->tcpdport, s->rport);
  745. ipmove(h6->tcpsrc, s->laddr);
  746. ipmove(h6->tcpdst, s->raddr);
  747. mss = DEF_MSS6;
  748. break;
  749. default:
  750. panic("inittcpctl: version %d", s->ipversion);
  751. }
  752. }
  753. tcb->mss = tcb->cwind = mss;
  754. tpriv = s->p->priv;
  755. tpriv->stats[Mss] = tcb->mss;
  756. /* default is no window scaling */
  757. tcb->window = QMAX;
  758. tcb->rcv.wnd = QMAX;
  759. tcb->rcv.scale = 0;
  760. tcb->snd.scale = 0;
  761. qsetlimit(s->rq, QMAX);
  762. }
  763. /*
  764. * called with s qlocked
  765. */
  766. void
  767. tcpstart(Conv *s, int mode)
  768. {
  769. Tcpctl *tcb;
  770. Tcppriv *tpriv;
  771. char kpname[KNAMELEN];
  772. tpriv = s->p->priv;
  773. if(tpriv->ackprocstarted == 0){
  774. qlock(&tpriv->apl);
  775. if(tpriv->ackprocstarted == 0){
  776. sprint(kpname, "#I%dtcpack", s->p->f->dev);
  777. kproc(kpname, tcpackproc, s->p);
  778. tpriv->ackprocstarted = 1;
  779. }
  780. qunlock(&tpriv->apl);
  781. }
  782. tcb = (Tcpctl*)s->ptcl;
  783. inittcpctl(s, mode);
  784. iphtadd(&tpriv->ht, s);
  785. switch(mode) {
  786. case TCP_LISTEN:
  787. tpriv->stats[PassiveOpens]++;
  788. tcb->flags |= CLONE;
  789. tcpsetstate(s, Listen);
  790. break;
  791. case TCP_CONNECT:
  792. tpriv->stats[ActiveOpens]++;
  793. tcb->flags |= ACTIVE;
  794. tcpsndsyn(s, tcb);
  795. tcpsetstate(s, Syn_sent);
  796. tcpoutput(s);
  797. break;
  798. }
  799. }
  800. static char*
  801. tcpflag(ushort flag)
  802. {
  803. static char buf[128];
  804. sprint(buf, "%d", flag>>10); /* Head len */
  805. if(flag & URG)
  806. strcat(buf, " URG");
  807. if(flag & ACK)
  808. strcat(buf, " ACK");
  809. if(flag & PSH)
  810. strcat(buf, " PSH");
  811. if(flag & RST)
  812. strcat(buf, " RST");
  813. if(flag & SYN)
  814. strcat(buf, " SYN");
  815. if(flag & FIN)
  816. strcat(buf, " FIN");
  817. return buf;
  818. }
  819. Block *
  820. htontcp6(Tcp *tcph, Block *data, Tcp6hdr *ph, Tcpctl *tcb)
  821. {
  822. int dlen;
  823. Tcp6hdr *h;
  824. ushort csum;
  825. ushort hdrlen, optpad = 0;
  826. uchar *opt;
  827. hdrlen = TCP6_HDRSIZE;
  828. if(tcph->flags & SYN){
  829. if(tcph->mss)
  830. hdrlen += MSS_LENGTH;
  831. if(tcph->ws)
  832. hdrlen += WS_LENGTH;
  833. optpad = hdrlen & 3;
  834. if(optpad)
  835. optpad = 4 - optpad;
  836. hdrlen += optpad;
  837. }
  838. if(data) {
  839. dlen = blocklen(data);
  840. data = padblock(data, hdrlen + TCP6_PKT);
  841. if(data == nil)
  842. return nil;
  843. }
  844. else {
  845. dlen = 0;
  846. data = allocb(hdrlen + TCP6_PKT + 64); /* the 64 pad is to meet mintu's */
  847. if(data == nil)
  848. return nil;
  849. data->wp += hdrlen + TCP6_PKT;
  850. }
  851. /* copy in pseudo ip header plus port numbers */
  852. h = (Tcp6hdr *)(data->rp);
  853. memmove(h, ph, TCP6_TCBPHDRSZ);
  854. /* compose pseudo tcp header, do cksum calculation */
  855. hnputl(h->vcf, hdrlen + dlen);
  856. h->ploadlen[0] = h->ploadlen[1] = h->proto = 0;
  857. h->ttl = ph->proto;
  858. /* copy in variable bits */
  859. hnputl(h->tcpseq, tcph->seq);
  860. hnputl(h->tcpack, tcph->ack);
  861. hnputs(h->tcpflag, (hdrlen<<10) | tcph->flags);
  862. hnputs(h->tcpwin, tcph->wnd>>(tcb != nil ? tcb->snd.scale : 0));
  863. hnputs(h->tcpurg, tcph->urg);
  864. if(tcph->flags & SYN){
  865. opt = h->tcpopt;
  866. if(tcph->mss != 0){
  867. *opt++ = MSSOPT;
  868. *opt++ = MSS_LENGTH;
  869. hnputs(opt, tcph->mss);
  870. // print("our outgoing mss %d\n", tcph->mss);
  871. opt += 2;
  872. }
  873. if(tcph->ws != 0){
  874. *opt++ = WSOPT;
  875. *opt++ = WS_LENGTH;
  876. *opt++ = tcph->ws;
  877. }
  878. while(optpad-- > 0)
  879. *opt++ = NOOPOPT;
  880. }
  881. if(tcb != nil && tcb->nochecksum){
  882. h->tcpcksum[0] = h->tcpcksum[1] = 0;
  883. } else {
  884. csum = ptclcsum(data, TCP6_IPLEN, hdrlen+dlen+TCP6_PHDRSIZE);
  885. hnputs(h->tcpcksum, csum);
  886. }
  887. /* move from pseudo header back to normal ip header */
  888. memset(h->vcf, 0, 4);
  889. h->vcf[0] = IP_VER6;
  890. hnputs(h->ploadlen, hdrlen+dlen);
  891. h->proto = ph->proto;
  892. return data;
  893. }
  894. Block *
  895. htontcp4(Tcp *tcph, Block *data, Tcp4hdr *ph, Tcpctl *tcb)
  896. {
  897. int dlen;
  898. Tcp4hdr *h;
  899. ushort csum;
  900. ushort hdrlen, optpad = 0;
  901. uchar *opt;
  902. hdrlen = TCP4_HDRSIZE;
  903. if(tcph->flags & SYN){
  904. if(tcph->mss)
  905. hdrlen += MSS_LENGTH;
  906. if(tcph->ws)
  907. hdrlen += WS_LENGTH;
  908. optpad = hdrlen & 3;
  909. if(optpad)
  910. optpad = 4 - optpad;
  911. hdrlen += optpad;
  912. }
  913. if(data) {
  914. dlen = blocklen(data);
  915. data = padblock(data, hdrlen + TCP4_PKT);
  916. if(data == nil)
  917. return nil;
  918. }
  919. else {
  920. dlen = 0;
  921. data = allocb(hdrlen + TCP4_PKT + 64); /* the 64 pad is to meet mintu's */
  922. if(data == nil)
  923. return nil;
  924. data->wp += hdrlen + TCP4_PKT;
  925. }
  926. /* copy in pseudo ip header plus port numbers */
  927. h = (Tcp4hdr *)(data->rp);
  928. memmove(h, ph, TCP4_TCBPHDRSZ);
  929. /* copy in variable bits */
  930. hnputs(h->tcplen, hdrlen + dlen);
  931. hnputl(h->tcpseq, tcph->seq);
  932. hnputl(h->tcpack, tcph->ack);
  933. hnputs(h->tcpflag, (hdrlen<<10) | tcph->flags);
  934. hnputs(h->tcpwin, tcph->wnd>>(tcb != nil ? tcb->snd.scale : 0));
  935. hnputs(h->tcpurg, tcph->urg);
  936. if(tcph->flags & SYN){
  937. opt = h->tcpopt;
  938. if(tcph->mss != 0){
  939. *opt++ = MSSOPT;
  940. *opt++ = MSS_LENGTH;
  941. hnputs(opt, tcph->mss);
  942. opt += 2;
  943. }
  944. if(tcph->ws != 0){
  945. *opt++ = WSOPT;
  946. *opt++ = WS_LENGTH;
  947. *opt++ = tcph->ws;
  948. }
  949. while(optpad-- > 0)
  950. *opt++ = NOOPOPT;
  951. }
  952. if(tcb != nil && tcb->nochecksum){
  953. h->tcpcksum[0] = h->tcpcksum[1] = 0;
  954. } else {
  955. csum = ptclcsum(data, TCP4_IPLEN, hdrlen+dlen+TCP4_PHDRSIZE);
  956. hnputs(h->tcpcksum, csum);
  957. }
  958. return data;
  959. }
  960. int
  961. ntohtcp6(Tcp *tcph, Block **bpp)
  962. {
  963. Tcp6hdr *h;
  964. uchar *optr;
  965. ushort hdrlen;
  966. ushort optlen;
  967. int n;
  968. *bpp = pullupblock(*bpp, TCP6_PKT+TCP6_HDRSIZE);
  969. if(*bpp == nil)
  970. return -1;
  971. h = (Tcp6hdr *)((*bpp)->rp);
  972. tcph->source = nhgets(h->tcpsport);
  973. tcph->dest = nhgets(h->tcpdport);
  974. tcph->seq = nhgetl(h->tcpseq);
  975. tcph->ack = nhgetl(h->tcpack);
  976. hdrlen = (h->tcpflag[0]>>2) & ~3;
  977. if(hdrlen < TCP6_HDRSIZE) {
  978. freeblist(*bpp);
  979. return -1;
  980. }
  981. tcph->flags = h->tcpflag[1];
  982. tcph->wnd = nhgets(h->tcpwin);
  983. tcph->urg = nhgets(h->tcpurg);
  984. tcph->mss = 0;
  985. tcph->ws = 0;
  986. tcph->len = nhgets(h->ploadlen) - hdrlen;
  987. *bpp = pullupblock(*bpp, hdrlen+TCP6_PKT);
  988. if(*bpp == nil)
  989. return -1;
  990. optr = h->tcpopt;
  991. n = hdrlen - TCP6_HDRSIZE;
  992. while(n > 0 && *optr != EOLOPT) {
  993. if(*optr == NOOPOPT) {
  994. n--;
  995. optr++;
  996. continue;
  997. }
  998. optlen = optr[1];
  999. if(optlen < 2 || optlen > n)
  1000. break;
  1001. switch(*optr) {
  1002. case MSSOPT:
  1003. if(optlen == MSS_LENGTH)
  1004. tcph->mss = nhgets(optr+2);
  1005. break;
  1006. case WSOPT:
  1007. if(optlen == WS_LENGTH && *(optr+2) <= 14)
  1008. tcph->ws = HaveWS | *(optr+2);
  1009. break;
  1010. }
  1011. n -= optlen;
  1012. optr += optlen;
  1013. }
  1014. return hdrlen;
  1015. }
  1016. int
  1017. ntohtcp4(Tcp *tcph, Block **bpp)
  1018. {
  1019. Tcp4hdr *h;
  1020. uchar *optr;
  1021. ushort hdrlen;
  1022. ushort optlen;
  1023. int n;
  1024. *bpp = pullupblock(*bpp, TCP4_PKT+TCP4_HDRSIZE);
  1025. if(*bpp == nil)
  1026. return -1;
  1027. h = (Tcp4hdr *)((*bpp)->rp);
  1028. tcph->source = nhgets(h->tcpsport);
  1029. tcph->dest = nhgets(h->tcpdport);
  1030. tcph->seq = nhgetl(h->tcpseq);
  1031. tcph->ack = nhgetl(h->tcpack);
  1032. hdrlen = (h->tcpflag[0]>>2) & ~3;
  1033. if(hdrlen < TCP4_HDRSIZE) {
  1034. freeblist(*bpp);
  1035. return -1;
  1036. }
  1037. tcph->flags = h->tcpflag[1];
  1038. tcph->wnd = nhgets(h->tcpwin);
  1039. tcph->urg = nhgets(h->tcpurg);
  1040. tcph->mss = 0;
  1041. tcph->ws = 0;
  1042. tcph->len = nhgets(h->length) - (hdrlen + TCP4_PKT);
  1043. *bpp = pullupblock(*bpp, hdrlen+TCP4_PKT);
  1044. if(*bpp == nil)
  1045. return -1;
  1046. optr = h->tcpopt;
  1047. n = hdrlen - TCP4_HDRSIZE;
  1048. while(n > 0 && *optr != EOLOPT) {
  1049. if(*optr == NOOPOPT) {
  1050. n--;
  1051. optr++;
  1052. continue;
  1053. }
  1054. optlen = optr[1];
  1055. if(optlen < 2 || optlen > n)
  1056. break;
  1057. switch(*optr) {
  1058. case MSSOPT:
  1059. if(optlen == MSS_LENGTH) {
  1060. tcph->mss = nhgets(optr+2);
  1061. // print("new incoming mss %d\n", tcph->mss);
  1062. }
  1063. break;
  1064. case WSOPT:
  1065. if(optlen == WS_LENGTH && *(optr+2) <= 14)
  1066. tcph->ws = HaveWS | *(optr+2);
  1067. break;
  1068. }
  1069. n -= optlen;
  1070. optr += optlen;
  1071. }
  1072. return hdrlen;
  1073. }
  1074. /*
  1075. * For outgiing calls, generate an initial sequence
  1076. * number and put a SYN on the send queue
  1077. */
  1078. void
  1079. tcpsndsyn(Conv *s, Tcpctl *tcb)
  1080. {
  1081. Tcppriv *tpriv;
  1082. tcb->iss = (nrand(1<<16)<<16)|nrand(1<<16);
  1083. tcb->rttseq = tcb->iss;
  1084. tcb->snd.wl2 = tcb->iss;
  1085. tcb->snd.una = tcb->iss;
  1086. tcb->snd.ptr = tcb->rttseq;
  1087. tcb->snd.nxt = tcb->rttseq;
  1088. tcb->flgcnt++;
  1089. tcb->flags |= FORCE;
  1090. tcb->sndsyntime = NOW;
  1091. /* set desired mss and scale */
  1092. tcb->mss = tcpmtu(s->p, s->laddr, s->ipversion, &tcb->scale);
  1093. tpriv = s->p->priv;
  1094. tpriv->stats[Mss] = tcb->mss;
  1095. }
  1096. void
  1097. sndrst(Proto *tcp, uchar *source, uchar *dest, ushort length, Tcp *seg, uchar version, char *reason)
  1098. {
  1099. Block *hbp;
  1100. uchar rflags;
  1101. Tcppriv *tpriv;
  1102. Tcp4hdr ph4;
  1103. Tcp6hdr ph6;
  1104. netlog(tcp->f, Logtcp, "sndrst: %s\n", reason);
  1105. tpriv = tcp->priv;
  1106. if(seg->flags & RST)
  1107. return;
  1108. /* make pseudo header */
  1109. switch(version) {
  1110. case V4:
  1111. memset(&ph4, 0, sizeof(ph4));
  1112. ph4.vihl = IP_VER4;
  1113. v6tov4(ph4.tcpsrc, dest);
  1114. v6tov4(ph4.tcpdst, source);
  1115. ph4.proto = IP_TCPPROTO;
  1116. hnputs(ph4.tcplen, TCP4_HDRSIZE);
  1117. hnputs(ph4.tcpsport, seg->dest);
  1118. hnputs(ph4.tcpdport, seg->source);
  1119. break;
  1120. case V6:
  1121. memset(&ph6, 0, sizeof(ph6));
  1122. ph6.vcf[0] = IP_VER6;
  1123. ipmove(ph6.tcpsrc, dest);
  1124. ipmove(ph6.tcpdst, source);
  1125. ph6.proto = IP_TCPPROTO;
  1126. hnputs(ph6.ploadlen, TCP6_HDRSIZE);
  1127. hnputs(ph6.tcpsport, seg->dest);
  1128. hnputs(ph6.tcpdport, seg->source);
  1129. break;
  1130. default:
  1131. panic("sndrst: version %d", version);
  1132. }
  1133. tpriv->stats[OutRsts]++;
  1134. rflags = RST;
  1135. /* convince the other end that this reset is in band */
  1136. if(seg->flags & ACK) {
  1137. seg->seq = seg->ack;
  1138. seg->ack = 0;
  1139. }
  1140. else {
  1141. rflags |= ACK;
  1142. seg->ack = seg->seq;
  1143. seg->seq = 0;
  1144. if(seg->flags & SYN)
  1145. seg->ack++;
  1146. seg->ack += length;
  1147. if(seg->flags & FIN)
  1148. seg->ack++;
  1149. }
  1150. seg->flags = rflags;
  1151. seg->wnd = 0;
  1152. seg->urg = 0;
  1153. seg->mss = 0;
  1154. seg->ws = 0;
  1155. switch(version) {
  1156. case V4:
  1157. hbp = htontcp4(seg, nil, &ph4, nil);
  1158. if(hbp == nil)
  1159. return;
  1160. ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
  1161. break;
  1162. case V6:
  1163. hbp = htontcp6(seg, nil, &ph6, nil);
  1164. if(hbp == nil)
  1165. return;
  1166. ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
  1167. break;
  1168. default:
  1169. panic("sndrst2: version %d", version);
  1170. }
  1171. }
  1172. /*
  1173. * send a reset to the remote side and close the conversation
  1174. * called with s qlocked
  1175. */
  1176. char*
  1177. tcphangup(Conv *s)
  1178. {
  1179. Tcp seg;
  1180. Tcpctl *tcb;
  1181. Block *hbp;
  1182. tcb = (Tcpctl*)s->ptcl;
  1183. if(waserror())
  1184. return commonerror();
  1185. if(ipcmp(s->raddr, IPnoaddr) != 0) {
  1186. if(!waserror()){
  1187. seg.flags = RST | ACK;
  1188. seg.ack = tcb->rcv.nxt;
  1189. tcb->rcv.una = 0;
  1190. seg.seq = tcb->snd.ptr;
  1191. seg.wnd = 0;
  1192. seg.urg = 0;
  1193. seg.mss = 0;
  1194. seg.ws = 0;
  1195. switch(s->ipversion) {
  1196. case V4:
  1197. tcb->protohdr.tcp4hdr.vihl = IP_VER4;
  1198. hbp = htontcp4(&seg, nil, &tcb->protohdr.tcp4hdr, tcb);
  1199. ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
  1200. break;
  1201. case V6:
  1202. tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
  1203. hbp = htontcp6(&seg, nil, &tcb->protohdr.tcp6hdr, tcb);
  1204. ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s);
  1205. break;
  1206. default:
  1207. panic("tcphangup: version %d", s->ipversion);
  1208. }
  1209. poperror();
  1210. }
  1211. }
  1212. localclose(s, nil);
  1213. poperror();
  1214. return nil;
  1215. }
  1216. /*
  1217. * (re)send a SYN ACK
  1218. */
  1219. int
  1220. sndsynack(Proto *tcp, Limbo *lp)
  1221. {
  1222. Block *hbp;
  1223. Tcp4hdr ph4;
  1224. Tcp6hdr ph6;
  1225. Tcp seg;
  1226. int scale;
  1227. /* make pseudo header */
  1228. switch(lp->version) {
  1229. case V4:
  1230. memset(&ph4, 0, sizeof(ph4));
  1231. ph4.vihl = IP_VER4;
  1232. v6tov4(ph4.tcpsrc, lp->laddr);
  1233. v6tov4(ph4.tcpdst, lp->raddr);
  1234. ph4.proto = IP_TCPPROTO;
  1235. hnputs(ph4.tcplen, TCP4_HDRSIZE);
  1236. hnputs(ph4.tcpsport, lp->lport);
  1237. hnputs(ph4.tcpdport, lp->rport);
  1238. break;
  1239. case V6:
  1240. memset(&ph6, 0, sizeof(ph6));
  1241. ph6.vcf[0] = IP_VER6;
  1242. ipmove(ph6.tcpsrc, lp->laddr);
  1243. ipmove(ph6.tcpdst, lp->raddr);
  1244. ph6.proto = IP_TCPPROTO;
  1245. hnputs(ph6.ploadlen, TCP6_HDRSIZE);
  1246. hnputs(ph6.tcpsport, lp->lport);
  1247. hnputs(ph6.tcpdport, lp->rport);
  1248. break;
  1249. default:
  1250. panic("sndrst: version %d", lp->version);
  1251. }
  1252. seg.seq = lp->iss;
  1253. seg.ack = lp->irs+1;
  1254. seg.flags = SYN|ACK;
  1255. seg.urg = 0;
  1256. seg.mss = tcpmtu(tcp, lp->laddr, lp->version, &scale);
  1257. // if (seg.mss > lp->mss && lp->mss >= 512)
  1258. // seg.mss = lp->mss;
  1259. seg.wnd = QMAX;
  1260. /* if the other side set scale, we should too */
  1261. if(lp->rcvscale){
  1262. seg.ws = scale;
  1263. lp->sndscale = scale;
  1264. } else {
  1265. seg.ws = 0;
  1266. lp->sndscale = 0;
  1267. }
  1268. switch(lp->version) {
  1269. case V4:
  1270. hbp = htontcp4(&seg, nil, &ph4, nil);
  1271. if(hbp == nil)
  1272. return -1;
  1273. ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
  1274. break;
  1275. case V6:
  1276. hbp = htontcp6(&seg, nil, &ph6, nil);
  1277. if(hbp == nil)
  1278. return -1;
  1279. ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
  1280. break;
  1281. default:
  1282. panic("sndsnack: version %d", lp->version);
  1283. }
  1284. lp->lastsend = NOW;
  1285. return 0;
  1286. }
  1287. #define hashipa(a, p) ( ( (a)[IPaddrlen-2] + (a)[IPaddrlen-1] + p )&LHTMASK )
  1288. /*
  1289. * put a call into limbo and respond with a SYN ACK
  1290. *
  1291. * called with proto locked
  1292. */
  1293. static void
  1294. limbo(Conv *s, uchar *source, uchar *dest, Tcp *seg, int version)
  1295. {
  1296. Limbo *lp, **l;
  1297. Tcppriv *tpriv;
  1298. int h;
  1299. tpriv = s->p->priv;
  1300. h = hashipa(source, seg->source);
  1301. for(l = &tpriv->lht[h]; *l != nil; l = &lp->next){
  1302. lp = *l;
  1303. if(lp->lport != seg->dest || lp->rport != seg->source || lp->version != version)
  1304. continue;
  1305. if(ipcmp(lp->raddr, source) != 0)
  1306. continue;
  1307. if(ipcmp(lp->laddr, dest) != 0)
  1308. continue;
  1309. /* each new SYN restarts the retransmits */
  1310. lp->irs = seg->seq;
  1311. break;
  1312. }
  1313. lp = *l;
  1314. if(lp == nil){
  1315. if(tpriv->nlimbo >= Maxlimbo && tpriv->lht[h]){
  1316. lp = tpriv->lht[h];
  1317. tpriv->lht[h] = lp->next;
  1318. lp->next = nil;
  1319. } else {
  1320. lp = malloc(sizeof(*lp));
  1321. if(lp == nil)
  1322. return;
  1323. tpriv->nlimbo++;
  1324. }
  1325. *l = lp;
  1326. lp->version = version;
  1327. ipmove(lp->laddr, dest);
  1328. ipmove(lp->raddr, source);
  1329. lp->lport = seg->dest;
  1330. lp->rport = seg->source;
  1331. lp->mss = seg->mss;
  1332. lp->rcvscale = seg->ws;
  1333. lp->irs = seg->seq;
  1334. lp->iss = (nrand(1<<16)<<16)|nrand(1<<16);
  1335. }
  1336. if(sndsynack(s->p, lp) < 0){
  1337. *l = lp->next;
  1338. tpriv->nlimbo--;
  1339. free(lp);
  1340. }
  1341. }
  1342. /*
  1343. * resend SYN ACK's once every SYNACK_RXTIMER ms.
  1344. */
  1345. static void
  1346. limborexmit(Proto *tcp)
  1347. {
  1348. Tcppriv *tpriv;
  1349. Limbo **l, *lp;
  1350. int h;
  1351. int seen;
  1352. ulong now;
  1353. tpriv = tcp->priv;
  1354. if(!canqlock(tcp))
  1355. return;
  1356. seen = 0;
  1357. now = NOW;
  1358. for(h = 0; h < NLHT && seen < tpriv->nlimbo; h++){
  1359. for(l = &tpriv->lht[h]; *l != nil && seen < tpriv->nlimbo; ){
  1360. lp = *l;
  1361. seen++;
  1362. if(now - lp->lastsend < (lp->rexmits+1)*SYNACK_RXTIMER)
  1363. continue;
  1364. /* time it out after 1 second */
  1365. if(++(lp->rexmits) > 5){
  1366. tpriv->nlimbo--;
  1367. *l = lp->next;
  1368. free(lp);
  1369. continue;
  1370. }
  1371. /* if we're being attacked, don't bother resending SYN ACK's */
  1372. if(tpriv->nlimbo > 100)
  1373. continue;
  1374. if(sndsynack(tcp, lp) < 0){
  1375. tpriv->nlimbo--;
  1376. *l = lp->next;
  1377. free(lp);
  1378. continue;
  1379. }
  1380. l = &lp->next;
  1381. }
  1382. }
  1383. qunlock(tcp);
  1384. }
  1385. /*
  1386. * lookup call in limbo. if found, throw it out.
  1387. *
  1388. * called with proto locked
  1389. */
  1390. static void
  1391. limborst(Conv *s, Tcp *segp, uchar *src, uchar *dst, uchar version)
  1392. {
  1393. Limbo *lp, **l;
  1394. int h;
  1395. Tcppriv *tpriv;
  1396. tpriv = s->p->priv;
  1397. /* find a call in limbo */
  1398. h = hashipa(src, segp->source);
  1399. for(l = &tpriv->lht[h]; *l != nil; l = &lp->next){
  1400. lp = *l;
  1401. if(lp->lport != segp->dest || lp->rport != segp->source || lp->version != version)
  1402. continue;
  1403. if(ipcmp(lp->laddr, dst) != 0)
  1404. continue;
  1405. if(ipcmp(lp->raddr, src) != 0)
  1406. continue;
  1407. /* RST can only follow the SYN */
  1408. if(segp->seq == lp->irs+1){
  1409. tpriv->nlimbo--;
  1410. *l = lp->next;
  1411. free(lp);
  1412. }
  1413. break;
  1414. }
  1415. }
  1416. /*
  1417. * come here when we finally get an ACK to our SYN-ACK.
  1418. * lookup call in limbo. if found, create a new conversation
  1419. *
  1420. * called with proto locked
  1421. */
  1422. static Conv*
  1423. tcpincoming(Conv *s, Tcp *segp, uchar *src, uchar *dst, uchar version)
  1424. {
  1425. Conv *new;
  1426. Tcpctl *tcb;
  1427. Tcppriv *tpriv;
  1428. Tcp4hdr *h4;
  1429. Tcp6hdr *h6;
  1430. Limbo *lp, **l;
  1431. int h;
  1432. /* unless it's just an ack, it can't be someone coming out of limbo */
  1433. if((segp->flags & SYN) || (segp->flags & ACK) == 0)
  1434. return nil;
  1435. tpriv = s->p->priv;
  1436. /* find a call in limbo */
  1437. h = hashipa(src, segp->source);
  1438. for(l = &tpriv->lht[h]; (lp = *l) != nil; l = &lp->next){
  1439. netlog(s->p->f, Logtcp, "tcpincoming s %I!%ud/%I!%ud d %I!%ud/%I!%ud v %d/%d\n",
  1440. src, segp->source, lp->raddr, lp->rport,
  1441. dst, segp->dest, lp->laddr, lp->lport,
  1442. version, lp->version
  1443. );
  1444. if(lp->lport != segp->dest || lp->rport != segp->source || lp->version != version)
  1445. continue;
  1446. if(ipcmp(lp->laddr, dst) != 0)
  1447. continue;
  1448. if(ipcmp(lp->raddr, src) != 0)
  1449. continue;
  1450. /* we're assuming no data with the initial SYN */
  1451. if(segp->seq != lp->irs+1 || segp->ack != lp->iss+1){
  1452. netlog(s->p->f, Logtcp, "tcpincoming s %lux/%lux a %lux %lux\n",
  1453. segp->seq, lp->irs+1, segp->ack, lp->iss+1);
  1454. lp = nil;
  1455. } else {
  1456. tpriv->nlimbo--;
  1457. *l = lp->next;
  1458. }
  1459. break;
  1460. }
  1461. if(lp == nil)
  1462. return nil;
  1463. new = Fsnewcall(s, src, segp->source, dst, segp->dest, version);
  1464. if(new == nil)
  1465. return nil;
  1466. memmove(new->ptcl, s->ptcl, sizeof(Tcpctl));
  1467. tcb = (Tcpctl*)new->ptcl;
  1468. tcb->flags &= ~CLONE;
  1469. tcb->timer.arg = new;
  1470. tcb->timer.state = TcptimerOFF;
  1471. tcb->acktimer.arg = new;
  1472. tcb->acktimer.state = TcptimerOFF;
  1473. tcb->katimer.arg = new;
  1474. tcb->katimer.state = TcptimerOFF;
  1475. tcb->rtt_timer.arg = new;
  1476. tcb->rtt_timer.state = TcptimerOFF;
  1477. tcb->irs = lp->irs;
  1478. tcb->rcv.nxt = tcb->irs+1;
  1479. tcb->rcv.urg = tcb->rcv.nxt;
  1480. tcb->iss = lp->iss;
  1481. tcb->rttseq = tcb->iss;
  1482. tcb->snd.wl2 = tcb->iss;
  1483. tcb->snd.una = tcb->iss+1;
  1484. tcb->snd.ptr = tcb->iss+1;
  1485. tcb->snd.nxt = tcb->iss+1;
  1486. tcb->flgcnt = 0;
  1487. tcb->flags |= SYNACK;
  1488. /* our sending max segment size cannot be bigger than what he asked for */
  1489. if(lp->mss != 0 && lp->mss < tcb->mss) {
  1490. tcb->mss = lp->mss;
  1491. tpriv->stats[Mss] = tcb->mss;
  1492. }
  1493. /* window scaling */
  1494. tcpsetscale(new, tcb, lp->rcvscale, lp->sndscale);
  1495. /* the congestion window always starts out as a single segment */
  1496. tcb->snd.wnd = segp->wnd;
  1497. tcb->cwind = tcb->mss;
  1498. /* set initial round trip time */
  1499. tcb->sndsyntime = lp->lastsend+lp->rexmits*SYNACK_RXTIMER;
  1500. tcpsynackrtt(new);
  1501. free(lp);
  1502. /* set up proto header */
  1503. switch(version){
  1504. case V4:
  1505. h4 = &tcb->protohdr.tcp4hdr;
  1506. memset(h4, 0, sizeof(*h4));
  1507. h4->proto = IP_TCPPROTO;
  1508. hnputs(h4->tcpsport, new->lport);
  1509. hnputs(h4->tcpdport, new->rport);
  1510. v6tov4(h4->tcpsrc, dst);
  1511. v6tov4(h4->tcpdst, src);
  1512. break;
  1513. case V6:
  1514. h6 = &tcb->protohdr.tcp6hdr;
  1515. memset(h6, 0, sizeof(*h6));
  1516. h6->proto = IP_TCPPROTO;
  1517. hnputs(h6->tcpsport, new->lport);
  1518. hnputs(h6->tcpdport, new->rport);
  1519. ipmove(h6->tcpsrc, dst);
  1520. ipmove(h6->tcpdst, src);
  1521. break;
  1522. default:
  1523. panic("tcpincoming: version %d", new->ipversion);
  1524. }
  1525. tcpsetstate(new, Established);
  1526. iphtadd(&tpriv->ht, new);
  1527. return new;
  1528. }
  1529. int
  1530. seq_within(ulong x, ulong low, ulong high)
  1531. {
  1532. if(low <= high){
  1533. if(low <= x && x <= high)
  1534. return 1;
  1535. }
  1536. else {
  1537. if(x >= low || x <= high)
  1538. return 1;
  1539. }
  1540. return 0;
  1541. }
  1542. int
  1543. seq_lt(ulong x, ulong y)
  1544. {
  1545. return (int)(x-y) < 0;
  1546. }
  1547. int
  1548. seq_le(ulong x, ulong y)
  1549. {
  1550. return (int)(x-y) <= 0;
  1551. }
  1552. int
  1553. seq_gt(ulong x, ulong y)
  1554. {
  1555. return (int)(x-y) > 0;
  1556. }
  1557. int
  1558. seq_ge(ulong x, ulong y)
  1559. {
  1560. return (int)(x-y) >= 0;
  1561. }
  1562. /*
  1563. * use the time between the first SYN and it's ack as the
  1564. * initial round trip time
  1565. */
  1566. void
  1567. tcpsynackrtt(Conv *s)
  1568. {
  1569. Tcpctl *tcb;
  1570. int delta;
  1571. Tcppriv *tpriv;
  1572. tcb = (Tcpctl*)s->ptcl;
  1573. tpriv = s->p->priv;
  1574. delta = NOW - tcb->sndsyntime;
  1575. tcb->srtt = delta<<LOGAGAIN;
  1576. tcb->mdev = delta<<LOGDGAIN;
  1577. /* halt round trip timer */
  1578. tcphalt(tpriv, &tcb->rtt_timer);
  1579. }
  1580. void
  1581. update(Conv *s, Tcp *seg)
  1582. {
  1583. int rtt, delta;
  1584. Tcpctl *tcb;
  1585. ulong acked;
  1586. ulong expand;
  1587. Tcppriv *tpriv;
  1588. tpriv = s->p->priv;
  1589. tcb = (Tcpctl*)s->ptcl;
  1590. /* if everything has been acked, force output(?) */
  1591. if(seq_gt(seg->ack, tcb->snd.nxt)) {
  1592. tcb->flags |= FORCE;
  1593. return;
  1594. }
  1595. /* added by Dong Lin for fast retransmission */
  1596. if(seg->ack == tcb->snd.una
  1597. && tcb->snd.una != tcb->snd.nxt
  1598. && seg->len == 0
  1599. && seg->wnd == tcb->snd.wnd) {
  1600. /* this is a pure ack w/o window update */
  1601. netlog(s->p->f, Logtcprxmt, "dupack %lud ack %lud sndwnd %lud advwin %lud\n",
  1602. tcb->snd.dupacks, seg->ack, tcb->snd.wnd, seg->wnd);
  1603. if(++tcb->snd.dupacks == TCPREXMTTHRESH) {
  1604. /*
  1605. * tahoe tcp rxt the packet, half sshthresh,
  1606. * and set cwnd to one packet
  1607. */
  1608. tcb->snd.recovery = 1;
  1609. tcb->snd.rxt = tcb->snd.nxt;
  1610. netlog(s->p->f, Logtcprxmt, "fast rxt %lud, nxt %lud\n", tcb->snd.una, tcb->snd.nxt);
  1611. tcprxmit(s);
  1612. } else {
  1613. /* do reno tcp here. */
  1614. }
  1615. }
  1616. /*
  1617. * update window
  1618. */
  1619. if(seq_gt(seg->ack, tcb->snd.wl2)
  1620. || (tcb->snd.wl2 == seg->ack && seg->wnd > tcb->snd.wnd)){
  1621. tcb->snd.wnd = seg->wnd;
  1622. tcb->snd.wl2 = seg->ack;
  1623. }
  1624. if(!seq_gt(seg->ack, tcb->snd.una)){
  1625. /*
  1626. * don't let us hangup if sending into a closed window and
  1627. * we're still getting acks
  1628. */
  1629. if((tcb->flags&RETRAN) && tcb->snd.wnd == 0){
  1630. tcb->backedoff = MAXBACKMS/4;
  1631. }
  1632. return;
  1633. }
  1634. /*
  1635. * any positive ack turns off fast rxt,
  1636. * (should we do new-reno on partial acks?)
  1637. */
  1638. if(!tcb->snd.recovery || seq_ge(seg->ack, tcb->snd.rxt)) {
  1639. tcb->snd.dupacks = 0;
  1640. tcb->snd.recovery = 0;
  1641. } else
  1642. netlog(s->p->f, Logtcp, "rxt next %lud, cwin %lud\n", seg->ack, tcb->cwind);
  1643. /* Compute the new send window size */
  1644. acked = seg->ack - tcb->snd.una;
  1645. /* avoid slow start and timers for SYN acks */
  1646. if((tcb->flags & SYNACK) == 0) {
  1647. tcb->flags |= SYNACK;
  1648. acked--;
  1649. tcb->flgcnt--;
  1650. goto done;
  1651. }
  1652. /* slow start as long as we're not recovering from lost packets */
  1653. if(tcb->cwind < tcb->snd.wnd && !tcb->snd.recovery) {
  1654. if(tcb->cwind < tcb->ssthresh) {
  1655. expand = tcb->mss;
  1656. if(acked < expand)
  1657. expand = acked;
  1658. }
  1659. else
  1660. expand = ((int)tcb->mss * tcb->mss) / tcb->cwind;
  1661. if(tcb->cwind + expand < tcb->cwind)
  1662. expand = tcb->snd.wnd - tcb->cwind;
  1663. if(tcb->cwind + expand > tcb->snd.wnd)
  1664. expand = tcb->snd.wnd - tcb->cwind;
  1665. tcb->cwind += expand;
  1666. }
  1667. /* Adjust the timers according to the round trip time */
  1668. if(tcb->rtt_timer.state == TcptimerON && seq_ge(seg->ack, tcb->rttseq)) {
  1669. tcphalt(tpriv, &tcb->rtt_timer);
  1670. if((tcb->flags&RETRAN) == 0) {
  1671. tcb->backoff = 0;
  1672. tcb->backedoff = 0;
  1673. rtt = tcb->rtt_timer.start - tcb->rtt_timer.count;
  1674. if(rtt == 0)
  1675. rtt = 1; /* otherwise all close systems will rexmit in 0 time */
  1676. rtt *= MSPTICK;
  1677. if(tcb->srtt == 0) {
  1678. tcb->srtt = rtt << LOGAGAIN;
  1679. tcb->mdev = rtt << LOGDGAIN;
  1680. } else {
  1681. delta = rtt - (tcb->srtt>>LOGAGAIN);
  1682. tcb->srtt += delta;
  1683. if(tcb->srtt <= 0)
  1684. tcb->srtt = 1;
  1685. delta = abs(delta) - (tcb->mdev>>LOGDGAIN);
  1686. tcb->mdev += delta;
  1687. if(tcb->mdev <= 0)
  1688. tcb->mdev = 1;
  1689. }
  1690. tcpsettimer(tcb);
  1691. }
  1692. }
  1693. done:
  1694. if(qdiscard(s->wq, acked) < acked)
  1695. tcb->flgcnt--;
  1696. tcb->snd.una = seg->ack;
  1697. if(seq_gt(seg->ack, tcb->snd.urg))
  1698. tcb->snd.urg = seg->ack;
  1699. if(tcb->snd.una != tcb->snd.nxt)
  1700. tcpgo(tpriv, &tcb->timer);
  1701. else
  1702. tcphalt(tpriv, &tcb->timer);
  1703. if(seq_lt(tcb->snd.ptr, tcb->snd.una))
  1704. tcb->snd.ptr = tcb->snd.una;
  1705. tcb->flags &= ~RETRAN;
  1706. tcb->backoff = 0;
  1707. tcb->backedoff = 0;
  1708. }
  1709. void
  1710. tcpiput(Proto *tcp, Ipifc*, Block *bp)
  1711. {
  1712. Tcp seg;
  1713. Tcp4hdr *h4;
  1714. Tcp6hdr *h6;
  1715. int hdrlen;
  1716. Tcpctl *tcb;
  1717. ushort length, csum;
  1718. uchar source[IPaddrlen], dest[IPaddrlen];
  1719. Conv *s;
  1720. Fs *f;
  1721. Tcppriv *tpriv;
  1722. uchar version;
  1723. f = tcp->f;
  1724. tpriv = tcp->priv;
  1725. tpriv->stats[InSegs]++;
  1726. h4 = (Tcp4hdr*)(bp->rp);
  1727. h6 = (Tcp6hdr*)(bp->rp);
  1728. if((h4->vihl&0xF0)==IP_VER4) {
  1729. version = V4;
  1730. length = nhgets(h4->length);
  1731. v4tov6(dest, h4->tcpdst);
  1732. v4tov6(source, h4->tcpsrc);
  1733. h4->Unused = 0;
  1734. hnputs(h4->tcplen, length-TCP4_PKT);
  1735. if(!(bp->flag & Btcpck) && (h4->tcpcksum[0] || h4->tcpcksum[1]) &&
  1736. ptclcsum(bp, TCP4_IPLEN, length-TCP4_IPLEN)) {
  1737. tpriv->stats[CsumErrs]++;
  1738. tpriv->stats[InErrs]++;
  1739. netlog(f, Logtcp, "bad tcp proto cksum\n");
  1740. freeblist(bp);
  1741. return;
  1742. }
  1743. hdrlen = ntohtcp4(&seg, &bp);
  1744. if(hdrlen < 0){
  1745. tpriv->stats[HlenErrs]++;
  1746. tpriv->stats[InErrs]++;
  1747. netlog(f, Logtcp, "bad tcp hdr len\n");
  1748. return;
  1749. }
  1750. /* trim the packet to the size claimed by the datagram */
  1751. length -= hdrlen+TCP4_PKT;
  1752. bp = trimblock(bp, hdrlen+TCP4_PKT, length);
  1753. if(bp == nil){
  1754. tpriv->stats[LenErrs]++;
  1755. tpriv->stats[InErrs]++;
  1756. netlog(f, Logtcp, "tcp len < 0 after trim\n");
  1757. return;
  1758. }
  1759. }
  1760. else {
  1761. int ttl = h6->ttl;
  1762. int proto = h6->proto;
  1763. version = V6;
  1764. length = nhgets(h6->ploadlen);
  1765. ipmove(dest, h6->tcpdst);
  1766. ipmove(source, h6->tcpsrc);
  1767. h6->ploadlen[0] = h6->ploadlen[1] = h6->proto = 0;
  1768. h6->ttl = proto;
  1769. hnputl(h6->vcf, length);
  1770. if((h6->tcpcksum[0] || h6->tcpcksum[1]) &&
  1771. (csum = ptclcsum(bp, TCP6_IPLEN, length+TCP6_PHDRSIZE)) != 0) {
  1772. tpriv->stats[CsumErrs]++;
  1773. tpriv->stats[InErrs]++;
  1774. netlog(f, Logtcp,
  1775. "bad tcpv6 proto cksum: got %#ux, computed %#ux\n",
  1776. h6->tcpcksum[0]<<8 | h6->tcpcksum[1], csum);
  1777. freeblist(bp);
  1778. return;
  1779. }
  1780. h6->ttl = ttl;
  1781. h6->proto = proto;
  1782. hnputs(h6->ploadlen, length);
  1783. hdrlen = ntohtcp6(&seg, &bp);
  1784. if(hdrlen < 0){
  1785. tpriv->stats[HlenErrs]++;
  1786. tpriv->stats[InErrs]++;
  1787. netlog(f, Logtcp, "bad tcpv6 hdr len\n");
  1788. return;
  1789. }
  1790. /* trim the packet to the size claimed by the datagram */
  1791. length -= hdrlen;
  1792. bp = trimblock(bp, hdrlen+TCP6_PKT, length);
  1793. if(bp == nil){
  1794. tpriv->stats[LenErrs]++;
  1795. tpriv->stats[InErrs]++;
  1796. netlog(f, Logtcp, "tcpv6 len < 0 after trim\n");
  1797. return;
  1798. }
  1799. }
  1800. /* lock protocol while searching for a conversation */
  1801. qlock(tcp);
  1802. /* Look for a matching conversation */
  1803. s = iphtlook(&tpriv->ht, source, seg.source, dest, seg.dest);
  1804. if(s == nil){
  1805. netlog(f, Logtcp, "iphtlook(src %I!%d, dst %I!%d) failed\n",
  1806. source, seg.source, dest, seg.dest);
  1807. reset:
  1808. qunlock(tcp);
  1809. sndrst(tcp, source, dest, length, &seg, version, "no conversation");
  1810. freeblist(bp);
  1811. return;
  1812. }
  1813. /* if it's a listener, look for the right flags and get a new conv */
  1814. tcb = (Tcpctl*)s->ptcl;
  1815. if(tcb->state == Listen){
  1816. if(seg.flags & RST){
  1817. limborst(s, &seg, source, dest, version);
  1818. qunlock(tcp);
  1819. freeblist(bp);
  1820. return;
  1821. }
  1822. /* if this is a new SYN, put the call into limbo */
  1823. if((seg.flags & SYN) && (seg.flags & ACK) == 0){
  1824. limbo(s, source, dest, &seg, version);
  1825. qunlock(tcp);
  1826. freeblist(bp);
  1827. return;
  1828. }
  1829. /*
  1830. * if there's a matching call in limbo, tcpincoming will
  1831. * return it in state Syn_received
  1832. */
  1833. s = tcpincoming(s, &seg, source, dest, version);
  1834. if(s == nil)
  1835. goto reset;
  1836. }
  1837. /* The rest of the input state machine is run with the control block
  1838. * locked and implements the state machine directly out of the RFC.
  1839. * Out-of-band data is ignored - it was always a bad idea.
  1840. */
  1841. tcb = (Tcpctl*)s->ptcl;
  1842. if(waserror()){
  1843. qunlock(s);
  1844. nexterror();
  1845. }
  1846. qlock(s);
  1847. qunlock(tcp);
  1848. /* fix up window */
  1849. seg.wnd <<= tcb->rcv.scale;
  1850. /* every input packet in puts off the keep alive time out */
  1851. tcpsetkacounter(tcb);
  1852. switch(tcb->state) {
  1853. case Closed:
  1854. sndrst(tcp, source, dest, length, &seg, version, "sending to Closed");
  1855. goto raise;
  1856. case Syn_sent:
  1857. if(seg.flags & ACK) {
  1858. if(!seq_within(seg.ack, tcb->iss+1, tcb->snd.nxt)) {
  1859. sndrst(tcp, source, dest, length, &seg, version,
  1860. "bad seq in Syn_sent");
  1861. goto raise;
  1862. }
  1863. }
  1864. if(seg.flags & RST) {
  1865. if(seg.flags & ACK)
  1866. localclose(s, Econrefused);
  1867. goto raise;
  1868. }
  1869. if(seg.flags & SYN) {
  1870. procsyn(s, &seg);
  1871. if(seg.flags & ACK){
  1872. update(s, &seg);
  1873. tcpsynackrtt(s);
  1874. tcpsetstate(s, Established);
  1875. tcpsetscale(s, tcb, seg.ws, tcb->scale);
  1876. }
  1877. else {
  1878. tcb->time = NOW;
  1879. tcpsetstate(s, Syn_received); /* DLP - shouldn't this be a reset? */
  1880. }
  1881. if(length != 0 || (seg.flags & FIN))
  1882. break;
  1883. freeblist(bp);
  1884. goto output;
  1885. }
  1886. else
  1887. freeblist(bp);
  1888. qunlock(s);
  1889. poperror();
  1890. return;
  1891. case Syn_received:
  1892. /* doesn't matter if it's the correct ack, we're just trying to set timing */
  1893. if(seg.flags & ACK)
  1894. tcpsynackrtt(s);
  1895. break;
  1896. }
  1897. /*
  1898. * One DOS attack is to open connections to us and then forget about them,
  1899. * thereby tying up a conv at no long term cost to the attacker.
  1900. * This is an attempt to defeat these stateless DOS attacks. See
  1901. * corresponding code in tcpsendka().
  1902. */
  1903. if(tcb->state != Syn_received && (seg.flags & RST) == 0){
  1904. if(tcpporthogdefense
  1905. && seq_within(seg.ack, tcb->snd.una-(1<<31), tcb->snd.una-(1<<29))){
  1906. print("stateless hog %I.%d->%I.%d f %ux %lux - %lux - %lux\n",
  1907. source, seg.source, dest, seg.dest, seg.flags,
  1908. tcb->snd.una-(1<<31), seg.ack, tcb->snd.una-(1<<29));
  1909. localclose(s, "stateless hog");
  1910. }
  1911. }
  1912. /* Cut the data to fit the receive window */
  1913. if(tcptrim(tcb, &seg, &bp, &length) == -1) {
  1914. netlog(f, Logtcp, "tcptrim, not accept, seq %lud-%lud win %lud-%lud\n",
  1915. seg.seq, seg.seq + length - 1,
  1916. tcb->rcv.nxt, tcb->rcv.nxt + tcb->rcv.wnd-1);
  1917. netlog(f, Logtcp, "tcp len < 0, %lud %d\n", seg.seq, length);
  1918. update(s, &seg);
  1919. if(qlen(s->wq)+tcb->flgcnt == 0 && tcb->state == Closing) {
  1920. tcphalt(tpriv, &tcb->rtt_timer);
  1921. tcphalt(tpriv, &tcb->acktimer);
  1922. tcphalt(tpriv, &tcb->katimer);
  1923. tcpsetstate(s, Time_wait);
  1924. tcb->timer.start = MSL2*(1000 / MSPTICK);
  1925. tcpgo(tpriv, &tcb->timer);
  1926. }
  1927. if(!(seg.flags & RST)) {
  1928. tcb->flags |= FORCE;
  1929. goto output;
  1930. }
  1931. qunlock(s);
  1932. poperror();
  1933. return;
  1934. }
  1935. /* Cannot accept so answer with a rst */
  1936. if(length && tcb->state == Closed) {
  1937. sndrst(tcp, source, dest, length, &seg, version, "sending to Closed");
  1938. goto raise;
  1939. }
  1940. /* The segment is beyond the current receive pointer so
  1941. * queue the data in the resequence queue
  1942. */
  1943. if(seg.seq != tcb->rcv.nxt)
  1944. if(length != 0 || (seg.flags & (SYN|FIN))) {
  1945. update(s, &seg);
  1946. if(addreseq(tcb, tpriv, &seg, bp, length) < 0)
  1947. print("reseq %I.%d -> %I.%d\n", s->raddr, s->rport, s->laddr, s->lport);
  1948. tcb->flags |= FORCE;
  1949. goto output;
  1950. }
  1951. /*
  1952. * keep looping till we've processed this packet plus any
  1953. * adjacent packets in the resequence queue
  1954. */
  1955. for(;;) {
  1956. if(seg.flags & RST) {
  1957. if(tcb->state == Established) {
  1958. tpriv->stats[EstabResets]++;
  1959. if(tcb->rcv.nxt != seg.seq)
  1960. print("out of order RST rcvd: %I.%d -> %I.%d, rcv.nxt %lux seq %lux\n", s->raddr, s->rport, s->laddr, s->lport, tcb->rcv.nxt, seg.seq);
  1961. }
  1962. localclose(s, Econrefused);
  1963. goto raise;
  1964. }
  1965. if((seg.flags&ACK) == 0)
  1966. goto raise;
  1967. switch(tcb->state) {
  1968. case Syn_received:
  1969. if(!seq_within(seg.ack, tcb->snd.una+1, tcb->snd.nxt)){
  1970. sndrst(tcp, source, dest, length, &seg, version,
  1971. "bad seq in Syn_received");
  1972. goto raise;
  1973. }
  1974. update(s, &seg);
  1975. tcpsetstate(s, Established);
  1976. case Established:
  1977. case Close_wait:
  1978. update(s, &seg);
  1979. break;
  1980. case Finwait1:
  1981. update(s, &seg);
  1982. if(qlen(s->wq)+tcb->flgcnt == 0){
  1983. tcphalt(tpriv, &tcb->rtt_timer);
  1984. tcphalt(tpriv, &tcb->acktimer);
  1985. tcpsetkacounter(tcb);
  1986. tcb->time = NOW;
  1987. tcpsetstate(s, Finwait2);
  1988. tcb->katimer.start = MSL2 * (1000 / MSPTICK);
  1989. tcpgo(tpriv, &tcb->katimer);
  1990. }
  1991. break;
  1992. case Finwait2:
  1993. update(s, &seg);
  1994. break;
  1995. case Closing:
  1996. update(s, &seg);
  1997. if(qlen(s->wq)+tcb->flgcnt == 0) {
  1998. tcphalt(tpriv, &tcb->rtt_timer);
  1999. tcphalt(tpriv, &tcb->acktimer);
  2000. tcphalt(tpriv, &tcb->katimer);
  2001. tcpsetstate(s, Time_wait);
  2002. tcb->timer.start = MSL2*(1000 / MSPTICK);
  2003. tcpgo(tpriv, &tcb->timer);
  2004. }
  2005. break;
  2006. case Last_ack:
  2007. update(s, &seg);
  2008. if(qlen(s->wq)+tcb->flgcnt == 0) {
  2009. localclose(s, nil);
  2010. goto raise;
  2011. }
  2012. case Time_wait:
  2013. tcb->flags |= FORCE;
  2014. if(tcb->timer.state != TcptimerON)
  2015. tcpgo(tpriv, &tcb->timer);
  2016. }
  2017. if((seg.flags&URG) && seg.urg) {
  2018. if(seq_gt(seg.urg + seg.seq, tcb->rcv.urg)) {
  2019. tcb->rcv.urg = seg.urg + seg.seq;
  2020. pullblock(&bp, seg.urg);
  2021. }
  2022. }
  2023. else
  2024. if(seq_gt(tcb->rcv.nxt, tcb->rcv.urg))
  2025. tcb->rcv.urg = tcb->rcv.nxt;
  2026. if(length == 0) {
  2027. if(bp != nil)
  2028. freeblist(bp);
  2029. }
  2030. else {
  2031. switch(tcb->state){
  2032. default:
  2033. /* Ignore segment text */
  2034. if(bp != nil)
  2035. freeblist(bp);
  2036. break;
  2037. case Syn_received:
  2038. case Established:
  2039. case Finwait1:
  2040. /* If we still have some data place on
  2041. * receive queue
  2042. */
  2043. if(bp) {
  2044. bp = packblock(bp);
  2045. if(bp == nil)
  2046. panic("tcp packblock");
  2047. qpassnolim(s->rq, bp);
  2048. bp = nil;
  2049. /*
  2050. * Force an ack every 2 data messages. This is
  2051. * a hack for rob to make his home system run
  2052. * faster.
  2053. *
  2054. * this also keeps the standard TCP congestion
  2055. * control working since it needs an ack every
  2056. * 2 max segs worth. This is not quite that,
  2057. * but under a real stream is equivalent since
  2058. * every packet has a max seg in it.
  2059. */
  2060. if(++(tcb->rcv.una) >= 2)
  2061. tcb->flags |= FORCE;
  2062. }
  2063. tcb->rcv.nxt += length;
  2064. /*
  2065. * update our rcv window
  2066. */
  2067. tcprcvwin(s);
  2068. /*
  2069. * turn on the acktimer if there's something
  2070. * to ack
  2071. */
  2072. if(tcb->acktimer.state != TcptimerON)
  2073. tcpgo(tpriv, &tcb->acktimer);
  2074. break;
  2075. case Finwait2:
  2076. /* no process to read the data, send a reset */
  2077. if(bp != nil)
  2078. freeblist(bp);
  2079. sndrst(tcp, source, dest, length, &seg, version,
  2080. "send to Finwait2");
  2081. qunlock(s);
  2082. poperror();
  2083. return;
  2084. }
  2085. }
  2086. if(seg.flags & FIN) {
  2087. tcb->flags |= FORCE;
  2088. switch(tcb->state) {
  2089. case Syn_received:
  2090. case Established:
  2091. tcb->rcv.nxt++;
  2092. tcpsetstate(s, Close_wait);
  2093. break;
  2094. case Finwait1:
  2095. tcb->rcv.nxt++;
  2096. if(qlen(s->wq)+tcb->flgcnt == 0) {
  2097. tcphalt(tpriv, &tcb->rtt_timer);
  2098. tcphalt(tpriv, &tcb->acktimer);
  2099. tcphalt(tpriv, &tcb->katimer);
  2100. tcpsetstate(s, Time_wait);
  2101. tcb->timer.start = MSL2*(1000/MSPTICK);
  2102. tcpgo(tpriv, &tcb->timer);
  2103. }
  2104. else
  2105. tcpsetstate(s, Closing);
  2106. break;
  2107. case Finwait2:
  2108. tcb->rcv.nxt++;
  2109. tcphalt(tpriv, &tcb->rtt_timer);
  2110. tcphalt(tpriv, &tcb->acktimer);
  2111. tcphalt(tpriv, &tcb->katimer);
  2112. tcpsetstate(s, Time_wait);
  2113. tcb->timer.start = MSL2 * (1000/MSPTICK);
  2114. tcpgo(tpriv, &tcb->timer);
  2115. break;
  2116. case Close_wait:
  2117. case Closing:
  2118. case Last_ack:
  2119. break;
  2120. case Time_wait:
  2121. tcpgo(tpriv, &tcb->timer);
  2122. break;
  2123. }
  2124. }
  2125. /*
  2126. * get next adjacent segment from the resequence queue.
  2127. * dump/trim any overlapping segments
  2128. */
  2129. for(;;) {
  2130. if(tcb->reseq == nil)
  2131. goto output;
  2132. if(seq_ge(tcb->rcv.nxt, tcb->reseq->seg.seq) == 0)
  2133. goto output;
  2134. getreseq(tcb, &seg, &bp, &length);
  2135. if(tcptrim(tcb, &seg, &bp, &length) == 0)
  2136. break;
  2137. }
  2138. }
  2139. output:
  2140. tcpoutput(s);
  2141. qunlock(s);
  2142. poperror();
  2143. return;
  2144. raise:
  2145. qunlock(s);
  2146. poperror();
  2147. freeblist(bp);
  2148. tcpkick(s);
  2149. }
  2150. /*
  2151. * always enters and exits with the s locked. We drop
  2152. * the lock to ipoput the packet so some care has to be
  2153. * taken by callers.
  2154. */
  2155. void
  2156. tcpoutput(Conv *s)
  2157. {
  2158. Tcp seg;
  2159. int msgs;
  2160. Tcpctl *tcb;
  2161. Block *hbp, *bp;
  2162. int sndcnt, n;
  2163. ulong ssize, dsize, usable, sent;
  2164. Fs *f;
  2165. Tcppriv *tpriv;
  2166. uchar version;
  2167. f = s->p->f;
  2168. tpriv = s->p->priv;
  2169. version = s->ipversion;
  2170. for(msgs = 0; msgs < 100; msgs++) {
  2171. tcb = (Tcpctl*)s->ptcl;
  2172. switch(tcb->state) {
  2173. case Listen:
  2174. case Closed:
  2175. case Finwait2:
  2176. return;
  2177. }
  2178. /* force an ack when a window has opened up */
  2179. if(tcb->rcv.blocked && tcb->rcv.wnd > 0){
  2180. tcb->rcv.blocked = 0;
  2181. tcb->flags |= FORCE;
  2182. }
  2183. sndcnt = qlen(s->wq)+tcb->flgcnt;
  2184. sent = tcb->snd.ptr - tcb->snd.una;
  2185. /* Don't send anything else until our SYN has been acked */
  2186. if(tcb->snd.ptr != tcb->iss && (tcb->flags & SYNACK) == 0)
  2187. break;
  2188. /* Compute usable segment based on offered window and limit
  2189. * window probes to one
  2190. */
  2191. if(tcb->snd.wnd == 0){
  2192. if(sent != 0) {
  2193. if((tcb->flags&FORCE) == 0)
  2194. break;
  2195. // tcb->snd.ptr = tcb->snd.una;
  2196. }
  2197. usable = 1;
  2198. }
  2199. else {
  2200. usable = tcb->cwind;
  2201. if(tcb->snd.wnd < usable)
  2202. usable = tcb->snd.wnd;
  2203. usable -= sent;
  2204. }
  2205. ssize = sndcnt-sent;
  2206. if(ssize && usable < 2)
  2207. netlog(s->p->f, Logtcp, "throttled snd.wnd %lud cwind %lud\n",
  2208. tcb->snd.wnd, tcb->cwind);
  2209. if(usable < ssize)
  2210. ssize = usable;
  2211. if(tcb->mss < ssize)
  2212. ssize = tcb->mss;
  2213. dsize = ssize;
  2214. seg.urg = 0;
  2215. if(ssize == 0)
  2216. if((tcb->flags&FORCE) == 0)
  2217. break;
  2218. tcb->flags &= ~FORCE;
  2219. tcprcvwin(s);
  2220. /* By default we will generate an ack */
  2221. tcphalt(tpriv, &tcb->acktimer);
  2222. tcb->rcv.una = 0;
  2223. seg.source = s->lport;
  2224. seg.dest = s->rport;
  2225. seg.flags = ACK;
  2226. seg.mss = 0;
  2227. seg.ws = 0;
  2228. switch(tcb->state){
  2229. case Syn_sent:
  2230. seg.flags = 0;
  2231. if(tcb->snd.ptr == tcb->iss){
  2232. seg.flags |= SYN;
  2233. dsize--;
  2234. seg.mss = tcb->mss;
  2235. seg.ws = tcb->scale;
  2236. }
  2237. break;
  2238. case Syn_received:
  2239. /*
  2240. * don't send any data with a SYN/ACK packet
  2241. * because Linux rejects the packet in its
  2242. * attempt to solve the SYN attack problem
  2243. */
  2244. if(tcb->snd.ptr == tcb->iss){
  2245. seg.flags |= SYN;
  2246. dsize = 0;
  2247. ssize = 1;
  2248. seg.mss = tcb->mss;
  2249. seg.ws = tcb->scale;
  2250. }
  2251. break;
  2252. }
  2253. seg.seq = tcb->snd.ptr;
  2254. seg.ack = tcb->rcv.nxt;
  2255. seg.wnd = tcb->rcv.wnd;
  2256. /* Pull out data to send */
  2257. bp = nil;
  2258. if(dsize != 0) {
  2259. bp = qcopy(s->wq, dsize, sent);
  2260. if(BLEN(bp) != dsize) {
  2261. seg.flags |= FIN;
  2262. dsize--;
  2263. }
  2264. }
  2265. if(sent+dsize == sndcnt)
  2266. seg.flags |= PSH;
  2267. /* keep track of balance of resent data */
  2268. if(seq_lt(tcb->snd.ptr, tcb->snd.nxt)) {
  2269. n = tcb->snd.nxt - tcb->snd.ptr;
  2270. if(ssize < n)
  2271. n = ssize;
  2272. tcb->resent += n;
  2273. netlog(f, Logtcp, "rexmit: %I!%d -> %I!%d ptr %lux nxt %lux\n",
  2274. s->raddr, s->rport, s->laddr, s->lport, tcb->snd.ptr, tcb->snd.nxt);
  2275. tpriv->stats[RetransSegs]++;
  2276. }
  2277. tcb->snd.ptr += ssize;
  2278. /* Pull up the send pointer so we can accept acks
  2279. * for this window
  2280. */
  2281. if(seq_gt(tcb->snd.ptr,tcb->snd.nxt))
  2282. tcb->snd.nxt = tcb->snd.ptr;
  2283. /* Build header, link data and compute cksum */
  2284. switch(version){
  2285. case V4:
  2286. tcb->protohdr.tcp4hdr.vihl = IP_VER4;
  2287. hbp = htontcp4(&seg, bp, &tcb->protohdr.tcp4hdr, tcb);
  2288. if(hbp == nil) {
  2289. freeblist(bp);
  2290. return;
  2291. }
  2292. break;
  2293. case V6:
  2294. tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
  2295. hbp = htontcp6(&seg, bp, &tcb->protohdr.tcp6hdr, tcb);
  2296. if(hbp == nil) {
  2297. freeblist(bp);
  2298. return;
  2299. }
  2300. break;
  2301. default:
  2302. hbp = nil; /* to suppress a warning */
  2303. panic("tcpoutput: version %d", version);
  2304. }
  2305. /* Start the transmission timers if there is new data and we
  2306. * expect acknowledges
  2307. */
  2308. if(ssize != 0){
  2309. if(tcb->timer.state != TcptimerON)
  2310. tcpgo(tpriv, &tcb->timer);
  2311. /* If round trip timer isn't running, start it.
  2312. * measure the longest packet only in case the
  2313. * transmission time dominates RTT
  2314. */
  2315. if(tcb->rtt_timer.state != TcptimerON)
  2316. if(ssize == tcb->mss) {
  2317. tcpgo(tpriv, &tcb->rtt_timer);
  2318. tcb->rttseq = tcb->snd.ptr;
  2319. }
  2320. }
  2321. tpriv->stats[OutSegs]++;
  2322. /* put off the next keep alive */
  2323. tcpgo(tpriv, &tcb->katimer);
  2324. switch(version){
  2325. case V4:
  2326. if(ipoput4(f, hbp, 0, s->ttl, s->tos, s) < 0){
  2327. /* a negative return means no route */
  2328. localclose(s, "no route");
  2329. }
  2330. break;
  2331. case V6:
  2332. if(ipoput6(f, hbp, 0, s->ttl, s->tos, s) < 0){
  2333. /* a negative return means no route */
  2334. localclose(s, "no route");
  2335. }
  2336. break;
  2337. default:
  2338. panic("tcpoutput2: version %d", version);
  2339. }
  2340. if((msgs%4) == 1){
  2341. qunlock(s);
  2342. sched();
  2343. qlock(s);
  2344. }
  2345. }
  2346. }
  2347. /*
  2348. * the BSD convention (hack?) for keep alives. resend last uchar acked.
  2349. */
  2350. void
  2351. tcpsendka(Conv *s)
  2352. {
  2353. Tcp seg;
  2354. Tcpctl *tcb;
  2355. Block *hbp,*dbp;
  2356. tcb = (Tcpctl*)s->ptcl;
  2357. dbp = nil;
  2358. seg.urg = 0;
  2359. seg.source = s->lport;
  2360. seg.dest = s->rport;
  2361. seg.flags = ACK|PSH;
  2362. seg.mss = 0;
  2363. seg.ws = 0;
  2364. if(tcpporthogdefense)
  2365. seg.seq = tcb->snd.una-(1<<30)-nrand(1<<20);
  2366. else
  2367. seg.seq = tcb->snd.una-1;
  2368. seg.ack = tcb->rcv.nxt;
  2369. tcb->rcv.una = 0;
  2370. seg.wnd = tcb->rcv.wnd;
  2371. if(tcb->state == Finwait2){
  2372. seg.flags |= FIN;
  2373. } else {
  2374. dbp = allocb(1);
  2375. dbp->wp++;
  2376. }
  2377. if(isv4(s->raddr)) {
  2378. /* Build header, link data and compute cksum */
  2379. tcb->protohdr.tcp4hdr.vihl = IP_VER4;
  2380. hbp = htontcp4(&seg, dbp, &tcb->protohdr.tcp4hdr, tcb);
  2381. if(hbp == nil) {
  2382. freeblist(dbp);
  2383. return;
  2384. }
  2385. ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
  2386. }
  2387. else {
  2388. /* Build header, link data and compute cksum */
  2389. tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
  2390. hbp = htontcp6(&seg, dbp, &tcb->protohdr.tcp6hdr, tcb);
  2391. if(hbp == nil) {
  2392. freeblist(dbp);
  2393. return;
  2394. }
  2395. ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s);
  2396. }
  2397. }
  2398. /*
  2399. * set connection to time out after 12 minutes
  2400. */
  2401. void
  2402. tcpsetkacounter(Tcpctl *tcb)
  2403. {
  2404. tcb->kacounter = (12 * 60 * 1000) / (tcb->katimer.start*MSPTICK);
  2405. if(tcb->kacounter < 3)
  2406. tcb->kacounter = 3;
  2407. }
  2408. /*
  2409. * if we've timed out, close the connection
  2410. * otherwise, send a keepalive and restart the timer
  2411. */
  2412. void
  2413. tcpkeepalive(void *v)
  2414. {
  2415. Tcpctl *tcb;
  2416. Conv *s;
  2417. s = v;
  2418. tcb = (Tcpctl*)s->ptcl;
  2419. if(waserror()){
  2420. qunlock(s);
  2421. nexterror();
  2422. }
  2423. qlock(s);
  2424. if(tcb->state != Closed){
  2425. if(--(tcb->kacounter) <= 0) {
  2426. localclose(s, Etimedout);
  2427. } else {
  2428. tcpsendka(s);
  2429. tcpgo(s->p->priv, &tcb->katimer);
  2430. }
  2431. }
  2432. qunlock(s);
  2433. poperror();
  2434. }
  2435. /*
  2436. * start keepalive timer
  2437. */
  2438. char*
  2439. tcpstartka(Conv *s, char **f, int n)
  2440. {
  2441. Tcpctl *tcb;
  2442. int x;
  2443. tcb = (Tcpctl*)s->ptcl;
  2444. if(tcb->state != Established)
  2445. return "connection must be in Establised state";
  2446. if(n > 1){
  2447. x = atoi(f[1]);
  2448. if(x >= MSPTICK)
  2449. tcb->katimer.start = x/MSPTICK;
  2450. }
  2451. tcpsetkacounter(tcb);
  2452. tcpgo(s->p->priv, &tcb->katimer);
  2453. return nil;
  2454. }
  2455. /*
  2456. * turn checksums on/off
  2457. */
  2458. char*
  2459. tcpsetchecksum(Conv *s, char **f, int)
  2460. {
  2461. Tcpctl *tcb;
  2462. tcb = (Tcpctl*)s->ptcl;
  2463. tcb->nochecksum = !atoi(f[1]);
  2464. return nil;
  2465. }
  2466. void
  2467. tcprxmit(Conv *s)
  2468. {
  2469. Tcpctl *tcb;
  2470. tcb = (Tcpctl*)s->ptcl;
  2471. tcb->flags |= RETRAN|FORCE;
  2472. tcb->snd.ptr = tcb->snd.una;
  2473. /*
  2474. * We should be halving the slow start threshhold (down to one
  2475. * mss) but leaving it at mss seems to work well enough
  2476. */
  2477. tcb->ssthresh = tcb->mss;
  2478. /*
  2479. * pull window down to a single packet
  2480. */
  2481. tcb->cwind = tcb->mss;
  2482. tcpoutput(s);
  2483. }
  2484. void
  2485. tcptimeout(void *arg)
  2486. {
  2487. Conv *s;
  2488. Tcpctl *tcb;
  2489. int maxback;
  2490. Tcppriv *tpriv;
  2491. s = (Conv*)arg;
  2492. tpriv = s->p->priv;
  2493. tcb = (Tcpctl*)s->ptcl;
  2494. if(waserror()){
  2495. qunlock(s);
  2496. nexterror();
  2497. }
  2498. qlock(s);
  2499. switch(tcb->state){
  2500. default:
  2501. tcb->backoff++;
  2502. if(tcb->state == Syn_sent)
  2503. maxback = MAXBACKMS/2;
  2504. else
  2505. maxback = MAXBACKMS;
  2506. tcb->backedoff += tcb->timer.start * MSPTICK;
  2507. if(tcb->backedoff >= maxback) {
  2508. localclose(s, Etimedout);
  2509. break;
  2510. }
  2511. netlog(s->p->f, Logtcprxmt, "timeout rexmit %#lux %d/%lud\n", tcb->snd.una, tcb->timer.start, NOW);
  2512. tcpsettimer(tcb);
  2513. tcprxmit(s);
  2514. tpriv->stats[RetransTimeouts]++;
  2515. tcb->snd.dupacks = 0;
  2516. break;
  2517. case Time_wait:
  2518. localclose(s, nil);
  2519. break;
  2520. case Closed:
  2521. break;
  2522. }
  2523. qunlock(s);
  2524. poperror();
  2525. }
  2526. int
  2527. inwindow(Tcpctl *tcb, int seq)
  2528. {
  2529. return seq_within(seq, tcb->rcv.nxt, tcb->rcv.nxt+tcb->rcv.wnd-1);
  2530. }
  2531. /*
  2532. * set up state for a received SYN (or SYN ACK) packet
  2533. */
  2534. void
  2535. procsyn(Conv *s, Tcp *seg)
  2536. {
  2537. Tcpctl *tcb;
  2538. Tcppriv *tpriv;
  2539. tcb = (Tcpctl*)s->ptcl;
  2540. tcb->flags |= FORCE;
  2541. tcb->rcv.nxt = seg->seq + 1;
  2542. tcb->rcv.urg = tcb->rcv.nxt;
  2543. tcb->irs = seg->seq;
  2544. /* our sending max segment size cannot be bigger than what he asked for */
  2545. if(seg->mss != 0 && seg->mss < tcb->mss) {
  2546. tcb->mss = seg->mss;
  2547. tpriv = s->p->priv;
  2548. tpriv->stats[Mss] = tcb->mss;
  2549. }
  2550. /* the congestion window always starts out as a single segment */
  2551. tcb->snd.wnd = seg->wnd;
  2552. tcb->cwind = tcb->mss;
  2553. }
  2554. int
  2555. addreseq(Tcpctl *tcb, Tcppriv *tpriv, Tcp *seg, Block *bp, ushort length)
  2556. {
  2557. Reseq *rp, *rp1;
  2558. int i, rqlen, qmax;
  2559. rp = malloc(sizeof(Reseq));
  2560. if(rp == nil){
  2561. freeblist(bp); /* bp always consumed by add_reseq */
  2562. return 0;
  2563. }
  2564. rp->seg = *seg;
  2565. rp->bp = bp;
  2566. rp->length = length;
  2567. /* Place on reassembly list sorting by starting seq number */
  2568. rp1 = tcb->reseq;
  2569. if(rp1 == nil || seq_lt(seg->seq, rp1->seg.seq)) {
  2570. rp->next = rp1;
  2571. tcb->reseq = rp;
  2572. if(rp->next != nil)
  2573. tpriv->stats[OutOfOrder]++;
  2574. return 0;
  2575. }
  2576. rqlen = 0;
  2577. for(i = 0;; i++) {
  2578. rqlen += rp1->length;
  2579. if(rp1->next == nil || seq_lt(seg->seq, rp1->next->seg.seq)) {
  2580. rp->next = rp1->next;
  2581. rp1->next = rp;
  2582. if(rp->next != nil)
  2583. tpriv->stats[OutOfOrder]++;
  2584. break;
  2585. }
  2586. rp1 = rp1->next;
  2587. }
  2588. qmax = QMAX<<tcb->rcv.scale;
  2589. if(rqlen > qmax){
  2590. print("resequence queue > window: %d > %d\n", rqlen, qmax);
  2591. i = 0;
  2592. for(rp1 = tcb->reseq; rp1 != nil; rp1 = rp1->next){
  2593. print("%#lux %#lux %#ux\n", rp1->seg.seq,
  2594. rp1->seg.ack, rp1->seg.flags);
  2595. if(i++ > 10){
  2596. print("...\n");
  2597. break;
  2598. }
  2599. }
  2600. /*
  2601. * delete entire reassembly queue; wait for retransmit.
  2602. * - should we be smarter and only delete the tail?
  2603. */
  2604. for(rp = tcb->reseq; rp != nil; rp = rp1){
  2605. rp1 = rp->next;
  2606. freeblist(rp->bp);
  2607. free(rp);
  2608. }
  2609. tcb->reseq = nil;
  2610. return -1;
  2611. }
  2612. return 0;
  2613. }
  2614. void
  2615. getreseq(Tcpctl *tcb, Tcp *seg, Block **bp, ushort *length)
  2616. {
  2617. Reseq *rp;
  2618. rp = tcb->reseq;
  2619. if(rp == nil)
  2620. return;
  2621. tcb->reseq = rp->next;
  2622. *seg = rp->seg;
  2623. *bp = rp->bp;
  2624. *length = rp->length;
  2625. free(rp);
  2626. }
  2627. int
  2628. tcptrim(Tcpctl *tcb, Tcp *seg, Block **bp, ushort *length)
  2629. {
  2630. ushort len;
  2631. uchar accept;
  2632. int dupcnt, excess;
  2633. accept = 0;
  2634. len = *length;
  2635. if(seg->flags & SYN)
  2636. len++;
  2637. if(seg->flags & FIN)
  2638. len++;
  2639. if(tcb->rcv.wnd == 0) {
  2640. if(len == 0 && seg->seq == tcb->rcv.nxt)
  2641. return 0;
  2642. }
  2643. else {
  2644. /* Some part of the segment should be in the window */
  2645. if(inwindow(tcb,seg->seq))
  2646. accept++;
  2647. else
  2648. if(len != 0) {
  2649. if(inwindow(tcb, seg->seq+len-1) ||
  2650. seq_within(tcb->rcv.nxt, seg->seq,seg->seq+len-1))
  2651. accept++;
  2652. }
  2653. }
  2654. if(!accept) {
  2655. freeblist(*bp);
  2656. return -1;
  2657. }
  2658. dupcnt = tcb->rcv.nxt - seg->seq;
  2659. if(dupcnt > 0){
  2660. tcb->rerecv += dupcnt;
  2661. if(seg->flags & SYN){
  2662. seg->flags &= ~SYN;
  2663. seg->seq++;
  2664. if(seg->urg > 1)
  2665. seg->urg--;
  2666. else
  2667. seg->flags &= ~URG;
  2668. dupcnt--;
  2669. }
  2670. if(dupcnt > 0){
  2671. pullblock(bp, (ushort)dupcnt);
  2672. seg->seq += dupcnt;
  2673. *length -= dupcnt;
  2674. if(seg->urg > dupcnt)
  2675. seg->urg -= dupcnt;
  2676. else {
  2677. seg->flags &= ~URG;
  2678. seg->urg = 0;
  2679. }
  2680. }
  2681. }
  2682. excess = seg->seq + *length - (tcb->rcv.nxt + tcb->rcv.wnd);
  2683. if(excess > 0) {
  2684. tcb->rerecv += excess;
  2685. *length -= excess;
  2686. *bp = trimblock(*bp, 0, *length);
  2687. if(*bp == nil)
  2688. panic("presotto is a boofhead");
  2689. seg->flags &= ~FIN;
  2690. }
  2691. return 0;
  2692. }
  2693. void
  2694. tcpadvise(Proto *tcp, Block *bp, char *msg)
  2695. {
  2696. Tcp4hdr *h4;
  2697. Tcp6hdr *h6;
  2698. Tcpctl *tcb;
  2699. uchar source[IPaddrlen];
  2700. uchar dest[IPaddrlen];
  2701. ushort psource, pdest;
  2702. Conv *s, **p;
  2703. h4 = (Tcp4hdr*)(bp->rp);
  2704. h6 = (Tcp6hdr*)(bp->rp);
  2705. if((h4->vihl&0xF0)==IP_VER4) {
  2706. v4tov6(dest, h4->tcpdst);
  2707. v4tov6(source, h4->tcpsrc);
  2708. psource = nhgets(h4->tcpsport);
  2709. pdest = nhgets(h4->tcpdport);
  2710. }
  2711. else {
  2712. ipmove(dest, h6->tcpdst);
  2713. ipmove(source, h6->tcpsrc);
  2714. psource = nhgets(h6->tcpsport);
  2715. pdest = nhgets(h6->tcpdport);
  2716. }
  2717. /* Look for a connection */
  2718. qlock(tcp);
  2719. for(p = tcp->conv; *p; p++) {
  2720. s = *p;
  2721. tcb = (Tcpctl*)s->ptcl;
  2722. if(s->rport == pdest)
  2723. if(s->lport == psource)
  2724. if(tcb->state != Closed)
  2725. if(ipcmp(s->raddr, dest) == 0)
  2726. if(ipcmp(s->laddr, source) == 0){
  2727. qlock(s);
  2728. qunlock(tcp);
  2729. switch(tcb->state){
  2730. case Syn_sent:
  2731. localclose(s, msg);
  2732. break;
  2733. }
  2734. qunlock(s);
  2735. freeblist(bp);
  2736. return;
  2737. }
  2738. }
  2739. qunlock(tcp);
  2740. freeblist(bp);
  2741. }
  2742. static char*
  2743. tcpporthogdefensectl(char *val)
  2744. {
  2745. if(strcmp(val, "on") == 0)
  2746. tcpporthogdefense = 1;
  2747. else if(strcmp(val, "off") == 0)
  2748. tcpporthogdefense = 0;
  2749. else
  2750. return "unknown value for tcpporthogdefense";
  2751. return nil;
  2752. }
  2753. /* called with c qlocked */
  2754. char*
  2755. tcpctl(Conv* c, char** f, int n)
  2756. {
  2757. if(n == 1 && strcmp(f[0], "hangup") == 0)
  2758. return tcphangup(c);
  2759. if(n >= 1 && strcmp(f[0], "keepalive") == 0)
  2760. return tcpstartka(c, f, n);
  2761. if(n >= 1 && strcmp(f[0], "checksum") == 0)
  2762. return tcpsetchecksum(c, f, n);
  2763. if(n >= 1 && strcmp(f[0], "tcpporthogdefense") == 0)
  2764. return tcpporthogdefensectl(f[1]);
  2765. return "unknown control request";
  2766. }
  2767. int
  2768. tcpstats(Proto *tcp, char *buf, int len)
  2769. {
  2770. Tcppriv *priv;
  2771. char *p, *e;
  2772. int i;
  2773. priv = tcp->priv;
  2774. p = buf;
  2775. e = p+len;
  2776. for(i = 0; i < Nstats; i++)
  2777. p = seprint(p, e, "%s: %llud\n", statnames[i], priv->stats[i]);
  2778. return p - buf;
  2779. }
  2780. /*
  2781. * garbage collect any stale conversations:
  2782. * - SYN received but no SYN-ACK after 5 seconds (could be the SYN attack)
  2783. * - Finwait2 after 5 minutes
  2784. *
  2785. * this is called whenever we run out of channels. Both checks are
  2786. * of questionable validity so we try to use them only when we're
  2787. * up against the wall.
  2788. */
  2789. int
  2790. tcpgc(Proto *tcp)
  2791. {
  2792. Conv *c, **pp, **ep;
  2793. int n;
  2794. Tcpctl *tcb;
  2795. n = 0;
  2796. ep = &tcp->conv[tcp->nc];
  2797. for(pp = tcp->conv; pp < ep; pp++) {
  2798. c = *pp;
  2799. if(c == nil)
  2800. break;
  2801. if(!canqlock(c))
  2802. continue;
  2803. tcb = (Tcpctl*)c->ptcl;
  2804. switch(tcb->state){
  2805. case Syn_received:
  2806. if(NOW - tcb->time > 5000){
  2807. localclose(c, "timed out");
  2808. n++;
  2809. }
  2810. break;
  2811. case Finwait2:
  2812. if(NOW - tcb->time > 5*60*1000){
  2813. localclose(c, "timed out");
  2814. n++;
  2815. }
  2816. break;
  2817. }
  2818. qunlock(c);
  2819. }
  2820. return n;
  2821. }
  2822. void
  2823. tcpsettimer(Tcpctl *tcb)
  2824. {
  2825. int x;
  2826. /* round trip dependency */
  2827. x = backoff(tcb->backoff) *
  2828. (tcb->mdev + (tcb->srtt>>LOGAGAIN) + MSPTICK) / MSPTICK;
  2829. /* bounded twixt 1/2 and 64 seconds */
  2830. if(x < 500/MSPTICK)
  2831. x = 500/MSPTICK;
  2832. else if(x > (64000/MSPTICK))
  2833. x = 64000/MSPTICK;
  2834. tcb->timer.start = x;
  2835. }
  2836. void
  2837. tcpinit(Fs *fs)
  2838. {
  2839. Proto *tcp;
  2840. Tcppriv *tpriv;
  2841. tcp = smalloc(sizeof(Proto));
  2842. tpriv = tcp->priv = smalloc(sizeof(Tcppriv));
  2843. tcp->name = "tcp";
  2844. tcp->connect = tcpconnect;
  2845. tcp->announce = tcpannounce;
  2846. tcp->ctl = tcpctl;
  2847. tcp->state = tcpstate;
  2848. tcp->create = tcpcreate;
  2849. tcp->close = tcpclose;
  2850. tcp->rcv = tcpiput;
  2851. tcp->advise = tcpadvise;
  2852. tcp->stats = tcpstats;
  2853. tcp->inuse = tcpinuse;
  2854. tcp->gc = tcpgc;
  2855. tcp->ipproto = IP_TCPPROTO;
  2856. tcp->nc = scalednconv();
  2857. tcp->ptclsize = sizeof(Tcpctl);
  2858. tpriv->stats[MaxConn] = tcp->nc;
  2859. Fsproto(fs, tcp);
  2860. }
  2861. void
  2862. tcpsetscale(Conv *s, Tcpctl *tcb, ushort rcvscale, ushort sndscale)
  2863. {
  2864. if(rcvscale){
  2865. tcb->rcv.scale = rcvscale & 0xff;
  2866. tcb->snd.scale = sndscale & 0xff;
  2867. tcb->window = QMAX<<tcb->snd.scale;
  2868. qsetlimit(s->rq, tcb->window);
  2869. } else {
  2870. tcb->rcv.scale = 0;
  2871. tcb->snd.scale = 0;
  2872. tcb->window = QMAX;
  2873. qsetlimit(s->rq, tcb->window);
  2874. }
  2875. }