url.c 24 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085
  1. /*
  2. * This is a URL parser, written to parse "Common Internet Scheme" URL
  3. * syntax as described in RFC1738 and updated by RFC2396. Only absolute URLs
  4. * are supported, using "server-based" naming authorities in the schemes.
  5. * Support for literal IPv6 addresses is included, per RFC2732.
  6. *
  7. * Current "known" schemes: http, ftp, file.
  8. *
  9. * We can do all the parsing operations without Runes since URLs are
  10. * defined to be composed of US-ASCII printable characters.
  11. * See RFC1738, RFC2396.
  12. */
  13. #include <u.h>
  14. #include <libc.h>
  15. #include <ctype.h>
  16. #include <regexp.h>
  17. #include <plumb.h>
  18. #include <thread.h>
  19. #include <fcall.h>
  20. #include <9p.h>
  21. #include "dat.h"
  22. #include "fns.h"
  23. int urldebug;
  24. /* If set, relative paths with leading ".." segments will have them trimmed */
  25. #define RemoveExtraRelDotDots 0
  26. #define ExpandCurrentDocUrls 1
  27. static char*
  28. schemestrtab[] =
  29. {
  30. nil,
  31. "http",
  32. "https",
  33. "ftp",
  34. "file",
  35. };
  36. static int
  37. ischeme(char *s)
  38. {
  39. int i;
  40. for(i=0; i<nelem(schemestrtab); i++)
  41. if(schemestrtab[i] && strcmp(s, schemestrtab[i])==0)
  42. return i;
  43. return USunknown;
  44. }
  45. /*
  46. * URI splitting regexp is from RFC2396, Appendix B:
  47. * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
  48. * 12 3 4 5 6 7 8 9
  49. *
  50. * Example: "http://www.ics.uci.edu/pub/ietf/uri/#Related"
  51. * $2 = scheme "http"
  52. * $4 = authority "www.ics.uci.edu"
  53. * $5 = path "/pub/ietf/uri/"
  54. * $7 = query <undefined>
  55. * $9 = fragment "Related"
  56. */
  57. /*
  58. * RFC2396, Sec 3.1, contains:
  59. *
  60. * Scheme names consist of a sequence of characters beginning with a
  61. * lower case letter and followed by any combination of lower case
  62. * letters, digits, plus ("+"), period ("."), or hyphen ("-"). For
  63. * resiliency, programs interpreting URI should treat upper case letters
  64. * as equivalent to lower case in scheme names (e.g., allow "HTTP" as
  65. * well as "http").
  66. */
  67. /*
  68. * For server-based naming authorities (RFC2396 Sec 3.2.2):
  69. * server = [ [ userinfo "@" ] hostport ]
  70. * userinfo = *( unreserved | escaped |
  71. * ";" | ":" | "&" | "=" | "+" | "$" | "," )
  72. * hostport = host [ ":" port ]
  73. * host = hostname | IPv4address
  74. * hostname = *( domainlabel "." ) toplabel [ "." ]
  75. * domainlabel = alphanum | alphanum *( alphanum | "-" ) alphanum
  76. * toplabel = alpha | alpha *( alphanum | "-" ) alphanum
  77. * IPv4address = 1*digit "." 1*digit "." 1*digit "." 1*digit
  78. * port = *digit
  79. *
  80. * The host is a domain name of a network host, or its IPv4 address as a
  81. * set of four decimal digit groups separated by ".". Literal IPv6
  82. * addresses are not supported.
  83. *
  84. * Note that literal IPv6 address support is outlined in RFC2732:
  85. * host = hostname | IPv4address | IPv6reference
  86. * ipv6reference = "[" IPv6address "]" (RFC2373)
  87. *
  88. * Since hostnames and numbers will have to be resolved by the OS anyway,
  89. * we don't have to parse them too pedantically (counting '.'s, checking
  90. * for well-formed literal IP addresses, etc.).
  91. *
  92. * In FTP/file paths, we reject most ";param"s and querys. In HTTP paths,
  93. * we just pass them through.
  94. *
  95. * Instead of letting a "path" be 0-or-more characters as RFC2396 suggests,
  96. * we'll say it's 1-or-more characters, 0-or-1 times. This way, an absent
  97. * path yields a nil substring match, instead of an empty one.
  98. *
  99. * We're more restrictive than RFC2396 indicates with "userinfo" strings,
  100. * insisting they have the form "[user[:password]]". This may need to
  101. * change at some point, however.
  102. */
  103. /* RE character-class components -- these go in brackets */
  104. #define PUNCT "\\-_.!~*'()"
  105. #define RES ";/?:@&=+$,"
  106. #define ALNUM "a-zA-Z0-9"
  107. #define HEX "0-9a-fA-F"
  108. #define UNRES ALNUM PUNCT
  109. /* RE components; _N => has N parenthesized subexpressions when expanded */
  110. #define ESCAPED_1 "(%[" HEX "][" HEX "])"
  111. #define URIC_2 "([" RES UNRES "]|" ESCAPED_1 ")"
  112. #define URICNOSLASH_2 "([" UNRES ";?:@&=+$,]|" ESCAPED_1 ")"
  113. #define USERINFO_2 "([" UNRES ";:&=+$,]|" ESCAPED_1 ")"
  114. #define PCHAR_2 "([" UNRES ":@&=+$,]|" ESCAPED_1 ")"
  115. #define PSEGCHAR_3 "([/;]|" PCHAR_2 ")"
  116. typedef struct Retab Retab;
  117. struct Retab
  118. {
  119. char *str;
  120. Reprog *prog;
  121. int size;
  122. int ind[5];
  123. };
  124. enum
  125. {
  126. REsplit = 0,
  127. REscheme,
  128. REunknowndata,
  129. REauthority,
  130. REhost,
  131. REuserinfo,
  132. REabspath,
  133. REquery,
  134. REfragment,
  135. REhttppath,
  136. REftppath,
  137. REfilepath,
  138. MaxResub= 20,
  139. };
  140. Retab retab[] = /* view in constant width Font */
  141. {
  142. [REsplit]
  143. "^(([^:/?#]+):)?(//([^/?#]*))?([^?#]+)?(\\?([^#]*))?(#(.*))?$", nil, 0,
  144. /* |-scheme-| |-auth.-| |path--| |query| |--|frag */
  145. { 2, 4, 5, 7, 9},
  146. [REscheme]
  147. "^[a-z][a-z0-9+-.]*$", nil, 0,
  148. { 0, },
  149. [REunknowndata]
  150. "^" URICNOSLASH_2 URIC_2 "*$", nil, 0,
  151. { 0, },
  152. [REauthority]
  153. "^(((" USERINFO_2 "*)@)?(((\\[[^\\]@]+\\])|([^:\\[@]+))(:([0-9]*))?)?)?$", nil, 0,
  154. /* |----user info-----| |--------host----------------| |-port-| */
  155. { 2, 7, 12, },
  156. [REhost]
  157. "^(([a-zA-Z0-9\\-.]+)|(\\[([a-fA-F0-9.:]+)\\]))$", nil, 0,
  158. /* |--regular host--| |-IPv6 literal-| */
  159. { 2, 4, },
  160. [REuserinfo]
  161. "^(([^:]*)(:([^:]*))?)$", nil, 0,
  162. /* |user-| |pass-| */
  163. { 2, 4, },
  164. [REabspath]
  165. "^/" PSEGCHAR_3 "*$", nil, 0,
  166. { 0, },
  167. [REquery]
  168. "^" URIC_2 "*$", nil, 0,
  169. { 0, },
  170. [REfragment]
  171. "^" URIC_2 "*$", nil, 0,
  172. { 0, },
  173. [REhttppath]
  174. "^.*$", nil, 0,
  175. { 0, },
  176. [REftppath]
  177. "^(.+)(;[tT][yY][pP][eE]=([aAiIdD]))?$", nil, 0,
  178. /*|--|-path |ftptype-| */
  179. { 1, 3, },
  180. [REfilepath]
  181. "^.*$", nil, 0,
  182. { 0, },
  183. };
  184. static int
  185. countleftparen(char *s)
  186. {
  187. int n;
  188. n = 0;
  189. for(; *s; s++)
  190. if(*s == '(')
  191. n++;
  192. return n;
  193. }
  194. void
  195. initurl(void)
  196. {
  197. int i, j;
  198. for(i=0; i<nelem(retab); i++){
  199. retab[i].prog = regcomp(retab[i].str);
  200. if(retab[i].prog == nil)
  201. sysfatal("recomp(%s): %r", retab[i].str);
  202. retab[i].size = countleftparen(retab[i].str)+1;
  203. for(j=0; j<nelem(retab[i].ind); j++)
  204. if(retab[i].ind[j] >= retab[i].size)
  205. sysfatal("bad index in regexp table: retab[%d].ind[%d] = %d >= %d",
  206. i, j, retab[i].ind[j], retab[i].size);
  207. if(MaxResub < retab[i].size)
  208. sysfatal("MaxResub too small: %d < %d", MaxResub, retab[i].size);
  209. }
  210. }
  211. typedef struct SplitUrl SplitUrl;
  212. struct SplitUrl
  213. {
  214. struct {
  215. char *s;
  216. char *e;
  217. } url, scheme, authority, path, query, fragment;
  218. };
  219. /*
  220. * Implements the algorithm in RFC2396 sec 5.2 step 6.
  221. * Returns number of chars written, excluding NUL terminator.
  222. * dest is known to be >= strlen(base)+rel_len.
  223. */
  224. static void
  225. merge_relative_path(char *base, char *rel_st, int rel_len, char *dest)
  226. {
  227. char *s, *p, *e, *pdest;
  228. pdest = dest;
  229. /* 6a: start with base, discard last segment */
  230. if(base){
  231. /* Empty paths don't match in our scheme; 'base' should be nil */
  232. assert(base[0] == '/');
  233. e = strrchr(base, '/');
  234. e++;
  235. memmove(pdest, base, e-base);
  236. pdest += e-base;
  237. }else{
  238. /* Artistic license on my part */
  239. *pdest++ = '/';
  240. }
  241. /* 6b: append relative component */
  242. if(rel_st){
  243. memmove(pdest, rel_st, rel_len);
  244. pdest += rel_len;
  245. }
  246. /* 6c: remove any occurrences of "./" as a complete segment */
  247. s = dest;
  248. *pdest = '\0';
  249. while(e = strstr(s, "./")){
  250. if((e == dest) || (*(e-1) == '/')){
  251. memmove(e, e+2, pdest+1-(e+2)); /* +1 for NUL */
  252. pdest -= 2;
  253. }else
  254. s = e+1;
  255. }
  256. /* 6d: remove a trailing "." as a complete segment */
  257. if(pdest>dest && *(pdest-1)=='.' &&
  258. (pdest==dest+1 || *(pdest-2)=='/'))
  259. *--pdest = '\0';
  260. /* 6e: remove occurences of "seg/../", where seg != "..", left->right */
  261. s = dest+1;
  262. while(e = strstr(s, "/../")){
  263. p = e - 1;
  264. while(p >= dest && *p != '/')
  265. p--;
  266. if(memcmp(p, "/../", 4) != 0){
  267. memmove(p+1, e+4, pdest+1-(e+4));
  268. pdest -= (e+4) - (p+1);
  269. }else
  270. s = e+1;
  271. }
  272. /* 6f: remove a trailing "seg/..", where seg isn't ".." */
  273. if(pdest-3 > dest && memcmp(pdest-3, "/..", 3)==0){
  274. p = pdest-3 - 1;
  275. while(p >= dest && *p != '/')
  276. p--;
  277. if(memcmp(p, "/../", 4) != 0){
  278. pdest = p+1;
  279. *pdest = '\0';
  280. }
  281. }
  282. /* 6g: leading ".." segments are errors -- we'll just blat them out. */
  283. if(RemoveExtraRelDotDots){
  284. p = dest;
  285. if (p[0] == '/')
  286. p++;
  287. s = p;
  288. while(s[0]=='.' && s[1]=='.' && (s[2]==0 || s[2]=='/'))
  289. s += 3;
  290. if(s > p){
  291. memmove(p, s, pdest+1-s);
  292. pdest -= s-p;
  293. }
  294. }
  295. USED(pdest);
  296. if(urldebug)
  297. fprint(2, "merge_relative_path: '%s' + '%.*s' -> '%s'\n", base, rel_len,
  298. rel_st, dest);
  299. }
  300. /*
  301. * See RFC2396 sec 5.2 for info on resolving relative URIs to absolute form.
  302. *
  303. * If successful, this just ends up freeing and replacing "u->url".
  304. */
  305. static int
  306. resolve_relative(SplitUrl *su, Url *base, Url *u)
  307. {
  308. char *url, *path;
  309. char *purl, *ppath;
  310. int currentdoc, ulen, plen;
  311. if(base == nil){
  312. werrstr("relative URI given without base");
  313. return -1;
  314. }
  315. if(base->scheme == nil){
  316. werrstr("relative URI given with no scheme");
  317. return -1;
  318. }
  319. if(base->ischeme == USunknown){
  320. werrstr("relative URI given with unknown scheme");
  321. return -1;
  322. }
  323. if(base->ischeme == UScurrent){
  324. werrstr("relative URI given with incomplete base");
  325. return -1;
  326. }
  327. assert(su->scheme.s == nil);
  328. /* Sec 5.2 step 2 */
  329. currentdoc = 0;
  330. if(su->path.s==nil && su->scheme.s==nil && su->authority.s==nil && su->query.s==nil){
  331. /* Reference is to current document */
  332. if(urldebug)
  333. fprint(2, "url %s is relative to current document\n", u->url);
  334. u->ischeme = UScurrent;
  335. if(!ExpandCurrentDocUrls)
  336. return 0;
  337. currentdoc = 1;
  338. }
  339. /* Over-estimate the maximum lengths, for allocation purposes */
  340. /* (constants are for separators) */
  341. plen = 1;
  342. if(base->path)
  343. plen += strlen(base->path);
  344. if(su->path.s)
  345. plen += 1 + (su->path.e - su->path.s);
  346. ulen = 0;
  347. ulen += strlen(base->scheme) + 1;
  348. if(su->authority.s)
  349. ulen += 2 + (su->authority.e - su->authority.s);
  350. else
  351. ulen += 2 + ((base->authority) ? strlen(base->authority) : 0);
  352. ulen += plen;
  353. if(su->query.s)
  354. ulen += 1 + (su->query.e - su->query.s);
  355. else if(currentdoc && base->query)
  356. ulen += 1 + strlen(base->query);
  357. if(su->fragment.s)
  358. ulen += 1 + (su->fragment.e - su->fragment.s);
  359. else if(currentdoc && base->fragment)
  360. ulen += 1 + strlen(base->fragment);
  361. url = emalloc(ulen+1);
  362. path = emalloc(plen+1);
  363. url[0] = '\0';
  364. purl = url;
  365. path[0] = '\0';
  366. ppath = path;
  367. if(su->authority.s || (su->path.s && (su->path.s[0] == '/'))){
  368. /* Is a "network-path" or "absolute-path"; don't merge with base path */
  369. /* Sec 5.2 steps 4,5 */
  370. if(su->path.s){
  371. memmove(ppath, su->path.s, su->path.e - su->path.s);
  372. ppath += su->path.e - su->path.s;
  373. *ppath = '\0';
  374. }
  375. }else if(currentdoc){
  376. /* Is a current-doc reference; just copy the path from the base URL */
  377. if(base->path){
  378. strcpy(ppath, base->path);
  379. ppath += strlen(ppath);
  380. }
  381. USED(ppath);
  382. }else{
  383. /* Is a relative-path reference; we have to merge it */
  384. /* Sec 5.2 step 6 */
  385. merge_relative_path(base->path,
  386. su->path.s, su->path.e - su->path.s, ppath);
  387. }
  388. /* Build new URL from pieces, inheriting from base where needed */
  389. strcpy(purl, base->scheme);
  390. purl += strlen(purl);
  391. *purl++ = ':';
  392. if(su->authority.s){
  393. strcpy(purl, "//");
  394. purl += strlen(purl);
  395. memmove(purl, su->authority.s, su->authority.e - su->authority.s);
  396. purl += su->authority.e - su->authority.s;
  397. }else if(base->authority){
  398. strcpy(purl, "//");
  399. purl += strlen(purl);
  400. strcpy(purl, base->authority);
  401. purl += strlen(purl);
  402. }
  403. assert((path[0] == '\0') || (path[0] == '/'));
  404. strcpy(purl, path);
  405. purl += strlen(purl);
  406. /*
  407. * The query and fragment are not inherited from the base,
  408. * except in case of "current document" URLs, which inherit any query
  409. * and may inherit the fragment.
  410. */
  411. if(su->query.s){
  412. *purl++ = '?';
  413. memmove(purl, su->query.s, su->query.e - su->query.s);
  414. purl += su->query.e - su->query.s;
  415. }else if(currentdoc && base->query){
  416. *purl++ = '?';
  417. strcpy(purl, base->query);
  418. purl += strlen(purl);
  419. }
  420. if(su->fragment.s){
  421. *purl++ = '#';
  422. memmove(purl, su->query.s, su->query.e - su->query.s);
  423. purl += su->fragment.e - su->fragment.s;
  424. }else if(currentdoc && base->fragment){
  425. *purl++ = '#';
  426. strcpy(purl, base->fragment);
  427. purl += strlen(purl);
  428. }
  429. USED(purl);
  430. if(urldebug)
  431. fprint(2, "resolve_relative: '%s' + '%s' -> '%s'\n", base->url, u->url, url);
  432. free(u->url);
  433. u->url = url;
  434. free(path);
  435. return 0;
  436. }
  437. int
  438. regx(Reprog *prog, char *s, Resub *m, int nm)
  439. {
  440. int i;
  441. if(s == nil)
  442. s = m[0].sp; /* why is this necessary? */
  443. i = regexec(prog, s, m, nm);
  444. /*
  445. if(i >= 0)
  446. for(j=0; j<nm; j++)
  447. fprint(2, "match%d: %.*s\n", j, utfnlen(m[j].sp, m[j].ep-m[j].sp), m[j].sp);
  448. */
  449. return i;
  450. }
  451. static int
  452. ismatch(int i, char *s, char *desc)
  453. {
  454. Resub m[1];
  455. m[0].sp = m[0].ep = nil;
  456. if(!regx(retab[i].prog, s, m, 1)){
  457. werrstr("malformed %s: %q", desc, s);
  458. return 0;
  459. }
  460. return 1;
  461. }
  462. static int
  463. spliturl(char *url, SplitUrl *su)
  464. {
  465. Resub m[MaxResub];
  466. Retab *t;
  467. /*
  468. * Newlines are not valid in a URI, but regexp(2) treats them specially
  469. * so it's best to make sure there are none before proceeding.
  470. */
  471. if(strchr(url, '\n')){
  472. werrstr("newline in URI");
  473. return -1;
  474. }
  475. /*
  476. * Because we use NUL-terminated strings, as do many client and server
  477. * implementations, an escaped NUL ("%00") will quite likely cause problems
  478. * when unescaped. We can check for such a sequence once before examining
  479. * the components because, per RFC2396 sec. 2.4.1 - 2.4.2, '%' is reserved
  480. * in URIs to _always_ indicate escape sequences. Something like "%2500"
  481. * will still get by, but that's legitimate, and if it ends up causing
  482. * a NUL then someone is unescaping too many times.
  483. */
  484. if(strstr(url, "%00")){
  485. werrstr("escaped NUL in URI");
  486. return -1;
  487. }
  488. m[0].sp = m[0].ep = nil;
  489. t = &retab[REsplit];
  490. if(!regx(t->prog, url, m, t->size)){
  491. werrstr("malformed URI: %q", url);
  492. return -1;
  493. }
  494. su->url.s = m[0].sp;
  495. su->url.e = m[0].ep;
  496. su->scheme.s = m[t->ind[0]].sp;
  497. su->scheme.e = m[t->ind[0]].ep;
  498. su->authority.s = m[t->ind[1]].sp;
  499. su->authority.e = m[t->ind[1]].ep;
  500. su->path.s = m[t->ind[2]].sp;
  501. su->path.e = m[t->ind[2]].ep;
  502. su->query.s = m[t->ind[3]].sp;
  503. su->query.e = m[t->ind[3]].ep;
  504. su->fragment.s = m[t->ind[4]].sp;
  505. su->fragment.e = m[t->ind[4]].ep;
  506. if(urldebug)
  507. fprint(2, "split url %s into %.*q %.*q %.*q %.*q %.*q %.*q\n",
  508. url,
  509. su->url.s ? utfnlen(su->url.s, su->url.e-su->url.s) : 10, su->url.s ? su->url.s : "",
  510. su->scheme.s ? utfnlen(su->scheme.s, su->scheme.e-su->scheme.s) : 10, su->scheme.s ? su->scheme.s : "",
  511. su->authority.s ? utfnlen(su->authority.s, su->authority.e-su->authority.s) : 10, su->authority.s ? su->authority.s : "",
  512. su->path.s ? utfnlen(su->path.s, su->path.e-su->path.s) : 10, su->path.s ? su->path.s : "",
  513. su->query.s ? utfnlen(su->query.s, su->query.e-su->query.s) : 10, su->query.s ? su->query.s : "",
  514. su->fragment.s ? utfnlen(su->fragment.s, su->fragment.e-su->fragment.s) : 10, su->fragment.s ? su->fragment.s : "");
  515. return 0;
  516. }
  517. static int
  518. parse_scheme(SplitUrl *su, Url *u)
  519. {
  520. if(su->scheme.s == nil){
  521. werrstr("missing scheme");
  522. return -1;
  523. }
  524. u->scheme = estredup(su->scheme.s, su->scheme.e);
  525. strlower(u->scheme);
  526. if(!ismatch(REscheme, u->scheme, "scheme"))
  527. return -1;
  528. u->ischeme = ischeme(u->scheme);
  529. if(urldebug)
  530. fprint(2, "parse_scheme %s => %d\n", u->scheme, u->ischeme);
  531. return 0;
  532. }
  533. static int
  534. parse_unknown_part(SplitUrl *su, Url *u)
  535. {
  536. char *s, *e;
  537. assert(u->ischeme == USunknown);
  538. assert(su->scheme.e[0] == ':');
  539. s = su->scheme.e+1;
  540. if(su->fragment.s){
  541. e = su->fragment.s-1;
  542. assert(*e == '#');
  543. }else
  544. e = s+strlen(s);
  545. u->schemedata = estredup(s, e);
  546. if(!ismatch(REunknowndata, u->schemedata, "unknown scheme data"))
  547. return -1;
  548. return 0;
  549. }
  550. static int
  551. parse_userinfo(char *s, char *e, Url *u)
  552. {
  553. Resub m[MaxResub];
  554. Retab *t;
  555. m[0].sp = s;
  556. m[0].ep = e;
  557. t = &retab[REuserinfo];
  558. if(!regx(t->prog, nil, m, t->size)){
  559. werrstr("malformed userinfo: %.*q", utfnlen(s, e-s), s);
  560. return -1;
  561. }
  562. if(m[t->ind[0]].sp)
  563. u->user = estredup(m[t->ind[0]].sp, m[t->ind[0]].ep);
  564. if(m[t->ind[1]].sp)
  565. u->user = estredup(m[t->ind[1]].sp, m[t->ind[1]].ep);
  566. return 0;
  567. }
  568. static int
  569. parse_host(char *s, char *e, Url *u)
  570. {
  571. Resub m[MaxResub];
  572. Retab *t;
  573. m[0].sp = s;
  574. m[0].ep = e;
  575. t = &retab[REhost];
  576. if(!regx(t->prog, nil, m, t->size)){
  577. werrstr("malformed host: %.*q", utfnlen(s, e-s), s);
  578. return -1;
  579. }
  580. assert(m[t->ind[0]].sp || m[t->ind[1]].sp);
  581. if(m[t->ind[0]].sp) /* regular */
  582. u->host = estredup(m[t->ind[0]].sp, m[t->ind[0]].ep);
  583. else
  584. u->host = estredup(m[t->ind[1]].sp, m[t->ind[1]].ep);
  585. return 0;
  586. }
  587. static int
  588. parse_authority(SplitUrl *su, Url *u)
  589. {
  590. Resub m[MaxResub];
  591. Retab *t;
  592. if(su->authority.s == nil)
  593. return 0;
  594. u->authority = estredup(su->authority.s, su->authority.e);
  595. m[0].sp = m[0].ep = nil;
  596. t = &retab[REauthority];
  597. if(!regx(t->prog, u->authority, m, t->size)){
  598. werrstr("malformed authority: %q", u->authority);
  599. return -1;
  600. }
  601. if(m[t->ind[0]].sp)
  602. if(parse_userinfo(m[t->ind[0]].sp, m[t->ind[0]].ep, u) < 0)
  603. return -1;
  604. if(m[t->ind[1]].sp)
  605. if(parse_host(m[t->ind[1]].sp, m[t->ind[1]].ep, u) < 0)
  606. return -1;
  607. if(m[t->ind[2]].sp)
  608. u->port = estredup(m[t->ind[2]].sp, m[t->ind[2]].ep);
  609. return 0;
  610. }
  611. static int
  612. parse_abspath(SplitUrl *su, Url *u)
  613. {
  614. if(su->path.s == nil)
  615. return 0;
  616. u->path = estredup(su->path.s, su->path.e);
  617. if(!ismatch(REabspath, u->path, "absolute path"))
  618. return -1;
  619. return 0;
  620. }
  621. static int
  622. parse_query(SplitUrl *su, Url *u)
  623. {
  624. if(su->query.s == nil)
  625. return 0;
  626. u->query = estredup(su->query.s, su->query.e);
  627. if(!ismatch(REquery, u->query, "query"))
  628. return -1;
  629. return 0;
  630. }
  631. static int
  632. parse_fragment(SplitUrl *su, Url *u)
  633. {
  634. if(su->fragment.s == nil)
  635. return 0;
  636. u->fragment = estredup(su->fragment.s, su->fragment.e);
  637. if(!ismatch(REfragment, u->fragment, "fragment"))
  638. return -1;
  639. return 0;
  640. }
  641. static int
  642. postparse_http(Url *u)
  643. {
  644. u->open = httpopen;
  645. u->read = httpread;
  646. u->close = httpclose;
  647. if(u->authority==nil){
  648. werrstr("missing authority (hostname, port, etc.)");
  649. return -1;
  650. }
  651. if(u->user || u->passwd){
  652. werrstr("user information not valid with http");
  653. return -1;
  654. }
  655. if(u->host == nil){
  656. werrstr("missing host specification");
  657. return -1;
  658. }
  659. if(u->path == nil){
  660. u->http.page_spec = estrdup("/");
  661. return 0;
  662. }
  663. if(!ismatch(REhttppath, u->path, "http path"))
  664. return -1;
  665. if(u->query){
  666. u->http.page_spec = emalloc(strlen(u->path)+1+strlen(u->query)+1);
  667. strcpy(u->http.page_spec, u->path);
  668. strcat(u->http.page_spec, "?");
  669. strcat(u->http.page_spec, u->query);
  670. }else
  671. u->http.page_spec = estrdup(u->path);
  672. return 0;
  673. }
  674. static int
  675. postparse_ftp(Url *u)
  676. {
  677. Resub m[MaxResub];
  678. Retab *t;
  679. if(u->authority==nil){
  680. werrstr("missing authority (hostname, port, etc.)");
  681. return -1;
  682. }
  683. if(u->query){
  684. werrstr("unexpected \"?query\" in ftp path");
  685. return -1;
  686. }
  687. if(u->host == nil){
  688. werrstr("missing host specification");
  689. return -1;
  690. }
  691. if(u->path == nil){
  692. u->ftp.path_spec = estrdup("/");
  693. return 0;
  694. }
  695. m[0].sp = m[0].ep = nil;
  696. t = &retab[REftppath];
  697. if(!regx(t->prog, u->path, m, t->size)){
  698. werrstr("malformed ftp path: %q", u->path);
  699. return -1;
  700. }
  701. if(m[t->ind[0]].sp){
  702. u->ftp.path_spec = estredup(m[t->ind[0]].sp, m[t->ind[0]].ep);
  703. if(strchr(u->ftp.path_spec, ';')){
  704. werrstr("unexpected \";param\" in ftp path");
  705. return -1;
  706. }
  707. }else
  708. u->ftp.path_spec = estrdup("/");
  709. if(m[t->ind[1]].sp){
  710. u->ftp.type = estredup(m[t->ind[1]].sp, m[t->ind[1]].ep);
  711. strlower(u->ftp.type);
  712. }
  713. return 0;
  714. }
  715. static int
  716. postparse_file(Url *u)
  717. {
  718. if(u->user || u->passwd){
  719. werrstr("user information not valid with file scheme");
  720. return -1;
  721. }
  722. if(u->query){
  723. werrstr("unexpected \"?query\" in file path");
  724. return -1;
  725. }
  726. if(u->port){
  727. werrstr("port not valid with file scheme");
  728. return -1;
  729. }
  730. if(u->path == nil){
  731. werrstr("missing path in file scheme");
  732. return -1;
  733. }
  734. if(strchr(u->path, ';')){
  735. werrstr("unexpected \";param\" in file path");
  736. return -1;
  737. }
  738. if(!ismatch(REfilepath, u->path, "file path"))
  739. return -1;
  740. /* "localhost" is equivalent to no host spec, we'll chose the latter */
  741. if(u->host && cistrcmp(u->host, "localhost") == 0){
  742. free(u->host);
  743. u->host = nil;
  744. }
  745. return 0;
  746. }
  747. static int (*postparse[])(Url*) = {
  748. nil,
  749. postparse_http,
  750. postparse_http,
  751. postparse_ftp,
  752. postparse_file,
  753. };
  754. Url*
  755. parseurl(char *url, Url *base)
  756. {
  757. Url *u;
  758. SplitUrl su;
  759. if(urldebug)
  760. fprint(2, "parseurl %s with base %s\n", url, base ? base->url : "<none>");
  761. u = emalloc(sizeof(Url));
  762. u->url = estrdup(url);
  763. if(spliturl(u->url, &su) < 0){
  764. Fail:
  765. freeurl(u);
  766. return nil;
  767. }
  768. /* RFC2396 sec 3.1 says relative URIs are distinguished by absent scheme */
  769. if(su.scheme.s==nil){
  770. if(urldebug)
  771. fprint(2, "parseurl has nil scheme\n");
  772. if(resolve_relative(&su, base, u) < 0 || spliturl(u->url, &su) < 0)
  773. goto Fail;
  774. if(u->ischeme == UScurrent){
  775. /* 'u.url' refers to current document; set fragment and return */
  776. if(parse_fragment(&su, u) < 0)
  777. goto Fail;
  778. return u;
  779. }
  780. }
  781. if(parse_scheme(&su, u) < 0
  782. || parse_fragment(&su, u) < 0)
  783. goto Fail;
  784. if(u->ischeme == USunknown){
  785. if(parse_unknown_part(&su, u) < 0)
  786. goto Fail;
  787. return u;
  788. }
  789. if(parse_query(&su, u) < 0
  790. || parse_authority(&su, u) < 0
  791. || parse_abspath(&su, u) < 0)
  792. goto Fail;
  793. if(u->ischeme < nelem(postparse) && postparse[u->ischeme])
  794. if((*postparse[u->ischeme])(u) < 0)
  795. goto Fail;
  796. setmalloctag(u, getcallerpc(&url));
  797. return u;
  798. }
  799. void
  800. freeurl(Url *u)
  801. {
  802. if(u == nil)
  803. return;
  804. free(u->url);
  805. free(u->scheme);
  806. free(u->schemedata);
  807. free(u->authority);
  808. free(u->user);
  809. free(u->passwd);
  810. free(u->host);
  811. free(u->port);
  812. free(u->path);
  813. free(u->query);
  814. free(u->fragment);
  815. switch(u->ischeme){
  816. case UShttp:
  817. free(u->http.page_spec);
  818. break;
  819. case USftp:
  820. free(u->ftp.path_spec);
  821. free(u->ftp.type);
  822. break;
  823. }
  824. free(u);
  825. }
  826. void
  827. rewriteurl(Url *u)
  828. {
  829. char *s;
  830. if(u->schemedata)
  831. s = estrmanydup(u->scheme, ":", u->schemedata, nil);
  832. else
  833. s = estrmanydup(u->scheme, "://",
  834. u->user ? u->user : "",
  835. u->passwd ? ":" : "", u->passwd ? u->passwd : "",
  836. u->user ? "@" : "", u->host ? u->host : "",
  837. u->port ? ":" : "", u->port ? u->port : "",
  838. u->path,
  839. u->query ? "?" : "", u->query ? u->query : "",
  840. u->fragment ? "#" : "", u->fragment ? u->fragment : "",
  841. nil);
  842. free(u->url);
  843. u->url = s;
  844. }
  845. int
  846. seturlquery(Url *u, char *query)
  847. {
  848. if(query == nil){
  849. free(u->query);
  850. u->query = nil;
  851. return 0;
  852. }
  853. if(!ismatch(REquery, query, "query"))
  854. return -1;
  855. free(u->query);
  856. u->query = estrdup(query);
  857. return 0;
  858. }
  859. static void
  860. dupp(char **p)
  861. {
  862. if(*p)
  863. *p = estrdup(*p);
  864. }
  865. Url*
  866. copyurl(Url *u)
  867. {
  868. Url *v;
  869. v = emalloc(sizeof(Url));
  870. *v = *u;
  871. dupp(&v->url);
  872. dupp(&v->scheme);
  873. dupp(&v->schemedata);
  874. dupp(&v->authority);
  875. dupp(&v->user);
  876. dupp(&v->passwd);
  877. dupp(&v->host);
  878. dupp(&v->port);
  879. dupp(&v->path);
  880. dupp(&v->query);
  881. dupp(&v->fragment);
  882. switch(v->ischeme){
  883. case UShttp:
  884. dupp(&v->http.page_spec);
  885. break;
  886. case USftp:
  887. dupp(&v->ftp.path_spec);
  888. dupp(&v->ftp.type);
  889. break;
  890. }
  891. return v;
  892. }
  893. static int
  894. dhex(char c)
  895. {
  896. if('0' <= c && c <= '9')
  897. return c-'0';
  898. if('a' <= c && c <= 'f')
  899. return c-'a'+10;
  900. if('A' <= c && c <= 'F')
  901. return c-'A'+10;
  902. return 0;
  903. }
  904. char*
  905. escapeurl(char *s, int (*needesc)(int))
  906. {
  907. int n;
  908. char *t, *u;
  909. Rune r;
  910. static char *hex = "0123456789abcdef";
  911. n = 0;
  912. for(t=s; *t; t++)
  913. if((*needesc)(*t))
  914. n++;
  915. u = emalloc(strlen(s)+2*n+1);
  916. t = u;
  917. for(; *s; s++){
  918. s += chartorune(&r, s);
  919. if(r >= 0xFF){
  920. werrstr("URLs cannot contain Runes > 0xFF");
  921. free(t);
  922. return nil;
  923. }
  924. if((*needesc)(r)){
  925. *u++ = '%';
  926. *u++ = hex[(r>>4)&0xF];
  927. *u++ = hex[r&0xF];
  928. }else
  929. *u++ = r;
  930. }
  931. *u = '\0';
  932. return t;
  933. }
  934. char*
  935. unescapeurl(char *s)
  936. {
  937. char *r, *w;
  938. Rune rune;
  939. s = estrdup(s);
  940. for(r=w=s; *r; r++){
  941. if(*r=='%'){
  942. r++;
  943. if(!isxdigit(r[0]) || !isxdigit(r[1])){
  944. werrstr("bad escape sequence '%.3s' in URL", r);
  945. return nil;
  946. }
  947. if(r[0]=='0' && r[2]=='0'){
  948. werrstr("escaped NUL in URL");
  949. return nil;
  950. }
  951. rune = (dhex(r[0])<<4)|dhex(r[1]); /* latin1 */
  952. w += runetochar(w, &rune);
  953. r += 2;
  954. }else
  955. *w++ = *r;
  956. }
  957. *w = '\0';
  958. return s;
  959. }