url.c 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101
  1. /*
  2. * This file is part of the UCB release of Plan 9. It is subject to the license
  3. * terms in the LICENSE file found in the top-level directory of this
  4. * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
  5. * part of the UCB release of Plan 9, including this file, may be copied,
  6. * modified, propagated, or distributed except according to the terms contained
  7. * in the LICENSE file.
  8. */
  9. /*
  10. * This is a URL parser, written to parse "Common Internet Scheme" URL
  11. * syntax as described in RFC1738 and updated by RFC2396. Only absolute URLs
  12. * are supported, using "server-based" naming authorities in the schemes.
  13. * Support for literal IPv6 addresses is included, per RFC2732.
  14. *
  15. * Current "known" schemes: http, ftp, file.
  16. *
  17. * We can do all the parsing operations without Runes since URLs are
  18. * defined to be composed of US-ASCII printable characters.
  19. * See RFC1738, RFC2396.
  20. */
  21. #include <u.h>
  22. #include <libc.h>
  23. #include <ctype.h>
  24. #include <regexp.h>
  25. #include <plumb.h>
  26. #include <thread.h>
  27. #include <fcall.h>
  28. #include <9p.h>
  29. #include "dat.h"
  30. #include "fns.h"
  31. int urldebug;
  32. /* If set, relative paths with leading ".." segments will have them trimmed */
  33. #define RemoveExtraRelDotDots 0
  34. #define ExpandCurrentDocUrls 1
  35. static char*
  36. schemestrtab[] =
  37. {
  38. nil,
  39. "http",
  40. "https",
  41. "ftp",
  42. "file",
  43. };
  44. static int
  45. ischeme(char *s)
  46. {
  47. int i;
  48. for(i=0; i<nelem(schemestrtab); i++)
  49. if(schemestrtab[i] && strcmp(s, schemestrtab[i])==0)
  50. return i;
  51. return USunknown;
  52. }
  53. /*
  54. * URI splitting regexp is from RFC2396, Appendix B:
  55. * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
  56. * 12 3 4 5 6 7 8 9
  57. *
  58. * Example: "http://www.ics.uci.edu/pub/ietf/uri/#Related"
  59. * $2 = scheme "http"
  60. * $4 = authority "www.ics.uci.edu"
  61. * $5 = path "/pub/ietf/uri/"
  62. * $7 = query <undefined>
  63. * $9 = fragment "Related"
  64. */
  65. /*
  66. * RFC2396, Sec 3.1, contains:
  67. *
  68. * Scheme names consist of a sequence of characters beginning with a
  69. * lower case letter and followed by any combination of lower case
  70. * letters, digits, plus ("+"), period ("."), or hyphen ("-"). For
  71. * resiliency, programs interpreting URI should treat upper case letters
  72. * as equivalent to lower case in scheme names (e.g., allow "HTTP" as
  73. * well as "http").
  74. */
  75. /*
  76. * For server-based naming authorities (RFC2396 Sec 3.2.2):
  77. * server = [ [ userinfo "@" ] hostport ]
  78. * userinfo = *( unreserved | escaped |
  79. * ";" | ":" | "&" | "=" | "+" | "$" | "," )
  80. * hostport = host [ ":" port ]
  81. * host = hostname | IPv4address
  82. * hostname = *( domainlabel "." ) toplabel [ "." ]
  83. * domainlabel = alphanum | alphanum *( alphanum | "-" ) alphanum
  84. * toplabel = alpha | alpha *( alphanum | "-" ) alphanum
  85. * IPv4address = 1*digit "." 1*digit "." 1*digit "." 1*digit
  86. * port = *digit
  87. *
  88. * The host is a domain name of a network host, or its IPv4 address as a
  89. * set of four decimal digit groups separated by ".". Literal IPv6
  90. * addresses are not supported.
  91. *
  92. * Note that literal IPv6 address support is outlined in RFC2732:
  93. * host = hostname | IPv4address | IPv6reference
  94. * ipv6reference = "[" IPv6address "]" (RFC2373)
  95. *
  96. * Since hostnames and numbers will have to be resolved by the OS anyway,
  97. * we don't have to parse them too pedantically (counting '.'s, checking
  98. * for well-formed literal IP addresses, etc.).
  99. *
  100. * In FTP/file paths, we reject most ";param"s and querys. In HTTP paths,
  101. * we just pass them through.
  102. *
  103. * Instead of letting a "path" be 0-or-more characters as RFC2396 suggests,
  104. * we'll say it's 1-or-more characters, 0-or-1 times. This way, an absent
  105. * path yields a nil substring match, instead of an empty one.
  106. *
  107. * We're more restrictive than RFC2396 indicates with "userinfo" strings,
  108. * insisting they have the form "[user[:password]]". This may need to
  109. * change at some point, however.
  110. */
  111. /* RE character-class components -- these go in brackets */
  112. #define PUNCT "\\-_.!~*'()"
  113. #define RES ";/?:@&=+$,"
  114. #define ALNUM "a-zA-Z0-9"
  115. #define HEX "0-9a-fA-F"
  116. #define UNRES ALNUM PUNCT
  117. /* RE components; _N => has N parenthesized subexpressions when expanded */
  118. #define ESCAPED_1 "(%[" HEX "][" HEX "])"
  119. #define URIC_2 "([" RES UNRES "]|" ESCAPED_1 ")"
  120. #define URICNOSLASH_2 "([" UNRES ";?:@&=+$,]|" ESCAPED_1 ")"
  121. #define USERINFO_2 "([" UNRES ";:&=+$,]|" ESCAPED_1 ")"
  122. #define PCHAR_2 "([" UNRES ":@&=+$,]|" ESCAPED_1 ")"
  123. #define PSEGCHAR_3 "([/;]|" PCHAR_2 ")"
  124. typedef struct Retab Retab;
  125. struct Retab
  126. {
  127. char *str;
  128. Reprog *prog;
  129. int size;
  130. int ind[5];
  131. };
  132. enum
  133. {
  134. REsplit = 0,
  135. REscheme,
  136. REunknowndata,
  137. REauthority,
  138. REhost,
  139. REuserinfo,
  140. REabspath,
  141. REquery,
  142. REfragment,
  143. REhttppath,
  144. REftppath,
  145. REfilepath,
  146. MaxResub= 20,
  147. };
  148. Retab retab[] = /* view in constant width Font */
  149. {
  150. [REsplit]
  151. "^(([^:/?#]+):)?(//([^/?#]*))?([^?#]+)?(\\?([^#]*))?(#(.*))?$", nil, 0,
  152. /* |-scheme-| |-auth.-| |path--| |query| |--|frag */
  153. { 2, 4, 5, 7, 9},
  154. [REscheme]
  155. "^[a-z][a-z0-9+-.]*$", nil, 0,
  156. { 0, },
  157. [REunknowndata]
  158. "^" URICNOSLASH_2 URIC_2 "*$", nil, 0,
  159. { 0, },
  160. [REauthority]
  161. "^(((" USERINFO_2 "*)@)?(((\\[[^\\]@]+\\])|([^:\\[@]+))(:([0-9]*))?)?)?$", nil, 0,
  162. /* |----user info-----| |--------host----------------| |-port-| */
  163. { 3, 7, 11, },
  164. [REhost]
  165. "^(([a-zA-Z0-9\\-.]+)|(\\[([a-fA-F0-9.:]+)\\]))$", nil, 0,
  166. /* |--regular host--| |-IPv6 literal-| */
  167. { 2, 4, },
  168. [REuserinfo]
  169. "^(([^:]*)(:([^:]*))?)$", nil, 0,
  170. /* |user-| |pass-| */
  171. { 2, 4, },
  172. [REabspath]
  173. "^/" PSEGCHAR_3 "*$", nil, 0,
  174. { 0, },
  175. [REquery]
  176. "^" URIC_2 "*$", nil, 0,
  177. { 0, },
  178. [REfragment]
  179. "^" URIC_2 "*$", nil, 0,
  180. { 0, },
  181. [REhttppath]
  182. "^.*$", nil, 0,
  183. { 0, },
  184. [REftppath]
  185. "^(.+)(;[tT][yY][pP][eE]=([aAiIdD]))?$", nil, 0,
  186. /*|--|-path |ftptype-| */
  187. { 1, 3, },
  188. [REfilepath]
  189. "^.*$", nil, 0,
  190. { 0, },
  191. };
  192. static int
  193. countleftparen(char *s)
  194. {
  195. int n;
  196. n = 0;
  197. for(; *s; s++)
  198. if(*s == '(')
  199. n++;
  200. return n;
  201. }
  202. void
  203. initurl(void)
  204. {
  205. int i, j;
  206. for(i=0; i<nelem(retab); i++){
  207. retab[i].prog = regcomp(retab[i].str);
  208. if(retab[i].prog == nil)
  209. sysfatal("recomp(%s): %r", retab[i].str);
  210. retab[i].size = countleftparen(retab[i].str)+1;
  211. for(j=0; j<nelem(retab[i].ind); j++)
  212. if(retab[i].ind[j] >= retab[i].size)
  213. sysfatal("bad index in regexp table: retab[%d].ind[%d] = %d >= %d",
  214. i, j, retab[i].ind[j], retab[i].size);
  215. if(MaxResub < retab[i].size)
  216. sysfatal("MaxResub too small: %d < %d", MaxResub, retab[i].size);
  217. }
  218. }
  219. typedef struct SplitUrl SplitUrl;
  220. struct SplitUrl
  221. {
  222. struct {
  223. char *s;
  224. char *e;
  225. } url, scheme, authority, path, query, fragment;
  226. };
  227. /*
  228. * Implements the algorithm in RFC2396 sec 5.2 step 6.
  229. * Returns number of chars written, excluding NUL terminator.
  230. * dest is known to be >= strlen(base)+rel_len.
  231. */
  232. static void
  233. merge_relative_path(char *base, char *rel_st, int rel_len, char *dest)
  234. {
  235. char *s, *p, *e, *pdest;
  236. pdest = dest;
  237. /* 6a: start with base, discard last segment */
  238. if(base && base[0]){
  239. /* Empty paths don't match in our scheme; 'base' should be nil */
  240. assert(base[0] == '/');
  241. e = strrchr(base, '/');
  242. e++;
  243. memmove(pdest, base, e-base);
  244. pdest += e-base;
  245. }else{
  246. /* Artistic license on my part */
  247. *pdest++ = '/';
  248. }
  249. /* 6b: append relative component */
  250. if(rel_st){
  251. memmove(pdest, rel_st, rel_len);
  252. pdest += rel_len;
  253. }
  254. /* 6c: remove any occurrences of "./" as a complete segment */
  255. s = dest;
  256. *pdest = '\0';
  257. while(e = strstr(s, "./")){
  258. if((e == dest) || (*(e-1) == '/')){
  259. memmove(e, e+2, pdest+1-(e+2)); /* +1 for NUL */
  260. pdest -= 2;
  261. }else
  262. s = e+1;
  263. }
  264. /* 6d: remove a trailing "." as a complete segment */
  265. if(pdest>dest && *(pdest-1)=='.' &&
  266. (pdest==dest+1 || *(pdest-2)=='/'))
  267. *--pdest = '\0';
  268. /* 6e: remove occurences of "seg/../", where seg != "..", left->right */
  269. s = dest+1;
  270. while(e = strstr(s, "/../")){
  271. p = e - 1;
  272. while(p >= dest && *p != '/')
  273. p--;
  274. if(memcmp(p, "/../", 4) != 0){
  275. memmove(p+1, e+4, pdest+1-(e+4));
  276. pdest -= (e+4) - (p+1);
  277. }else
  278. s = e+1;
  279. }
  280. /* 6f: remove a trailing "seg/..", where seg isn't ".." */
  281. if(pdest-3 > dest && memcmp(pdest-3, "/..", 3)==0){
  282. p = pdest-3 - 1;
  283. while(p >= dest && *p != '/')
  284. p--;
  285. if(memcmp(p, "/../", 4) != 0){
  286. pdest = p+1;
  287. *pdest = '\0';
  288. }
  289. }
  290. /* 6g: leading ".." segments are errors -- we'll just blat them out. */
  291. if(RemoveExtraRelDotDots){
  292. p = dest;
  293. if (p[0] == '/')
  294. p++;
  295. s = p;
  296. while(s[0]=='.' && s[1]=='.' && (s[2]==0 || s[2]=='/'))
  297. s += 3;
  298. if(s > p){
  299. memmove(p, s, pdest+1-s);
  300. pdest -= s-p;
  301. }
  302. }
  303. USED(pdest);
  304. if(urldebug)
  305. fprint(2, "merge_relative_path: '%s' + '%.*s' -> '%s'\n", base, rel_len,
  306. rel_st, dest);
  307. }
  308. /*
  309. * See RFC2396 sec 5.2 for info on resolving relative URIs to absolute form.
  310. *
  311. * If successful, this just ends up freeing and replacing "u->url".
  312. */
  313. static int
  314. resolve_relative(SplitUrl *su, Url *base, Url *u)
  315. {
  316. char *url, *path;
  317. char *purl, *ppath;
  318. int currentdoc, ulen, plen;
  319. if(base == nil){
  320. werrstr("relative URI given without base");
  321. return -1;
  322. }
  323. if(base->scheme == nil){
  324. werrstr("relative URI given with no scheme");
  325. return -1;
  326. }
  327. if(base->ischeme == USunknown){
  328. werrstr("relative URI given with unknown scheme");
  329. return -1;
  330. }
  331. if(base->ischeme == UScurrent){
  332. werrstr("relative URI given with incomplete base");
  333. return -1;
  334. }
  335. assert(su->scheme.s == nil);
  336. /* Sec 5.2 step 2 */
  337. currentdoc = 0;
  338. if(su->path.s==nil && su->scheme.s==nil && su->authority.s==nil && su->query.s==nil){
  339. /* Reference is to current document */
  340. if(urldebug)
  341. fprint(2, "url %s is relative to current document\n", u->url);
  342. u->ischeme = UScurrent;
  343. if(!ExpandCurrentDocUrls)
  344. return 0;
  345. currentdoc = 1;
  346. }
  347. /* Over-estimate the maximum lengths, for allocation purposes */
  348. /* (constants are for separators) */
  349. plen = 1;
  350. if(base->path)
  351. plen += strlen(base->path);
  352. if(su->path.s)
  353. plen += 1 + (su->path.e - su->path.s);
  354. ulen = 0;
  355. ulen += strlen(base->scheme) + 1;
  356. if(su->authority.s)
  357. ulen += 2 + (su->authority.e - su->authority.s);
  358. else
  359. ulen += 2 + ((base->authority) ? strlen(base->authority) : 0);
  360. ulen += plen;
  361. if(su->query.s)
  362. ulen += 1 + (su->query.e - su->query.s);
  363. else if(currentdoc && base->query)
  364. ulen += 1 + strlen(base->query);
  365. if(su->fragment.s)
  366. ulen += 1 + (su->fragment.e - su->fragment.s);
  367. else if(currentdoc && base->fragment)
  368. ulen += 1 + strlen(base->fragment);
  369. url = emalloc(ulen+1);
  370. path = emalloc(plen+1);
  371. url[0] = '\0';
  372. purl = url;
  373. path[0] = '\0';
  374. ppath = path;
  375. if(su->authority.s || (su->path.s && (su->path.s[0] == '/'))){
  376. /* Is a "network-path" or "absolute-path"; don't merge with base path */
  377. /* Sec 5.2 steps 4,5 */
  378. if(su->path.s){
  379. memmove(ppath, su->path.s, su->path.e - su->path.s);
  380. ppath += su->path.e - su->path.s;
  381. *ppath = '\0';
  382. }
  383. }else if(currentdoc){
  384. /* Is a current-doc reference; just copy the path from the base URL */
  385. if(base->path){
  386. strcpy(ppath, base->path);
  387. ppath += strlen(ppath);
  388. }
  389. USED(ppath);
  390. }else{
  391. /* Is a relative-path reference; we have to merge it */
  392. /* Sec 5.2 step 6 */
  393. merge_relative_path(base->path,
  394. su->path.s, su->path.e - su->path.s, ppath);
  395. }
  396. /* Build new URL from pieces, inheriting from base where needed */
  397. strcpy(purl, base->scheme);
  398. purl += strlen(purl);
  399. *purl++ = ':';
  400. if(su->authority.s){
  401. strcpy(purl, "//");
  402. purl += strlen(purl);
  403. memmove(purl, su->authority.s, su->authority.e - su->authority.s);
  404. purl += su->authority.e - su->authority.s;
  405. }else if(base->authority){
  406. strcpy(purl, "//");
  407. purl += strlen(purl);
  408. strcpy(purl, base->authority);
  409. purl += strlen(purl);
  410. }
  411. assert((path[0] == '\0') || (path[0] == '/'));
  412. strcpy(purl, path);
  413. purl += strlen(purl);
  414. /*
  415. * The query and fragment are not inherited from the base,
  416. * except in case of "current document" URLs, which inherit any query
  417. * and may inherit the fragment.
  418. */
  419. if(su->query.s){
  420. *purl++ = '?';
  421. memmove(purl, su->query.s, su->query.e - su->query.s);
  422. purl += su->query.e - su->query.s;
  423. }else if(currentdoc && base->query){
  424. *purl++ = '?';
  425. strcpy(purl, base->query);
  426. purl += strlen(purl);
  427. }
  428. if(su->fragment.s){
  429. *purl++ = '#';
  430. memmove(purl, su->query.s, su->query.e - su->query.s);
  431. purl += su->fragment.e - su->fragment.s;
  432. }else if(currentdoc && base->fragment){
  433. *purl++ = '#';
  434. strcpy(purl, base->fragment);
  435. purl += strlen(purl);
  436. }
  437. USED(purl);
  438. if(urldebug)
  439. fprint(2, "resolve_relative: '%s' + '%s' -> '%s'\n", base->url, u->url, url);
  440. free(u->url);
  441. u->url = url;
  442. free(path);
  443. return 0;
  444. }
  445. int
  446. regx(Reprog *prog, char *s, Resub *m, int nm)
  447. {
  448. int i;
  449. if(s == nil)
  450. s = m[0].sp; /* why is this necessary? */
  451. i = regexec(prog, s, m, nm);
  452. /*
  453. if(i >= 0)
  454. for(j=0; j<nm; j++)
  455. fprint(2, "match%d: %.*s\n", j, utfnlen(m[j].sp, m[j].ep-m[j].sp), m[j].sp);
  456. */
  457. return i;
  458. }
  459. static int
  460. ismatch(int i, char *s, char *desc)
  461. {
  462. Resub m[1];
  463. m[0].sp = m[0].ep = nil;
  464. if(!regx(retab[i].prog, s, m, 1)){
  465. werrstr("malformed %s: %q", desc, s);
  466. return 0;
  467. }
  468. return 1;
  469. }
  470. static int
  471. spliturl(char *url, SplitUrl *su)
  472. {
  473. Resub m[MaxResub];
  474. Retab *t;
  475. /*
  476. * Newlines are not valid in a URI, but regexp(2) treats them specially
  477. * so it's best to make sure there are none before proceeding.
  478. */
  479. if(strchr(url, '\n')){
  480. werrstr("newline in URI");
  481. return -1;
  482. }
  483. /*
  484. * Because we use NUL-terminated strings, as do many client and server
  485. * implementations, an escaped NUL ("%00") will quite likely cause problems
  486. * when unescaped. We can check for such a sequence once before examining
  487. * the components because, per RFC2396 sec. 2.4.1 - 2.4.2, '%' is reserved
  488. * in URIs to _always_ indicate escape sequences. Something like "%2500"
  489. * will still get by, but that's legitimate, and if it ends up causing
  490. * a NUL then someone is unescaping too many times.
  491. */
  492. if(strstr(url, "%00")){
  493. werrstr("escaped NUL in URI");
  494. return -1;
  495. }
  496. m[0].sp = m[0].ep = nil;
  497. t = &retab[REsplit];
  498. if(!regx(t->prog, url, m, t->size)){
  499. werrstr("malformed URI: %q", url);
  500. return -1;
  501. }
  502. su->url.s = m[0].sp;
  503. su->url.e = m[0].ep;
  504. su->scheme.s = m[t->ind[0]].sp;
  505. su->scheme.e = m[t->ind[0]].ep;
  506. su->authority.s = m[t->ind[1]].sp;
  507. su->authority.e = m[t->ind[1]].ep;
  508. su->path.s = m[t->ind[2]].sp;
  509. su->path.e = m[t->ind[2]].ep;
  510. su->query.s = m[t->ind[3]].sp;
  511. su->query.e = m[t->ind[3]].ep;
  512. su->fragment.s = m[t->ind[4]].sp;
  513. su->fragment.e = m[t->ind[4]].ep;
  514. if(urldebug)
  515. fprint(2, "split url %s into %.*q %.*q %.*q %.*q %.*q %.*q\n",
  516. url,
  517. su->url.s ? utfnlen(su->url.s, su->url.e-su->url.s) : 10, su->url.s ? su->url.s : "",
  518. su->scheme.s ? utfnlen(su->scheme.s, su->scheme.e-su->scheme.s) : 10, su->scheme.s ? su->scheme.s : "",
  519. su->authority.s ? utfnlen(su->authority.s, su->authority.e-su->authority.s) : 10, su->authority.s ? su->authority.s : "",
  520. su->path.s ? utfnlen(su->path.s, su->path.e-su->path.s) : 10, su->path.s ? su->path.s : "",
  521. su->query.s ? utfnlen(su->query.s, su->query.e-su->query.s) : 10, su->query.s ? su->query.s : "",
  522. su->fragment.s ? utfnlen(su->fragment.s, su->fragment.e-su->fragment.s) : 10, su->fragment.s ? su->fragment.s : "");
  523. return 0;
  524. }
  525. static int
  526. parse_scheme(SplitUrl *su, Url *u)
  527. {
  528. if(su->scheme.s == nil){
  529. werrstr("missing scheme");
  530. return -1;
  531. }
  532. u->scheme = estredup(su->scheme.s, su->scheme.e);
  533. strlower(u->scheme);
  534. if(!ismatch(REscheme, u->scheme, "scheme"))
  535. return -1;
  536. u->ischeme = ischeme(u->scheme);
  537. if(urldebug)
  538. fprint(2, "parse_scheme %s => %d\n", u->scheme, u->ischeme);
  539. return 0;
  540. }
  541. static int
  542. parse_unknown_part(SplitUrl *su, Url *u)
  543. {
  544. char *s, *e;
  545. assert(u->ischeme == USunknown);
  546. assert(su->scheme.e[0] == ':');
  547. s = su->scheme.e+1;
  548. if(su->fragment.s){
  549. e = su->fragment.s-1;
  550. assert(*e == '#');
  551. }else
  552. e = s+strlen(s);
  553. u->schemedata = estredup(s, e);
  554. if(!ismatch(REunknowndata, u->schemedata, "unknown scheme data"))
  555. return -1;
  556. return 0;
  557. }
  558. static int
  559. parse_userinfo(char *s, char *e, Url *u)
  560. {
  561. Resub m[MaxResub];
  562. Retab *t;
  563. m[0].sp = s;
  564. m[0].ep = e;
  565. t = &retab[REuserinfo];
  566. if(!regx(t->prog, nil, m, t->size)){
  567. werrstr("malformed userinfo: %.*q", utfnlen(s, e-s), s);
  568. return -1;
  569. }
  570. if(m[t->ind[0]].sp)
  571. u->user = estredup(m[t->ind[0]].sp, m[t->ind[0]].ep);
  572. if(m[t->ind[1]].sp)
  573. u->user = estredup(m[t->ind[1]].sp, m[t->ind[1]].ep);
  574. return 0;
  575. }
  576. static int
  577. parse_host(char *s, char *e, Url *u)
  578. {
  579. Resub m[MaxResub];
  580. Retab *t;
  581. m[0].sp = s;
  582. m[0].ep = e;
  583. t = &retab[REhost];
  584. if(!regx(t->prog, nil, m, t->size)){
  585. werrstr("malformed host: %.*q", utfnlen(s, e-s), s);
  586. return -1;
  587. }
  588. assert(m[t->ind[0]].sp || m[t->ind[1]].sp);
  589. if(m[t->ind[0]].sp) /* regular */
  590. u->host = estredup(m[t->ind[0]].sp, m[t->ind[0]].ep);
  591. else
  592. u->host = estredup(m[t->ind[1]].sp, m[t->ind[1]].ep);
  593. return 0;
  594. }
  595. static int
  596. parse_authority(SplitUrl *su, Url *u)
  597. {
  598. Resub m[MaxResub];
  599. Retab *t;
  600. char *host;
  601. char *userinfo;
  602. if(su->authority.s == nil)
  603. return 0;
  604. u->authority = estredup(su->authority.s, su->authority.e);
  605. m[0].sp = m[0].ep = nil;
  606. t = &retab[REauthority];
  607. if(!regx(t->prog, u->authority, m, t->size)){
  608. werrstr("malformed authority: %q", u->authority);
  609. return -1;
  610. }
  611. if(m[t->ind[0]].sp)
  612. if(parse_userinfo(m[t->ind[0]].sp, m[t->ind[0]].ep, u) < 0)
  613. return -1;
  614. if(m[t->ind[1]].sp)
  615. if(parse_host(m[t->ind[1]].sp, m[t->ind[1]].ep, u) < 0)
  616. return -1;
  617. if(m[t->ind[2]].sp)
  618. u->port = estredup(m[t->ind[2]].sp, m[t->ind[2]].ep);
  619. if(urldebug > 0){
  620. userinfo = estredup(m[t->ind[0]].sp, m[t->ind[0]].ep);
  621. host = estredup(m[t->ind[1]].sp, m[t->ind[1]].ep);
  622. fprint(2, "port: %q, authority %q\n", u->port, u->authority);
  623. fprint(2, "host %q, userinfo %q\n", host, userinfo);
  624. free(host);
  625. free(userinfo);
  626. }
  627. return 0;
  628. }
  629. static int
  630. parse_abspath(SplitUrl *su, Url *u)
  631. {
  632. if(su->path.s == nil)
  633. return 0;
  634. u->path = estredup(su->path.s, su->path.e);
  635. if(!ismatch(REabspath, u->path, "absolute path"))
  636. return -1;
  637. return 0;
  638. }
  639. static int
  640. parse_query(SplitUrl *su, Url *u)
  641. {
  642. if(su->query.s == nil)
  643. return 0;
  644. u->query = estredup(su->query.s, su->query.e);
  645. if(!ismatch(REquery, u->query, "query"))
  646. return -1;
  647. return 0;
  648. }
  649. static int
  650. parse_fragment(SplitUrl *su, Url *u)
  651. {
  652. if(su->fragment.s == nil)
  653. return 0;
  654. u->fragment = estredup(su->fragment.s, su->fragment.e);
  655. if(!ismatch(REfragment, u->fragment, "fragment"))
  656. return -1;
  657. return 0;
  658. }
  659. static int
  660. postparse_http(Url *u)
  661. {
  662. u->open = httpopen;
  663. u->read = httpread;
  664. u->close = httpclose;
  665. if(u->authority==nil){
  666. werrstr("missing authority (hostname, port, etc.)");
  667. return -1;
  668. }
  669. if(u->host == nil){
  670. werrstr("missing host specification");
  671. return -1;
  672. }
  673. if(u->path == nil){
  674. u->http.page_spec = estrdup("/");
  675. return 0;
  676. }
  677. if(!ismatch(REhttppath, u->path, "http path"))
  678. return -1;
  679. if(u->query){
  680. u->http.page_spec = emalloc(strlen(u->path)+1+strlen(u->query)+1);
  681. strcpy(u->http.page_spec, u->path);
  682. strcat(u->http.page_spec, "?");
  683. strcat(u->http.page_spec, u->query);
  684. }else
  685. u->http.page_spec = estrdup(u->path);
  686. return 0;
  687. }
  688. static int
  689. postparse_ftp(Url *u)
  690. {
  691. Resub m[MaxResub];
  692. Retab *t;
  693. if(u->authority==nil){
  694. werrstr("missing authority (hostname, port, etc.)");
  695. return -1;
  696. }
  697. if(u->query){
  698. werrstr("unexpected \"?query\" in ftp path");
  699. return -1;
  700. }
  701. if(u->host == nil){
  702. werrstr("missing host specification");
  703. return -1;
  704. }
  705. if(u->path == nil){
  706. u->ftp.path_spec = estrdup("/");
  707. return 0;
  708. }
  709. m[0].sp = m[0].ep = nil;
  710. t = &retab[REftppath];
  711. if(!regx(t->prog, u->path, m, t->size)){
  712. werrstr("malformed ftp path: %q", u->path);
  713. return -1;
  714. }
  715. if(m[t->ind[0]].sp){
  716. u->ftp.path_spec = estredup(m[t->ind[0]].sp, m[t->ind[0]].ep);
  717. if(strchr(u->ftp.path_spec, ';')){
  718. werrstr("unexpected \";param\" in ftp path");
  719. return -1;
  720. }
  721. }else
  722. u->ftp.path_spec = estrdup("/");
  723. if(m[t->ind[1]].sp){
  724. u->ftp.type = estredup(m[t->ind[1]].sp, m[t->ind[1]].ep);
  725. strlower(u->ftp.type);
  726. }
  727. return 0;
  728. }
  729. static int
  730. postparse_file(Url *u)
  731. {
  732. if(u->user || u->passwd){
  733. werrstr("user information not valid with file scheme");
  734. return -1;
  735. }
  736. if(u->query){
  737. werrstr("unexpected \"?query\" in file path");
  738. return -1;
  739. }
  740. if(u->port){
  741. werrstr("port not valid with file scheme");
  742. return -1;
  743. }
  744. if(u->path == nil){
  745. werrstr("missing path in file scheme");
  746. return -1;
  747. }
  748. if(strchr(u->path, ';')){
  749. werrstr("unexpected \";param\" in file path");
  750. return -1;
  751. }
  752. if(!ismatch(REfilepath, u->path, "file path"))
  753. return -1;
  754. /* "localhost" is equivalent to no host spec, we'll chose the latter */
  755. if(u->host && cistrcmp(u->host, "localhost") == 0){
  756. free(u->host);
  757. u->host = nil;
  758. }
  759. return 0;
  760. }
  761. static int (*postparse[])(Url*) = {
  762. nil,
  763. postparse_http,
  764. postparse_http,
  765. postparse_ftp,
  766. postparse_file,
  767. };
  768. Url*
  769. parseurl(char *url, Url *base)
  770. {
  771. Url *u;
  772. SplitUrl su;
  773. if(urldebug)
  774. fprint(2, "parseurl %s with base %s\n", url, base ? base->url : "<none>");
  775. u = emalloc(sizeof(Url));
  776. u->url = estrdup(url);
  777. if(spliturl(u->url, &su) < 0){
  778. Fail:
  779. freeurl(u);
  780. return nil;
  781. }
  782. /* RFC2396 sec 3.1 says relative URIs are distinguished by absent scheme */
  783. if(su.scheme.s==nil){
  784. if(urldebug)
  785. fprint(2, "parseurl has nil scheme\n");
  786. if(resolve_relative(&su, base, u) < 0 || spliturl(u->url, &su) < 0)
  787. goto Fail;
  788. if(u->ischeme == UScurrent){
  789. /* 'u.url' refers to current document; set fragment and return */
  790. if(parse_fragment(&su, u) < 0)
  791. goto Fail;
  792. return u;
  793. }
  794. }
  795. if(parse_scheme(&su, u) < 0
  796. || parse_fragment(&su, u) < 0)
  797. goto Fail;
  798. if(u->ischeme == USunknown){
  799. if(parse_unknown_part(&su, u) < 0)
  800. goto Fail;
  801. return u;
  802. }
  803. if(parse_query(&su, u) < 0
  804. || parse_authority(&su, u) < 0
  805. || parse_abspath(&su, u) < 0)
  806. goto Fail;
  807. if(u->ischeme < nelem(postparse) && postparse[u->ischeme])
  808. if((*postparse[u->ischeme])(u) < 0)
  809. goto Fail;
  810. setmalloctag(u, getcallerpc(&url));
  811. return u;
  812. }
  813. void
  814. freeurl(Url *u)
  815. {
  816. if(u == nil)
  817. return;
  818. free(u->url);
  819. free(u->scheme);
  820. free(u->schemedata);
  821. free(u->authority);
  822. free(u->user);
  823. free(u->passwd);
  824. free(u->host);
  825. free(u->port);
  826. free(u->path);
  827. free(u->query);
  828. free(u->fragment);
  829. switch(u->ischeme){
  830. case UShttp:
  831. free(u->http.page_spec);
  832. break;
  833. case USftp:
  834. free(u->ftp.path_spec);
  835. free(u->ftp.type);
  836. break;
  837. }
  838. free(u);
  839. }
  840. void
  841. rewriteurl(Url *u)
  842. {
  843. char *s;
  844. if(u->schemedata)
  845. s = estrmanydup(u->scheme, ":", u->schemedata, nil);
  846. else
  847. s = estrmanydup(u->scheme, "://",
  848. u->user ? u->user : "",
  849. u->passwd ? ":" : "", u->passwd ? u->passwd : "",
  850. u->user ? "@" : "", u->host ? u->host : "",
  851. u->port ? ":" : "", u->port ? u->port : "",
  852. u->path,
  853. u->query ? "?" : "", u->query ? u->query : "",
  854. u->fragment ? "#" : "", u->fragment ? u->fragment : "",
  855. nil);
  856. free(u->url);
  857. u->url = s;
  858. }
  859. int
  860. seturlquery(Url *u, char *query)
  861. {
  862. if(query == nil){
  863. free(u->query);
  864. u->query = nil;
  865. return 0;
  866. }
  867. if(!ismatch(REquery, query, "query"))
  868. return -1;
  869. free(u->query);
  870. u->query = estrdup(query);
  871. return 0;
  872. }
  873. static void
  874. dupp(char **p)
  875. {
  876. if(*p)
  877. *p = estrdup(*p);
  878. }
  879. Url*
  880. copyurl(Url *u)
  881. {
  882. Url *v;
  883. v = emalloc(sizeof(Url));
  884. *v = *u;
  885. dupp(&v->url);
  886. dupp(&v->scheme);
  887. dupp(&v->schemedata);
  888. dupp(&v->authority);
  889. dupp(&v->user);
  890. dupp(&v->passwd);
  891. dupp(&v->host);
  892. dupp(&v->port);
  893. dupp(&v->path);
  894. dupp(&v->query);
  895. dupp(&v->fragment);
  896. switch(v->ischeme){
  897. case UShttp:
  898. dupp(&v->http.page_spec);
  899. break;
  900. case USftp:
  901. dupp(&v->ftp.path_spec);
  902. dupp(&v->ftp.type);
  903. break;
  904. }
  905. return v;
  906. }
  907. static int
  908. dhex(char c)
  909. {
  910. if('0' <= c && c <= '9')
  911. return c-'0';
  912. if('a' <= c && c <= 'f')
  913. return c-'a'+10;
  914. if('A' <= c && c <= 'F')
  915. return c-'A'+10;
  916. return 0;
  917. }
  918. char*
  919. escapeurl(char *s, int (*needesc)(int))
  920. {
  921. int n;
  922. char *t, *u;
  923. Rune r;
  924. static char *hex = "0123456789abcdef";
  925. n = 0;
  926. for(t=s; *t; t++)
  927. if((*needesc)(*t))
  928. n++;
  929. u = emalloc(strlen(s)+2*n+1);
  930. t = u;
  931. for(; *s; s++){
  932. s += chartorune(&r, s);
  933. if(r >= 0xFF){
  934. werrstr("URLs cannot contain Runes > 0xFF");
  935. free(t);
  936. return nil;
  937. }
  938. if((*needesc)(r)){
  939. *u++ = '%';
  940. *u++ = hex[(r>>4)&0xF];
  941. *u++ = hex[r&0xF];
  942. }else
  943. *u++ = r;
  944. }
  945. *u = '\0';
  946. return t;
  947. }
  948. char*
  949. unescapeurl(char *s)
  950. {
  951. char *r, *w;
  952. Rune rune;
  953. s = estrdup(s);
  954. for(r=w=s; *r; r++){
  955. if(*r=='%'){
  956. r++;
  957. if(!isxdigit(r[0]) || !isxdigit(r[1])){
  958. werrstr("bad escape sequence '%.3s' in URL", r);
  959. return nil;
  960. }
  961. if(r[0]=='0' && r[2]=='0'){
  962. werrstr("escaped NUL in URL");
  963. return nil;
  964. }
  965. rune = (dhex(r[0])<<4)|dhex(r[1]); /* latin1 */
  966. w += runetochar(w, &rune);
  967. r += 2;
  968. }else
  969. *w++ = *r;
  970. }
  971. *w = '\0';
  972. return s;
  973. }