url.c 24 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081
  1. /*
  2. * This is a URL parser, written to parse "Common Internet Scheme" URL
  3. * syntax as described in RFC1738 and updated by RFC2396. Only absolute URLs
  4. * are supported, using "server-based" naming authorities in the schemes.
  5. * Support for literal IPv6 addresses is included, per RFC2732.
  6. *
  7. * Current "known" schemes: http, ftp, file.
  8. *
  9. * We can do all the parsing operations without Runes since URLs are
  10. * defined to be composed of US-ASCII printable characters.
  11. * See RFC1738, RFC2396.
  12. */
  13. #include <u.h>
  14. #include <libc.h>
  15. #include <ctype.h>
  16. #include <regexp.h>
  17. #include <plumb.h>
  18. #include <thread.h>
  19. #include <fcall.h>
  20. #include <9p.h>
  21. #include "dat.h"
  22. #include "fns.h"
  23. int urldebug;
  24. /* If set, relative paths with leading ".." segments will have them trimmed */
  25. #define RemoveExtraRelDotDots 0
  26. #define ExpandCurrentDocUrls 1
  27. static char*
  28. schemestrtab[] =
  29. {
  30. nil,
  31. "http",
  32. "https",
  33. "ftp",
  34. "file",
  35. };
  36. static int
  37. ischeme(char *s)
  38. {
  39. int i;
  40. for(i=0; i<nelem(schemestrtab); i++)
  41. if(schemestrtab[i] && strcmp(s, schemestrtab[i])==0)
  42. return i;
  43. return USunknown;
  44. }
  45. /*
  46. * URI splitting regexp is from RFC2396, Appendix B:
  47. * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
  48. * 12 3 4 5 6 7 8 9
  49. *
  50. * Example: "http://www.ics.uci.edu/pub/ietf/uri/#Related"
  51. * $2 = scheme "http"
  52. * $4 = authority "www.ics.uci.edu"
  53. * $5 = path "/pub/ietf/uri/"
  54. * $7 = query <undefined>
  55. * $9 = fragment "Related"
  56. */
  57. /*
  58. * RFC2396, Sec 3.1, contains:
  59. *
  60. * Scheme names consist of a sequence of characters beginning with a
  61. * lower case letter and followed by any combination of lower case
  62. * letters, digits, plus ("+"), period ("."), or hyphen ("-"). For
  63. * resiliency, programs interpreting URI should treat upper case letters
  64. * as equivalent to lower case in scheme names (e.g., allow "HTTP" as
  65. * well as "http").
  66. */
  67. /*
  68. * For server-based naming authorities (RFC2396 Sec 3.2.2):
  69. * server = [ [ userinfo "@" ] hostport ]
  70. * userinfo = *( unreserved | escaped |
  71. * ";" | ":" | "&" | "=" | "+" | "$" | "," )
  72. * hostport = host [ ":" port ]
  73. * host = hostname | IPv4address
  74. * hostname = *( domainlabel "." ) toplabel [ "." ]
  75. * domainlabel = alphanum | alphanum *( alphanum | "-" ) alphanum
  76. * toplabel = alpha | alpha *( alphanum | "-" ) alphanum
  77. * IPv4address = 1*digit "." 1*digit "." 1*digit "." 1*digit
  78. * port = *digit
  79. *
  80. * The host is a domain name of a network host, or its IPv4 address as a
  81. * set of four decimal digit groups separated by ".". Literal IPv6
  82. * addresses are not supported.
  83. *
  84. * Note that literal IPv6 address support is outlined in RFC2732:
  85. * host = hostname | IPv4address | IPv6reference
  86. * ipv6reference = "[" IPv6address "]" (RFC2373)
  87. *
  88. * Since hostnames and numbers will have to be resolved by the OS anyway,
  89. * we don't have to parse them too pedantically (counting '.'s, checking
  90. * for well-formed literal IP addresses, etc.).
  91. *
  92. * In FTP/file paths, we reject most ";param"s and querys. In HTTP paths,
  93. * we just pass them through.
  94. *
  95. * Instead of letting a "path" be 0-or-more characters as RFC2396 suggests,
  96. * we'll say it's 1-or-more characters, 0-or-1 times. This way, an absent
  97. * path yields a nil substring match, instead of an empty one.
  98. *
  99. * We're more restrictive than RFC2396 indicates with "userinfo" strings,
  100. * insisting they have the form "[user[:password]]". This may need to
  101. * change at some point, however.
  102. */
  103. /* RE character-class components -- these go in brackets */
  104. #define PUNCT "\\-_.!~*'()"
  105. #define RES ";/?:@&=+$,"
  106. #define ALNUM "a-zA-Z0-9"
  107. #define HEX "0-9a-fA-F"
  108. #define UNRES ALNUM PUNCT
  109. /* RE components; _N => has N parenthesized subexpressions when expanded */
  110. #define ESCAPED_1 "(%[" HEX "][" HEX "])"
  111. #define URIC_2 "([" RES UNRES "]|" ESCAPED_1 ")"
  112. #define URICNOSLASH_2 "([" UNRES ";?:@&=+$,]|" ESCAPED_1 ")"
  113. #define USERINFO_2 "([" UNRES ";:&=+$,]|" ESCAPED_1 ")"
  114. #define PCHAR_2 "([" UNRES ":@&=+$,]|" ESCAPED_1 ")"
  115. #define PSEGCHAR_3 "([/;]|" PCHAR_2 ")"
  116. typedef struct Retab Retab;
  117. struct Retab
  118. {
  119. char *str;
  120. Reprog *prog;
  121. int size;
  122. int ind[5];
  123. };
  124. enum
  125. {
  126. REsplit = 0,
  127. REscheme,
  128. REunknowndata,
  129. REauthority,
  130. REhost,
  131. REuserinfo,
  132. REabspath,
  133. REquery,
  134. REfragment,
  135. REhttppath,
  136. REftppath,
  137. REfilepath,
  138. MaxResub= 20,
  139. };
  140. Retab retab[] = /* view in constant width Font */
  141. {
  142. [REsplit]
  143. "^(([^:/?#]+):)?(//([^/?#]*))?([^?#]+)?(\\?([^#]*))?(#(.*))?$", nil, 0,
  144. /* |-scheme-| |-auth.-| |path--| |query| |--|frag */
  145. { 2, 4, 5, 7, 9},
  146. [REscheme]
  147. "^[a-z][a-z0-9+-.]*$", nil, 0,
  148. { 0, },
  149. [REunknowndata]
  150. "^" URICNOSLASH_2 URIC_2 "*$", nil, 0,
  151. { 0, },
  152. [REauthority]
  153. "^(((" USERINFO_2 "*)@)?(((\\[[^\\]@]+\\])|([^:\\[@]+))(:([0-9]*))?)?)?$", nil, 0,
  154. /* |----user info-----| |--------host----------------| |-port-| */
  155. { 2, 7, 12, },
  156. [REhost]
  157. "^(([a-zA-Z0-9\\-.]+)|(\\[([a-fA-F0-9.:]+)\\]))$", nil, 0,
  158. /* |--regular host--| |-IPv6 literal-| */
  159. { 2, 4, },
  160. [REuserinfo]
  161. "^(([^:]*)(:([^:]*))?)$", nil, 0,
  162. /* |user-| |pass-| */
  163. { 2, 4, },
  164. [REabspath]
  165. "^/" PSEGCHAR_3 "*$", nil, 0,
  166. { 0, },
  167. [REquery]
  168. "^" URIC_2 "*$", nil, 0,
  169. { 0, },
  170. [REfragment]
  171. "^" URIC_2 "*$", nil, 0,
  172. { 0, },
  173. [REhttppath]
  174. "^.*$", nil, 0,
  175. { 0, },
  176. [REftppath]
  177. "^(.+)(;[tT][yY][pP][eE]=([aAiIdD]))?$", nil, 0,
  178. /*|--|-path |ftptype-| */
  179. { 1, 3, },
  180. [REfilepath]
  181. "^.*$", nil, 0,
  182. { 0, },
  183. };
  184. static int
  185. countleftparen(char *s)
  186. {
  187. int n;
  188. n = 0;
  189. for(; *s; s++)
  190. if(*s == '(')
  191. n++;
  192. return n;
  193. }
  194. void
  195. initurl(void)
  196. {
  197. int i, j;
  198. for(i=0; i<nelem(retab); i++){
  199. retab[i].prog = regcomp(retab[i].str);
  200. if(retab[i].prog == nil)
  201. sysfatal("recomp(%s): %r", retab[i].str);
  202. retab[i].size = countleftparen(retab[i].str)+1;
  203. for(j=0; j<nelem(retab[i].ind); j++)
  204. if(retab[i].ind[j] >= retab[i].size)
  205. sysfatal("bad index in regexp table: retab[%d].ind[%d] = %d >= %d",
  206. i, j, retab[i].ind[j], retab[i].size);
  207. if(MaxResub < retab[i].size)
  208. sysfatal("MaxResub too small: %d < %d", MaxResub, retab[i].size);
  209. }
  210. }
  211. typedef struct SplitUrl SplitUrl;
  212. struct SplitUrl
  213. {
  214. struct {
  215. char *s;
  216. char *e;
  217. } url, scheme, authority, path, query, fragment;
  218. };
  219. /*
  220. * Implements the algorithm in RFC2396 sec 5.2 step 6.
  221. * Returns number of chars written, excluding NUL terminator.
  222. * dest is known to be >= strlen(base)+rel_len.
  223. */
  224. static void
  225. merge_relative_path(char *base, char *rel_st, int rel_len, char *dest)
  226. {
  227. char *s, *p, *e, *pdest;
  228. pdest = dest;
  229. /* 6a: start with base, discard last segment */
  230. if(base){
  231. /* Empty paths don't match in our scheme; 'base' should be nil */
  232. assert(base[0] == '/');
  233. e = strrchr(base, '/');
  234. e++;
  235. memmove(pdest, base, e-base);
  236. pdest += e-base;
  237. }else{
  238. /* Artistic license on my part */
  239. *pdest++ = '/';
  240. }
  241. /* 6b: append relative component */
  242. if(rel_st){
  243. memmove(pdest, rel_st, rel_len);
  244. pdest += rel_len;
  245. }
  246. /* 6c: remove any occurrences of "./" as a complete segment */
  247. s = dest;
  248. *pdest = '\0';
  249. while(e = strstr(s, "./")){
  250. if((e == dest) || (*(e-1) == '/')){
  251. memmove(e, e+2, pdest+1-(e+2)); /* +1 for NUL */
  252. pdest -= 2;
  253. }else
  254. s = e+1;
  255. }
  256. /* 6d: remove a trailing "." as a complete segment */
  257. if(pdest>dest && *(pdest-1)=='.' &&
  258. (pdest==dest+1 || *(pdest-2)=='/'))
  259. *--pdest = '\0';
  260. /* 6e: remove occurences of "seg/../", where seg != "..", left->right */
  261. s = dest+1;
  262. while(e = strstr(s, "/../")){
  263. p = e - 1;
  264. while(p >= dest && *p != '/')
  265. p--;
  266. if(memcmp(p, "/../", 4) != 0){
  267. memmove(p+1, e+4, pdest+1-(e+4));
  268. pdest -= (e+4) - (p+1);
  269. }else
  270. s = e+1;
  271. }
  272. /* 6f: remove a trailing "seg/..", where seg isn't ".." */
  273. if(pdest-3 > dest && memcmp(pdest-3, "/..", 3)==0){
  274. p = pdest-3 - 1;
  275. while(p >= dest && *p != '/')
  276. p--;
  277. if(memcmp(p, "/../", 4) != 0){
  278. pdest = p+1;
  279. *pdest = '\0';
  280. }
  281. }
  282. /* 6g: leading ".." segments are errors -- we'll just blat them out. */
  283. if(RemoveExtraRelDotDots){
  284. p = dest;
  285. if (p[0] == '/')
  286. p++;
  287. s = p;
  288. while(s[0]=='.' && s[1]=='.' && (s[2]==0 || s[2]=='/'))
  289. s += 3;
  290. if(s > p){
  291. memmove(p, s, pdest+1-s);
  292. pdest -= s-p;
  293. }
  294. }
  295. USED(pdest);
  296. if(urldebug)
  297. fprint(2, "merge_relative_path: '%s' + '%.*s' -> '%s'\n", base, rel_len,
  298. rel_st, dest);
  299. }
  300. /*
  301. * See RFC2396 sec 5.2 for info on resolving relative URIs to absolute form.
  302. *
  303. * If successful, this just ends up freeing and replacing "u->url".
  304. */
  305. static int
  306. resolve_relative(SplitUrl *su, Url *base, Url *u)
  307. {
  308. char *url, *path;
  309. char *purl, *ppath;
  310. int currentdoc, ulen, plen;
  311. if(base == nil){
  312. werrstr("relative URI given without base");
  313. return -1;
  314. }
  315. if(base->scheme == nil){
  316. werrstr("relative URI given with no scheme");
  317. return -1;
  318. }
  319. if(base->ischeme == USunknown){
  320. werrstr("relative URI given with unknown scheme");
  321. return -1;
  322. }
  323. if(base->ischeme == UScurrent){
  324. werrstr("relative URI given with incomplete base");
  325. return -1;
  326. }
  327. assert(su->scheme.s == nil);
  328. /* Sec 5.2 step 2 */
  329. currentdoc = 0;
  330. if(su->path.s==nil && su->scheme.s==nil && su->authority.s==nil && su->query.s==nil){
  331. /* Reference is to current document */
  332. if(urldebug)
  333. fprint(2, "url %s is relative to current document\n", u->url);
  334. u->ischeme = UScurrent;
  335. if(!ExpandCurrentDocUrls)
  336. return 0;
  337. currentdoc = 1;
  338. }
  339. /* Over-estimate the maximum lengths, for allocation purposes */
  340. /* (constants are for separators) */
  341. plen = 1;
  342. if(base->path)
  343. plen += strlen(base->path);
  344. if(su->path.s)
  345. plen += 1 + (su->path.e - su->path.s);
  346. ulen = 0;
  347. ulen += strlen(base->scheme) + 1;
  348. if(su->authority.s)
  349. ulen += 2 + (su->authority.e - su->authority.s);
  350. else
  351. ulen += 2 + ((base->authority) ? strlen(base->authority) : 0);
  352. ulen += plen;
  353. if(su->query.s)
  354. ulen += 1 + (su->query.e - su->query.s);
  355. else if(currentdoc && base->query)
  356. ulen += 1 + strlen(base->query);
  357. if(su->fragment.s)
  358. ulen += 1 + (su->fragment.e - su->fragment.s);
  359. else if(currentdoc && base->fragment)
  360. ulen += 1 + strlen(base->fragment);
  361. url = emalloc(ulen+1);
  362. path = emalloc(plen+1);
  363. url[0] = '\0';
  364. purl = url;
  365. path[0] = '\0';
  366. ppath = path;
  367. if(su->authority.s || (su->path.s && (su->path.s[0] == '/'))){
  368. /* Is a "network-path" or "absolute-path"; don't merge with base path */
  369. /* Sec 5.2 steps 4,5 */
  370. if(su->path.s){
  371. memmove(ppath, su->path.s, su->path.e - su->path.s);
  372. ppath += su->path.e - su->path.s;
  373. *ppath = '\0';
  374. }
  375. }else if(currentdoc){
  376. /* Is a current-doc reference; just copy the path from the base URL */
  377. if(base->path){
  378. strcpy(ppath, base->path);
  379. ppath += strlen(ppath);
  380. }
  381. USED(ppath);
  382. }else{
  383. /* Is a relative-path reference; we have to merge it */
  384. /* Sec 5.2 step 6 */
  385. merge_relative_path(base->path,
  386. su->path.s, su->path.e - su->path.s, ppath);
  387. }
  388. /* Build new URL from pieces, inheriting from base where needed */
  389. strcpy(purl, base->scheme);
  390. purl += strlen(purl);
  391. *purl++ = ':';
  392. if(su->authority.s){
  393. strcpy(purl, "//");
  394. purl += strlen(purl);
  395. memmove(purl, su->authority.s, su->authority.e - su->authority.s);
  396. purl += su->authority.e - su->authority.s;
  397. }else if(base->authority){
  398. strcpy(purl, "//");
  399. purl += strlen(purl);
  400. strcpy(purl, base->authority);
  401. purl += strlen(purl);
  402. }
  403. assert((path[0] == '\0') || (path[0] == '/'));
  404. strcpy(purl, path);
  405. purl += strlen(purl);
  406. /*
  407. * The query and fragment are not inherited from the base,
  408. * except in case of "current document" URLs, which inherit any query
  409. * and may inherit the fragment.
  410. */
  411. if(su->query.s){
  412. *purl++ = '?';
  413. memmove(purl, su->query.s, su->query.e - su->query.s);
  414. purl += su->query.e - su->query.s;
  415. }else if(currentdoc && base->query){
  416. *purl++ = '?';
  417. strcpy(purl, base->query);
  418. purl += strlen(purl);
  419. }
  420. if(su->fragment.s){
  421. *purl++ = '#';
  422. memmove(purl, su->query.s, su->query.e - su->query.s);
  423. purl += su->fragment.e - su->fragment.s;
  424. }else if(currentdoc && base->fragment){
  425. *purl++ = '#';
  426. strcpy(purl, base->fragment);
  427. purl += strlen(purl);
  428. }
  429. USED(purl);
  430. if(urldebug)
  431. fprint(2, "resolve_relative: '%s' + '%s' -> '%s'\n", base->url, u->url, url);
  432. free(u->url);
  433. u->url = url;
  434. free(path);
  435. return 0;
  436. }
  437. int
  438. regx(Reprog *prog, char *s, Resub *m, int nm)
  439. {
  440. int i;
  441. if(s == nil)
  442. s = m[0].sp; /* why is this necessary? */
  443. i = regexec(prog, s, m, nm);
  444. /*
  445. if(i >= 0)
  446. for(j=0; j<nm; j++)
  447. fprint(2, "match%d: %.*s\n", j, utfnlen(m[j].sp, m[j].ep-m[j].sp), m[j].sp);
  448. */
  449. return i;
  450. }
  451. static int
  452. ismatch(int i, char *s, char *desc)
  453. {
  454. Resub m[1];
  455. m[0].sp = m[0].ep = nil;
  456. if(!regx(retab[i].prog, s, m, 1)){
  457. werrstr("malformed %s: %q", desc, s);
  458. return 0;
  459. }
  460. return 1;
  461. }
  462. static int
  463. spliturl(char *url, SplitUrl *su)
  464. {
  465. Resub m[MaxResub];
  466. Retab *t;
  467. /*
  468. * Newlines are not valid in a URI, but regexp(2) treats them specially
  469. * so it's best to make sure there are none before proceeding.
  470. */
  471. if(strchr(url, '\n')){
  472. werrstr("newline in URI");
  473. return -1;
  474. }
  475. /*
  476. * Because we use NUL-terminated strings, as do many client and server
  477. * implementations, an escaped NUL ("%00") will quite likely cause problems
  478. * when unescaped. We can check for such a sequence once before examining
  479. * the components because, per RFC2396 sec. 2.4.1 - 2.4.2, '%' is reserved
  480. * in URIs to _always_ indicate escape sequences. Something like "%2500"
  481. * will still get by, but that's legitimate, and if it ends up causing
  482. * a NUL then someone is unescaping too many times.
  483. */
  484. if(strstr(url, "%00")){
  485. werrstr("escaped NUL in URI");
  486. return -1;
  487. }
  488. m[0].sp = m[0].ep = nil;
  489. t = &retab[REsplit];
  490. if(!regx(t->prog, url, m, t->size)){
  491. werrstr("malformed URI: %q", url);
  492. return -1;
  493. }
  494. su->url.s = m[0].sp;
  495. su->url.e = m[0].ep;
  496. su->scheme.s = m[t->ind[0]].sp;
  497. su->scheme.e = m[t->ind[0]].ep;
  498. su->authority.s = m[t->ind[1]].sp;
  499. su->authority.e = m[t->ind[1]].ep;
  500. su->path.s = m[t->ind[2]].sp;
  501. su->path.e = m[t->ind[2]].ep;
  502. su->query.s = m[t->ind[3]].sp;
  503. su->query.e = m[t->ind[3]].ep;
  504. su->fragment.s = m[t->ind[4]].sp;
  505. su->fragment.e = m[t->ind[4]].ep;
  506. if(urldebug)
  507. fprint(2, "split url %s into %.*q %.*q %.*q %.*q %.*q %.*q\n",
  508. url,
  509. su->url.s ? utfnlen(su->url.s, su->url.e-su->url.s) : 10, su->url.s ? su->url.s : "",
  510. su->scheme.s ? utfnlen(su->scheme.s, su->scheme.e-su->scheme.s) : 10, su->scheme.s ? su->scheme.s : "",
  511. su->authority.s ? utfnlen(su->authority.s, su->authority.e-su->authority.s) : 10, su->authority.s ? su->authority.s : "",
  512. su->path.s ? utfnlen(su->path.s, su->path.e-su->path.s) : 10, su->path.s ? su->path.s : "",
  513. su->query.s ? utfnlen(su->query.s, su->query.e-su->query.s) : 10, su->query.s ? su->query.s : "",
  514. su->fragment.s ? utfnlen(su->fragment.s, su->fragment.e-su->fragment.s) : 10, su->fragment.s ? su->fragment.s : "");
  515. return 0;
  516. }
  517. static int
  518. parse_scheme(SplitUrl *su, Url *u)
  519. {
  520. if(su->scheme.s == nil){
  521. werrstr("missing scheme");
  522. return -1;
  523. }
  524. u->scheme = estredup(su->scheme.s, su->scheme.e);
  525. strlower(u->scheme);
  526. if(!ismatch(REscheme, u->scheme, "scheme"))
  527. return -1;
  528. u->ischeme = ischeme(u->scheme);
  529. if(urldebug)
  530. fprint(2, "parse_scheme %s => %d\n", u->scheme, u->ischeme);
  531. return 0;
  532. }
  533. static int
  534. parse_unknown_part(SplitUrl *su, Url *u)
  535. {
  536. char *s, *e;
  537. assert(u->ischeme == USunknown);
  538. assert(su->scheme.e[0] == ':');
  539. s = su->scheme.e+1;
  540. if(su->fragment.s){
  541. e = su->fragment.s-1;
  542. assert(*e == '#');
  543. }else
  544. e = s+strlen(s);
  545. u->schemedata = estredup(s, e);
  546. if(!ismatch(REunknowndata, u->schemedata, "unknown scheme data"))
  547. return -1;
  548. return 0;
  549. }
  550. static int
  551. parse_userinfo(char *s, char *e, Url *u)
  552. {
  553. Resub m[MaxResub];
  554. Retab *t;
  555. m[0].sp = s;
  556. m[0].ep = e;
  557. t = &retab[REuserinfo];
  558. if(!regx(t->prog, nil, m, t->size)){
  559. werrstr("malformed userinfo: %.*q", utfnlen(s, e-s), s);
  560. return -1;
  561. }
  562. if(m[t->ind[0]].sp)
  563. u->user = estredup(m[t->ind[0]].sp, m[t->ind[0]].ep);
  564. if(m[t->ind[1]].sp)
  565. u->user = estredup(m[t->ind[1]].sp, m[t->ind[1]].ep);
  566. return 0;
  567. }
  568. static int
  569. parse_host(char *s, char *e, Url *u)
  570. {
  571. Resub m[MaxResub];
  572. Retab *t;
  573. m[0].sp = s;
  574. m[0].ep = e;
  575. t = &retab[REhost];
  576. if(!regx(t->prog, nil, m, t->size)){
  577. werrstr("malformed host: %.*q", utfnlen(s, e-s), s);
  578. return -1;
  579. }
  580. assert(m[t->ind[0]].sp || m[t->ind[1]].sp);
  581. if(m[t->ind[0]].sp) /* regular */
  582. u->host = estredup(m[t->ind[0]].sp, m[t->ind[0]].ep);
  583. else
  584. u->host = estredup(m[t->ind[1]].sp, m[t->ind[1]].ep);
  585. return 0;
  586. }
  587. static int
  588. parse_authority(SplitUrl *su, Url *u)
  589. {
  590. Resub m[MaxResub];
  591. Retab *t;
  592. if(su->authority.s == nil)
  593. return 0;
  594. u->authority = estredup(su->authority.s, su->authority.e);
  595. m[0].sp = m[0].ep = nil;
  596. t = &retab[REauthority];
  597. if(!regx(t->prog, u->authority, m, t->size)){
  598. werrstr("malformed authority: %q", u->authority);
  599. return -1;
  600. }
  601. if(m[t->ind[0]].sp)
  602. if(parse_userinfo(m[t->ind[0]].sp, m[t->ind[0]].ep, u) < 0)
  603. return -1;
  604. if(m[t->ind[1]].sp)
  605. if(parse_host(m[t->ind[1]].sp, m[t->ind[1]].ep, u) < 0)
  606. return -1;
  607. if(m[t->ind[2]].sp)
  608. u->port = estredup(m[t->ind[2]].sp, m[t->ind[2]].ep);
  609. return 0;
  610. }
  611. static int
  612. parse_abspath(SplitUrl *su, Url *u)
  613. {
  614. if(su->path.s == nil)
  615. return 0;
  616. u->path = estredup(su->path.s, su->path.e);
  617. if(!ismatch(REabspath, u->path, "absolute path"))
  618. return -1;
  619. return 0;
  620. }
  621. static int
  622. parse_query(SplitUrl *su, Url *u)
  623. {
  624. if(su->query.s == nil)
  625. return 0;
  626. u->query = estredup(su->query.s, su->query.e);
  627. if(!ismatch(REquery, u->query, "query"))
  628. return -1;
  629. return 0;
  630. }
  631. static int
  632. parse_fragment(SplitUrl *su, Url *u)
  633. {
  634. if(su->fragment.s == nil)
  635. return 0;
  636. u->fragment = estredup(su->fragment.s, su->fragment.e);
  637. if(!ismatch(REfragment, u->fragment, "fragment"))
  638. return -1;
  639. return 0;
  640. }
  641. static int
  642. postparse_http(Url *u)
  643. {
  644. u->open = httpopen;
  645. u->read = httpread;
  646. u->close = httpclose;
  647. if(u->authority==nil){
  648. werrstr("missing authority (hostname, port, etc.)");
  649. return -1;
  650. }
  651. if(u->host == nil){
  652. werrstr("missing host specification");
  653. return -1;
  654. }
  655. if(u->path == nil){
  656. u->http.page_spec = estrdup("/");
  657. return 0;
  658. }
  659. if(!ismatch(REhttppath, u->path, "http path"))
  660. return -1;
  661. if(u->query){
  662. u->http.page_spec = emalloc(strlen(u->path)+1+strlen(u->query)+1);
  663. strcpy(u->http.page_spec, u->path);
  664. strcat(u->http.page_spec, "?");
  665. strcat(u->http.page_spec, u->query);
  666. }else
  667. u->http.page_spec = estrdup(u->path);
  668. return 0;
  669. }
  670. static int
  671. postparse_ftp(Url *u)
  672. {
  673. Resub m[MaxResub];
  674. Retab *t;
  675. if(u->authority==nil){
  676. werrstr("missing authority (hostname, port, etc.)");
  677. return -1;
  678. }
  679. if(u->query){
  680. werrstr("unexpected \"?query\" in ftp path");
  681. return -1;
  682. }
  683. if(u->host == nil){
  684. werrstr("missing host specification");
  685. return -1;
  686. }
  687. if(u->path == nil){
  688. u->ftp.path_spec = estrdup("/");
  689. return 0;
  690. }
  691. m[0].sp = m[0].ep = nil;
  692. t = &retab[REftppath];
  693. if(!regx(t->prog, u->path, m, t->size)){
  694. werrstr("malformed ftp path: %q", u->path);
  695. return -1;
  696. }
  697. if(m[t->ind[0]].sp){
  698. u->ftp.path_spec = estredup(m[t->ind[0]].sp, m[t->ind[0]].ep);
  699. if(strchr(u->ftp.path_spec, ';')){
  700. werrstr("unexpected \";param\" in ftp path");
  701. return -1;
  702. }
  703. }else
  704. u->ftp.path_spec = estrdup("/");
  705. if(m[t->ind[1]].sp){
  706. u->ftp.type = estredup(m[t->ind[1]].sp, m[t->ind[1]].ep);
  707. strlower(u->ftp.type);
  708. }
  709. return 0;
  710. }
  711. static int
  712. postparse_file(Url *u)
  713. {
  714. if(u->user || u->passwd){
  715. werrstr("user information not valid with file scheme");
  716. return -1;
  717. }
  718. if(u->query){
  719. werrstr("unexpected \"?query\" in file path");
  720. return -1;
  721. }
  722. if(u->port){
  723. werrstr("port not valid with file scheme");
  724. return -1;
  725. }
  726. if(u->path == nil){
  727. werrstr("missing path in file scheme");
  728. return -1;
  729. }
  730. if(strchr(u->path, ';')){
  731. werrstr("unexpected \";param\" in file path");
  732. return -1;
  733. }
  734. if(!ismatch(REfilepath, u->path, "file path"))
  735. return -1;
  736. /* "localhost" is equivalent to no host spec, we'll chose the latter */
  737. if(u->host && cistrcmp(u->host, "localhost") == 0){
  738. free(u->host);
  739. u->host = nil;
  740. }
  741. return 0;
  742. }
  743. static int (*postparse[])(Url*) = {
  744. nil,
  745. postparse_http,
  746. postparse_http,
  747. postparse_ftp,
  748. postparse_file,
  749. };
  750. Url*
  751. parseurl(char *url, Url *base)
  752. {
  753. Url *u;
  754. SplitUrl su;
  755. if(urldebug)
  756. fprint(2, "parseurl %s with base %s\n", url, base ? base->url : "<none>");
  757. u = emalloc(sizeof(Url));
  758. u->url = estrdup(url);
  759. if(spliturl(u->url, &su) < 0){
  760. Fail:
  761. freeurl(u);
  762. return nil;
  763. }
  764. /* RFC2396 sec 3.1 says relative URIs are distinguished by absent scheme */
  765. if(su.scheme.s==nil){
  766. if(urldebug)
  767. fprint(2, "parseurl has nil scheme\n");
  768. if(resolve_relative(&su, base, u) < 0 || spliturl(u->url, &su) < 0)
  769. goto Fail;
  770. if(u->ischeme == UScurrent){
  771. /* 'u.url' refers to current document; set fragment and return */
  772. if(parse_fragment(&su, u) < 0)
  773. goto Fail;
  774. return u;
  775. }
  776. }
  777. if(parse_scheme(&su, u) < 0
  778. || parse_fragment(&su, u) < 0)
  779. goto Fail;
  780. if(u->ischeme == USunknown){
  781. if(parse_unknown_part(&su, u) < 0)
  782. goto Fail;
  783. return u;
  784. }
  785. if(parse_query(&su, u) < 0
  786. || parse_authority(&su, u) < 0
  787. || parse_abspath(&su, u) < 0)
  788. goto Fail;
  789. if(u->ischeme < nelem(postparse) && postparse[u->ischeme])
  790. if((*postparse[u->ischeme])(u) < 0)
  791. goto Fail;
  792. setmalloctag(u, getcallerpc(&url));
  793. return u;
  794. }
  795. void
  796. freeurl(Url *u)
  797. {
  798. if(u == nil)
  799. return;
  800. free(u->url);
  801. free(u->scheme);
  802. free(u->schemedata);
  803. free(u->authority);
  804. free(u->user);
  805. free(u->passwd);
  806. free(u->host);
  807. free(u->port);
  808. free(u->path);
  809. free(u->query);
  810. free(u->fragment);
  811. switch(u->ischeme){
  812. case UShttp:
  813. free(u->http.page_spec);
  814. break;
  815. case USftp:
  816. free(u->ftp.path_spec);
  817. free(u->ftp.type);
  818. break;
  819. }
  820. free(u);
  821. }
  822. void
  823. rewriteurl(Url *u)
  824. {
  825. char *s;
  826. if(u->schemedata)
  827. s = estrmanydup(u->scheme, ":", u->schemedata, nil);
  828. else
  829. s = estrmanydup(u->scheme, "://",
  830. u->user ? u->user : "",
  831. u->passwd ? ":" : "", u->passwd ? u->passwd : "",
  832. u->user ? "@" : "", u->host ? u->host : "",
  833. u->port ? ":" : "", u->port ? u->port : "",
  834. u->path,
  835. u->query ? "?" : "", u->query ? u->query : "",
  836. u->fragment ? "#" : "", u->fragment ? u->fragment : "",
  837. nil);
  838. free(u->url);
  839. u->url = s;
  840. }
  841. int
  842. seturlquery(Url *u, char *query)
  843. {
  844. if(query == nil){
  845. free(u->query);
  846. u->query = nil;
  847. return 0;
  848. }
  849. if(!ismatch(REquery, query, "query"))
  850. return -1;
  851. free(u->query);
  852. u->query = estrdup(query);
  853. return 0;
  854. }
  855. static void
  856. dupp(char **p)
  857. {
  858. if(*p)
  859. *p = estrdup(*p);
  860. }
  861. Url*
  862. copyurl(Url *u)
  863. {
  864. Url *v;
  865. v = emalloc(sizeof(Url));
  866. *v = *u;
  867. dupp(&v->url);
  868. dupp(&v->scheme);
  869. dupp(&v->schemedata);
  870. dupp(&v->authority);
  871. dupp(&v->user);
  872. dupp(&v->passwd);
  873. dupp(&v->host);
  874. dupp(&v->port);
  875. dupp(&v->path);
  876. dupp(&v->query);
  877. dupp(&v->fragment);
  878. switch(v->ischeme){
  879. case UShttp:
  880. dupp(&v->http.page_spec);
  881. break;
  882. case USftp:
  883. dupp(&v->ftp.path_spec);
  884. dupp(&v->ftp.type);
  885. break;
  886. }
  887. return v;
  888. }
  889. static int
  890. dhex(char c)
  891. {
  892. if('0' <= c && c <= '9')
  893. return c-'0';
  894. if('a' <= c && c <= 'f')
  895. return c-'a'+10;
  896. if('A' <= c && c <= 'F')
  897. return c-'A'+10;
  898. return 0;
  899. }
  900. char*
  901. escapeurl(char *s, int (*needesc)(int))
  902. {
  903. int n;
  904. char *t, *u;
  905. Rune r;
  906. static char *hex = "0123456789abcdef";
  907. n = 0;
  908. for(t=s; *t; t++)
  909. if((*needesc)(*t))
  910. n++;
  911. u = emalloc(strlen(s)+2*n+1);
  912. t = u;
  913. for(; *s; s++){
  914. s += chartorune(&r, s);
  915. if(r >= 0xFF){
  916. werrstr("URLs cannot contain Runes > 0xFF");
  917. free(t);
  918. return nil;
  919. }
  920. if((*needesc)(r)){
  921. *u++ = '%';
  922. *u++ = hex[(r>>4)&0xF];
  923. *u++ = hex[r&0xF];
  924. }else
  925. *u++ = r;
  926. }
  927. *u = '\0';
  928. return t;
  929. }
  930. char*
  931. unescapeurl(char *s)
  932. {
  933. char *r, *w;
  934. Rune rune;
  935. s = estrdup(s);
  936. for(r=w=s; *r; r++){
  937. if(*r=='%'){
  938. r++;
  939. if(!isxdigit(r[0]) || !isxdigit(r[1])){
  940. werrstr("bad escape sequence '%.3s' in URL", r);
  941. return nil;
  942. }
  943. if(r[0]=='0' && r[2]=='0'){
  944. werrstr("escaped NUL in URL");
  945. return nil;
  946. }
  947. rune = (dhex(r[0])<<4)|dhex(r[1]); /* latin1 */
  948. w += runetochar(w, &rune);
  949. r += 2;
  950. }else
  951. *w++ = *r;
  952. }
  953. *w = '\0';
  954. return s;
  955. }