1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081 |
- /*
- * This is a URL parser, written to parse "Common Internet Scheme" URL
- * syntax as described in RFC1738 and updated by RFC2396. Only absolute URLs
- * are supported, using "server-based" naming authorities in the schemes.
- * Support for literal IPv6 addresses is included, per RFC2732.
- *
- * Current "known" schemes: http, ftp, file.
- *
- * We can do all the parsing operations without Runes since URLs are
- * defined to be composed of US-ASCII printable characters.
- * See RFC1738, RFC2396.
- */
- #include <u.h>
- #include <libc.h>
- #include <ctype.h>
- #include <regexp.h>
- #include <plumb.h>
- #include <thread.h>
- #include <fcall.h>
- #include <9p.h>
- #include "dat.h"
- #include "fns.h"
- int urldebug;
- /* If set, relative paths with leading ".." segments will have them trimmed */
- #define RemoveExtraRelDotDots 0
- #define ExpandCurrentDocUrls 1
- static char*
- schemestrtab[] =
- {
- nil,
- "http",
- "https",
- "ftp",
- "file",
- };
- static int
- ischeme(char *s)
- {
- int i;
- for(i=0; i<nelem(schemestrtab); i++)
- if(schemestrtab[i] && strcmp(s, schemestrtab[i])==0)
- return i;
- return USunknown;
- }
- /*
- * URI splitting regexp is from RFC2396, Appendix B:
- * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
- * 12 3 4 5 6 7 8 9
- *
- * Example: "http://www.ics.uci.edu/pub/ietf/uri/#Related"
- * $2 = scheme "http"
- * $4 = authority "www.ics.uci.edu"
- * $5 = path "/pub/ietf/uri/"
- * $7 = query <undefined>
- * $9 = fragment "Related"
- */
- /*
- * RFC2396, Sec 3.1, contains:
- *
- * Scheme names consist of a sequence of characters beginning with a
- * lower case letter and followed by any combination of lower case
- * letters, digits, plus ("+"), period ("."), or hyphen ("-"). For
- * resiliency, programs interpreting URI should treat upper case letters
- * as equivalent to lower case in scheme names (e.g., allow "HTTP" as
- * well as "http").
- */
- /*
- * For server-based naming authorities (RFC2396 Sec 3.2.2):
- * server = [ [ userinfo "@" ] hostport ]
- * userinfo = *( unreserved | escaped |
- * ";" | ":" | "&" | "=" | "+" | "$" | "," )
- * hostport = host [ ":" port ]
- * host = hostname | IPv4address
- * hostname = *( domainlabel "." ) toplabel [ "." ]
- * domainlabel = alphanum | alphanum *( alphanum | "-" ) alphanum
- * toplabel = alpha | alpha *( alphanum | "-" ) alphanum
- * IPv4address = 1*digit "." 1*digit "." 1*digit "." 1*digit
- * port = *digit
- *
- * The host is a domain name of a network host, or its IPv4 address as a
- * set of four decimal digit groups separated by ".". Literal IPv6
- * addresses are not supported.
- *
- * Note that literal IPv6 address support is outlined in RFC2732:
- * host = hostname | IPv4address | IPv6reference
- * ipv6reference = "[" IPv6address "]" (RFC2373)
- *
- * Since hostnames and numbers will have to be resolved by the OS anyway,
- * we don't have to parse them too pedantically (counting '.'s, checking
- * for well-formed literal IP addresses, etc.).
- *
- * In FTP/file paths, we reject most ";param"s and querys. In HTTP paths,
- * we just pass them through.
- *
- * Instead of letting a "path" be 0-or-more characters as RFC2396 suggests,
- * we'll say it's 1-or-more characters, 0-or-1 times. This way, an absent
- * path yields a nil substring match, instead of an empty one.
- *
- * We're more restrictive than RFC2396 indicates with "userinfo" strings,
- * insisting they have the form "[user[:password]]". This may need to
- * change at some point, however.
- */
- /* RE character-class components -- these go in brackets */
- #define PUNCT "\\-_.!~*'()"
- #define RES ";/?:@&=+$,"
- #define ALNUM "a-zA-Z0-9"
- #define HEX "0-9a-fA-F"
- #define UNRES ALNUM PUNCT
- /* RE components; _N => has N parenthesized subexpressions when expanded */
- #define ESCAPED_1 "(%[" HEX "][" HEX "])"
- #define URIC_2 "([" RES UNRES "]|" ESCAPED_1 ")"
- #define URICNOSLASH_2 "([" UNRES ";?:@&=+$,]|" ESCAPED_1 ")"
- #define USERINFO_2 "([" UNRES ";:&=+$,]|" ESCAPED_1 ")"
- #define PCHAR_2 "([" UNRES ":@&=+$,]|" ESCAPED_1 ")"
- #define PSEGCHAR_3 "([/;]|" PCHAR_2 ")"
- typedef struct Retab Retab;
- struct Retab
- {
- char *str;
- Reprog *prog;
- int size;
- int ind[5];
- };
- enum
- {
- REsplit = 0,
- REscheme,
- REunknowndata,
- REauthority,
- REhost,
- REuserinfo,
- REabspath,
- REquery,
- REfragment,
- REhttppath,
- REftppath,
- REfilepath,
- MaxResub= 20,
- };
- Retab retab[] = /* view in constant width Font */
- {
- [REsplit]
- "^(([^:/?#]+):)?(//([^/?#]*))?([^?#]+)?(\\?([^#]*))?(#(.*))?$", nil, 0,
- /* |-scheme-| |-auth.-| |path--| |query| |--|frag */
- { 2, 4, 5, 7, 9},
- [REscheme]
- "^[a-z][a-z0-9+-.]*$", nil, 0,
- { 0, },
- [REunknowndata]
- "^" URICNOSLASH_2 URIC_2 "*$", nil, 0,
- { 0, },
- [REauthority]
- "^(((" USERINFO_2 "*)@)?(((\\[[^\\]@]+\\])|([^:\\[@]+))(:([0-9]*))?)?)?$", nil, 0,
- /* |----user info-----| |--------host----------------| |-port-| */
- { 2, 7, 12, },
- [REhost]
- "^(([a-zA-Z0-9\\-.]+)|(\\[([a-fA-F0-9.:]+)\\]))$", nil, 0,
- /* |--regular host--| |-IPv6 literal-| */
- { 2, 4, },
- [REuserinfo]
- "^(([^:]*)(:([^:]*))?)$", nil, 0,
- /* |user-| |pass-| */
- { 2, 4, },
- [REabspath]
- "^/" PSEGCHAR_3 "*$", nil, 0,
- { 0, },
- [REquery]
- "^" URIC_2 "*$", nil, 0,
- { 0, },
- [REfragment]
- "^" URIC_2 "*$", nil, 0,
- { 0, },
- [REhttppath]
- "^.*$", nil, 0,
- { 0, },
- [REftppath]
- "^(.+)(;[tT][yY][pP][eE]=([aAiIdD]))?$", nil, 0,
- /*|--|-path |ftptype-| */
- { 1, 3, },
- [REfilepath]
- "^.*$", nil, 0,
- { 0, },
- };
- static int
- countleftparen(char *s)
- {
- int n;
- n = 0;
- for(; *s; s++)
- if(*s == '(')
- n++;
- return n;
- }
- void
- initurl(void)
- {
- int i, j;
- for(i=0; i<nelem(retab); i++){
- retab[i].prog = regcomp(retab[i].str);
- if(retab[i].prog == nil)
- sysfatal("recomp(%s): %r", retab[i].str);
- retab[i].size = countleftparen(retab[i].str)+1;
- for(j=0; j<nelem(retab[i].ind); j++)
- if(retab[i].ind[j] >= retab[i].size)
- sysfatal("bad index in regexp table: retab[%d].ind[%d] = %d >= %d",
- i, j, retab[i].ind[j], retab[i].size);
- if(MaxResub < retab[i].size)
- sysfatal("MaxResub too small: %d < %d", MaxResub, retab[i].size);
- }
- }
- typedef struct SplitUrl SplitUrl;
- struct SplitUrl
- {
- struct {
- char *s;
- char *e;
- } url, scheme, authority, path, query, fragment;
- };
- /*
- * Implements the algorithm in RFC2396 sec 5.2 step 6.
- * Returns number of chars written, excluding NUL terminator.
- * dest is known to be >= strlen(base)+rel_len.
- */
- static void
- merge_relative_path(char *base, char *rel_st, int rel_len, char *dest)
- {
- char *s, *p, *e, *pdest;
- pdest = dest;
- /* 6a: start with base, discard last segment */
- if(base){
- /* Empty paths don't match in our scheme; 'base' should be nil */
- assert(base[0] == '/');
- e = strrchr(base, '/');
- e++;
- memmove(pdest, base, e-base);
- pdest += e-base;
- }else{
- /* Artistic license on my part */
- *pdest++ = '/';
- }
- /* 6b: append relative component */
- if(rel_st){
- memmove(pdest, rel_st, rel_len);
- pdest += rel_len;
- }
- /* 6c: remove any occurrences of "./" as a complete segment */
- s = dest;
- *pdest = '\0';
- while(e = strstr(s, "./")){
- if((e == dest) || (*(e-1) == '/')){
- memmove(e, e+2, pdest+1-(e+2)); /* +1 for NUL */
- pdest -= 2;
- }else
- s = e+1;
- }
- /* 6d: remove a trailing "." as a complete segment */
- if(pdest>dest && *(pdest-1)=='.' &&
- (pdest==dest+1 || *(pdest-2)=='/'))
- *--pdest = '\0';
- /* 6e: remove occurences of "seg/../", where seg != "..", left->right */
- s = dest+1;
- while(e = strstr(s, "/../")){
- p = e - 1;
- while(p >= dest && *p != '/')
- p--;
- if(memcmp(p, "/../", 4) != 0){
- memmove(p+1, e+4, pdest+1-(e+4));
- pdest -= (e+4) - (p+1);
- }else
- s = e+1;
- }
- /* 6f: remove a trailing "seg/..", where seg isn't ".." */
- if(pdest-3 > dest && memcmp(pdest-3, "/..", 3)==0){
- p = pdest-3 - 1;
- while(p >= dest && *p != '/')
- p--;
- if(memcmp(p, "/../", 4) != 0){
- pdest = p+1;
- *pdest = '\0';
- }
- }
- /* 6g: leading ".." segments are errors -- we'll just blat them out. */
- if(RemoveExtraRelDotDots){
- p = dest;
- if (p[0] == '/')
- p++;
- s = p;
- while(s[0]=='.' && s[1]=='.' && (s[2]==0 || s[2]=='/'))
- s += 3;
- if(s > p){
- memmove(p, s, pdest+1-s);
- pdest -= s-p;
- }
- }
- USED(pdest);
- if(urldebug)
- fprint(2, "merge_relative_path: '%s' + '%.*s' -> '%s'\n", base, rel_len,
- rel_st, dest);
- }
- /*
- * See RFC2396 sec 5.2 for info on resolving relative URIs to absolute form.
- *
- * If successful, this just ends up freeing and replacing "u->url".
- */
- static int
- resolve_relative(SplitUrl *su, Url *base, Url *u)
- {
- char *url, *path;
- char *purl, *ppath;
- int currentdoc, ulen, plen;
- if(base == nil){
- werrstr("relative URI given without base");
- return -1;
- }
- if(base->scheme == nil){
- werrstr("relative URI given with no scheme");
- return -1;
- }
- if(base->ischeme == USunknown){
- werrstr("relative URI given with unknown scheme");
- return -1;
- }
- if(base->ischeme == UScurrent){
- werrstr("relative URI given with incomplete base");
- return -1;
- }
- assert(su->scheme.s == nil);
- /* Sec 5.2 step 2 */
- currentdoc = 0;
- if(su->path.s==nil && su->scheme.s==nil && su->authority.s==nil && su->query.s==nil){
- /* Reference is to current document */
- if(urldebug)
- fprint(2, "url %s is relative to current document\n", u->url);
- u->ischeme = UScurrent;
- if(!ExpandCurrentDocUrls)
- return 0;
- currentdoc = 1;
- }
-
- /* Over-estimate the maximum lengths, for allocation purposes */
- /* (constants are for separators) */
- plen = 1;
- if(base->path)
- plen += strlen(base->path);
- if(su->path.s)
- plen += 1 + (su->path.e - su->path.s);
- ulen = 0;
- ulen += strlen(base->scheme) + 1;
- if(su->authority.s)
- ulen += 2 + (su->authority.e - su->authority.s);
- else
- ulen += 2 + ((base->authority) ? strlen(base->authority) : 0);
- ulen += plen;
- if(su->query.s)
- ulen += 1 + (su->query.e - su->query.s);
- else if(currentdoc && base->query)
- ulen += 1 + strlen(base->query);
- if(su->fragment.s)
- ulen += 1 + (su->fragment.e - su->fragment.s);
- else if(currentdoc && base->fragment)
- ulen += 1 + strlen(base->fragment);
- url = emalloc(ulen+1);
- path = emalloc(plen+1);
- url[0] = '\0';
- purl = url;
- path[0] = '\0';
- ppath = path;
- if(su->authority.s || (su->path.s && (su->path.s[0] == '/'))){
- /* Is a "network-path" or "absolute-path"; don't merge with base path */
- /* Sec 5.2 steps 4,5 */
- if(su->path.s){
- memmove(ppath, su->path.s, su->path.e - su->path.s);
- ppath += su->path.e - su->path.s;
- *ppath = '\0';
- }
- }else if(currentdoc){
- /* Is a current-doc reference; just copy the path from the base URL */
- if(base->path){
- strcpy(ppath, base->path);
- ppath += strlen(ppath);
- }
- USED(ppath);
- }else{
- /* Is a relative-path reference; we have to merge it */
- /* Sec 5.2 step 6 */
- merge_relative_path(base->path,
- su->path.s, su->path.e - su->path.s, ppath);
- }
- /* Build new URL from pieces, inheriting from base where needed */
- strcpy(purl, base->scheme);
- purl += strlen(purl);
- *purl++ = ':';
- if(su->authority.s){
- strcpy(purl, "//");
- purl += strlen(purl);
- memmove(purl, su->authority.s, su->authority.e - su->authority.s);
- purl += su->authority.e - su->authority.s;
- }else if(base->authority){
- strcpy(purl, "//");
- purl += strlen(purl);
- strcpy(purl, base->authority);
- purl += strlen(purl);
- }
- assert((path[0] == '\0') || (path[0] == '/'));
- strcpy(purl, path);
- purl += strlen(purl);
- /*
- * The query and fragment are not inherited from the base,
- * except in case of "current document" URLs, which inherit any query
- * and may inherit the fragment.
- */
- if(su->query.s){
- *purl++ = '?';
- memmove(purl, su->query.s, su->query.e - su->query.s);
- purl += su->query.e - su->query.s;
- }else if(currentdoc && base->query){
- *purl++ = '?';
- strcpy(purl, base->query);
- purl += strlen(purl);
- }
- if(su->fragment.s){
- *purl++ = '#';
- memmove(purl, su->query.s, su->query.e - su->query.s);
- purl += su->fragment.e - su->fragment.s;
- }else if(currentdoc && base->fragment){
- *purl++ = '#';
- strcpy(purl, base->fragment);
- purl += strlen(purl);
- }
- USED(purl);
- if(urldebug)
- fprint(2, "resolve_relative: '%s' + '%s' -> '%s'\n", base->url, u->url, url);
- free(u->url);
- u->url = url;
- free(path);
- return 0;
- }
- int
- regx(Reprog *prog, char *s, Resub *m, int nm)
- {
- int i;
- if(s == nil)
- s = m[0].sp; /* why is this necessary? */
- i = regexec(prog, s, m, nm);
- /*
- if(i >= 0)
- for(j=0; j<nm; j++)
- fprint(2, "match%d: %.*s\n", j, utfnlen(m[j].sp, m[j].ep-m[j].sp), m[j].sp);
- */
- return i;
- }
- static int
- ismatch(int i, char *s, char *desc)
- {
- Resub m[1];
- m[0].sp = m[0].ep = nil;
- if(!regx(retab[i].prog, s, m, 1)){
- werrstr("malformed %s: %q", desc, s);
- return 0;
- }
- return 1;
- }
- static int
- spliturl(char *url, SplitUrl *su)
- {
- Resub m[MaxResub];
- Retab *t;
- /*
- * Newlines are not valid in a URI, but regexp(2) treats them specially
- * so it's best to make sure there are none before proceeding.
- */
- if(strchr(url, '\n')){
- werrstr("newline in URI");
- return -1;
- }
- /*
- * Because we use NUL-terminated strings, as do many client and server
- * implementations, an escaped NUL ("%00") will quite likely cause problems
- * when unescaped. We can check for such a sequence once before examining
- * the components because, per RFC2396 sec. 2.4.1 - 2.4.2, '%' is reserved
- * in URIs to _always_ indicate escape sequences. Something like "%2500"
- * will still get by, but that's legitimate, and if it ends up causing
- * a NUL then someone is unescaping too many times.
- */
- if(strstr(url, "%00")){
- werrstr("escaped NUL in URI");
- return -1;
- }
- m[0].sp = m[0].ep = nil;
- t = &retab[REsplit];
- if(!regx(t->prog, url, m, t->size)){
- werrstr("malformed URI: %q", url);
- return -1;
- }
- su->url.s = m[0].sp;
- su->url.e = m[0].ep;
- su->scheme.s = m[t->ind[0]].sp;
- su->scheme.e = m[t->ind[0]].ep;
- su->authority.s = m[t->ind[1]].sp;
- su->authority.e = m[t->ind[1]].ep;
- su->path.s = m[t->ind[2]].sp;
- su->path.e = m[t->ind[2]].ep;
- su->query.s = m[t->ind[3]].sp;
- su->query.e = m[t->ind[3]].ep;
- su->fragment.s = m[t->ind[4]].sp;
- su->fragment.e = m[t->ind[4]].ep;
- if(urldebug)
- fprint(2, "split url %s into %.*q %.*q %.*q %.*q %.*q %.*q\n",
- url,
- su->url.s ? utfnlen(su->url.s, su->url.e-su->url.s) : 10, su->url.s ? su->url.s : "",
- su->scheme.s ? utfnlen(su->scheme.s, su->scheme.e-su->scheme.s) : 10, su->scheme.s ? su->scheme.s : "",
- su->authority.s ? utfnlen(su->authority.s, su->authority.e-su->authority.s) : 10, su->authority.s ? su->authority.s : "",
- su->path.s ? utfnlen(su->path.s, su->path.e-su->path.s) : 10, su->path.s ? su->path.s : "",
- su->query.s ? utfnlen(su->query.s, su->query.e-su->query.s) : 10, su->query.s ? su->query.s : "",
- su->fragment.s ? utfnlen(su->fragment.s, su->fragment.e-su->fragment.s) : 10, su->fragment.s ? su->fragment.s : "");
- return 0;
- }
- static int
- parse_scheme(SplitUrl *su, Url *u)
- {
- if(su->scheme.s == nil){
- werrstr("missing scheme");
- return -1;
- }
- u->scheme = estredup(su->scheme.s, su->scheme.e);
- strlower(u->scheme);
- if(!ismatch(REscheme, u->scheme, "scheme"))
- return -1;
- u->ischeme = ischeme(u->scheme);
- if(urldebug)
- fprint(2, "parse_scheme %s => %d\n", u->scheme, u->ischeme);
- return 0;
- }
- static int
- parse_unknown_part(SplitUrl *su, Url *u)
- {
- char *s, *e;
- assert(u->ischeme == USunknown);
- assert(su->scheme.e[0] == ':');
- s = su->scheme.e+1;
- if(su->fragment.s){
- e = su->fragment.s-1;
- assert(*e == '#');
- }else
- e = s+strlen(s);
- u->schemedata = estredup(s, e);
- if(!ismatch(REunknowndata, u->schemedata, "unknown scheme data"))
- return -1;
- return 0;
- }
- static int
- parse_userinfo(char *s, char *e, Url *u)
- {
- Resub m[MaxResub];
- Retab *t;
- m[0].sp = s;
- m[0].ep = e;
- t = &retab[REuserinfo];
- if(!regx(t->prog, nil, m, t->size)){
- werrstr("malformed userinfo: %.*q", utfnlen(s, e-s), s);
- return -1;
- }
- if(m[t->ind[0]].sp)
- u->user = estredup(m[t->ind[0]].sp, m[t->ind[0]].ep);
- if(m[t->ind[1]].sp)
- u->user = estredup(m[t->ind[1]].sp, m[t->ind[1]].ep);
- return 0;
- }
- static int
- parse_host(char *s, char *e, Url *u)
- {
- Resub m[MaxResub];
- Retab *t;
- m[0].sp = s;
- m[0].ep = e;
- t = &retab[REhost];
- if(!regx(t->prog, nil, m, t->size)){
- werrstr("malformed host: %.*q", utfnlen(s, e-s), s);
- return -1;
- }
- assert(m[t->ind[0]].sp || m[t->ind[1]].sp);
- if(m[t->ind[0]].sp) /* regular */
- u->host = estredup(m[t->ind[0]].sp, m[t->ind[0]].ep);
- else
- u->host = estredup(m[t->ind[1]].sp, m[t->ind[1]].ep);
- return 0;
- }
- static int
- parse_authority(SplitUrl *su, Url *u)
- {
- Resub m[MaxResub];
- Retab *t;
- if(su->authority.s == nil)
- return 0;
- u->authority = estredup(su->authority.s, su->authority.e);
- m[0].sp = m[0].ep = nil;
- t = &retab[REauthority];
- if(!regx(t->prog, u->authority, m, t->size)){
- werrstr("malformed authority: %q", u->authority);
- return -1;
- }
- if(m[t->ind[0]].sp)
- if(parse_userinfo(m[t->ind[0]].sp, m[t->ind[0]].ep, u) < 0)
- return -1;
- if(m[t->ind[1]].sp)
- if(parse_host(m[t->ind[1]].sp, m[t->ind[1]].ep, u) < 0)
- return -1;
- if(m[t->ind[2]].sp)
- u->port = estredup(m[t->ind[2]].sp, m[t->ind[2]].ep);
- return 0;
- }
- static int
- parse_abspath(SplitUrl *su, Url *u)
- {
- if(su->path.s == nil)
- return 0;
- u->path = estredup(su->path.s, su->path.e);
- if(!ismatch(REabspath, u->path, "absolute path"))
- return -1;
- return 0;
- }
- static int
- parse_query(SplitUrl *su, Url *u)
- {
- if(su->query.s == nil)
- return 0;
- u->query = estredup(su->query.s, su->query.e);
- if(!ismatch(REquery, u->query, "query"))
- return -1;
- return 0;
- }
- static int
- parse_fragment(SplitUrl *su, Url *u)
- {
- if(su->fragment.s == nil)
- return 0;
- u->fragment = estredup(su->fragment.s, su->fragment.e);
- if(!ismatch(REfragment, u->fragment, "fragment"))
- return -1;
- return 0;
- }
- static int
- postparse_http(Url *u)
- {
- u->open = httpopen;
- u->read = httpread;
- u->close = httpclose;
- if(u->authority==nil){
- werrstr("missing authority (hostname, port, etc.)");
- return -1;
- }
- if(u->host == nil){
- werrstr("missing host specification");
- return -1;
- }
- if(u->path == nil){
- u->http.page_spec = estrdup("/");
- return 0;
- }
- if(!ismatch(REhttppath, u->path, "http path"))
- return -1;
- if(u->query){
- u->http.page_spec = emalloc(strlen(u->path)+1+strlen(u->query)+1);
- strcpy(u->http.page_spec, u->path);
- strcat(u->http.page_spec, "?");
- strcat(u->http.page_spec, u->query);
- }else
- u->http.page_spec = estrdup(u->path);
- return 0;
- }
- static int
- postparse_ftp(Url *u)
- {
- Resub m[MaxResub];
- Retab *t;
- if(u->authority==nil){
- werrstr("missing authority (hostname, port, etc.)");
- return -1;
- }
- if(u->query){
- werrstr("unexpected \"?query\" in ftp path");
- return -1;
- }
- if(u->host == nil){
- werrstr("missing host specification");
- return -1;
- }
- if(u->path == nil){
- u->ftp.path_spec = estrdup("/");
- return 0;
- }
- m[0].sp = m[0].ep = nil;
- t = &retab[REftppath];
- if(!regx(t->prog, u->path, m, t->size)){
- werrstr("malformed ftp path: %q", u->path);
- return -1;
- }
- if(m[t->ind[0]].sp){
- u->ftp.path_spec = estredup(m[t->ind[0]].sp, m[t->ind[0]].ep);
- if(strchr(u->ftp.path_spec, ';')){
- werrstr("unexpected \";param\" in ftp path");
- return -1;
- }
- }else
- u->ftp.path_spec = estrdup("/");
- if(m[t->ind[1]].sp){
- u->ftp.type = estredup(m[t->ind[1]].sp, m[t->ind[1]].ep);
- strlower(u->ftp.type);
- }
- return 0;
- }
- static int
- postparse_file(Url *u)
- {
- if(u->user || u->passwd){
- werrstr("user information not valid with file scheme");
- return -1;
- }
- if(u->query){
- werrstr("unexpected \"?query\" in file path");
- return -1;
- }
- if(u->port){
- werrstr("port not valid with file scheme");
- return -1;
- }
- if(u->path == nil){
- werrstr("missing path in file scheme");
- return -1;
- }
- if(strchr(u->path, ';')){
- werrstr("unexpected \";param\" in file path");
- return -1;
- }
- if(!ismatch(REfilepath, u->path, "file path"))
- return -1;
- /* "localhost" is equivalent to no host spec, we'll chose the latter */
- if(u->host && cistrcmp(u->host, "localhost") == 0){
- free(u->host);
- u->host = nil;
- }
- return 0;
- }
- static int (*postparse[])(Url*) = {
- nil,
- postparse_http,
- postparse_http,
- postparse_ftp,
- postparse_file,
- };
- Url*
- parseurl(char *url, Url *base)
- {
- Url *u;
- SplitUrl su;
- if(urldebug)
- fprint(2, "parseurl %s with base %s\n", url, base ? base->url : "<none>");
- u = emalloc(sizeof(Url));
- u->url = estrdup(url);
- if(spliturl(u->url, &su) < 0){
- Fail:
- freeurl(u);
- return nil;
- }
- /* RFC2396 sec 3.1 says relative URIs are distinguished by absent scheme */
- if(su.scheme.s==nil){
- if(urldebug)
- fprint(2, "parseurl has nil scheme\n");
- if(resolve_relative(&su, base, u) < 0 || spliturl(u->url, &su) < 0)
- goto Fail;
- if(u->ischeme == UScurrent){
- /* 'u.url' refers to current document; set fragment and return */
- if(parse_fragment(&su, u) < 0)
- goto Fail;
- return u;
- }
- }
- if(parse_scheme(&su, u) < 0
- || parse_fragment(&su, u) < 0)
- goto Fail;
- if(u->ischeme == USunknown){
- if(parse_unknown_part(&su, u) < 0)
- goto Fail;
- return u;
- }
- if(parse_query(&su, u) < 0
- || parse_authority(&su, u) < 0
- || parse_abspath(&su, u) < 0)
- goto Fail;
- if(u->ischeme < nelem(postparse) && postparse[u->ischeme])
- if((*postparse[u->ischeme])(u) < 0)
- goto Fail;
- setmalloctag(u, getcallerpc(&url));
- return u;
- }
- void
- freeurl(Url *u)
- {
- if(u == nil)
- return;
- free(u->url);
- free(u->scheme);
- free(u->schemedata);
- free(u->authority);
- free(u->user);
- free(u->passwd);
- free(u->host);
- free(u->port);
- free(u->path);
- free(u->query);
- free(u->fragment);
- switch(u->ischeme){
- case UShttp:
- free(u->http.page_spec);
- break;
- case USftp:
- free(u->ftp.path_spec);
- free(u->ftp.type);
- break;
- }
- free(u);
- }
- void
- rewriteurl(Url *u)
- {
- char *s;
- if(u->schemedata)
- s = estrmanydup(u->scheme, ":", u->schemedata, nil);
- else
- s = estrmanydup(u->scheme, "://",
- u->user ? u->user : "",
- u->passwd ? ":" : "", u->passwd ? u->passwd : "",
- u->user ? "@" : "", u->host ? u->host : "",
- u->port ? ":" : "", u->port ? u->port : "",
- u->path,
- u->query ? "?" : "", u->query ? u->query : "",
- u->fragment ? "#" : "", u->fragment ? u->fragment : "",
- nil);
- free(u->url);
- u->url = s;
- }
- int
- seturlquery(Url *u, char *query)
- {
- if(query == nil){
- free(u->query);
- u->query = nil;
- return 0;
- }
- if(!ismatch(REquery, query, "query"))
- return -1;
- free(u->query);
- u->query = estrdup(query);
- return 0;
- }
- static void
- dupp(char **p)
- {
- if(*p)
- *p = estrdup(*p);
- }
- Url*
- copyurl(Url *u)
- {
- Url *v;
- v = emalloc(sizeof(Url));
- *v = *u;
- dupp(&v->url);
- dupp(&v->scheme);
- dupp(&v->schemedata);
- dupp(&v->authority);
- dupp(&v->user);
- dupp(&v->passwd);
- dupp(&v->host);
- dupp(&v->port);
- dupp(&v->path);
- dupp(&v->query);
- dupp(&v->fragment);
- switch(v->ischeme){
- case UShttp:
- dupp(&v->http.page_spec);
- break;
- case USftp:
- dupp(&v->ftp.path_spec);
- dupp(&v->ftp.type);
- break;
- }
- return v;
- }
- static int
- dhex(char c)
- {
- if('0' <= c && c <= '9')
- return c-'0';
- if('a' <= c && c <= 'f')
- return c-'a'+10;
- if('A' <= c && c <= 'F')
- return c-'A'+10;
- return 0;
- }
- char*
- escapeurl(char *s, int (*needesc)(int))
- {
- int n;
- char *t, *u;
- Rune r;
- static char *hex = "0123456789abcdef";
- n = 0;
- for(t=s; *t; t++)
- if((*needesc)(*t))
- n++;
- u = emalloc(strlen(s)+2*n+1);
- t = u;
- for(; *s; s++){
- s += chartorune(&r, s);
- if(r >= 0xFF){
- werrstr("URLs cannot contain Runes > 0xFF");
- free(t);
- return nil;
- }
- if((*needesc)(r)){
- *u++ = '%';
- *u++ = hex[(r>>4)&0xF];
- *u++ = hex[r&0xF];
- }else
- *u++ = r;
- }
- *u = '\0';
- return t;
- }
- char*
- unescapeurl(char *s)
- {
- char *r, *w;
- Rune rune;
- s = estrdup(s);
- for(r=w=s; *r; r++){
- if(*r=='%'){
- r++;
- if(!isxdigit(r[0]) || !isxdigit(r[1])){
- werrstr("bad escape sequence '%.3s' in URL", r);
- return nil;
- }
- if(r[0]=='0' && r[2]=='0'){
- werrstr("escaped NUL in URL");
- return nil;
- }
- rune = (dhex(r[0])<<4)|dhex(r[1]); /* latin1 */
- w += runetochar(w, &rune);
- r += 2;
- }else
- *w++ = *r;
- }
- *w = '\0';
- return s;
- }
|