%PDF- %PDF-
Direktori : /proc/985914/root/lib/python3/dist-packages/mechanize/ |
Current File : //proc/985914/root/lib/python3/dist-packages/mechanize/_rfc3986.py |
"""RFC 3986 URI parsing and relative reference resolution / absolutization. (aka splitting and joining) Copyright 2006 John J. Lee <jjl@pobox.com> This code is free software; you can redistribute it and/or modify it under the terms of the BSD or ZPL 2.1 licenses (see the file LICENSE included with the distribution). """ # XXX Wow, this is ugly. Overly-direct translation of the RFC ATM. from __future__ import absolute_import import re from .polyglot import quote # def chr_range(a, b): # return "".join(map(chr, range(ord(a), ord(b)+1))) # UNRESERVED_URI_CHARS = ("ABCDEFGHIJKLMNOPQRSTUVWXYZ" # "abcdefghijklmnopqrstuvwxyz" # "0123456789" # "-_.~") # RESERVED_URI_CHARS = "!*'();:@&=+$,/?#[]" # URI_CHARS = RESERVED_URI_CHARS+UNRESERVED_URI_CHARS+'%' # this re matches any character that's not in URI_CHARS BAD_URI_CHARS_RE = re.compile(r"[^A-Za-z0-9\-_.~!*'();:@&=+$,/?%#[\]]") def clean_url(url, encoding='utf-8'): # percent-encode illegal URI characters # Trying to come up with test cases for this gave me a headache, revisit # when do switch to unicode. # Somebody else's comments (lost the attribution): # - IE will return you the url in the encoding you send it # - Mozilla/Firefox will send you latin-1 if there's no non latin-1 # characters in your link. It will send you utf-8 however if there are... is_unicode = not isinstance(url, bytes) if not is_unicode: url = url.decode(encoding, "replace") url = url.strip() # for second param to urllib.quote(), we want URI_CHARS, minus the # 'always_safe' characters that urllib.quote() never percent-encodes ans = quote(url.encode(encoding), "!*'();:@&=+$,/?%#[]~") if is_unicode and isinstance(ans, bytes): ans = ans.decode(encoding) return ans def is_clean_uri(uri): """ >>> is_clean_uri("ABC!") True >>> is_clean_uri(u"ABC!") True >>> is_clean_uri("ABC|") False >>> is_clean_uri(u"ABC|") False >>> is_clean_uri("http://example.com/0") True >>> is_clean_uri(u"http://example.com/0") True """ # note module re treats bytestrings as through they were decoded as latin-1 # so this function accepts both unicode and bytestrings return not bool(BAD_URI_CHARS_RE.search(uri)) SPLIT_MATCH = re.compile( r"^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?").match def urlsplit(absolute_uri): """Return scheme, authority, path, query, fragment.""" match = SPLIT_MATCH(absolute_uri) if match: g = match.groups() return g[1], g[3], g[4], g[6], g[8] def urlunsplit(parts): scheme, authority, path, query, fragment = parts r = [] append = r.append if scheme is not None: append(scheme) append(":") if authority is not None: append("//") append(authority) append(path) if query is not None: append("?") append(query) if fragment is not None: append("#") append(fragment) return "".join(r) def urljoin(base_uri, uri_reference): """Join a base URI with a URI reference and return the resulting URI. See RFC 3986. """ return urlunsplit(urljoin_parts(urlsplit(base_uri), urlsplit(uri_reference))) # oops, this doesn't do the same thing as the literal translation # from the RFC below # import posixpath # def urljoin_parts(base_parts, reference_parts): # scheme, authority, path, query, fragment = base_parts # rscheme, rauthority, rpath, rquery, rfragment = reference_parts # compute target URI path # if rpath == "": # tpath = path # else: # tpath = rpath # if not tpath.startswith("/"): # tpath = merge(authority, path, tpath) # tpath = posixpath.normpath(tpath) # if rscheme is not None: # return (rscheme, rauthority, tpath, rquery, rfragment) # elif rauthority is not None: # return (scheme, rauthority, tpath, rquery, rfragment) # elif rpath == "": # if rquery is not None: # tquery = rquery # else: # tquery = query # return (scheme, authority, tpath, tquery, rfragment) # else: # return (scheme, authority, tpath, rquery, rfragment) def urljoin_parts(base_parts, reference_parts): scheme, authority, path, query, fragment = base_parts rscheme, rauthority, rpath, rquery, rfragment = reference_parts if rscheme == scheme: rscheme = None if rscheme is not None: tscheme, tauthority, tpath, tquery = ( rscheme, rauthority, remove_dot_segments(rpath), rquery) else: if rauthority is not None: tauthority, tpath, tquery = ( rauthority, remove_dot_segments(rpath), rquery) else: if rpath == "": tpath = path if rquery is not None: tquery = rquery else: tquery = query else: if rpath.startswith("/"): tpath = remove_dot_segments(rpath) else: tpath = merge(authority, path, rpath) tpath = remove_dot_segments(tpath) tquery = rquery tauthority = authority tscheme = scheme tfragment = rfragment return (tscheme, tauthority, tpath, tquery, tfragment) # um, something *vaguely* like this is what I want, but I have to generate # lots of test cases first, if only to understand what it is that # remove_dot_segments really does... # def remove_dot_segments(path): # if path == '': # return '' # comps = path.split('/') # new_comps = [] # for comp in comps: # if comp in ['.', '']: # if not new_comps or new_comps[-1]: # new_comps.append('') # continue # if comp != '..': # new_comps.append(comp) # elif new_comps: # new_comps.pop() # return '/'.join(new_comps) def remove_dot_segments(path): r = [] while path: # A if path.startswith("../"): path = path[3:] continue if path.startswith("./"): path = path[2:] continue # B if path.startswith("/./"): path = path[2:] continue if path == "/.": path = "/" continue # C if path.startswith("/../"): path = path[3:] if r: r.pop() continue if path == "/..": path = "/" if r: r.pop() continue # D if path == ".": path = path[1:] continue if path == "..": path = path[2:] continue # E start = 0 if path.startswith("/"): start = 1 ii = path.find("/", start) if ii < 0: ii = None r.append(path[:ii]) if ii is None: break path = path[ii:] return "".join(r) def merge(base_authority, base_path, ref_path): # XXXX Oddly, the sample Perl implementation of this by Roy Fielding # doesn't even take base_authority as a parameter, despite the wording in # the RFC suggesting otherwise. Perhaps I'm missing some obvious identity. # if base_authority is not None and base_path == "": if base_path == "": return "/" + ref_path ii = base_path.rfind("/") if ii >= 0: return base_path[:ii + 1] + ref_path return ref_path if __name__ == "__main__": import doctest doctest.testmod()