%PDF- %PDF-
| Direktori : /lib/python3/dist-packages/mechanize/ |
| Current File : //lib/python3/dist-packages/mechanize/_http.py |
"""HTTP related handlers.
Note that some other HTTP handlers live in more specific modules: _auth.py,
_gzip.py, etc.
Copyright 2002-2006 John J Lee <jjl@pobox.com>
This code is free software; you can redistribute it and/or modify it
under the terms of the BSD or ZPL 2.1 licenses (see the file
LICENSE included with the distribution).
"""
from __future__ import absolute_import
import logging
import socket
import time
from io import BytesIO
from . import _rfc3986, _sockettimeout
from ._headersutil import is_html
from ._request import Request
from ._response import response_seek_wrapper
from ._urllib2_fork import BaseHandler, HTTPError
from ._equiv import HTTPEquivParser
from .polyglot import create_response_info, RobotFileParser, is_py2, as_unicode
debug = logging.getLogger("mechanize").debug
debug_robots = logging.getLogger("mechanize.robots").debug
def parse_head(fileobj):
"""Return a list of key, value pairs."""
p = HTTPEquivParser(fileobj.read(4096))
return p()
class HTTPEquivProcessor(BaseHandler):
"""Append META HTTP-EQUIV headers to regular HTTP headers."""
handler_order = 300 # before handlers that look at HTTP headers
def http_response(self, request, response):
if not hasattr(response, "seek"):
response = response_seek_wrapper(response)
http_message = response.info()
url = response.geturl()
ct_hdrs = http_message.getheaders("content-type")
if is_html(ct_hdrs, url, True):
try:
try:
html_headers = parse_head(response)
finally:
response.seek(0)
except Exception:
pass
else:
for hdr, val in html_headers:
if is_py2:
# add a header
http_message.dict[hdr.lower()] = val
text = hdr + b": " + val
for line in text.split(b"\n"):
http_message.headers.append(line + b"\n")
else:
hdr = hdr.decode('iso-8859-1')
http_message[hdr] = val.decode('iso-8859-1')
return response
https_response = http_response
class MechanizeRobotFileParser(RobotFileParser):
def __init__(self, url='', opener=None):
RobotFileParser.__init__(self, url)
self._opener = opener
self._timeout = _sockettimeout._GLOBAL_DEFAULT_TIMEOUT
def set_opener(self, opener=None):
from . import _opener
if opener is None:
opener = _opener.OpenerDirector()
self._opener = opener
def set_timeout(self, timeout):
self._timeout = timeout
def read(self):
"""Reads the robots.txt URL and feeds it to the parser."""
if self._opener is None:
self.set_opener()
req = Request(self.url, unverifiable=True, visit=False,
timeout=self._timeout)
try:
f = self._opener.open(req)
except HTTPError as err:
f = err
except (IOError, socket.error, OSError) as exc:
debug_robots("ignoring error opening %r: %s" %
(self.url, exc))
return
lines = []
line = f.readline()
while line:
lines.append(line.strip())
line = f.readline()
status = f.code
if status == 401 or status == 403:
self.disallow_all = True
debug_robots("disallow all")
elif status >= 400:
self.allow_all = True
debug_robots("allow all")
elif status == 200 and lines:
debug_robots("parse lines")
if is_py2:
self.parse(lines)
else:
self.parse(map(as_unicode, lines))
class RobotExclusionError(HTTPError):
def __init__(self, request, *args):
HTTPError.__init__(self, *args)
self.request = request
class HTTPRobotRulesProcessor(BaseHandler):
# before redirections, after everything else
handler_order = 800
http_response_class = None
def __init__(self, rfp_class=MechanizeRobotFileParser):
self.rfp_class = rfp_class
self.rfp = None
self._host = None
def __copy__(self):
return self.__class__(self.rfp_class)
def http_request(self, request):
scheme = request.get_type()
if scheme not in ["http", "https"]:
# robots exclusion only applies to HTTP
return request
if request.get_selector() == "/robots.txt":
# /robots.txt is always OK to fetch
return request
host = request.get_host()
# robots.txt requests don't need to be allowed by robots.txt :-)
origin_req = getattr(request, "_origin_req", None)
if (origin_req is not None and
origin_req.get_selector() == "/robots.txt" and
origin_req.get_host() == host):
return request
if host != self._host:
self.rfp = self.rfp_class()
try:
self.rfp.set_opener(self.parent)
except AttributeError:
debug("%r instance does not support set_opener" %
self.rfp.__class__)
self.rfp.set_url(scheme + "://" + host + "/robots.txt")
self.rfp.set_timeout(request.timeout)
self.rfp.read()
self._host = host
ua = request.get_header("User-agent", "")
if self.rfp.can_fetch(ua, request.get_full_url()):
return request
else:
# XXX This should really have raised URLError. Too late now...
factory = self.http_response_class or create_response_info
msg = b"request disallowed by robots.txt"
raise RobotExclusionError(
request,
request.get_full_url(),
403, msg,
factory(BytesIO()), BytesIO(msg))
https_request = http_request
class HTTPRefererProcessor(BaseHandler):
"""Add Referer header to requests.
This only makes sense if you use each RefererProcessor for a single
chain of requests only (so, for example, if you use a single
HTTPRefererProcessor to fetch a series of URLs extracted from a single
page, this will break).
There's a proper implementation of this in mechanize.Browser.
"""
def __init__(self):
self.referer = None
def http_request(self, request):
if ((self.referer is not None) and
not request.has_header("Referer")):
request.add_unredirected_header("Referer", self.referer)
return request
def http_response(self, request, response):
self.referer = response.geturl()
return response
https_request = http_request
https_response = http_response
def clean_refresh_url(url):
# e.g. Firefox 1.5 does (something like) this
if ((url.startswith('"') and url.endswith('"')) or
(url.startswith("'") and url.endswith("'"))):
url = url[1:-1]
return _rfc3986.clean_url(url, 'utf-8') # XXX encoding
def parse_refresh_header(refresh):
"""
>>> parse_refresh_header("1; url=http://example.com/")
(1.0, 'http://example.com/')
>>> parse_refresh_header("1; url='http://example.com/'")
(1.0, 'http://example.com/')
>>> parse_refresh_header("1")
(1.0, None)
>>> parse_refresh_header("blah") # doctest: +IGNORE_EXCEPTION_DETAIL
Traceback (most recent call last):
ValueError: invalid literal for float(): blah
"""
ii = refresh.find(";")
if ii != -1:
pause, newurl_spec = float(refresh[:ii]), refresh[ii + 1:]
jj = newurl_spec.find("=")
key = None
if jj != -1:
key, newurl = newurl_spec[:jj], newurl_spec[jj + 1:]
newurl = clean_refresh_url(newurl)
if key is None or key.strip().lower() != "url":
raise ValueError()
else:
pause, newurl = float(refresh), None
return pause, newurl
class HTTPRefreshProcessor(BaseHandler):
"""Perform HTTP Refresh redirections.
Note that if a non-200 HTTP code has occurred (for example, a 30x
redirect), this processor will do nothing.
By default, only zero-time Refresh headers are redirected. Use the
max_time attribute / constructor argument to allow Refresh with longer
pauses. Use the honor_time attribute / constructor argument to control
whether the requested pause is honoured (with a time.sleep()) or
skipped in favour of immediate redirection.
Public attributes:
max_time: see above
honor_time: see above
"""
handler_order = 1000
def __init__(self, max_time=0, honor_time=True):
self.max_time = max_time
self.honor_time = honor_time
self._sleep = time.sleep
def __copy__(self):
return self.__class__(self.max_time, self.honor_time)
def http_response(self, request, response):
code, msg, hdrs = response.code, response.msg, response.info()
if code == 200 and 'refresh' in hdrs:
refresh = hdrs.getheaders("refresh")[0]
try:
pause, newurl = parse_refresh_header(refresh)
except ValueError:
debug("bad Refresh header: %r" % refresh)
return response
if newurl is None:
newurl = response.geturl()
if (self.max_time is None) or (pause <= self.max_time):
if pause > 1E-3 and self.honor_time:
self._sleep(pause)
hdrs["location"] = newurl
# hardcoded http is NOT a bug
response = self.parent.error(
"http", request, response,
"refresh", msg, hdrs)
else:
debug("Refresh header ignored: %r" % refresh)
return response
https_response = http_response