%PDF- %PDF-
| Direktori : /lib/python3/dist-packages/mechanize/ |
| Current File : //lib/python3/dist-packages/mechanize/_equiv.py |
#!/usr/bin/env python
# vim:fileencoding=utf-8
# Copyright: 2017, Kovid Goyal <kovid at kovidgoyal.net>
from __future__ import (absolute_import, division, print_function,
unicode_literals)
import re
import string
from ._entities import html5_entities
from .polyglot import codepoint_to_chr
space_chars = frozenset(("\t", "\n", "\u000C", " ", "\r"))
space_chars_bytes = frozenset(item.encode("ascii") for item in space_chars)
ascii_letters_bytes = frozenset(
item.encode("ascii") for item in string.ascii_letters)
spaces_angle_brackets = space_chars_bytes | frozenset((b">", b"<"))
skip1 = space_chars_bytes | frozenset((b"/", ))
head_elems = frozenset((
b"html", b"head", b"title", b"base", b"script",
b"style", b"meta", b"link", b"object"))
def my_unichr(num):
try:
return codepoint_to_chr(num)
except (ValueError, OverflowError):
return '?'
def replace_entity(match):
ent = match.group(1).lower()
if ent in {'apos', 'squot'}:
# squot is generated by some broken CMS software
return "'"
if ent == 'hellips':
ent = 'hellip'
if ent.startswith('#'):
try:
if ent[1] in ('x', 'X'):
num = int(ent[2:], 16)
else:
num = int(ent[1:])
except Exception:
return '&' + ent + ';'
if num > 255:
return my_unichr(num)
try:
return chr(num).decode('cp1252')
except UnicodeDecodeError:
return my_unichr(num)
try:
return html5_entities[ent]
except KeyError:
pass
return '&' + ent + ';'
class Bytes(bytes):
"""String-like object with an associated position and various extra methods
If the position is ever greater than the string length then an exception is
raised"""
def __init__(self, value):
self._position = -1
def __iter__(self):
return self
def __next__(self):
p = self._position = self._position + 1
if p >= len(self):
raise StopIteration
elif p < 0:
raise TypeError
return self[p:p + 1]
def next(self):
# Py2 compat
return self.__next__()
def previous(self):
p = self._position
if p >= len(self):
raise StopIteration
elif p < 0:
raise TypeError
self._position = p = p - 1
return self[p:p + 1]
@property
def position(self):
if self._position >= len(self):
raise StopIteration
if self._position >= 0:
return self._position
@position.setter
def position(self, position):
if self._position >= len(self):
raise StopIteration
self._position = position
@property
def current_byte(self):
return self[self.position:self.position + 1]
def skip(self, chars=space_chars_bytes):
"""Skip past a list of characters"""
p = self.position # use property for the error-checking
while p < len(self):
c = self[p:p + 1]
if c not in chars:
self._position = p
return c
p += 1
self._position = p
return
def skip_until(self, chars):
p = pos = self.position
while p < len(self):
c = self[p:p + 1]
if c in chars:
self._position = p
return self[pos:p], c
p += 1
self._position = p
return b'', b''
def match_bytes(self, bytes):
"""Look for a sequence of bytes at the start of a string. If the bytes
are found return True and advance the position to the byte after the
match. Otherwise return False and leave the position alone"""
p = self.position
data = self[p:p + len(bytes)]
rv = data.startswith(bytes)
if rv:
self.position += len(bytes)
return rv
def match_bytes_pat(self, pat):
bytes = pat.pattern
m = pat.match(self, self.position)
if m is None:
return False
bytes = m.group()
self.position += len(bytes)
return True
def jump_to(self, bytes):
"""Look for the next sequence of bytes matching a given sequence. If
a match is found advance the position to the last byte of the match"""
new_pos = self.find(bytes, max(0, self.position))
if new_pos > -1:
new_pos -= self.position
if self._position == -1:
self._position = 0
self._position += (new_pos + len(bytes) - 1)
return True
else:
raise StopIteration
class HTTPEquivParser(object):
"""Mini parser for detecting http-equiv headers from meta tags """
def __init__(self, data):
"""string - the data to work on """
self.data = Bytes(data)
self.headers = []
def __call__(self):
mb, mbp = self.data.match_bytes, self.data.match_bytes_pat
dispatch = (
(mb, b"<!--", self.handle_comment),
(mbp, re.compile(b"<meta", flags=re.IGNORECASE),
self.handle_meta),
(mbp, re.compile(b"</head", flags=re.IGNORECASE),
lambda: False),
(mb, b"</", self.handle_possible_end_tag),
(mb, b"<!", self.handle_other),
(mb, b"<?", self.handle_other),
(mb, b"<", self.handle_possible_start_tag)
)
for byte in self.data:
keep_parsing = True
for matcher, key, method in dispatch:
if matcher(key):
try:
keep_parsing = method()
break
except StopIteration:
keep_parsing = False
break
if not keep_parsing:
break
ans = []
entity_pat = re.compile(r'&(\S+?);')
for name, val in self.headers:
try:
name, val = name.decode('ascii'), val.decode('ascii')
except ValueError:
continue
name = entity_pat.sub(replace_entity, name)
val = entity_pat.sub(replace_entity, val)
try:
name, val = name.encode('ascii'), val.encode('ascii')
except ValueError:
continue
ans.append((name, val))
return ans
def handle_comment(self):
"""Skip over comments"""
return self.data.jump_to(b"-->")
def handle_meta(self):
if self.data.current_byte not in space_chars_bytes:
# if we have <meta not followed by a space so just keep going
return True
# We have a valid meta element we want to search for attributes
pending_header = pending_content = None
while True:
# Try to find the next attribute after the current position
attr = self.get_attribute()
if attr is None:
return True
name, val = attr
name = name.lower()
if name == b"http-equiv":
if val:
val = val.lower()
if pending_content:
self.headers.append((val, pending_content))
return True
pending_header = val
elif name == b'content':
if val:
if pending_header:
self.headers.append((pending_header, val))
return True
pending_content = val
return True
def handle_possible_start_tag(self):
return self.handle_possible_tag(False)
def handle_possible_end_tag(self):
next(self.data)
return self.handle_possible_tag(True)
def handle_possible_tag(self, end_tag):
data = self.data
if data.current_byte not in ascii_letters_bytes:
# If the next byte is not an ascii letter either ignore this
# fragment (possible start tag case) or treat it according to
# handle_other
if end_tag:
data.previous()
self.handle_other()
return True
tag_name, c = data.skip_until(spaces_angle_brackets)
tag_name = tag_name.lower()
if not end_tag and tag_name not in head_elems:
return False
if c == b"<":
# return to the first step in the overall "two step" algorithm
# reprocessing the < byte
data.previous()
else:
# Read all attributes
attr = self.get_attribute()
while attr is not None:
attr = self.get_attribute()
return True
def handle_other(self):
return self.data.jump_to(b">")
def get_attribute(self):
"""Return a name,value pair for the next attribute in the stream,
if one is found, or None"""
data = self.data
# Step 1 (skip chars)
c = data.skip(skip1)
assert c is None or len(c) == 1
# Step 2
if c in (b">", None):
return None
# Step 3
attr_name = []
attr_value = []
# Step 4 attribute name
while True:
if c == b"=" and attr_name:
break
elif c in space_chars_bytes:
# Step 6!
c = data.skip()
break
elif c in (b"/", b">"):
return b"".join(attr_name), b""
elif c is None:
return None
else:
attr_name.append(c)
# Step 5
c = next(data)
# Step 7
if c != b"=":
data.previous()
return b"".join(attr_name), b""
# Step 8
next(data)
# Step 9
c = data.skip()
# Step 10
if c in (b"'", b'"'):
# 10.1
quote_char = c
while True:
# 10.2
c = next(data)
# 10.3
if c == quote_char:
next(data)
return b"".join(attr_name), b"".join(attr_value)
# 10.4
else:
attr_value.append(c)
elif c == b">":
return b"".join(attr_name), b""
elif c is None:
return None
else:
attr_value.append(c)
# Step 11
while True:
c = next(data)
if c in spaces_angle_brackets:
return b"".join(attr_name), b"".join(attr_value)
elif c is None:
return None
else:
attr_value.append(c)