%PDF- %PDF-
Direktori : /data/old/usr/lib/python3.4/site-packages/urlwatch/ |
Current File : //data/old/usr/lib/python3.4/site-packages/urlwatch/html2txt.py |
# -*- coding: utf-8 -*- # # This file is part of urlwatch (https://thp.io/2008/urlwatch/). # Copyright (c) 2008-2016 Thomas Perl <thp.io/about> # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # 3. The name of the author may not be used to endorse or promote products # derived from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import re import os import subprocess import logging logger = logging.getLogger(__name__) def html2text(data, method='lynx'): """ Convert a string consisting of HTML to plain text for easy difference checking. Method may be one of: 'lynx' (default) - Use "lynx -dump" for conversion 'html2text' - Use "html2text -nobs" for conversion 're' - A simple regex-based HTML tag stripper """ if method == 're': stripped_tags = re.sub(r'<[^>]*>', '', data) d = '\n'.join((l.rstrip() for l in stripped_tags.splitlines() if l.strip() != '')) return d if method == 'lynx': cmd = ['lynx', '-dump', '-stdin', '-assume_charset=UTF-8'] # For some reason it looks like lynx always(?) outputs Latin-1 stdout_encoding = 'latin-1' elif method == 'html2text': cmd = ['html2text', '-nobs', '-utf8'] stdout_encoding = 'utf-8' else: raise ValueError('Unknown html2text method: %r' % (method,)) logger.debug('Command: %r, stdout encoding: %s', cmd, stdout_encoding) env = {} env.update(os.environ) env['LANG'] = 'en_US.utf-8' env['LC_ALL'] = 'en_US.utf-8' html2text = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, env=env) stdout, stderr = html2text.communicate(data.encode('utf-8')) stdout = stdout.decode(stdout_encoding) if method == 'lynx': # Lynx translates relative links in the mode we use it to: # file://localhost/tmp/[RANDOM STRING]/[RELATIVE LINK] # Recent versions of lynx (seen in 2.8.8pre1-1) do not include the # "localhost" in the file:// URLs; see Debian bug 732112 stdout = re.sub(r'file://%s/[^/]*/' % (os.environ.get('TMPDIR', '/tmp'),), '', stdout) # Use the following regular expression to remove the unnecessary # parts, so that [RANDOM STRING] (changing on each call) does not # expose itself as change on the website (it's a Lynx-related thing # Thanks to Evert Meulie for pointing that out stdout = re.sub(r'file://localhost%s/[^/]*/' % (os.environ.get('TMPDIR', '/tmp'),), '', stdout) # Also remove file names like L9816-5928TMP.html stdout = re.sub(r'L\d+-\d+TMP.html', '', stdout) return stdout.strip()