%PDF- %PDF-
Mini Shell

Mini Shell

Direktori : /home/waritko/yacy/source/net/yacy/cora/util/
Upload File :
Create Path :
Current File : //home/waritko/yacy/source/net/yacy/cora/util/Html2Image.java

/**
 *  Html2Image
 *  Copyright 2014 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
 *  First published 26.11.2014 on http://yacy.net
 *
 *  This library is free software; you can redistribute it and/or
 *  modify it under the terms of the GNU Lesser General Public
 *  License as published by the Free Software Foundation; either
 *  version 2.1 of the License, or (at your option) any later version.
 *
 *  This library is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 *  Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program in the file lgpl21.txt
 *  If not, see <http://www.gnu.org/licenses/>.
 */

package net.yacy.cora.util;

import java.awt.Container;
import java.awt.Dimension;
import java.awt.Graphics;
import java.awt.Image;
import java.awt.MediaTracker;
import java.awt.image.BufferedImage;
import java.beans.PropertyChangeEvent;
import java.beans.PropertyChangeListener;
import java.io.File;
import java.io.IOException;
import java.util.List;
import java.util.concurrent.TimeUnit;

import javax.imageio.ImageIO;
import javax.swing.JEditorPane;
import javax.swing.text.Document;
import javax.swing.text.Element;
import javax.swing.text.View;
import javax.swing.text.ViewFactory;
import javax.swing.text.html.HTMLDocument;
import javax.swing.text.html.HTMLEditorKit;
import javax.swing.text.html.ImageView;

import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.http.HTTPClient;
import net.yacy.document.ImageParser;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.util.OS;

import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.rendering.ImageType;
import org.apache.pdfbox.rendering.PDFRenderer;

/**
 * Convert html to an copy on disk-image in a other file format
 * currently (pdf and/or jpg)
 */
public class Html2Image {
    
    // Mac
	/**
	 * Path to wkhtmltopdf executable on Mac OS when installed using
	 * wkhtmltox-n.n.n.macos-cocoa.pkg from https://wkhtmltopdf.org/downloads.html.
	 * This can also be a path on Debian or another Gnu/Linux distribution.
	 */
	private final static File wkhtmltopdfMac = new File("/usr/local/bin/wkhtmltopdf");
    
    // to install imagemagick, download from http://cactuslab.com/imagemagick/assets/ImageMagick-6.8.9-9.pkg.zip
    // the convert command from imagemagick needs ghostscript, if not present on older macs, download a version of gs from http://pages.uoregon.edu/koch/
    
    private final static File convertMac1 = new File("/opt/local/bin/convert");
    private final static File convertMac2 = new File("/opt/ImageMagick/bin/convert");
    
    /* Debian packages to install: apt-get install wkhtmltopdf imagemagick xvfb ghostscript
     The imagemagick policy at /etc should also be checked :
     if it contains a line such as <policy domain="coder" rights="none" pattern="PDF" /> it must be edited with rights="read" at minimum
    */
    private final static File wkhtmltopdfDebian = new File("/usr/bin/wkhtmltopdf"); // there is no wkhtmltoimage, use convert to create images
    private final static File convertDebian = new File("/usr/bin/convert");
    
	/**
	 * Path to wkhtmltopdf executable on Windows, when installed with default
	 * settings using wkhtmltox-n.n.n.msvc2015-win64.exe from
	 * https://wkhtmltopdf.org/downloads.html
	 */
	private static final File WKHTMLTOPDF_WINDOWS = new File("C:\\Program Files\\wkhtmltopdf\\bin\\wkhtmltopdf.exe");

	/**
	 * Path to wkhtmltopdf executable on Windows, when installed with default
	 * settings using wkhtmltox-n.n.n.msvc2015-win32.exe from
	 * https://wkhtmltopdf.org/downloads.html
	 */
	private static final File WKHTMLTOPDF_WINDOWS_X86 = new File(
			"C:\\Program Files (x86)\\wkhtmltopdf\\bin\\wkhtmltopdf.exe");
	
	/** Command to use when wkhtmltopdf is included in the system Path */
	private static final String WKHTMLTOPDF_COMMAND = "wkhtmltopdf";
	
	/** Command to use when imagemagick convert is included in the system Path */
	private static final String CONVERT_COMMAND = "convert";

    private static boolean usexvfb = false;

    /**
     * @return when the wkhtmltopdf command is detected as available in the system
     */
    public static boolean wkhtmltopdfAvailable() {
    	/* Check wkhtmltopdf common installation paths and system Path */
		return wkhtmltopdfExecutable() != null || wkhtmltopdfAvailableInPath();
    }
    
    /**
     * @return a wkhtmltopdf executable file when one can be found, null otherwise
     */
    private static File wkhtmltopdfExecutable() {
    	File executable = null;
    	if(OS.isWindows) {
    		if(WKHTMLTOPDF_WINDOWS.exists()) {
    			executable = WKHTMLTOPDF_WINDOWS; 
    		} else if(WKHTMLTOPDF_WINDOWS_X86.exists()) {
    			executable = WKHTMLTOPDF_WINDOWS_X86;
    		}
    	} else {
    		if(wkhtmltopdfMac.exists()) {
    			executable = wkhtmltopdfMac;
    		} else if(wkhtmltopdfDebian.exists()) {
    			executable = wkhtmltopdfDebian;
    		}
    	}
    	return executable;
    }

    /**
     * @return true when wkhtmltopdf is available in system path
     */
	private static boolean wkhtmltopdfAvailableInPath() {
		boolean available = false;
		try {
			final Process p = Runtime.getRuntime().exec(WKHTMLTOPDF_COMMAND + " -V");
			available = p.waitFor(2, TimeUnit.SECONDS) && p.exitValue() == 0;
		} catch (final IOException e) {
			ConcurrentLog.fine("Html2Image", "wkhtmltopdf is not included in system path.");
		} catch (final InterruptedException e) {
			Thread.currentThread().interrupt(); // preserve thread interrupted state
		}
		return available;
	}
	
    /**
     * @return a imagemagick convert executable file when one can be found, null otherwise
     */
    private static File convertExecutable() {
    	File executable = null;
    	if(!OS.isWindows) {
    		if(convertMac1.exists()) {
    			executable = convertMac1; 
    		} else if(convertMac2.exists()) {
    			executable = convertMac2;
    		} else if(convertDebian.exists()) {
    			executable = convertDebian;
    		}
    	}
    	return executable;
    }
    
    /**
     * @return when the imagemagick convert command is detected as available in the system
     */
    public static boolean convertAvailable() {
    	/* Check convert common installation paths and system Path */
        return convertExecutable() != null || convertAvailableInPath();
    }

    /**
     * @return when imagemagick convert is available in system path
     */
	private static boolean convertAvailableInPath() {
		boolean available = false;
		if(!OS.isWindows) { // on MS Windows convert is a system tool to convert volumes from FAT to NTFS
			try {
				final Process p = Runtime.getRuntime().exec(CONVERT_COMMAND + " -version");
				available = p.waitFor(2, TimeUnit.SECONDS) && p.exitValue() == 0;
			} catch (final IOException e) {
				ConcurrentLog.fine("Html2Image", "convert is not included in system path.");
			} catch (final InterruptedException e) {
				Thread.currentThread().interrupt(); // preserve thread interrupted state
			}
		}
		return available;
	}
    
	/**
	 * Run the wkhtmltopdf external tool to fetch and render to PDF a web resource.
	 * wKhtmltopdf may be called multiple times with various parameters flavors in
	 * case of failure.
	 * 
	 * @param url         the URL of a web resource to fetch, render and convert to
	 *                    a pdf file. Must not be null.
	 * @param proxy       the eventual proxy address to use. Can be null. Must be of
	 *                    the form http://host:port; use YaCy here as proxy which is
	 *                    mostly http://localhost:8090
	 * @param destination the destination PDF file that should be written. Must not
	 *                    be null.
	 * @param maxSeconds  the maximum time in seconds to wait for each wkhtmltopdf
	 *                    call termination. Beyond this limit the process is killed.
	 * @return true when the destination file was successfully written
	 */
    public static boolean writeWkhtmltopdf(String url, String proxy, String userAgent, final String acceptLanguage, final File destination, final long maxSeconds) {
        boolean success = false;
        for (boolean ignoreErrors: new boolean[]{false, true}) {
            success = writeWkhtmltopdfInternal(url, proxy, destination, userAgent, acceptLanguage, ignoreErrors, maxSeconds);
            if (success) break;
            if (!success && proxy != null) {
                ConcurrentLog.warn("Html2Image", "trying to load without proxy: " + url);
                success = writeWkhtmltopdfInternal(url, null, destination, userAgent, acceptLanguage, ignoreErrors, maxSeconds);
                if (success) break;
            }
        }
        if (success) {
            ConcurrentLog.info("Html2Image", "wrote " + destination.toString() + " for " + url);
        } else {
            ConcurrentLog.warn("Html2Image", "could not generate snapshot for " + url);
        }
        return success;
    }
    
	/**
	 * Run wkhtmltopdf in a separate process to fetch and render to PDF a web
	 * resource.
	 * 
	 * @param url          the URL of a web resource to fetch, render and convert to
	 *                     a pdf file. Must not be null.
	 * @param proxy        the eventual proxy address to use. Can be null.
	 * @param destination  the destination PDF file that should be written. Must not
	 *                     be null.
	 * @param userAgent    TODO: implement
	 * @param acceptLanguage TODO: implement
	 * @param ignoreErrors when true wkhtmltopdf is instructed to ignore load errors
	 * @param maxSeconds   the maximum time in seconds to wait for the wkhtmltopdf
	 *                     dedicated process termination. Beyond this limit the
	 *                     process is killed.
	 * @return true when the destination file was successfully written
	 */
	private static boolean writeWkhtmltopdfInternal(final String url, final String proxy, final File destination,
			final String userAgent, final String acceptLanguage, final boolean ignoreErrors, final long maxSeconds) {
		final String wkhtmltopdfCmd;
		final File wkhtmltopdf = wkhtmltopdfExecutable();
		if(wkhtmltopdf != null) {
			wkhtmltopdfCmd = wkhtmltopdf.getAbsolutePath();
		} else if(wkhtmltopdfAvailableInPath()) {
			wkhtmltopdfCmd = WKHTMLTOPDF_COMMAND;
		} else {
			ConcurrentLog.warn("Html2Pdf", "Unable to locate wkhtmltopdf executable on this system!");
			return false;
		}
        String commandline =
        		wkhtmltopdfCmd + " -q --title '" + url + "' " +
                //acceptLanguage == null ? "" : "--custom-header 'Accept-Language' '" + acceptLanguage + "' " + 
                //(userAgent == null ? "" : "--custom-header \"User-Agent\" \"" + userAgent + "\" --custom-header-propagation ") + 
                (proxy == null ? "" : "--proxy " + proxy + " ") +
                (ignoreErrors ? (OS.isMacArchitecture ? "--load-error-handling ignore " : "--ignore-load-errors ") : "") + // some versions do not have that flag and fail if attempting to use it...
                //"--footer-font-name 'Courier' --footer-font-size 9 --footer-left [webpage] --footer-right [date]/[time]([page]/[topage]) " +
                "--footer-left [webpage] --footer-right '[date]/[time]([page]/[topage])' --footer-font-size 7 " +
                url + " " + destination.getAbsolutePath();
        try {
            ConcurrentLog.info("Html2Pdf", "creating pdf from url " + url + " with command: " + commandline); 
            if (!usexvfb && execWkhtmlToPdf(proxy, destination, commandline, maxSeconds)) {
           		return true;
            }
            // if this fails, we should try to wrap the X server with a virtual screen using xvfb, this works on headless servers
            commandline = "xvfb-run -a " + commandline;
            return execWkhtmlToPdf(proxy, destination, commandline, maxSeconds);
        } catch (final IOException e) {
            ConcurrentLog.warn("Html2Pdf", "exception while creation of pdf with command: " + commandline, e);
            return false;
        }
    }

	/**
	 * Run a wkhtmltopdf commandline in a separate process.
	 * 
	 * @param proxy       the eventual proxy address to use. Can be null.
	 * @param destination the destination PDF file that should be written. Must not
	 *                    be null.
	 * @param commandline the wkhtmltopdf command line to execute. Must not be null.
	 * @param maxSeconds  the maximum time in seconds to wait for the process
	 *                    termination. Beyond this limit the process is killed.
	 * @return true when the destination file was successfully written
	 * @throws IOException when an unexpected error occurred
	 */
	private static boolean execWkhtmlToPdf(final String proxy, final File destination, final String commandline, final long maxSeconds)
			throws IOException {
		final Process p = Runtime.getRuntime().exec(commandline);

		try {
			p.waitFor(maxSeconds, TimeUnit.SECONDS);
		} catch (final InterruptedException e) {
			p.destroyForcibly();
			ConcurrentLog.warn("Html2Pdf", "Interrupted creation of pdf. Killing the process started with command : " + commandline);
			Thread.currentThread().interrupt(); // Keep the thread interrupted state
			return false;
		}
		if(p.isAlive()) {
			ConcurrentLog.warn("Html2Pdf", "Creation of pdf did not terminate within " + maxSeconds + " seconds. Killing the process started with command : " + commandline);
			p.destroyForcibly();
			return false;
		}
		if (p.exitValue() == 0 && destination.exists()) {
			return true;
		}
		final List<String> messages = OS.readStreams(p);
		ConcurrentLog.warn("Html2Image", "failed to create pdf " + (proxy == null ? "" : "using proxy " + proxy) + " with command : " + commandline);
		for (final String message : messages) {
			ConcurrentLog.warn("Html2Image", ">> " + message);
		}
		return false;
	}
    
    /**
     * Convert a pdf (first page) to an image. Proper values are i.e. width = 1024, height = 1024, density = 300, quality = 75
     * using internal pdf library or external command line tool on linux or mac
     * @param pdf input pdf file. Must not be null.
     * @param image output image file. Must not be null, and should end with ".jpg" or ".png".
     * @param width output width in pixels
     * @param height output height in pixels
     * @param density (dpi)
     * @param quality JPEG/PNG compression level
     * @return true when the ouput image file was successfully written.
     */
    public static boolean pdf2image(final File pdf, final File image, final int width, final int height, final int density, final int quality) {
    	/* Deduce the ouput image format from the file extension */
    	String imageFormat = MultiProtocolURL.getFileExtension(image.getName());
    	if(imageFormat.isEmpty()) {
    		/* Use JPEG as a default fallback */
    		imageFormat = "jpg";
    	}
		String convertCmd = null;
		final File convert = convertExecutable();
		if(convert != null) {
			convertCmd = convert.getAbsolutePath();
		} else if(convertAvailableInPath()) {
			convertCmd = CONVERT_COMMAND;
		} else {
			ConcurrentLog.info("Html2Image", "Unable to locate convert executable on this system!");
		}

        // convert pdf to jpg using internal pdfbox capability
        if (convertCmd == null) {
            try (final PDDocument pdoc = PDDocument.load(pdf);) {
                
                BufferedImage bi = new PDFRenderer(pdoc).renderImageWithDPI(0, density, ImageType.RGB);

                return ImageIO.write(bi, imageFormat, image);

            } catch (final IOException ex) {
				ConcurrentLog.warn("Html2Image", "Failed to create image with pdfbox"
						+ (ex.getMessage() != null ? " : " + ex.getMessage() : ""));
            	return false;
            }
        }

        // convert using external command line utility
        try {
            // i.e. convert -density 300 -trim yacy.pdf[0] -trim -resize 1024x -crop x1024+0+0 -quality 75% yacy-convert-300.jpg
            // note: both -trim are necessary, otherwise it is trimmed only on one side. The [0] selects the first page of the pdf
            String command = convertCmd + " -alpha remove -density " + density + " -trim " + pdf.getAbsolutePath() + "[0] -trim -resize " + width + "x -crop x" + height + "+0+0 -quality " + quality + "% " + image.getAbsolutePath();
            List<String> message = OS.execSynchronous(command);
            if (image.exists()) return true;
            ConcurrentLog.warn("Html2Image", "failed to create image with command: " + command);
            for (String m: message) ConcurrentLog.warn("Html2Image", ">> " + m);
            
            // another try for mac: use Image Events using AppleScript in osacript commands...
            // the following command overwrites a pdf with an png, so we must make a copy first
            if (!OS.isMacArchitecture) return false;
            File pngFile = new File(pdf.getAbsolutePath() + ".tmp.pdf");
            org.apache.commons.io.FileUtils.copyFile(pdf, pngFile);
            String[] commandx = {"osascript",
                    "-e", "set ImgFile to \"" + pngFile.getAbsolutePath() + "\"",
                    "-e", "tell application \"Image Events\"",
                    "-e", "set Img to open file ImgFile",
                    "-e", "save Img as PNG",
                    "-e", "end tell"};
            //ConcurrentLog.warn("Html2Image", "failed to create image with command: " + commandx);
            message = OS.execSynchronous(commandx);
            for (String m: message) ConcurrentLog.warn("Html2Image", ">> " + m);
            // now we must read and convert this file to the target format with the target size 1024x1024
            try {
                File newPngFile = new File(pngFile.getAbsolutePath() + ".png");
                pngFile.renameTo(newPngFile);
                final Image img = ImageParser.parse(pngFile.getAbsolutePath(), FileUtils.read(newPngFile));
                if(img == null) {
                	/* Should not happen. If so, ImageParser.parse() should already have logged about the error */
                	return false;
                }
                final Image scaled = img.getScaledInstance(width, height, Image.SCALE_AREA_AVERAGING);
                final MediaTracker mediaTracker = new MediaTracker(new Container());
                mediaTracker.addImage(scaled, 0);
                try {mediaTracker.waitForID(0);} catch (final InterruptedException e) {}
                // finally write the image
                final BufferedImage bi = new BufferedImage(width, height, BufferedImage.TYPE_INT_RGB);
                bi.createGraphics().drawImage(scaled, 0, 0, width, height, null);
                ImageIO.write(bi, imageFormat, image);
                newPngFile.delete();
                return image.exists();
            } catch (IOException e) {
                ConcurrentLog.logException(e);
                return false;
            }
        } catch (IOException e) {
            e.printStackTrace();
            return false;
        }
    }
    
    /**
     * render a html page with a JEditorPane, which can do html up to html v 3.2. No CSS supported!
     * @param url
     * @param size
     * @throws IOException 
     */
    public static void writeSwingImage(String url, Dimension size, File destination) throws IOException {
        
        // set up a pane for rendering
        final JEditorPane htmlPane = new JEditorPane();
        htmlPane.setSize(size);
        htmlPane.setEditable(false);
        final HTMLEditorKit kit = new HTMLEditorKit() {

            private static final long serialVersionUID = 1L;

            @Override
            public Document createDefaultDocument() {
                HTMLDocument doc = (HTMLDocument) super.createDefaultDocument();
                doc.setAsynchronousLoadPriority(-1);
                return doc;
            }

            @Override
            public ViewFactory getViewFactory() {
                return new HTMLFactory() {
                    @Override
                    public View create(Element elem) {
                        View view = super.create(elem);
                        if (view instanceof ImageView) {
                            ((ImageView) view).setLoadsSynchronously(true);
                        }
                        return view;
                    }
                };
            }
        };
        htmlPane.setEditorKitForContentType("text/html", kit);
        htmlPane.setContentType("text/html");
        htmlPane.addPropertyChangeListener(new PropertyChangeListener() {
            @Override
            public void propertyChange(PropertyChangeEvent evt) {
            }
        });
        
        // load the page
        try {
            htmlPane.setPage(url);
        } catch (IOException e) {
            e.printStackTrace();
        }
        
        // render the page
        Dimension prefSize = htmlPane.getPreferredSize();
        BufferedImage img = new BufferedImage(prefSize.width, htmlPane.getPreferredSize().height, BufferedImage.TYPE_INT_ARGB);
        Graphics graphics = img.getGraphics();
        htmlPane.setSize(prefSize);
        htmlPane.paint(graphics);
        ImageIO.write(img, destination.getName().endsWith("jpg") ? "jpg" : "png", destination);
    }
    
    /**
     * Test PDF or image snapshot generation for a given URL.
     * @param args main arguments list:
     * <ol>
     * 	<li>Source remote URL (required)</li>
     * 	<li>Target local file path (required)</li>
     * 	<li>Snapshot generation method identifier (optional) :
     * 		<ul>
     * 			<li>"wkhtmltopdf" (default): generate a PDF snapshot using external wkhtmltopdf tool.</li>
     * 			<li>"swing" : use JRE provided Swing to generate a jpg or png image snapshot.</li>
     * 		</ul>
     * 	</li>
     * </ol>
     */
	public static void main(String[] args) {
		final String usageMessage = "Usage : java " + Html2Image.class.getName()
				+ " <url> <target-file[.pdf|.jpg|.png]> [wkhtmltopdf|swing]";
		int exitStatus = 0;
		try {
			if (args.length < 2) {
				System.out.println("Missing required parameter(s).");
				System.out.println(usageMessage);
				exitStatus = 1;
				return;
			}
			final String targetPath = args[1];
			if (args.length < 3 || "wkhtmltopdf".equals(args[2])) {
				if(Html2Image.wkhtmltopdfAvailable()) {
					final File targetPdfFile;
					if(targetPath.endsWith(".jpg") || targetPath.endsWith(".png")) {
						targetPdfFile = new File(targetPath.substring(0, targetPath.length() - 4) + ".pdf");
					} else if(targetPath.endsWith(".pdf")) {
						targetPdfFile = new File(targetPath);
					} else {
						System.out.println("Unsupported output format");
						System.out.println(usageMessage);
						exitStatus = 1;
						return;
					}
					if(Html2Image.writeWkhtmltopdf(args[0], null, ClientIdentification.yacyInternetCrawlerAgent.userAgent,
							"en-us,en;q=0.5", targetPdfFile, 30)) {
						if(targetPath.endsWith(".jpg") || targetPath.endsWith(".png")) {
							if(Html2Image.pdf2image(targetPdfFile, new File(targetPath), 1024, 1024, 300, 75)) {
								ConcurrentLog.info("Html2Image", "wrote " + targetPath + " converted from " + targetPdfFile);
							} else {
								exitStatus = 1;
								return;
							}
						}
					} else {
						exitStatus = 1;
						return;
					}
				} else {
					System.out.println("Unable to locate wkhtmltopdf executable on this system!");
					exitStatus = 1;
					return;
				}
			} else if ("swing".equals(args[2])) {
				if(targetPath.endsWith(".pdf")) {
					System.out.println("Pdf output format is not supported with swing method.");
					exitStatus = 1;
					return;
				}
				if(!targetPath.endsWith(".jpg") && !targetPath.endsWith(".png")) {
					System.out.println("Unsupported output format");
					System.out.println(usageMessage);
					exitStatus = 1;
					return;
				}
				
				try {
					Html2Image.writeSwingImage(args[0], new Dimension(1200, 2000), new File(targetPath));
				} catch (final IOException e) {
					e.printStackTrace();
					exitStatus = 1;
					return;
				}
			} else {
				System.out.println("Unknown method : please specify either wkhtmltopdf or swing.");
				exitStatus = 1;
				return;
			}
		} finally {
			/* Shutdown running threads */
			Domains.close();
			try {
				HTTPClient.closeConnectionManager();
			} catch (final InterruptedException e) {
				Thread.currentThread().interrupt(); // restore interrupted state
			}
			ConcurrentLog.shutdown();
			if(exitStatus != 0) {
				System.exit(exitStatus);
			}
		}
	}
    
}

Zerion Mini Shell 1.0