首页

关于使用pdfbox的TextToPDF文本转换pdf、pdf转换图片PDFToImage、pdf转换html的PDFText2HTML代码示例

标签:pdfbox,Text文件转PDF,PDF转Image图片、pdf转html,代码示例     发布时间:2018-04-14   

一、前言

下面通过pdfbox的org.apache.pdfbox.pdmodel.PDDocument的pdf文件实现类分别有org.apache.pdfbox.tools.TextToPDF、org.apache.pdfbox.tools.PDFToImage、org.apache.pdfbox.tools.PDFText2HTML进行pdf的文本转PDF文件、PDF转图像、PDF文本转HTML文件处理代码示例。

二、代码示例

1.TextToPDF文本转pdf文件示例

package org.apache.pdfbox.tools;@b@@b@import java.io.BufferedReader;@b@import java.io.File;@b@import java.io.FileReader;@b@import java.io.IOException;@b@import java.io.Reader;@b@import java.util.HashMap;@b@import java.util.Map;@b@import org.apache.pdfbox.pdmodel.PDDocument;@b@import org.apache.pdfbox.pdmodel.PDPage;@b@import org.apache.pdfbox.pdmodel.PDPageContentStream;@b@import org.apache.pdfbox.pdmodel.common.PDRectangle;@b@import org.apache.pdfbox.pdmodel.font.PDFont;@b@import org.apache.pdfbox.pdmodel.font.PDType0Font;@b@import org.apache.pdfbox.pdmodel.font.PDType1Font;@b@@b@/**@b@ * This will take a text file and ouput a pdf with that text.@b@ *@b@ * @author Ben Litchfield@b@ */@b@public class TextToPDF@b@{@b@    /**@b@     * The scaling factor for font units to PDF units@b@     */@b@    private static final int FONTSCALE = 1000;@b@    @b@    /**@b@     * The default font@b@     */@b@    private static final PDType1Font DEFAULT_FONT = PDType1Font.HELVETICA;@b@@b@    /**@b@     * The default font size@b@     */@b@    private static final int DEFAULT_FONT_SIZE = 10;@b@    @b@    /**@b@     * The line height as a factor of the font size@b@     */@b@    private static final float LINE_HEIGHT_FACTOR = 1.05f;@b@@b@    private int fontSize = DEFAULT_FONT_SIZE;@b@    private PDRectangle mediaBox = PDRectangle.LETTER;@b@    private boolean landscape = false;@b@    private PDFont font = DEFAULT_FONT;@b@@b@    private static final Map<String, PDType1Font> STANDARD_14 = new HashMap<String, PDType1Font>();@b@    static@b@    {@b@        STANDARD_14.put(PDType1Font.TIMES_ROMAN.getBaseFont(), PDType1Font.TIMES_ROMAN);@b@        STANDARD_14.put(PDType1Font.TIMES_BOLD.getBaseFont(), PDType1Font.TIMES_BOLD);@b@        STANDARD_14.put(PDType1Font.TIMES_ITALIC.getBaseFont(), PDType1Font.TIMES_ITALIC);@b@        STANDARD_14.put(PDType1Font.TIMES_BOLD_ITALIC.getBaseFont(), PDType1Font.TIMES_BOLD_ITALIC);@b@        STANDARD_14.put(PDType1Font.HELVETICA.getBaseFont(), PDType1Font.HELVETICA);@b@        STANDARD_14.put(PDType1Font.HELVETICA_BOLD.getBaseFont(), PDType1Font.HELVETICA_BOLD);@b@        STANDARD_14.put(PDType1Font.HELVETICA_OBLIQUE.getBaseFont(), PDType1Font.HELVETICA_OBLIQUE);@b@        STANDARD_14.put(PDType1Font.HELVETICA_BOLD_OBLIQUE.getBaseFont(), PDType1Font.HELVETICA_BOLD_OBLIQUE);@b@        STANDARD_14.put(PDType1Font.COURIER.getBaseFont(), PDType1Font.COURIER);@b@        STANDARD_14.put(PDType1Font.COURIER_BOLD.getBaseFont(), PDType1Font.COURIER_BOLD);@b@        STANDARD_14.put(PDType1Font.COURIER_OBLIQUE.getBaseFont(), PDType1Font.COURIER_OBLIQUE);@b@        STANDARD_14.put(PDType1Font.COURIER_BOLD_OBLIQUE.getBaseFont(), PDType1Font.COURIER_BOLD_OBLIQUE);@b@        STANDARD_14.put(PDType1Font.SYMBOL.getBaseFont(), PDType1Font.SYMBOL);@b@        STANDARD_14.put(PDType1Font.ZAPF_DINGBATS.getBaseFont(), PDType1Font.ZAPF_DINGBATS);@b@    }@b@@b@    /**@b@     * Create a PDF document with some text.@b@     *@b@     * @param text The stream of text data.@b@     *@b@     * @return The document with the text in it.@b@     *@b@     * @throws IOException If there is an error writing the data.@b@     */@b@    public PDDocument createPDFFromText( Reader text ) throws IOException@b@    {@b@        PDDocument doc = new PDDocument();@b@        createPDFFromText(doc, text);@b@        return doc;@b@    }@b@@b@    /**@b@     * Create a PDF document with some text.@b@     *@b@     * @param doc The document.@b@     * @param text The stream of text data.@b@     *@b@     * @throws IOException If there is an error writing the data.@b@     */@b@    public void createPDFFromText( PDDocument doc, Reader text ) throws IOException@b@    {@b@        try@b@        {@b@@b@            final int margin = 40;@b@            float height = font.getBoundingBox().getHeight() / FONTSCALE;@b@            PDRectangle actualMediaBox = mediaBox;@b@            if (landscape)@b@            {@b@                actualMediaBox = new PDRectangle(mediaBox.getHeight(), mediaBox.getWidth());@b@            }@b@@b@            //calculate font height and increase by a factor.@b@            height = height*fontSize*LINE_HEIGHT_FACTOR;@b@            BufferedReader data = new BufferedReader( text );@b@            String nextLine;@b@            PDPage page = new PDPage(actualMediaBox);@b@            PDPageContentStream contentStream = null;@b@            float y = -1;@b@            float maxStringLength = page.getMediaBox().getWidth() - 2*margin;@b@@b@            // There is a special case of creating a PDF document from an empty string.@b@            boolean textIsEmpty = true;@b@@b@            while( (nextLine = data.readLine()) != null )@b@            {@b@@b@                // The input text is nonEmpty. New pages will be created and added@b@                // to the PDF document as they are needed, depending on the length of@b@                // the text.@b@                textIsEmpty = false;@b@@b@                String[] lineWords = nextLine.replaceAll("[\\n\\r]+$", "").split(" ");@b@                int lineIndex = 0;@b@                while( lineIndex < lineWords.length )@b@                {@b@                    StringBuilder nextLineToDraw = new StringBuilder();@b@                    float lengthIfUsingNextWord = 0;@b@                    boolean ff = false;@b@                    do@b@                    {@b@                        String word1, word2 = "";@b@                        int indexFF = lineWords[lineIndex].indexOf('\f');@b@                        if (indexFF == -1)@b@                        {@b@                            word1 = lineWords[lineIndex];@b@                        }@b@                        else@b@                        {@b@                            ff = true;@b@                            word1 = lineWords[lineIndex].substring(0, indexFF);@b@                            if (indexFF < lineWords[lineIndex].length())@b@                            {@b@                                word2 = lineWords[lineIndex].substring(indexFF + 1);@b@                            }@b@                        }@b@                        // word1 is the part before ff, word2 after@b@                        // both can be empty@b@                        // word1 can also be empty without ff, if a line has many spaces@b@                        if (word1.length() > 0 || !ff)@b@                        {@b@                            nextLineToDraw.append(word1);@b@                            nextLineToDraw.append(" ");@b@                        }@b@                        if (!ff || word2.length() == 0)@b@                        {@b@                            lineIndex++;@b@                        }@b@                        else@b@                        {@b@                            lineWords[lineIndex] = word2;@b@                        }@b@                        if (ff)@b@                        {@b@                            break;@b@                        }@b@                        if( lineIndex < lineWords.length )@b@                        {@b@                            // need cut off at \f in next word to avoid IllegalArgumentException@b@                            String nextWord = lineWords[lineIndex];@b@                            indexFF = nextWord.indexOf('\f');@b@                            if (indexFF != -1)@b@                            {@b@                                nextWord = nextWord.substring(0, indexFF);@b@                            }@b@                            @b@                            String lineWithNextWord = nextLineToDraw.toString() + " " + nextWord;@b@                            lengthIfUsingNextWord =@b@                                (font.getStringWidth( lineWithNextWord )/FONTSCALE) * fontSize;@b@                        }@b@                    }@b@                    while (lineIndex < lineWords.length && lengthIfUsingNextWord < maxStringLength);@b@@b@                    if( y < margin )@b@                    {@b@                        // We have crossed the end-of-page boundary and need to extend the@b@                        // document by another page.@b@                        page = new PDPage(actualMediaBox);@b@                        doc.addPage( page );@b@                        if( contentStream != null )@b@                        {@b@                            contentStream.endText();@b@                            contentStream.close();@b@                        }@b@                        contentStream = new PDPageContentStream(doc, page);@b@                        contentStream.setFont( font, fontSize );@b@                        contentStream.beginText();@b@                        y = page.getMediaBox().getHeight() - margin + height;@b@                        contentStream.newLineAtOffset(margin, y);@b@                    }@b@@b@                    if( contentStream == null )@b@                    {@b@                        throw new IOException( "Error:Expected non-null content stream." );@b@                    }@b@                    contentStream.newLineAtOffset(0, -height);@b@                    y -= height;@b@                    contentStream.showText(nextLineToDraw.toString());@b@                    if (ff)@b@                    {@b@                        page = new PDPage(actualMediaBox);@b@                        doc.addPage(page);@b@                        contentStream.endText();@b@                        contentStream.close();@b@                        contentStream = new PDPageContentStream(doc, page);@b@                        contentStream.setFont(font, fontSize);@b@                        contentStream.beginText();@b@                        y = page.getMediaBox().getHeight() - margin + height;@b@                        contentStream.newLineAtOffset(margin, y);@b@                    }@b@                }@b@            }@b@@b@            // If the input text was the empty string, then the above while loop will have short-circuited@b@            // and we will not have added any PDPages to the document.@b@            // So in order to make the resultant PDF document readable by Adobe Reader etc, we'll add an empty page.@b@            if (textIsEmpty)@b@            {@b@                doc.addPage(page);@b@            }@b@@b@            if( contentStream != null )@b@            {@b@                contentStream.endText();@b@                contentStream.close();@b@            }@b@        }@b@        catch( IOException io )@b@        {@b@            if( doc != null )@b@            {@b@                doc.close();@b@            }@b@            throw io;@b@        }@b@    }@b@@b@    /**@b@     * This will create a PDF document with some text in it.@b@     * <br>@b@     * see usage() for commandline@b@     *@b@     * @param args Command line arguments.@b@     *@b@     * @throws IOException If there is an error with the PDF.@b@     */@b@    public static void main(String[] args) throws IOException@b@    {@b@        // suppress the Dock icon on OS X@b@        System.setProperty("apple.awt.UIElement", "true");@b@@b@        TextToPDF app = new TextToPDF();@b@                @b@        PDDocument doc = new PDDocument();@b@        try@b@        {@b@            if( args.length < 2 )@b@            {@b@                app.usage();@b@            }@b@            else@b@            {@b@                for( int i=0; i<args.length-2; i++ )@b@                {@b@                    if( args[i].equals( "-standardFont" ))@b@                    {@b@                        i++;@b@                        app.setFont( getStandardFont( args[i] ));@b@                    }@b@                    else if( args[i].equals( "-ttf" ))@b@                    {@b@                        i++;@b@                        PDFont font = PDType0Font.load( doc, new File( args[i]) );@b@                        app.setFont( font );@b@                    }@b@                    else if( args[i].equals( "-fontSize" ))@b@                    {@b@                        i++;@b@                        app.setFontSize( Integer.parseInt( args[i] ) );@b@                    }@b@                    else if( args[i].equals( "-pageSize" ))@b@                    {@b@                        i++;@b@                        PDRectangle rectangle = createRectangle(args[i]);@b@                        if (rectangle == null)@b@                        {@b@                            throw new IOException("Unknown argument: " + args[i]);@b@                        }@b@                        app.setMediaBox(rectangle);@b@                    }@b@                    else if( args[i].equals( "-landscape" ))@b@                    {@b@                        app.setLandscape(true);@b@                    }                    @b@                    else@b@                    {@b@                        throw new IOException( "Unknown argument: " + args[i] );@b@                    }@b@                }@b@                @b@                app.createPDFFromText( doc, new FileReader( args[args.length-1] ) );@b@                doc.save( args[args.length-2] );@b@            }@b@        }@b@        finally@b@        {@b@            doc.close();@b@        }@b@    }@b@@b@    private static PDRectangle createRectangle( String paperSize )@b@    {@b@        if ("letter".equalsIgnoreCase(paperSize))@b@        {@b@            return PDRectangle.LETTER;@b@        }@b@        else if ("legal".equalsIgnoreCase(paperSize))@b@        {@b@            return PDRectangle.LEGAL;@b@        }@b@        else if ("A0".equalsIgnoreCase(paperSize))@b@        {@b@            return PDRectangle.A0;@b@        }@b@        else if ("A1".equalsIgnoreCase(paperSize))@b@        {@b@            return PDRectangle.A1;@b@        }@b@        else if ("A2".equalsIgnoreCase(paperSize))@b@        {@b@            return PDRectangle.A2;@b@        }@b@        else if ("A3".equalsIgnoreCase(paperSize))@b@        {@b@            return PDRectangle.A3;@b@        }@b@    	else if ("A4".equalsIgnoreCase(paperSize))@b@        {@b@            return PDRectangle.A4;@b@        }@b@        else if ("A5".equalsIgnoreCase(paperSize))@b@        {@b@            return PDRectangle.A5;@b@        }@b@        else if ("A6".equalsIgnoreCase(paperSize))@b@        {@b@            return PDRectangle.A6;@b@        }@b@        else@b@        {@b@            return null;@b@        }@b@    }@b@@b@    /**@b@     * This will print out a message telling how to use this example.@b@     */@b@    private void usage()@b@    {@b@        String[] std14 = getStandard14Names();@b@        @b@        StringBuilder message = new StringBuilder();       @b@        message.append("Usage: jar -jar pdfbox-app-x.y.z.jar TextToPDF [options] <outputfile> <textfile>\n");@b@        message.append("\nOptions:\n");@b@        message.append("  -standardFont <name> : ").append(DEFAULT_FONT.getBaseFont()).append(" (default)\n");@b@@b@        for (String std14String : std14)@b@        {@b@            message.append("                         ").append(std14String).append("\n");@b@        }@b@        message.append("  -ttf <ttf file>      : The TTF font to use.\n");@b@        message.append("  -fontSize <fontSize> : default: ").append(DEFAULT_FONT_SIZE).append("\n");@b@        message.append("  -pageSize <pageSize> : Letter (default)\n");@b@        message.append("                         Legal\n");@b@        message.append("                         A0\n");@b@        message.append("                         A1\n");@b@        message.append("                         A2\n");@b@        message.append("                         A3\n");@b@        message.append("                         A4\n");@b@        message.append("                         A5\n");@b@        message.append("                         A6\n");@b@        message.append("  -landscape           : sets orientation to landscape" );@b@@b@        System.err.println(message.toString());@b@        System.exit(1);@b@    }@b@@b@@b@    /**@b@     * A convenience method to get one of the standard 14 font from name.@b@     *@b@     * @param name The name of the font to get.@b@     *@b@     * @return The font that matches the name or null if it does not exist.@b@     */@b@    private static PDType1Font getStandardFont(String name)@b@    {@b@        return STANDARD_14.get(name);@b@    }@b@@b@    /**@b@     * This will get the names of the standard 14 fonts.@b@     *@b@     * @return An array of the names of the standard 14 fonts.@b@     */@b@    private static String[] getStandard14Names()@b@    {@b@        return STANDARD_14.keySet().toArray(new String[14]);@b@    }@b@@b@@b@    /**@b@     * @return Returns the font.@b@     */@b@    public PDFont getFont()@b@    {@b@        return font;@b@    }@b@    /**@b@     * @param aFont The font to set.@b@     */@b@    public void setFont(PDFont aFont)@b@    {@b@        this.font = aFont;@b@    }@b@    /**@b@     * @return Returns the fontSize.@b@     */@b@    public int getFontSize()@b@    {@b@        return fontSize;@b@    }@b@    /**@b@     * @param aFontSize The fontSize to set.@b@     */@b@    public void setFontSize(int aFontSize)@b@    {@b@        this.fontSize = aFontSize;@b@    }@b@@b@    /**@b@     * Sets page size of produced PDF.@b@     *@b@     * @return returns the page size (media box)@b@     */@b@    public PDRectangle getMediaBox()@b@    {@b@        return mediaBox;@b@    }@b@@b@    /**@b@     * Sets page size of produced PDF.@b@     *@b@     * @param mediaBox@b@     */@b@    public void setMediaBox(PDRectangle mediaBox)@b@    {@b@        this.mediaBox = mediaBox;@b@    }@b@@b@    /**@b@     * Tells the paper orientation.@b@     *@b@     * @return true for landscape orientation@b@     */@b@    public boolean isLandscape()@b@    {@b@        return landscape;@b@    }@b@@b@    /**@b@     * Sets paper orientation.@b@     *@b@     * @param landscape@b@     */@b@    public void setLandscape(boolean landscape)@b@    {@b@        this.landscape = landscape;@b@    }@b@}

2.PDFToImage关于pdf文件转图像img代码示例

package org.apache.pdfbox.tools;@b@@b@import java.awt.HeadlessException;@b@import java.awt.Toolkit;@b@import java.awt.image.BufferedImage;@b@import java.io.File;@b@import java.io.IOException;@b@@b@import javax.imageio.ImageIO;@b@@b@import org.apache.pdfbox.pdmodel.PDDocument;@b@import org.apache.pdfbox.pdmodel.PDPage;@b@import org.apache.pdfbox.pdmodel.common.PDRectangle;@b@import org.apache.pdfbox.rendering.ImageType;@b@import org.apache.pdfbox.rendering.PDFRenderer;@b@import org.apache.pdfbox.tools.imageio.ImageIOUtil;@b@@b@/**@b@ * Convert a PDF document to an image.@b@ *@b@ * @author Ben Litchfield@b@ */@b@public final class PDFToImage@b@{@b@    private static final String PASSWORD = "-password";@b@    private static final String START_PAGE = "-startPage";@b@    private static final String END_PAGE = "-endPage";@b@    private static final String PAGE = "-page";@b@    private static final String IMAGE_TYPE = "-imageType";@b@    private static final String FORMAT = "-format";@b@    private static final String OUTPUT_PREFIX = "-outputPrefix";@b@    private static final String PREFIX = "-prefix";@b@    private static final String COLOR = "-color";@b@    private static final String RESOLUTION = "-resolution";@b@    private static final String DPI = "-dpi";@b@    private static final String CROPBOX = "-cropbox";@b@    private static final String TIME = "-time";@b@@b@    /**@b@     * private constructor.@b@    */@b@    private PDFToImage()@b@    {@b@        //static class@b@    }@b@@b@    /**@b@     * Infamous main method.@b@     *@b@     * @param args Command line arguments, should be one and a reference to a file.@b@     *@b@     * @throws IOException If there is an error parsing the document.@b@     */@b@    public static void main( String[] args ) throws IOException@b@    {@b@        try@b@        {@b@            // force KCMS (faster than LCMS) if available@b@            Class.forName("sun.java2d.cmm.kcms.KcmsServiceProvider");@b@            System.setProperty("sun.java2d.cmm", "sun.java2d.cmm.kcms.KcmsServiceProvider");@b@        }@b@        catch (ClassNotFoundException e)@b@        {@b@            // do nothing@b@        }@b@@b@        // suppress the Dock icon on OS X@b@        System.setProperty("apple.awt.UIElement", "true");@b@@b@        String password = "";@b@        String pdfFile = null;@b@        String outputPrefix = null;@b@        String imageFormat = "jpg";@b@        int startPage = 1;@b@        int endPage = Integer.MAX_VALUE;@b@        String color = "rgb";@b@        int dpi;@b@        float cropBoxLowerLeftX = 0;@b@        float cropBoxLowerLeftY = 0;@b@        float cropBoxUpperRightX = 0;@b@        float cropBoxUpperRightY = 0;@b@        boolean showTime = false;@b@        try@b@        {@b@            dpi = Toolkit.getDefaultToolkit().getScreenResolution();@b@        }@b@        catch( HeadlessException e )@b@        {@b@            dpi = 96;@b@        }@b@        for( int i = 0; i < args.length; i++ )@b@        {@b@            if( args[i].equals( PASSWORD ) )@b@            {@b@                i++;@b@                if( i >= args.length )@b@                {@b@                    usage();@b@                }@b@                password = args[i];@b@            }@b@            else if( args[i].equals( START_PAGE ) )@b@            {@b@                i++;@b@                if( i >= args.length )@b@                {@b@                    usage();@b@                }@b@                startPage = Integer.parseInt( args[i] );@b@            }@b@            else if( args[i].equals( END_PAGE ) )@b@            {@b@                i++;@b@                if( i >= args.length )@b@                {@b@                    usage();@b@                }@b@                endPage = Integer.parseInt( args[i] );@b@            }@b@            else if( args[i].equals( PAGE ) )@b@            {@b@                i++;@b@                if( i >= args.length )@b@                {@b@                    usage();@b@                }@b@                startPage = Integer.parseInt( args[i] );@b@                endPage = Integer.parseInt( args[i] );@b@            }@b@            else if( args[i].equals(IMAGE_TYPE) || args[i].equals(FORMAT) )@b@            {@b@                i++;@b@                imageFormat = args[i];@b@            }@b@            else if( args[i].equals( OUTPUT_PREFIX ) || args[i].equals( PREFIX ) )@b@            {@b@                i++;@b@                outputPrefix = args[i];@b@            }@b@            else if( args[i].equals( COLOR ) )@b@            {@b@                i++;@b@                color = args[i];@b@            }@b@            else if( args[i].equals( RESOLUTION ) || args[i].equals( DPI ) )@b@            {@b@                i++;@b@                dpi = Integer.parseInt(args[i]);@b@            }@b@            else if( args[i].equals( CROPBOX ) )@b@            {@b@                i++;@b@                cropBoxLowerLeftX = Float.valueOf(args[i]);@b@                i++;@b@                cropBoxLowerLeftY = Float.valueOf(args[i]);@b@                i++;@b@                cropBoxUpperRightX = Float.valueOf(args[i]);@b@                i++;@b@                cropBoxUpperRightY = Float.valueOf(args[i]);@b@            }@b@            else if( args[i].equals( TIME ) )@b@            {@b@                showTime = true;@b@            }@b@            else@b@            {@b@                if( pdfFile == null )@b@                {@b@                    pdfFile = args[i];@b@                }@b@            }@b@        }@b@        if( pdfFile == null )@b@        {@b@            usage();@b@        }@b@        else@b@        {@b@            if(outputPrefix == null)@b@            {@b@                outputPrefix = pdfFile.substring( 0, pdfFile.lastIndexOf( '.' ));@b@            }@b@@b@            PDDocument document = null;@b@            try@b@            {@b@                document = PDDocument.load(new File(pdfFile), password);@b@@b@                ImageType imageType = null;@b@                if ("bilevel".equalsIgnoreCase(color))@b@                {@b@                    imageType = ImageType.BINARY;@b@                }@b@                else if ("gray".equalsIgnoreCase(color))@b@                {@b@                    imageType = ImageType.GRAY;@b@                }@b@                else if ("rgb".equalsIgnoreCase(color))@b@                {@b@                    imageType = ImageType.RGB;@b@                }@b@                else if ("rgba".equalsIgnoreCase(color))@b@                {@b@                    imageType = ImageType.ARGB;@b@                }@b@                @b@                if (imageType == null)@b@                {@b@                    System.err.println( "Error: Invalid color." );@b@                    System.exit( 2 );@b@                }@b@@b@                //if a CropBox has been specified, update the CropBox:@b@                //changeCropBoxes(PDDocument document,float a, float b, float c,float d)@b@                if ( cropBoxLowerLeftX!=0 || cropBoxLowerLeftY!=0@b@                        || cropBoxUpperRightX!=0 || cropBoxUpperRightY!=0 )@b@                {@b@                    changeCropBox(document,@b@                            cropBoxLowerLeftX, cropBoxLowerLeftY,@b@                            cropBoxUpperRightX, cropBoxUpperRightY);@b@                }@b@@b@                long startTime = System.nanoTime();@b@@b@                // render the pages@b@                boolean success = true;@b@                endPage = Math.min(endPage, document.getNumberOfPages());@b@                PDFRenderer renderer = new PDFRenderer(document);@b@                for (int i = startPage - 1; i < endPage; i++)@b@                {@b@                    BufferedImage image = renderer.renderImageWithDPI(i, dpi, imageType);@b@                    String fileName = outputPrefix + (i + 1) + "." + imageFormat;@b@                    success &= ImageIOUtil.writeImage(image, fileName, dpi);@b@                }@b@@b@                // performance stats@b@                long endTime = System.nanoTime();@b@                long duration = endTime - startTime;@b@                int count = 1 + endPage - startPage;@b@                if (showTime)@b@                {@b@                    System.err.printf("Rendered %d page%s in %dms\n", count, count == 1 ? "" : "s",@b@                                      duration / 1000000);@b@                }@b@@b@                if (!success)@b@                {@b@                    System.err.println( "Error: no writer found for image format '"@b@                            + imageFormat + "'" );@b@                    System.exit(1);@b@                }@b@            }@b@            finally@b@            {@b@                if( document != null )@b@                {@b@                    document.close();@b@                }@b@            }@b@        }@b@    }@b@@b@    /**@b@     * This will print the usage requirements and exit.@b@     */@b@    private static void usage()@b@    {@b@        String message = "Usage: java -jar pdfbox-app-x.y.z.jar PDFToImage [options] <inputfile>\n"@b@            + "\nOptions:\n"@b@            + "  -password  <password>            : Password to decrypt document\n"@b@            + "  -format <string>                 : Image format: " + getImageFormats() + "\n"@b@            + "  -prefix <string>                 : Filename prefix for image files\n"@b@            + "  -page <number>                   : The only page to extract (1-based)\n"@b@            + "  -startPage <int>                 : The first page to start extraction (1-based)\n"@b@            + "  -endPage <int>                   : The last page to extract(inclusive)\n"@b@            + "  -color <int>                     : The color depth (valid: bilevel, gray, rgb, rgba)\n"@b@            + "  -dpi <int>                       : The DPI of the output image\n"@b@            + "  -cropbox <int> <int> <int> <int> : The page area to export\n"@b@            + "  -time                            : Prints timing information to stdout\n"@b@            + "  <inputfile>                      : The PDF document to use\n";@b@        @b@        System.err.println(message);@b@        System.exit( 1 );@b@    }@b@@b@    private static String getImageFormats()@b@    {@b@        StringBuilder retval = new StringBuilder();@b@        String[] formats = ImageIO.getReaderFormatNames();@b@        for( int i = 0; i < formats.length; i++ )@b@        {@b@           if (formats[i].equalsIgnoreCase(formats[i]))@b@           {@b@               retval.append( formats[i] );@b@               if( i + 1 < formats.length )@b@               {@b@                   retval.append( ", " );@b@               }@b@           }@b@        }@b@        return retval.toString();@b@    }@b@@b@    private static void changeCropBox(PDDocument document, float a, float b, float c, float d)@b@    {@b@        for (PDPage page : document.getPages())@b@        {@b@            System.out.println("resizing page");@b@            PDRectangle rectangle = new PDRectangle();@b@            rectangle.setLowerLeftX(a);@b@            rectangle.setLowerLeftY(b);@b@            rectangle.setUpperRightX(c);@b@            rectangle.setUpperRightY(d);@b@            page.setCropBox(rectangle);@b@@b@        }@b@    }@b@}

3.PDFText2HTML关于pdf文本文件转HTML文件代码示例

package org.apache.pdfbox.tools;@b@@b@import java.io.IOException;@b@import java.util.ArrayList;@b@import java.util.HashSet;@b@import java.util.Iterator;@b@import java.util.List;@b@import java.util.Set;@b@@b@import org.apache.pdfbox.pdmodel.PDDocument;@b@import org.apache.pdfbox.pdmodel.font.PDFontDescriptor;@b@import org.apache.pdfbox.text.PDFTextStripper;@b@import org.apache.pdfbox.text.TextPosition;@b@@b@/**@b@ * Wrap stripped text in simple HTML, trying to form HTML paragraphs. Paragraphs@b@ * broken by pages, columns, or figures are not mended.@b@ *@b@ * @author John J Barton@b@ * @b@ */@b@public class PDFText2HTML extends PDFTextStripper@b@{@b@    private static final int INITIAL_PDF_TO_HTML_BYTES = 8192;@b@@b@    private final FontState fontState = new FontState();@b@@b@    /**@b@     * Constructor.@b@     * @throws IOException If there is an error during initialization.@b@     */@b@    public PDFText2HTML() throws IOException@b@    {@b@        super();@b@        setLineSeparator(LINE_SEPARATOR);@b@        setParagraphStart("<p>");@b@        setParagraphEnd("</p>"+ LINE_SEPARATOR);@b@        setPageStart("<div style=\"page-break-before:always; page-break-after:always\">");@b@        setPageEnd("</div>"+ LINE_SEPARATOR);@b@        setArticleStart(LINE_SEPARATOR);@b@        setArticleEnd(LINE_SEPARATOR);@b@    }@b@@b@    /**@b@     * Write the header to the output document. Now also writes the tag defining@b@     * the character encoding.@b@     *@b@     * @throws IOException@b@     *             If there is a problem writing out the header to the document.@b@     * @deprecated deprecated, use {@link #startDocument(PDDocument)}@b@     */@b@    protected void writeHeader() throws IOException@b@    {@b@    }@b@@b@    @Override@b@    protected void startDocument(PDDocument document) throws IOException@b@    {@b@        StringBuilder buf = new StringBuilder(INITIAL_PDF_TO_HTML_BYTES);@b@        buf.append("<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\"" + "\n"@b@                + "\"http://www.w3.org/TR/html4/loose.dtd\">\n");@b@        buf.append("<html><head>");@b@        buf.append("<title>").append(escape(getTitle())).append("</title>\n");@b@        buf.append("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=\"UTF-8\">\n");@b@        buf.append("</head>\n");@b@        buf.append("<body>\n");@b@        super.writeString(buf.toString());@b@    }@b@    @b@    /**@b@     * {@inheritDoc}@b@     */@b@    @Override@b@    public void endDocument(PDDocument document) throws IOException@b@    {@b@        super.writeString("</body></html>");@b@    }@b@@b@    /**@b@     * This method will attempt to guess the title of the document using@b@     * either the document properties or the first lines of text.@b@     *@b@     * @return returns the title.@b@     */@b@    protected String getTitle()@b@    {@b@        String titleGuess = document.getDocumentInformation().getTitle();@b@        if(titleGuess != null && titleGuess.length() > 0)@b@        {@b@            return titleGuess;@b@        }@b@        else@b@        {@b@            Iterator<List<TextPosition>> textIter = getCharactersByArticle().iterator();@b@            float lastFontSize = -1.0f;@b@@b@            StringBuilder titleText = new StringBuilder();@b@            while (textIter.hasNext())@b@            {@b@                for (TextPosition position : textIter.next())@b@                {@b@                    float currentFontSize = position.getFontSize();@b@                    //If we're past 64 chars we will assume that we're past the title@b@                    //64 is arbitrary@b@                    if (currentFontSize != lastFontSize || titleText.length() > 64)@b@                    {@b@                        if (titleText.length() > 0)@b@                        {@b@                            return titleText.toString();@b@                        }@b@                        lastFontSize = currentFontSize;@b@                    }@b@                    if (currentFontSize > 13.0f)@b@                    { // most body text is 12pt@b@                        titleText.append(position.getUnicode());@b@                    }@b@                }@b@            }@b@        }@b@        return "";@b@    }@b@@b@@b@    /**@b@     * Write out the article separator (div tag) with proper text direction@b@     * information.@b@     *@b@     * @param isLTR true if direction of text is left to right@b@     * @throws IOException@b@     *             If there is an error writing to the stream.@b@     */@b@    @Override@b@    protected void startArticle(boolean isLTR) throws IOException@b@    {@b@        if (isLTR)@b@        {@b@            super.writeString("<div>");@b@        }@b@        else@b@        {@b@            super.writeString("<div dir=\"RTL\">");@b@        }@b@    }@b@@b@    /**@b@     * Write out the article separator.@b@     *@b@     * @throws IOException@b@     *             If there is an error writing to the stream.@b@     */@b@    @Override@b@    protected void endArticle() throws IOException@b@    {@b@        super.endArticle();@b@        super.writeString("</div>");@b@    }@b@@b@    /**@b@     * Write a string to the output stream, maintain font state, and escape some HTML characters.@b@     * The font state is only preserved per word.@b@     *@b@     * @param text The text to write to the stream.@b@     * @param textPositions the corresponding text positions@b@     * @throws IOException If there is an error writing to the stream.@b@     */@b@    @Override@b@    protected void writeString(String text, List<TextPosition> textPositions) throws IOException@b@    {@b@        super.writeString(fontState.push(text, textPositions));@b@    }@b@@b@    /**@b@     * Write a string to the output stream and escape some HTML characters.@b@     *@b@     * @param chars String to be written to the stream@b@     * @throws IOException@b@     *             If there is an error writing to the stream.@b@     */@b@    @Override@b@    protected void writeString(String chars) throws IOException@b@    {@b@        super.writeString(escape(chars));@b@    }@b@@b@    /**@b@     * Writes the paragraph end "&lt;/p&gt;" to the output. Furthermore, it will also clear the font state.@b@     * @b@     * {@inheritDoc}@b@     */@b@    @Override@b@    protected void writeParagraphEnd() throws IOException@b@    {@b@        // do not escape HTML@b@        super.writeString(fontState.clear());@b@        @b@        super.writeParagraphEnd();@b@    }@b@@b@    /**@b@     * Escape some HTML characters.@b@     *@b@     * @param chars String to be escaped@b@     * @return returns escaped String.@b@     */@b@    private static String escape(String chars)@b@    {@b@        StringBuilder builder = new StringBuilder(chars.length());@b@        for (int i = 0; i < chars.length(); i++)@b@        {@b@            appendEscaped(builder, chars.charAt(i));@b@        }@b@        return builder.toString();@b@    }@b@@b@    private static void appendEscaped(StringBuilder builder, char character)@b@    {@b@        // write non-ASCII as named entities@b@        if ((character < 32) || (character > 126))@b@        {@b@            int charAsInt = character;@b@            builder.append("&#").append(charAsInt).append(";");@b@        }@b@        else@b@        {@b@            switch (character)@b@            {@b@            case 34:@b@                builder.append("&quot;");@b@                break;@b@            case 38:@b@                builder.append("&amp;");@b@                break;@b@            case 60:@b@                builder.append("&lt;");@b@                break;@b@            case 62:@b@                builder.append("&gt;");@b@                break;@b@            default:@b@                builder.append(String.valueOf(character));@b@            }@b@        }@b@    }@b@@b@    /**@b@     * A helper class to maintain the current font state. It's public methods will emit opening and@b@     * closing tags as needed, and in the correct order.@b@     *@b@     * @author Axel Dörfler@b@     */@b@    private static class FontState@b@    {@b@        private final List<String> stateList = new ArrayList<String>();@b@        private final Set<String> stateSet = new HashSet<String>();@b@@b@        /**@b@         * Pushes new {@link TextPosition TextPositions} into the font state. The state is only@b@         * preserved correctly for each letter if the number of letters in <code>text</code> matches@b@         * the number of {@link TextPosition} objects. Otherwise, it's done once for the complete@b@         * array (just by looking at its first entry).@b@         *@b@         * @return A string that contains the text including tag changes caused by its font state.@b@         */@b@        public String push(String text, List<TextPosition> textPositions)@b@        {@b@            StringBuilder buffer = new StringBuilder();@b@@b@            if (text.length() == textPositions.size())@b@            {@b@                // There is a 1:1 mapping, and we can use the TextPositions directly@b@                for (int i = 0; i < text.length(); i++)@b@                {@b@                    push(buffer, text.charAt(i), textPositions.get(i));@b@                }@b@            }@b@            else if (!text.isEmpty())@b@            {@b@                // The normalized text does not match the number of TextPositions, so we'll just@b@                // have a look at its first entry.@b@                // TODO change PDFTextStripper.normalize() such that it maintains the 1:1 relation@b@                if (textPositions.isEmpty())@b@                {@b@                    return text;@b@                }@b@                push(buffer, text.charAt(0), textPositions.get(0));@b@                buffer.append(escape(text.substring(1)));@b@            }@b@            return buffer.toString();@b@        }@b@@b@        /**@b@         * Closes all open states.@b@         * @return A string that contains the closing tags of all currently open states.@b@         */@b@        public String clear()@b@        {@b@            StringBuilder buffer = new StringBuilder();@b@            closeUntil(buffer, null);@b@            stateList.clear();@b@            stateSet.clear();@b@            return buffer.toString();@b@        }@b@@b@        protected String push(StringBuilder buffer, char character, TextPosition textPosition)@b@        {@b@            boolean bold = false;@b@            boolean italics = false;@b@@b@            PDFontDescriptor descriptor = textPosition.getFont().getFontDescriptor();@b@            if (descriptor != null)@b@            {@b@                bold = isBold(descriptor);@b@                italics = isItalic(descriptor);@b@            }@b@            @b@            buffer.append(bold ? open("b") : close("b"));@b@            buffer.append(italics ? open("i") : close("i"));@b@            appendEscaped(buffer, character);@b@@b@            return buffer.toString();@b@        }@b@@b@        private String open(String tag)@b@        {@b@            if (stateSet.contains(tag))@b@            {@b@                return "";@b@            }@b@            stateList.add(tag);@b@            stateSet.add(tag);@b@@b@            return openTag(tag);@b@        }@b@@b@        private String close(String tag)@b@        {@b@            if (!stateSet.contains(tag))@b@            {@b@                return "";@b@            }@b@            // Close all tags until (but including) the one we should close@b@            StringBuilder tagsBuilder = new StringBuilder();@b@            int index = closeUntil(tagsBuilder, tag);@b@@b@            // Remove from state@b@            stateList.remove(index);@b@            stateSet.remove(tag);@b@@b@            // Now open the states that were closed but should remain open again@b@            for (; index < stateList.size(); index++)@b@            {@b@                tagsBuilder.append(openTag(stateList.get(index)));@b@            }@b@            return tagsBuilder.toString();@b@        }@b@@b@        private int closeUntil(StringBuilder tagsBuilder, String endTag)@b@        {@b@            for (int i = stateList.size(); i-- > 0;)@b@            {@b@                String tag = stateList.get(i);@b@                tagsBuilder.append(closeTag(tag));@b@                if (endTag != null && tag.equals(endTag))@b@                {@b@                    return i;@b@                }@b@            }@b@            return -1;@b@        }@b@@b@        private String openTag(String tag)@b@        {@b@            return "<" + tag + ">";@b@        }@b@@b@        private String closeTag(String tag)@b@        {@b@            return "</" + tag + ">";@b@        }@b@@b@        private boolean isBold(PDFontDescriptor descriptor)@b@        {@b@            if (descriptor.isForceBold())@b@            {@b@                return true;@b@            }@b@            return descriptor.getFontName().contains("Bold");@b@        }@b@@b@        private boolean isItalic(PDFontDescriptor descriptor)@b@        {@b@            if (descriptor.isItalic())@b@            {@b@                return true;@b@            }@b@            return descriptor.getFontName().contains("Italic");@b@        }@b@    }@b@}





<<热门下载>>