一、前言
下面通过pdfbox的org.apache.pdfbox.pdmodel.PDDocument的pdf文件实现类分别有org.apache.pdfbox.tools.TextToPDF、org.apache.pdfbox.tools.PDFToImage、org.apache.pdfbox.tools.PDFText2HTML进行pdf的文本转PDF文件、PDF转图像、PDF文本转HTML文件处理代码示例。
二、代码示例
1.TextToPDF文本转pdf文件示例
package org.apache.pdfbox.tools;@b@@b@import java.io.BufferedReader;@b@import java.io.File;@b@import java.io.FileReader;@b@import java.io.IOException;@b@import java.io.Reader;@b@import java.util.HashMap;@b@import java.util.Map;@b@import org.apache.pdfbox.pdmodel.PDDocument;@b@import org.apache.pdfbox.pdmodel.PDPage;@b@import org.apache.pdfbox.pdmodel.PDPageContentStream;@b@import org.apache.pdfbox.pdmodel.common.PDRectangle;@b@import org.apache.pdfbox.pdmodel.font.PDFont;@b@import org.apache.pdfbox.pdmodel.font.PDType0Font;@b@import org.apache.pdfbox.pdmodel.font.PDType1Font;@b@@b@/**@b@ * This will take a text file and ouput a pdf with that text.@b@ *@b@ * @author Ben Litchfield@b@ */@b@public class TextToPDF@b@{@b@ /**@b@ * The scaling factor for font units to PDF units@b@ */@b@ private static final int FONTSCALE = 1000;@b@ @b@ /**@b@ * The default font@b@ */@b@ private static final PDType1Font DEFAULT_FONT = PDType1Font.HELVETICA;@b@@b@ /**@b@ * The default font size@b@ */@b@ private static final int DEFAULT_FONT_SIZE = 10;@b@ @b@ /**@b@ * The line height as a factor of the font size@b@ */@b@ private static final float LINE_HEIGHT_FACTOR = 1.05f;@b@@b@ private int fontSize = DEFAULT_FONT_SIZE;@b@ private PDRectangle mediaBox = PDRectangle.LETTER;@b@ private boolean landscape = false;@b@ private PDFont font = DEFAULT_FONT;@b@@b@ private static final Map<String, PDType1Font> STANDARD_14 = new HashMap<String, PDType1Font>();@b@ static@b@ {@b@ STANDARD_14.put(PDType1Font.TIMES_ROMAN.getBaseFont(), PDType1Font.TIMES_ROMAN);@b@ STANDARD_14.put(PDType1Font.TIMES_BOLD.getBaseFont(), PDType1Font.TIMES_BOLD);@b@ STANDARD_14.put(PDType1Font.TIMES_ITALIC.getBaseFont(), PDType1Font.TIMES_ITALIC);@b@ STANDARD_14.put(PDType1Font.TIMES_BOLD_ITALIC.getBaseFont(), PDType1Font.TIMES_BOLD_ITALIC);@b@ STANDARD_14.put(PDType1Font.HELVETICA.getBaseFont(), PDType1Font.HELVETICA);@b@ STANDARD_14.put(PDType1Font.HELVETICA_BOLD.getBaseFont(), PDType1Font.HELVETICA_BOLD);@b@ STANDARD_14.put(PDType1Font.HELVETICA_OBLIQUE.getBaseFont(), PDType1Font.HELVETICA_OBLIQUE);@b@ STANDARD_14.put(PDType1Font.HELVETICA_BOLD_OBLIQUE.getBaseFont(), PDType1Font.HELVETICA_BOLD_OBLIQUE);@b@ STANDARD_14.put(PDType1Font.COURIER.getBaseFont(), PDType1Font.COURIER);@b@ STANDARD_14.put(PDType1Font.COURIER_BOLD.getBaseFont(), PDType1Font.COURIER_BOLD);@b@ STANDARD_14.put(PDType1Font.COURIER_OBLIQUE.getBaseFont(), PDType1Font.COURIER_OBLIQUE);@b@ STANDARD_14.put(PDType1Font.COURIER_BOLD_OBLIQUE.getBaseFont(), PDType1Font.COURIER_BOLD_OBLIQUE);@b@ STANDARD_14.put(PDType1Font.SYMBOL.getBaseFont(), PDType1Font.SYMBOL);@b@ STANDARD_14.put(PDType1Font.ZAPF_DINGBATS.getBaseFont(), PDType1Font.ZAPF_DINGBATS);@b@ }@b@@b@ /**@b@ * Create a PDF document with some text.@b@ *@b@ * @param text The stream of text data.@b@ *@b@ * @return The document with the text in it.@b@ *@b@ * @throws IOException If there is an error writing the data.@b@ */@b@ public PDDocument createPDFFromText( Reader text ) throws IOException@b@ {@b@ PDDocument doc = new PDDocument();@b@ createPDFFromText(doc, text);@b@ return doc;@b@ }@b@@b@ /**@b@ * Create a PDF document with some text.@b@ *@b@ * @param doc The document.@b@ * @param text The stream of text data.@b@ *@b@ * @throws IOException If there is an error writing the data.@b@ */@b@ public void createPDFFromText( PDDocument doc, Reader text ) throws IOException@b@ {@b@ try@b@ {@b@@b@ final int margin = 40;@b@ float height = font.getBoundingBox().getHeight() / FONTSCALE;@b@ PDRectangle actualMediaBox = mediaBox;@b@ if (landscape)@b@ {@b@ actualMediaBox = new PDRectangle(mediaBox.getHeight(), mediaBox.getWidth());@b@ }@b@@b@ //calculate font height and increase by a factor.@b@ height = height*fontSize*LINE_HEIGHT_FACTOR;@b@ BufferedReader data = new BufferedReader( text );@b@ String nextLine;@b@ PDPage page = new PDPage(actualMediaBox);@b@ PDPageContentStream contentStream = null;@b@ float y = -1;@b@ float maxStringLength = page.getMediaBox().getWidth() - 2*margin;@b@@b@ // There is a special case of creating a PDF document from an empty string.@b@ boolean textIsEmpty = true;@b@@b@ while( (nextLine = data.readLine()) != null )@b@ {@b@@b@ // The input text is nonEmpty. New pages will be created and added@b@ // to the PDF document as they are needed, depending on the length of@b@ // the text.@b@ textIsEmpty = false;@b@@b@ String[] lineWords = nextLine.replaceAll("[\\n\\r]+$", "").split(" ");@b@ int lineIndex = 0;@b@ while( lineIndex < lineWords.length )@b@ {@b@ StringBuilder nextLineToDraw = new StringBuilder();@b@ float lengthIfUsingNextWord = 0;@b@ boolean ff = false;@b@ do@b@ {@b@ String word1, word2 = "";@b@ int indexFF = lineWords[lineIndex].indexOf('\f');@b@ if (indexFF == -1)@b@ {@b@ word1 = lineWords[lineIndex];@b@ }@b@ else@b@ {@b@ ff = true;@b@ word1 = lineWords[lineIndex].substring(0, indexFF);@b@ if (indexFF < lineWords[lineIndex].length())@b@ {@b@ word2 = lineWords[lineIndex].substring(indexFF + 1);@b@ }@b@ }@b@ // word1 is the part before ff, word2 after@b@ // both can be empty@b@ // word1 can also be empty without ff, if a line has many spaces@b@ if (word1.length() > 0 || !ff)@b@ {@b@ nextLineToDraw.append(word1);@b@ nextLineToDraw.append(" ");@b@ }@b@ if (!ff || word2.length() == 0)@b@ {@b@ lineIndex++;@b@ }@b@ else@b@ {@b@ lineWords[lineIndex] = word2;@b@ }@b@ if (ff)@b@ {@b@ break;@b@ }@b@ if( lineIndex < lineWords.length )@b@ {@b@ // need cut off at \f in next word to avoid IllegalArgumentException@b@ String nextWord = lineWords[lineIndex];@b@ indexFF = nextWord.indexOf('\f');@b@ if (indexFF != -1)@b@ {@b@ nextWord = nextWord.substring(0, indexFF);@b@ }@b@ @b@ String lineWithNextWord = nextLineToDraw.toString() + " " + nextWord;@b@ lengthIfUsingNextWord =@b@ (font.getStringWidth( lineWithNextWord )/FONTSCALE) * fontSize;@b@ }@b@ }@b@ while (lineIndex < lineWords.length && lengthIfUsingNextWord < maxStringLength);@b@@b@ if( y < margin )@b@ {@b@ // We have crossed the end-of-page boundary and need to extend the@b@ // document by another page.@b@ page = new PDPage(actualMediaBox);@b@ doc.addPage( page );@b@ if( contentStream != null )@b@ {@b@ contentStream.endText();@b@ contentStream.close();@b@ }@b@ contentStream = new PDPageContentStream(doc, page);@b@ contentStream.setFont( font, fontSize );@b@ contentStream.beginText();@b@ y = page.getMediaBox().getHeight() - margin + height;@b@ contentStream.newLineAtOffset(margin, y);@b@ }@b@@b@ if( contentStream == null )@b@ {@b@ throw new IOException( "Error:Expected non-null content stream." );@b@ }@b@ contentStream.newLineAtOffset(0, -height);@b@ y -= height;@b@ contentStream.showText(nextLineToDraw.toString());@b@ if (ff)@b@ {@b@ page = new PDPage(actualMediaBox);@b@ doc.addPage(page);@b@ contentStream.endText();@b@ contentStream.close();@b@ contentStream = new PDPageContentStream(doc, page);@b@ contentStream.setFont(font, fontSize);@b@ contentStream.beginText();@b@ y = page.getMediaBox().getHeight() - margin + height;@b@ contentStream.newLineAtOffset(margin, y);@b@ }@b@ }@b@ }@b@@b@ // If the input text was the empty string, then the above while loop will have short-circuited@b@ // and we will not have added any PDPages to the document.@b@ // So in order to make the resultant PDF document readable by Adobe Reader etc, we'll add an empty page.@b@ if (textIsEmpty)@b@ {@b@ doc.addPage(page);@b@ }@b@@b@ if( contentStream != null )@b@ {@b@ contentStream.endText();@b@ contentStream.close();@b@ }@b@ }@b@ catch( IOException io )@b@ {@b@ if( doc != null )@b@ {@b@ doc.close();@b@ }@b@ throw io;@b@ }@b@ }@b@@b@ /**@b@ * This will create a PDF document with some text in it.@b@ * <br>@b@ * see usage() for commandline@b@ *@b@ * @param args Command line arguments.@b@ *@b@ * @throws IOException If there is an error with the PDF.@b@ */@b@ public static void main(String[] args) throws IOException@b@ {@b@ // suppress the Dock icon on OS X@b@ System.setProperty("apple.awt.UIElement", "true");@b@@b@ TextToPDF app = new TextToPDF();@b@ @b@ PDDocument doc = new PDDocument();@b@ try@b@ {@b@ if( args.length < 2 )@b@ {@b@ app.usage();@b@ }@b@ else@b@ {@b@ for( int i=0; i<args.length-2; i++ )@b@ {@b@ if( args[i].equals( "-standardFont" ))@b@ {@b@ i++;@b@ app.setFont( getStandardFont( args[i] ));@b@ }@b@ else if( args[i].equals( "-ttf" ))@b@ {@b@ i++;@b@ PDFont font = PDType0Font.load( doc, new File( args[i]) );@b@ app.setFont( font );@b@ }@b@ else if( args[i].equals( "-fontSize" ))@b@ {@b@ i++;@b@ app.setFontSize( Integer.parseInt( args[i] ) );@b@ }@b@ else if( args[i].equals( "-pageSize" ))@b@ {@b@ i++;@b@ PDRectangle rectangle = createRectangle(args[i]);@b@ if (rectangle == null)@b@ {@b@ throw new IOException("Unknown argument: " + args[i]);@b@ }@b@ app.setMediaBox(rectangle);@b@ }@b@ else if( args[i].equals( "-landscape" ))@b@ {@b@ app.setLandscape(true);@b@ } @b@ else@b@ {@b@ throw new IOException( "Unknown argument: " + args[i] );@b@ }@b@ }@b@ @b@ app.createPDFFromText( doc, new FileReader( args[args.length-1] ) );@b@ doc.save( args[args.length-2] );@b@ }@b@ }@b@ finally@b@ {@b@ doc.close();@b@ }@b@ }@b@@b@ private static PDRectangle createRectangle( String paperSize )@b@ {@b@ if ("letter".equalsIgnoreCase(paperSize))@b@ {@b@ return PDRectangle.LETTER;@b@ }@b@ else if ("legal".equalsIgnoreCase(paperSize))@b@ {@b@ return PDRectangle.LEGAL;@b@ }@b@ else if ("A0".equalsIgnoreCase(paperSize))@b@ {@b@ return PDRectangle.A0;@b@ }@b@ else if ("A1".equalsIgnoreCase(paperSize))@b@ {@b@ return PDRectangle.A1;@b@ }@b@ else if ("A2".equalsIgnoreCase(paperSize))@b@ {@b@ return PDRectangle.A2;@b@ }@b@ else if ("A3".equalsIgnoreCase(paperSize))@b@ {@b@ return PDRectangle.A3;@b@ }@b@ else if ("A4".equalsIgnoreCase(paperSize))@b@ {@b@ return PDRectangle.A4;@b@ }@b@ else if ("A5".equalsIgnoreCase(paperSize))@b@ {@b@ return PDRectangle.A5;@b@ }@b@ else if ("A6".equalsIgnoreCase(paperSize))@b@ {@b@ return PDRectangle.A6;@b@ }@b@ else@b@ {@b@ return null;@b@ }@b@ }@b@@b@ /**@b@ * This will print out a message telling how to use this example.@b@ */@b@ private void usage()@b@ {@b@ String[] std14 = getStandard14Names();@b@ @b@ StringBuilder message = new StringBuilder(); @b@ message.append("Usage: jar -jar pdfbox-app-x.y.z.jar TextToPDF [options] <outputfile> <textfile>\n");@b@ message.append("\nOptions:\n");@b@ message.append(" -standardFont <name> : ").append(DEFAULT_FONT.getBaseFont()).append(" (default)\n");@b@@b@ for (String std14String : std14)@b@ {@b@ message.append(" ").append(std14String).append("\n");@b@ }@b@ message.append(" -ttf <ttf file> : The TTF font to use.\n");@b@ message.append(" -fontSize <fontSize> : default: ").append(DEFAULT_FONT_SIZE).append("\n");@b@ message.append(" -pageSize <pageSize> : Letter (default)\n");@b@ message.append(" Legal\n");@b@ message.append(" A0\n");@b@ message.append(" A1\n");@b@ message.append(" A2\n");@b@ message.append(" A3\n");@b@ message.append(" A4\n");@b@ message.append(" A5\n");@b@ message.append(" A6\n");@b@ message.append(" -landscape : sets orientation to landscape" );@b@@b@ System.err.println(message.toString());@b@ System.exit(1);@b@ }@b@@b@@b@ /**@b@ * A convenience method to get one of the standard 14 font from name.@b@ *@b@ * @param name The name of the font to get.@b@ *@b@ * @return The font that matches the name or null if it does not exist.@b@ */@b@ private static PDType1Font getStandardFont(String name)@b@ {@b@ return STANDARD_14.get(name);@b@ }@b@@b@ /**@b@ * This will get the names of the standard 14 fonts.@b@ *@b@ * @return An array of the names of the standard 14 fonts.@b@ */@b@ private static String[] getStandard14Names()@b@ {@b@ return STANDARD_14.keySet().toArray(new String[14]);@b@ }@b@@b@@b@ /**@b@ * @return Returns the font.@b@ */@b@ public PDFont getFont()@b@ {@b@ return font;@b@ }@b@ /**@b@ * @param aFont The font to set.@b@ */@b@ public void setFont(PDFont aFont)@b@ {@b@ this.font = aFont;@b@ }@b@ /**@b@ * @return Returns the fontSize.@b@ */@b@ public int getFontSize()@b@ {@b@ return fontSize;@b@ }@b@ /**@b@ * @param aFontSize The fontSize to set.@b@ */@b@ public void setFontSize(int aFontSize)@b@ {@b@ this.fontSize = aFontSize;@b@ }@b@@b@ /**@b@ * Sets page size of produced PDF.@b@ *@b@ * @return returns the page size (media box)@b@ */@b@ public PDRectangle getMediaBox()@b@ {@b@ return mediaBox;@b@ }@b@@b@ /**@b@ * Sets page size of produced PDF.@b@ *@b@ * @param mediaBox@b@ */@b@ public void setMediaBox(PDRectangle mediaBox)@b@ {@b@ this.mediaBox = mediaBox;@b@ }@b@@b@ /**@b@ * Tells the paper orientation.@b@ *@b@ * @return true for landscape orientation@b@ */@b@ public boolean isLandscape()@b@ {@b@ return landscape;@b@ }@b@@b@ /**@b@ * Sets paper orientation.@b@ *@b@ * @param landscape@b@ */@b@ public void setLandscape(boolean landscape)@b@ {@b@ this.landscape = landscape;@b@ }@b@}
2.PDFToImage关于pdf文件转图像img代码示例
package org.apache.pdfbox.tools;@b@@b@import java.awt.HeadlessException;@b@import java.awt.Toolkit;@b@import java.awt.image.BufferedImage;@b@import java.io.File;@b@import java.io.IOException;@b@@b@import javax.imageio.ImageIO;@b@@b@import org.apache.pdfbox.pdmodel.PDDocument;@b@import org.apache.pdfbox.pdmodel.PDPage;@b@import org.apache.pdfbox.pdmodel.common.PDRectangle;@b@import org.apache.pdfbox.rendering.ImageType;@b@import org.apache.pdfbox.rendering.PDFRenderer;@b@import org.apache.pdfbox.tools.imageio.ImageIOUtil;@b@@b@/**@b@ * Convert a PDF document to an image.@b@ *@b@ * @author Ben Litchfield@b@ */@b@public final class PDFToImage@b@{@b@ private static final String PASSWORD = "-password";@b@ private static final String START_PAGE = "-startPage";@b@ private static final String END_PAGE = "-endPage";@b@ private static final String PAGE = "-page";@b@ private static final String IMAGE_TYPE = "-imageType";@b@ private static final String FORMAT = "-format";@b@ private static final String OUTPUT_PREFIX = "-outputPrefix";@b@ private static final String PREFIX = "-prefix";@b@ private static final String COLOR = "-color";@b@ private static final String RESOLUTION = "-resolution";@b@ private static final String DPI = "-dpi";@b@ private static final String CROPBOX = "-cropbox";@b@ private static final String TIME = "-time";@b@@b@ /**@b@ * private constructor.@b@ */@b@ private PDFToImage()@b@ {@b@ //static class@b@ }@b@@b@ /**@b@ * Infamous main method.@b@ *@b@ * @param args Command line arguments, should be one and a reference to a file.@b@ *@b@ * @throws IOException If there is an error parsing the document.@b@ */@b@ public static void main( String[] args ) throws IOException@b@ {@b@ try@b@ {@b@ // force KCMS (faster than LCMS) if available@b@ Class.forName("sun.java2d.cmm.kcms.KcmsServiceProvider");@b@ System.setProperty("sun.java2d.cmm", "sun.java2d.cmm.kcms.KcmsServiceProvider");@b@ }@b@ catch (ClassNotFoundException e)@b@ {@b@ // do nothing@b@ }@b@@b@ // suppress the Dock icon on OS X@b@ System.setProperty("apple.awt.UIElement", "true");@b@@b@ String password = "";@b@ String pdfFile = null;@b@ String outputPrefix = null;@b@ String imageFormat = "jpg";@b@ int startPage = 1;@b@ int endPage = Integer.MAX_VALUE;@b@ String color = "rgb";@b@ int dpi;@b@ float cropBoxLowerLeftX = 0;@b@ float cropBoxLowerLeftY = 0;@b@ float cropBoxUpperRightX = 0;@b@ float cropBoxUpperRightY = 0;@b@ boolean showTime = false;@b@ try@b@ {@b@ dpi = Toolkit.getDefaultToolkit().getScreenResolution();@b@ }@b@ catch( HeadlessException e )@b@ {@b@ dpi = 96;@b@ }@b@ for( int i = 0; i < args.length; i++ )@b@ {@b@ if( args[i].equals( PASSWORD ) )@b@ {@b@ i++;@b@ if( i >= args.length )@b@ {@b@ usage();@b@ }@b@ password = args[i];@b@ }@b@ else if( args[i].equals( START_PAGE ) )@b@ {@b@ i++;@b@ if( i >= args.length )@b@ {@b@ usage();@b@ }@b@ startPage = Integer.parseInt( args[i] );@b@ }@b@ else if( args[i].equals( END_PAGE ) )@b@ {@b@ i++;@b@ if( i >= args.length )@b@ {@b@ usage();@b@ }@b@ endPage = Integer.parseInt( args[i] );@b@ }@b@ else if( args[i].equals( PAGE ) )@b@ {@b@ i++;@b@ if( i >= args.length )@b@ {@b@ usage();@b@ }@b@ startPage = Integer.parseInt( args[i] );@b@ endPage = Integer.parseInt( args[i] );@b@ }@b@ else if( args[i].equals(IMAGE_TYPE) || args[i].equals(FORMAT) )@b@ {@b@ i++;@b@ imageFormat = args[i];@b@ }@b@ else if( args[i].equals( OUTPUT_PREFIX ) || args[i].equals( PREFIX ) )@b@ {@b@ i++;@b@ outputPrefix = args[i];@b@ }@b@ else if( args[i].equals( COLOR ) )@b@ {@b@ i++;@b@ color = args[i];@b@ }@b@ else if( args[i].equals( RESOLUTION ) || args[i].equals( DPI ) )@b@ {@b@ i++;@b@ dpi = Integer.parseInt(args[i]);@b@ }@b@ else if( args[i].equals( CROPBOX ) )@b@ {@b@ i++;@b@ cropBoxLowerLeftX = Float.valueOf(args[i]);@b@ i++;@b@ cropBoxLowerLeftY = Float.valueOf(args[i]);@b@ i++;@b@ cropBoxUpperRightX = Float.valueOf(args[i]);@b@ i++;@b@ cropBoxUpperRightY = Float.valueOf(args[i]);@b@ }@b@ else if( args[i].equals( TIME ) )@b@ {@b@ showTime = true;@b@ }@b@ else@b@ {@b@ if( pdfFile == null )@b@ {@b@ pdfFile = args[i];@b@ }@b@ }@b@ }@b@ if( pdfFile == null )@b@ {@b@ usage();@b@ }@b@ else@b@ {@b@ if(outputPrefix == null)@b@ {@b@ outputPrefix = pdfFile.substring( 0, pdfFile.lastIndexOf( '.' ));@b@ }@b@@b@ PDDocument document = null;@b@ try@b@ {@b@ document = PDDocument.load(new File(pdfFile), password);@b@@b@ ImageType imageType = null;@b@ if ("bilevel".equalsIgnoreCase(color))@b@ {@b@ imageType = ImageType.BINARY;@b@ }@b@ else if ("gray".equalsIgnoreCase(color))@b@ {@b@ imageType = ImageType.GRAY;@b@ }@b@ else if ("rgb".equalsIgnoreCase(color))@b@ {@b@ imageType = ImageType.RGB;@b@ }@b@ else if ("rgba".equalsIgnoreCase(color))@b@ {@b@ imageType = ImageType.ARGB;@b@ }@b@ @b@ if (imageType == null)@b@ {@b@ System.err.println( "Error: Invalid color." );@b@ System.exit( 2 );@b@ }@b@@b@ //if a CropBox has been specified, update the CropBox:@b@ //changeCropBoxes(PDDocument document,float a, float b, float c,float d)@b@ if ( cropBoxLowerLeftX!=0 || cropBoxLowerLeftY!=0@b@ || cropBoxUpperRightX!=0 || cropBoxUpperRightY!=0 )@b@ {@b@ changeCropBox(document,@b@ cropBoxLowerLeftX, cropBoxLowerLeftY,@b@ cropBoxUpperRightX, cropBoxUpperRightY);@b@ }@b@@b@ long startTime = System.nanoTime();@b@@b@ // render the pages@b@ boolean success = true;@b@ endPage = Math.min(endPage, document.getNumberOfPages());@b@ PDFRenderer renderer = new PDFRenderer(document);@b@ for (int i = startPage - 1; i < endPage; i++)@b@ {@b@ BufferedImage image = renderer.renderImageWithDPI(i, dpi, imageType);@b@ String fileName = outputPrefix + (i + 1) + "." + imageFormat;@b@ success &= ImageIOUtil.writeImage(image, fileName, dpi);@b@ }@b@@b@ // performance stats@b@ long endTime = System.nanoTime();@b@ long duration = endTime - startTime;@b@ int count = 1 + endPage - startPage;@b@ if (showTime)@b@ {@b@ System.err.printf("Rendered %d page%s in %dms\n", count, count == 1 ? "" : "s",@b@ duration / 1000000);@b@ }@b@@b@ if (!success)@b@ {@b@ System.err.println( "Error: no writer found for image format '"@b@ + imageFormat + "'" );@b@ System.exit(1);@b@ }@b@ }@b@ finally@b@ {@b@ if( document != null )@b@ {@b@ document.close();@b@ }@b@ }@b@ }@b@ }@b@@b@ /**@b@ * This will print the usage requirements and exit.@b@ */@b@ private static void usage()@b@ {@b@ String message = "Usage: java -jar pdfbox-app-x.y.z.jar PDFToImage [options] <inputfile>\n"@b@ + "\nOptions:\n"@b@ + " -password <password> : Password to decrypt document\n"@b@ + " -format <string> : Image format: " + getImageFormats() + "\n"@b@ + " -prefix <string> : Filename prefix for image files\n"@b@ + " -page <number> : The only page to extract (1-based)\n"@b@ + " -startPage <int> : The first page to start extraction (1-based)\n"@b@ + " -endPage <int> : The last page to extract(inclusive)\n"@b@ + " -color <int> : The color depth (valid: bilevel, gray, rgb, rgba)\n"@b@ + " -dpi <int> : The DPI of the output image\n"@b@ + " -cropbox <int> <int> <int> <int> : The page area to export\n"@b@ + " -time : Prints timing information to stdout\n"@b@ + " <inputfile> : The PDF document to use\n";@b@ @b@ System.err.println(message);@b@ System.exit( 1 );@b@ }@b@@b@ private static String getImageFormats()@b@ {@b@ StringBuilder retval = new StringBuilder();@b@ String[] formats = ImageIO.getReaderFormatNames();@b@ for( int i = 0; i < formats.length; i++ )@b@ {@b@ if (formats[i].equalsIgnoreCase(formats[i]))@b@ {@b@ retval.append( formats[i] );@b@ if( i + 1 < formats.length )@b@ {@b@ retval.append( ", " );@b@ }@b@ }@b@ }@b@ return retval.toString();@b@ }@b@@b@ private static void changeCropBox(PDDocument document, float a, float b, float c, float d)@b@ {@b@ for (PDPage page : document.getPages())@b@ {@b@ System.out.println("resizing page");@b@ PDRectangle rectangle = new PDRectangle();@b@ rectangle.setLowerLeftX(a);@b@ rectangle.setLowerLeftY(b);@b@ rectangle.setUpperRightX(c);@b@ rectangle.setUpperRightY(d);@b@ page.setCropBox(rectangle);@b@@b@ }@b@ }@b@}
3.PDFText2HTML关于pdf文本文件转HTML文件代码示例
package org.apache.pdfbox.tools;@b@@b@import java.io.IOException;@b@import java.util.ArrayList;@b@import java.util.HashSet;@b@import java.util.Iterator;@b@import java.util.List;@b@import java.util.Set;@b@@b@import org.apache.pdfbox.pdmodel.PDDocument;@b@import org.apache.pdfbox.pdmodel.font.PDFontDescriptor;@b@import org.apache.pdfbox.text.PDFTextStripper;@b@import org.apache.pdfbox.text.TextPosition;@b@@b@/**@b@ * Wrap stripped text in simple HTML, trying to form HTML paragraphs. Paragraphs@b@ * broken by pages, columns, or figures are not mended.@b@ *@b@ * @author John J Barton@b@ * @b@ */@b@public class PDFText2HTML extends PDFTextStripper@b@{@b@ private static final int INITIAL_PDF_TO_HTML_BYTES = 8192;@b@@b@ private final FontState fontState = new FontState();@b@@b@ /**@b@ * Constructor.@b@ * @throws IOException If there is an error during initialization.@b@ */@b@ public PDFText2HTML() throws IOException@b@ {@b@ super();@b@ setLineSeparator(LINE_SEPARATOR);@b@ setParagraphStart("<p>");@b@ setParagraphEnd("</p>"+ LINE_SEPARATOR);@b@ setPageStart("<div style=\"page-break-before:always; page-break-after:always\">");@b@ setPageEnd("</div>"+ LINE_SEPARATOR);@b@ setArticleStart(LINE_SEPARATOR);@b@ setArticleEnd(LINE_SEPARATOR);@b@ }@b@@b@ /**@b@ * Write the header to the output document. Now also writes the tag defining@b@ * the character encoding.@b@ *@b@ * @throws IOException@b@ * If there is a problem writing out the header to the document.@b@ * @deprecated deprecated, use {@link #startDocument(PDDocument)}@b@ */@b@ protected void writeHeader() throws IOException@b@ {@b@ }@b@@b@ @Override@b@ protected void startDocument(PDDocument document) throws IOException@b@ {@b@ StringBuilder buf = new StringBuilder(INITIAL_PDF_TO_HTML_BYTES);@b@ buf.append("<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\"" + "\n"@b@ + "\"http://www.w3.org/TR/html4/loose.dtd\">\n");@b@ buf.append("<html><head>");@b@ buf.append("<title>").append(escape(getTitle())).append("</title>\n");@b@ buf.append("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=\"UTF-8\">\n");@b@ buf.append("</head>\n");@b@ buf.append("<body>\n");@b@ super.writeString(buf.toString());@b@ }@b@ @b@ /**@b@ * {@inheritDoc}@b@ */@b@ @Override@b@ public void endDocument(PDDocument document) throws IOException@b@ {@b@ super.writeString("</body></html>");@b@ }@b@@b@ /**@b@ * This method will attempt to guess the title of the document using@b@ * either the document properties or the first lines of text.@b@ *@b@ * @return returns the title.@b@ */@b@ protected String getTitle()@b@ {@b@ String titleGuess = document.getDocumentInformation().getTitle();@b@ if(titleGuess != null && titleGuess.length() > 0)@b@ {@b@ return titleGuess;@b@ }@b@ else@b@ {@b@ Iterator<List<TextPosition>> textIter = getCharactersByArticle().iterator();@b@ float lastFontSize = -1.0f;@b@@b@ StringBuilder titleText = new StringBuilder();@b@ while (textIter.hasNext())@b@ {@b@ for (TextPosition position : textIter.next())@b@ {@b@ float currentFontSize = position.getFontSize();@b@ //If we're past 64 chars we will assume that we're past the title@b@ //64 is arbitrary@b@ if (currentFontSize != lastFontSize || titleText.length() > 64)@b@ {@b@ if (titleText.length() > 0)@b@ {@b@ return titleText.toString();@b@ }@b@ lastFontSize = currentFontSize;@b@ }@b@ if (currentFontSize > 13.0f)@b@ { // most body text is 12pt@b@ titleText.append(position.getUnicode());@b@ }@b@ }@b@ }@b@ }@b@ return "";@b@ }@b@@b@@b@ /**@b@ * Write out the article separator (div tag) with proper text direction@b@ * information.@b@ *@b@ * @param isLTR true if direction of text is left to right@b@ * @throws IOException@b@ * If there is an error writing to the stream.@b@ */@b@ @Override@b@ protected void startArticle(boolean isLTR) throws IOException@b@ {@b@ if (isLTR)@b@ {@b@ super.writeString("<div>");@b@ }@b@ else@b@ {@b@ super.writeString("<div dir=\"RTL\">");@b@ }@b@ }@b@@b@ /**@b@ * Write out the article separator.@b@ *@b@ * @throws IOException@b@ * If there is an error writing to the stream.@b@ */@b@ @Override@b@ protected void endArticle() throws IOException@b@ {@b@ super.endArticle();@b@ super.writeString("</div>");@b@ }@b@@b@ /**@b@ * Write a string to the output stream, maintain font state, and escape some HTML characters.@b@ * The font state is only preserved per word.@b@ *@b@ * @param text The text to write to the stream.@b@ * @param textPositions the corresponding text positions@b@ * @throws IOException If there is an error writing to the stream.@b@ */@b@ @Override@b@ protected void writeString(String text, List<TextPosition> textPositions) throws IOException@b@ {@b@ super.writeString(fontState.push(text, textPositions));@b@ }@b@@b@ /**@b@ * Write a string to the output stream and escape some HTML characters.@b@ *@b@ * @param chars String to be written to the stream@b@ * @throws IOException@b@ * If there is an error writing to the stream.@b@ */@b@ @Override@b@ protected void writeString(String chars) throws IOException@b@ {@b@ super.writeString(escape(chars));@b@ }@b@@b@ /**@b@ * Writes the paragraph end "</p>" to the output. Furthermore, it will also clear the font state.@b@ * @b@ * {@inheritDoc}@b@ */@b@ @Override@b@ protected void writeParagraphEnd() throws IOException@b@ {@b@ // do not escape HTML@b@ super.writeString(fontState.clear());@b@ @b@ super.writeParagraphEnd();@b@ }@b@@b@ /**@b@ * Escape some HTML characters.@b@ *@b@ * @param chars String to be escaped@b@ * @return returns escaped String.@b@ */@b@ private static String escape(String chars)@b@ {@b@ StringBuilder builder = new StringBuilder(chars.length());@b@ for (int i = 0; i < chars.length(); i++)@b@ {@b@ appendEscaped(builder, chars.charAt(i));@b@ }@b@ return builder.toString();@b@ }@b@@b@ private static void appendEscaped(StringBuilder builder, char character)@b@ {@b@ // write non-ASCII as named entities@b@ if ((character < 32) || (character > 126))@b@ {@b@ int charAsInt = character;@b@ builder.append("&#").append(charAsInt).append(";");@b@ }@b@ else@b@ {@b@ switch (character)@b@ {@b@ case 34:@b@ builder.append(""");@b@ break;@b@ case 38:@b@ builder.append("&");@b@ break;@b@ case 60:@b@ builder.append("<");@b@ break;@b@ case 62:@b@ builder.append(">");@b@ break;@b@ default:@b@ builder.append(String.valueOf(character));@b@ }@b@ }@b@ }@b@@b@ /**@b@ * A helper class to maintain the current font state. It's public methods will emit opening and@b@ * closing tags as needed, and in the correct order.@b@ *@b@ * @author Axel Dörfler@b@ */@b@ private static class FontState@b@ {@b@ private final List<String> stateList = new ArrayList<String>();@b@ private final Set<String> stateSet = new HashSet<String>();@b@@b@ /**@b@ * Pushes new {@link TextPosition TextPositions} into the font state. The state is only@b@ * preserved correctly for each letter if the number of letters in <code>text</code> matches@b@ * the number of {@link TextPosition} objects. Otherwise, it's done once for the complete@b@ * array (just by looking at its first entry).@b@ *@b@ * @return A string that contains the text including tag changes caused by its font state.@b@ */@b@ public String push(String text, List<TextPosition> textPositions)@b@ {@b@ StringBuilder buffer = new StringBuilder();@b@@b@ if (text.length() == textPositions.size())@b@ {@b@ // There is a 1:1 mapping, and we can use the TextPositions directly@b@ for (int i = 0; i < text.length(); i++)@b@ {@b@ push(buffer, text.charAt(i), textPositions.get(i));@b@ }@b@ }@b@ else if (!text.isEmpty())@b@ {@b@ // The normalized text does not match the number of TextPositions, so we'll just@b@ // have a look at its first entry.@b@ // TODO change PDFTextStripper.normalize() such that it maintains the 1:1 relation@b@ if (textPositions.isEmpty())@b@ {@b@ return text;@b@ }@b@ push(buffer, text.charAt(0), textPositions.get(0));@b@ buffer.append(escape(text.substring(1)));@b@ }@b@ return buffer.toString();@b@ }@b@@b@ /**@b@ * Closes all open states.@b@ * @return A string that contains the closing tags of all currently open states.@b@ */@b@ public String clear()@b@ {@b@ StringBuilder buffer = new StringBuilder();@b@ closeUntil(buffer, null);@b@ stateList.clear();@b@ stateSet.clear();@b@ return buffer.toString();@b@ }@b@@b@ protected String push(StringBuilder buffer, char character, TextPosition textPosition)@b@ {@b@ boolean bold = false;@b@ boolean italics = false;@b@@b@ PDFontDescriptor descriptor = textPosition.getFont().getFontDescriptor();@b@ if (descriptor != null)@b@ {@b@ bold = isBold(descriptor);@b@ italics = isItalic(descriptor);@b@ }@b@ @b@ buffer.append(bold ? open("b") : close("b"));@b@ buffer.append(italics ? open("i") : close("i"));@b@ appendEscaped(buffer, character);@b@@b@ return buffer.toString();@b@ }@b@@b@ private String open(String tag)@b@ {@b@ if (stateSet.contains(tag))@b@ {@b@ return "";@b@ }@b@ stateList.add(tag);@b@ stateSet.add(tag);@b@@b@ return openTag(tag);@b@ }@b@@b@ private String close(String tag)@b@ {@b@ if (!stateSet.contains(tag))@b@ {@b@ return "";@b@ }@b@ // Close all tags until (but including) the one we should close@b@ StringBuilder tagsBuilder = new StringBuilder();@b@ int index = closeUntil(tagsBuilder, tag);@b@@b@ // Remove from state@b@ stateList.remove(index);@b@ stateSet.remove(tag);@b@@b@ // Now open the states that were closed but should remain open again@b@ for (; index < stateList.size(); index++)@b@ {@b@ tagsBuilder.append(openTag(stateList.get(index)));@b@ }@b@ return tagsBuilder.toString();@b@ }@b@@b@ private int closeUntil(StringBuilder tagsBuilder, String endTag)@b@ {@b@ for (int i = stateList.size(); i-- > 0;)@b@ {@b@ String tag = stateList.get(i);@b@ tagsBuilder.append(closeTag(tag));@b@ if (endTag != null && tag.equals(endTag))@b@ {@b@ return i;@b@ }@b@ }@b@ return -1;@b@ }@b@@b@ private String openTag(String tag)@b@ {@b@ return "<" + tag + ">";@b@ }@b@@b@ private String closeTag(String tag)@b@ {@b@ return "</" + tag + ">";@b@ }@b@@b@ private boolean isBold(PDFontDescriptor descriptor)@b@ {@b@ if (descriptor.isForceBold())@b@ {@b@ return true;@b@ }@b@ return descriptor.getFontName().contains("Bold");@b@ }@b@@b@ private boolean isItalic(PDFontDescriptor descriptor)@b@ {@b@ if (descriptor.isItalic())@b@ {@b@ return true;@b@ }@b@ return descriptor.getFontName().contains("Italic");@b@ }@b@ }@b@}