一、前言
下面通过pdfbox的pdfbox-2.0.9源码中org.apache.pdfbox.examples.lucene.IndexPDFFiles索引PDF文件生成类,基于apache的lucene的org.apache.lucene.index.IndexWriter生成pdf文件的所有的代码示例。
二、代码示例
package org.apache.pdfbox.examples.lucene;@b@@b@import java.io.File;@b@import java.io.FileInputStream;@b@import java.io.FileNotFoundException;@b@import java.io.IOException;@b@import java.util.Date;@b@@b@import org.apache.lucene.analysis.Analyzer;@b@import org.apache.lucene.analysis.standard.StandardAnalyzer;@b@import org.apache.lucene.document.Document;@b@import org.apache.lucene.index.IndexWriter;@b@import org.apache.lucene.index.IndexWriterConfig;@b@import org.apache.lucene.index.IndexWriterConfig.OpenMode;@b@import org.apache.lucene.index.Term;@b@import org.apache.lucene.store.Directory;@b@import org.apache.lucene.store.FSDirectory;@b@import org.apache.lucene.util.Version;@b@@b@/**@b@ * Index all pdf files under a directory.@b@ * <p>@b@ * This is a command-line application demonstrating simple Lucene indexing. Run it with no command-line arguments for@b@ * usage information.@b@ * <p>@b@ * It's based on a demo provided by the lucene project.@b@ */@b@public final class IndexPDFFiles@b@{@b@@b@ private IndexPDFFiles()@b@ {@b@ }@b@@b@ /**@b@ * Index all text files under a directory.@b@ * @b@ * @param args command line arguments@b@ * @b@ */@b@ public static void main(String[] args)@b@ {@b@ String usage = "java org.apache.pdfbox.lucene.IndexPDFFiles"@b@ + " [-index INDEX_PATH] [-docs DOCS_PATH] [-update]\n\n"@b@ + "This indexes all PDF documents in DOCS_PATH, creating a Lucene index"@b@ + "in INDEX_PATH that can be searched with SearchFiles";@b@ String indexPath = "index";@b@ String docsPath = null;@b@ boolean create = true;@b@ for (int i = 0; i < args.length; i++)@b@ {@b@ if ("-index".equals(args[i]))@b@ {@b@ indexPath = args[i + 1];@b@ i++;@b@ }@b@ else if ("-docs".equals(args[i]))@b@ {@b@ docsPath = args[i + 1];@b@ i++;@b@ }@b@ else if ("-update".equals(args[i]))@b@ {@b@ create = false;@b@ }@b@ }@b@@b@ if (docsPath == null)@b@ {@b@ System.err.println("Usage: " + usage);@b@ System.exit(1);@b@ }@b@@b@ final File docDir = new File(docsPath);@b@ if (!docDir.exists() || !docDir.canRead())@b@ {@b@ System.out.println("Document directory '" + docDir.getAbsolutePath()@b@ + "' does not exist or is not readable, please check the path");@b@ System.exit(1);@b@ }@b@@b@ Date start = new Date();@b@ try@b@ {@b@ System.out.println("Indexing to directory '" + indexPath + "'...");@b@@b@ Directory dir = FSDirectory.open(new File(indexPath));@b@ Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_47);@b@ IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_47, analyzer);@b@@b@ if (create)@b@ {@b@ // Create a new index in the directory, removing any@b@ // previously indexed documents:@b@ iwc.setOpenMode(OpenMode.CREATE);@b@ }@b@ else@b@ {@b@ // Add new documents to an existing index:@b@ iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);@b@ }@b@@b@ // Optional: for better indexing performance, if you@b@ // are indexing many documents, increase the RAM@b@ // buffer. But if you do this, increase the max heap@b@ // size to the JVM (eg add -Xmx512m or -Xmx1g):@b@ //@b@ // iwc.setRAMBufferSizeMB(256.0);@b@@b@ IndexWriter writer = new IndexWriter(dir, iwc);@b@ indexDocs(writer, docDir);@b@@b@ // NOTE: if you want to maximize search performance,@b@ // you can optionally call forceMerge here. This can be@b@ // a terribly costly operation, so generally it's only@b@ // worth it when your index is relatively static (ie@b@ // you're done adding documents to it):@b@ //@b@ // writer.forceMerge(1);@b@@b@ writer.close();@b@@b@ Date end = new Date();@b@ System.out.println(end.getTime() - start.getTime() + " total milliseconds");@b@@b@ }@b@ catch (IOException e)@b@ {@b@ System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage());@b@ }@b@ }@b@@b@ /**@b@ * Indexes the given file using the given writer, or if a directory is given, recurses over files and directories@b@ * found under the given directory.@b@ * @b@ * NOTE: This method indexes one document per input file. This is slow. For good throughput, put multiple documents@b@ * into your input file(s). An example of this is in the benchmark module, which can create "line doc" files, one@b@ * document per line, using the <a@b@ * href="../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html"@b@ * >WriteLineDocTask</a>.@b@ * @b@ * @param writer Writer to the index where the given file/dir info will be stored@b@ * @param file The file to index, or the directory to recurse into to find files to index@b@ * @throws IOException If there is a low-level I/O error@b@ */@b@ static void indexDocs(IndexWriter writer, File file) throws IOException@b@ {@b@ // do not try to index files that cannot be read@b@ if (file.canRead())@b@ {@b@ if (file.isDirectory())@b@ {@b@ String[] files = file.list();@b@ // an IO error could occur@b@ if (files != null)@b@ {@b@ for (String fileName : files)@b@ {@b@ indexDocs(writer, new File(file, fileName));@b@ }@b@ }@b@ }@b@ else@b@ {@b@@b@ FileInputStream fis;@b@ try@b@ {@b@ fis = new FileInputStream(file);@b@ }@b@ catch (FileNotFoundException fnfe)@b@ {@b@ // at least on windows, some temporary files raise this exception with an "access denied" message@b@ // checking if the file can be read doesn't help@b@ return;@b@ }@b@@b@ try@b@ {@b@@b@ String path = file.getName().toUpperCase();@b@ Document doc = null;@b@ if (path.toLowerCase().endsWith(".pdf"))@b@ {@b@ System.out.println("Indexing PDF document: " + file);@b@ doc = LucenePDFDocument.getDocument(file);@b@ }@b@ else@b@ {@b@ System.out.println("Skipping " + file);@b@ return;@b@ }@b@@b@ if (writer.getConfig().getOpenMode() == OpenMode.CREATE)@b@ {@b@ // New index, so we just add the document (no old document can be there):@b@ System.out.println("adding " + file);@b@ writer.addDocument(doc);@b@ }@b@ else@b@ {@b@ // Existing index (an old copy of this document may have been indexed) so@b@ // we use updateDocument instead to replace the old one matching the exact@b@ // path, if present:@b@ System.out.println("updating " + file);@b@ writer.updateDocument(new Term("uid", LucenePDFDocument.createUID(file)), doc);@b@ }@b@ }@b@ finally@b@ {@b@ fis.close();@b@ }@b@ }@b@ }@b@ }@b@}