一、前言
定义HtmlExtractUtil页面内容抽取解析工具类,进行指定模板页面内容抽取(extractHtml)、获取指定url的页面字符串内容(getContent)、获取指定内容数据字符集编码(getEncoding)、指定url字符串连接测试连接是否正常(tryConnect)、头文件字符串内容解析(parseHeaders)等。
二、代码示例
import java.io.ByteArrayOutputStream;@b@import java.io.IOException;@b@import java.io.InputStream;@b@import java.net.HttpURLConnection;@b@import java.net.URL;@b@import java.net.URLConnection;@b@import java.util.HashMap;@b@import java.util.Map;@b@import java.util.regex.Matcher;@b@import java.util.regex.Pattern;@b@import java.util.zip.GZIPInputStream;@b@import javax.xml.transform.Transformer;@b@import javax.xml.transform.TransformerFactory;@b@import javax.xml.transform.sax.SAXTransformerFactory;@b@import javax.xml.transform.stream.StreamResult;@b@import javax.xml.transform.stream.StreamSource;@b@import org.apache.commons.io.IOUtils;@b@@b@public class HtmlExtractUtil {@b@ @b@ private static final Pattern metaCharsetPt = Pattern.compile("(?i)<meta (.*?)charset=(.*?)[\"']");@b@ private static final Pattern xmlEncodingPt = Pattern.compile("(?i)<?xml (.*?)encoding=[\"'](.*?)[\"']");@b@ private static final Pattern contentTypePt = Pattern.compile("(?i)text/html.*charset=(.*)");@b@@b@ public static String extractHtml(String pageContent, String template,String outputEncoding) throws Exception {@b@ @b@ TransformerFactory tFactory = SAXTransformerFactory.newInstance();@b@ StreamSource xslts = new StreamSource(IOUtils.toInputStream(template));@b@@b@ pageContent = pageContent.replaceAll("\\<\\!\\-\\-", "").replaceAll("\\-\\-\\>", "");@b@@b@ pageContent = pageContent.replaceAll("(?i)<br\\s*/?>", "\n");@b@ pageContent = pageContent.replaceFirst("(?i)<html.*?>", "<html>");@b@@b@ Transformer transformer = tFactory.newTransformer(xslts);@b@ StreamSource xmls = new StreamSource(IOUtils.toInputStream(pageContent));@b@ ByteArrayOutputStream outs = new ByteArrayOutputStream();@b@ transformer.transform(xmls, new StreamResult(outs));@b@@b@ String xml = outs.toString(outputEncoding);@b@ outs.close();@b@ return xml;@b@ }@b@@b@ public static String getContent(String url) throws IOException {@b@ return getContent(url, null);@b@ }@b@@b@ public static String getContent(String url, Map<String, String> headers)@b@ throws IOException {@b@ URLConnection conn = null;@b@ InputStream in = null;@b@ String page = null;@b@ try {@b@ conn = new URL(url).openConnection();@b@ conn.setConnectTimeout(180000);@b@ conn.setReadTimeout(180000);@b@@b@ if (null != headers) {@b@ for (String hd : headers.keySet())@b@ if (!(hd.equalsIgnoreCase("Accept-Encoding")))@b@ conn.setRequestProperty(hd, (String) headers.get(hd));@b@ } else {@b@ conn.setRequestProperty("User-Agent","Mozilla/4.0 (compatible; Windows NT 5.1)");@b@ conn.setRequestProperty("Accept", "*/*");@b@ }@b@@b@ conn.setUseCaches(false);@b@ conn.connect();@b@ String contentEncoding = conn.getContentEncoding();@b@ String contentType = conn.getContentType();@b@ in = conn.getInputStream();@b@@b@ if ((contentEncoding != null)&& (-1 != contentEncoding.toLowerCase().indexOf("gzip"))) {@b@ in = new GZIPInputStream(in);@b@ }@b@ byte[] buf = IOUtils.toByteArray(in);@b@ String encoding = getEncoding(contentType, buf);@b@ page = new String(buf, encoding);@b@ in.close();@b@ } finally {@b@ if ((conn != null) && (conn instanceof HttpURLConnection))@b@ ((HttpURLConnection) conn).disconnect();@b@ if (in != null)@b@ in.close();@b@ }@b@ return page;@b@ }@b@@b@ public static String getEncoding(String contentType, byte[] content) {@b@ String encode = null;@b@ try {@b@ if (contentType != null) {@b@ Matcher mc = contentTypePt.matcher(contentType);@b@ if (mc.find())@b@ encode = mc.group(1).trim();@b@ }@b@ if (encode == null) {@b@ String htmlContent = new String(content);@b@ Matcher mc = metaCharsetPt.matcher(htmlContent);@b@ if (mc.find()) {@b@ encode = mc.group(2).trim();@b@ } else {@b@ mc = xmlEncodingPt.matcher(htmlContent);@b@ if (mc.find())@b@ encode = mc.group(2).trim();@b@ }@b@ } else if (encode.toLowerCase().indexOf("utf") == -1) {@b@ encode = "GB18030";@b@ } else {@b@ encode = "UTF-8";@b@ }@b@ } catch (Exception e) {@b@ encode = "GB18030";@b@ }@b@ return encode;@b@ }@b@@b@ public static boolean tryConnect(String url) throws Exception {@b@ HttpURLConnection conn = null;@b@ try {@b@ conn = (HttpURLConnection) new URL(url).openConnection();@b@ conn.setConnectTimeout(60000);@b@ conn.setReadTimeout(60000);@b@ conn.setUseCaches(false);@b@ conn.setInstanceFollowRedirects(false);@b@ conn.connect();@b@ int status = conn.getResponseCode();@b@@b@ if ((status >= 200) && (status < 300)) {@b@ return true;@b@ }@b@ return false;@b@ } finally {@b@ if (conn != null)@b@ conn.disconnect();@b@ }@b@ }@b@@b@ public Map<String, String> parseHeaders(String headerStr) {@b@ if ((null == headerStr) || (headerStr.equals("")))@b@ return null;@b@@b@ int beginIndex = headerStr.indexOf("\r\t");@b@ int endIndex = headerStr.indexOf("\r\t\r\t");@b@ @b@ String hdstr=null;@b@ if (beginIndex > 0) {@b@ @b@ if (endIndex > beginIndex)@b@ hdstr = headerStr.substring(beginIndex + 2, endIndex);@b@ else {@b@ hdstr = headerStr.substring(beginIndex + 2);@b@ }@b@@b@ hdstr = hdstr.replace("\r\t", "\r");@b@ String[] hds = hdstr.split("\r");@b@ Map headers = new HashMap();@b@ for (int i = 0; i < hds.length; ++i) {@b@ String[] nv = hds[i].split(":", 2);@b@ headers.put(nv[0].trim(), nv[1].trim());@b@ }@b@ return headers;@b@ }@b@ return null;@b@ }@b@}