POI读取Word2007+

0

很多人使用POI读取Word的时候都会这么写:

InputStream inputStream = new FileInputStream(new File("e://company/test.docx"));
XWPFDocument document = new XWPFDocument(inputStream);
System.out.println(new XWPFWordExtractor(document).getText());

但是这个方法其实有非常多的问题的,文本框里面的内容读取不到,换行也有问题,那么这里我改进了一下:

/**
 * 处理2007+的WORD
 * @param filePath 文件地址
 * @return word内容
 */
private static String read2007(String filePath) {
	InputStream inputStream = null;
	StringBuffer content = new StringBuffer();
	try {
		inputStream = new FileInputStream(new File(filePath));
		XWPFDocument document = new XWPFDocument(inputStream);
		// 读取非表格文本框
		for (XWPFParagraph xwpfParagraph : document.getParagraphs()) {
			for(XWPFRun xwpfRun : xwpfParagraph.getRuns()) {
				content.append(getXMLContent(xwpfRun.getCTR().newCursor().xmlText())).append(NEW_LINE);
			}
		}
		// 读取表格内文本框
		for(XWPFTable xwpfTable : document.getTables()) {
			for (XWPFTableRow xwpfTableRow : xwpfTable.getRows()) {
				for (XWPFTableCell xwpfTableCell : xwpfTableRow.getTableCells()) {
					for (XWPFParagraph xwpfParagraph : xwpfTableCell.getParagraphs()) {
						for(XWPFRun xwpfRun : xwpfParagraph.getRuns()) {
							content.append(getXMLContent(xwpfRun.getCTR().newCursor().xmlText())).append(NEW_LINE);
						}
					}
				}
			}
		}
		// 读取表格内容
		for(XWPFTable xwpfTable : document.getTables()) {
			for (XWPFTableRow xwpfTableRow : xwpfTable.getRows()) {
				for (XWPFTableCell xwpfTableCell : xwpfTableRow.getTableCells()) {
					for (XWPFParagraph xwpfParagraph : xwpfTableCell.getParagraphs()) {
						content.append(xwpfParagraph.getText()).append(NEW_LINE);
					}
				}
			}
		}
		return content.toString();
	} catch (IOException e) {
		logger.error("解析word错误,文件地址:" + filePath, e);
	} finally {
		IOUtils.closeQuietly(inputStream);
	}
	return null;
}

/**
 * 获取XML内容,可以使用递归cursor.getDomNode()
 * @param xml xml
 * @return xml内容
 */
private static String getXMLContent(String xml) {
	StringBuffer content = new StringBuffer();
	Document document;
	try {
		document = DocumentHelper.parseText(xml);
		List<?> namespaces = document.getRootElement().declaredNamespaces(); // 判断是否有表格包含文本框
		boolean hasboxintab = false;
		for (Object object : namespaces) {
			Namespace namespace = (Namespace) object;
			if(NAMESPANCE_OF_TEXTBOX_IN_TABLE.equals(namespace.getPrefix())) {
				hasboxintab = true;
				break;
			}
		}
		if(!hasboxintab)
			return content.toString();
		for(Object node : document.selectNodes("//mc:Fallback//w:p")) {
			for(Object nodeb : ((Node) node).selectNodes(".//w:t")) {
				if(StringUtils.isNotEmpty(((Node) nodeb).getText()))
					content.append(((Node) nodeb).getText());
			}
			content.append(NEW_LINE);
		}
	} catch (DocumentException e) {
		logger.error("XML转化错误,内容:" + xml, e);
	}
	return content.toString();
}

2003版本简单一些:

/**
 * 处理2003的WORD
 * @param filePath 文件地址
 * @return word内容
 */
private static String read2003(String filePath) {
	InputStream inputStream = null;
	StringBuffer content = new StringBuffer();
	try {
		inputStream = new FileInputStream(new File(filePath));
		HWPFDocument document = new HWPFDocument(inputStream);
		String text = null;
		for (int i = 0; i < document.getMainTextboxRange().numParagraphs(); i++) { // 文本框
			text = document.getMainTextboxRange().getParagraph(i).text();
			if(StringUtils.isNotEmpty(text))
				content.append(text).append(NEW_LINE);
		}
		for (int i = 0; i < document.getRange().numParagraphs(); i++) { // 非文本框
			text = document.getRange().getParagraph(i).text();
			if(StringUtils.isNotEmpty(text) && StringUtils.isNotEmpty(text.trim())) // 注意这里的trim()方法否者会出现乱码
				content.append(text.trim()).append(NEW_LINE);
		}
		return content.toString();
	} catch (FileNotFoundException e) {
		logger.error("解析word错误,文件地址:" + filePath, e);
	} catch (IOException e) {
		logger.error("解析word错误,文件地址:" + filePath, e);
	} finally {
		IOUtils.closeQuietly(inputStream);
	}
	return null;
}

注意:读取出的内容为表格里面的内容,文本框内容和直接写在编辑区里面的文本,其他的一些诸如:批注,引用等一些信息可能读取不到,需要的请自行解决。

比较完整的代码:

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;

import org.apache.commons.io.FilenameUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.log4j.Logger;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.apache.poi.xwpf.usermodel.XWPFRun;
import org.apache.poi.xwpf.usermodel.XWPFTable;
import org.apache.poi.xwpf.usermodel.XWPFTableCell;
import org.apache.poi.xwpf.usermodel.XWPFTableRow;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.DocumentHelper;
import org.dom4j.Node;

/**
 * WordReaderUtils - WORD 读取
 * 
 * @author 500d Team
 * @version 1.0
 */
public class WordReaderUtils {

	private static final String WORD_2003 = "doc";
	private static final String WORD_2007 = "docx";
	private static final Logger logger = Logger.getLogger(WordReaderUtils.class);
	public static final String NEW_LINE = "\r\n";
	
	public static String read(String filePath) {
		File wordFile = StringUtils.isNotEmpty(filePath) ? new File(filePath) : null;
		if (wordFile == null || !wordFile.exists() || !wordFile.isFile())
			return null;
		String extension = FilenameUtils.getExtension(filePath);
		if(StringUtils.isEmpty(extension))
			return null;
		String content = null;
		if(WORD_2003.equals(extension.toLowerCase()))
			content = read2003(filePath);
		else if(WORD_2007.equals(extension.toLowerCase()))
			content = read2007(filePath);
		return Crossover.handle(content);
	}
	
	/**
	 * 处理2003的WORD
	 * @param filePath 文件地址
	 * @return word内容
	 */
	private static String read2003(String filePath) {
		InputStream inputStream = null;
		StringBuffer content = new StringBuffer();
		try {
			inputStream = new FileInputStream(new File(filePath));
			HWPFDocument document = new HWPFDocument(inputStream);
			String text = null;
			for (int i = 0; i < document.getMainTextboxRange().numParagraphs(); i++) {
				text = document.getMainTextboxRange().getParagraph(i).text();
				if(StringUtils.isNotEmpty(text))
					content.append(text).append(NEW_LINE);
			}
			for (int i = 0; i < document.getRange().numParagraphs(); i++) {
				text = document.getRange().getParagraph(i).text();
				if(StringUtils.isNotEmpty(text) && StringUtils.isNotEmpty(text.trim())) // 注意这里的trim()方法否者会出现乱码
					content.append(text.trim()).append(NEW_LINE);
			}
			return content.toString();
		} catch (FileNotFoundException e) {
			logger.error("解析word错误,文件地址:" + filePath, e);
		} catch (IOException e) {
			logger.error("解析word错误,文件地址:" + filePath, e);
		} finally {
			IOUtils.closeQuietly(inputStream);
		}
		return null;
	}
	
	/**
	 * 处理2007+的WORD
	 * @param filePath 文件地址
	 * @return word内容
	 */
	private static String read2007(String filePath) {
		InputStream inputStream = null;
		StringBuffer content = new StringBuffer();
		try {
			inputStream = new FileInputStream(new File(filePath));
			XWPFDocument document = new XWPFDocument(inputStream);
			// 读取非表格文本框
			for (XWPFParagraph xwpfParagraph : document.getParagraphs()) {
				for(XWPFRun xwpfRun : xwpfParagraph.getRuns()) {
					content.append(getXMLContent(xwpfRun.getCTR().newCursor().xmlText())).append(NEW_LINE);
				}
			}
			// 读取表格内文本框
			for(XWPFTable xwpfTable : document.getTables()) {
				for (XWPFTableRow xwpfTableRow : xwpfTable.getRows()) {
					for (XWPFTableCell xwpfTableCell : xwpfTableRow.getTableCells()) {
						for (XWPFParagraph xwpfParagraph : xwpfTableCell.getParagraphs()) {
							for(XWPFRun xwpfRun : xwpfParagraph.getRuns()) {
								content.append(getXMLContent(xwpfRun.getCTR().newCursor().xmlText())).append(NEW_LINE);
							}
						}
					}
				}
			}
			// 读取表格内容
			for(XWPFTable xwpfTable : document.getTables()) {
				for (XWPFTableRow xwpfTableRow : xwpfTable.getRows()) {
					for (XWPFTableCell xwpfTableCell : xwpfTableRow.getTableCells()) {
						for (XWPFParagraph xwpfParagraph : xwpfTableCell.getParagraphs()) {
							content.append(xwpfParagraph.getText()).append(NEW_LINE);
						}
					}
				}
			}
			return content.toString();
		} catch (IOException e) {
			logger.error("解析word错误,文件地址:" + filePath, e);
		} finally {
			IOUtils.closeQuietly(inputStream);
		}
		return null;
	}
	
	/**
	 * 获取XML内容,可以使用递归cursor.getDomNode()
	 * @param xml xml
	 * @return xml内容
	 */
	private static String getXMLContent(String xml) {
		StringBuffer content = new StringBuffer();
		Document document;
		try {
			document = DocumentHelper.parseText(xml);
			List<?> namespaces = document.getRootElement().declaredNamespaces(); // 判断是否有表格包含文本框
			boolean hasboxintab = false;
			for (Object object : namespaces) {
				Namespace namespace = (Namespace) object;
				if(NAMESPANCE_OF_TEXTBOX_IN_TABLE.equals(namespace.getPrefix())) {
					hasboxintab = true;
					break;
				}
			}
			if(!hasboxintab)
				return content.toString();
			for(Object node : document.selectNodes("//mc:Fallback//w:p")) {
				for(Object nodeb : ((Node) node).selectNodes(".//w:t")) {
					if(StringUtils.isNotEmpty(((Node) nodeb).getText()))
						content.append(((Node) nodeb).getText());
				}
				content.append(NEW_LINE);
			}
		} catch (DocumentException e) {
			logger.error("XML转化错误,内容:" + xml, e);
		}
		return content.toString();
	}
	
	public static void main(String[] args) throws Exception {
//		System.out.println(read("e://company/test.doc"));
//		System.out.println(read("e://company/test.docx"));
	}
	
}