<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>4.1.1</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>4.1.1</version>
</dependency>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.7</version>
</dependency>
添加依赖
直接上代码:
// 解析word两种类型
private static List<List<EvaluationBaseInfo>> analysisWordTable(String filePath) {
File file = new File(filePath);
if (!file.exists()) {
return null;
}
FileInputStream fileInputStream = null;
FileInputStream fileInputStream2 = null;
try {
// 因OLE2无法解析Buffered的流数据,所以定义了两个
fileInputStream = new FileInputStream(file);
fileInputStream2 = new FileInputStream(file);
BufferedInputStream bis = new BufferedInputStream(fileInputStream2);
FileMagic fileMagic = FileMagic.valueOf(bis);
if (fileMagic == FileMagic.OLE2) {
return analysisOLE2(fileInputStream);
} else if (fileMagic == FileMagic.OOXML) {
return analysisOOXML(fileInputStream);
}
} catch (Exception e) {
e.printStackTrace();
}finally {
try {
if (null != fileInputStream){
fileInputStream.close();
}
if (null != fileInputStream2){
fileInputStream2.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
return null;
}
private static List<List<EvaluationBaseInfo>> analysisOLE2(FileInputStream fileInputStream) {
// 获取word中的所有段落与表格
POIFSFileSystem pfs = null;
HWPFDocument hwpf = null;
try {
pfs = new POIFSFileSystem(fileInputStream);
hwpf = new HWPFDocument(pfs);
Range range = hwpf.getRange();// 得到文档的读取范围
TableIterator it = new TableIterator(range);
Map<String, String> map = new HashMap<>();
List<List<EvaluationBaseInfo>> evaluationBaseInfoList = new ArrayList<>();
while (it.hasNext()) {
Table tb = it.next();
// 迭代行,默认从0开始
List<EvaluationBaseInfo> evaluationBaseInfos = new ArrayList<>();
for (int i = 1; i < tb.numRows(); i++) {
TableRow tr = tb.getRow(i);
// 迭代列,默认从0开始
EvaluationBaseInfo evaluationBaseInfo = new EvaluationBaseInfo();
int index = 1;
for (int j = 1; j < tr.numCells(); j++) {
TableCell td = tr.getCell(j);// 取得单元格
for (int k = 0; k < td.numParagraphs(); k++) {
Paragraph para = td.getParagraph(k);// 获取第k个段落
String text = para.text();
EvaluationBaseInfo.covert(text,index,evaluationBaseInfo);
index++;
}
}
evaluationBaseInfos.add(evaluationBaseInfo);
}
evaluationBaseInfoList.add(evaluationBaseInfos);
}
return evaluationBaseInfoList;
} catch (Exception e) {
e.printStackTrace();
} finally {
try {
if (null != hwpf){
hwpf.close();
}
if (null != pfs){
pfs.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
return null;
}
private static List<List<EvaluationBaseInfo>> analysisOOXML(FileInputStream fileInputStream) {
List<List<EvaluationBaseInfo>> evaluationBaseInfoList = new ArrayList<>();
XWPFDocument doc = null;
try {
doc = new XWPFDocument(fileInputStream);
List<XWPFTable> tables = doc.getTables();
for (XWPFTable table : tables){
List<XWPFTableRow> rows = table.getRows();
List<EvaluationBaseInfo> evaluationBaseInfos = new ArrayList<>();
// 从下标1开始循环,跳过表头
for (int i = 1;i < rows.size();i++){
XWPFTableRow xwpfTableRow = rows.get(i);
List<XWPFTableCell> tableCells = xwpfTableRow.getTableCells();
EvaluationBaseInfo evaluationBaseInfo = new EvaluationBaseInfo();
// 组装字段值
for (int j = 1;j < tableCells.size();j++){
XWPFTableCell cell = tableCells.get(j);
String text = cell.getText();
if (StringUtils.isBlank(text)){
continue;
}
EvaluationBaseInfo.covert(text,j,evaluationBaseInfo);
}
// 下标和字典值进行匹配,获取county_science表中data_type类型
CountyScienceDict countyScienceDict = CountyScienceDict.getOne(i);
if (null != countyScienceDict){
evaluationBaseInfo.setDataType(countyScienceDict.getType());
}
evaluationBaseInfos.add(evaluationBaseInfo);
}
evaluationBaseInfoList.add(evaluationBaseInfos);
}
return evaluationBaseInfoList;
} catch (Exception e) {
e.printStackTrace();
} finally {
try {
if(doc!=null) {
doc.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
return null;
}
// 解析PDF
private static List<List<EvaluationBaseInfo>> analysisPdfTable(String filePath) {
PDDocument pdDocument = null;
//创建pdf文件解析器
FileInputStream is = null;
PDFParser parser = null;
// 用来统一返回类型
List<List<EvaluationBaseInfo>> evaluationBaseInfoList = new ArrayList<>();
try {
is = new FileInputStream(filePath);
parser = new PDFParser(new RandomAccessBuffer(is));
parser.parse();
//获取解析后的pdf文档
pdDocument = parser.getPDDocument();
// 新建一个PDF文本剥离器
PDFTextStripper stripper = new PDFTextStripper();
stripper.setSortByPosition(false); //sort:设置为true 则按照行进行读取,默认是false
String pageContent = "";
Pattern pattern = Pattern.compile("XXXXXX");
//获取pdf文件总页数
int pageCount = pdDocument.getNumberOfPages();
for (int i = 1; i <= pageCount; i++) {
pageContent = "";
stripper.setStartPage(i);
stripper.setEndPage(i);
pageContent = stripper.getText(pdDocument);
Matcher matcher = pattern.matcher(pageContent);
if (matcher.find()) {
break;
}
}
String[] lines = pageContent.split("\n");
pattern = Pattern.compile("^\\d+");
AtomicInteger atomicInteger = new AtomicInteger(0);
List<EvaluationBaseInfo> evaluationBaseInfos = new ArrayList<>();
for (String line : lines ) {
if(pattern.matcher(line).find()){
String[] row = line.split(" ");
// 转换对象
EvaluationBaseInfo evaluationBaseInfo = new EvaluationBaseInfo();
int j = 1;
for (int i = 0; i < row.length; i++) {
j++;
EvaluationBaseInfo.covert(row[i],j,evaluationBaseInfo);
}
// 下标和字典值进行匹配,获取county_science表中data_type类型
CountyScienceDict countyScienceDict = CountyScienceDict.getOne(atomicInteger.incrementAndGet());
if (null != countyScienceDict){
evaluationBaseInfo.setDataType(countyScienceDict.getType());
}
evaluationBaseInfos.add(evaluationBaseInfo);
}
}
evaluationBaseInfoList.add(evaluationBaseInfos);
} catch (Exception e) {
e.printStackTrace();
System.err.println(filePath);
}finally {
if (pdDocument != null) {
// 关闭PDF Document
try {
is.close();
pdDocument.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
return evaluationBaseInfoList;
}