JAVA利用POI scratchpad 5.2.1 将Word文档doc格式转换成HTML 格式 含文档里面图片

2023-05-16

一、POM文件

 <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-scratchpad</artifactId>
            <version>5.2.1</version>
 </dependency>

二、转换具体代码

package org.zhao.component;

import org.apache.commons.collections4.CollectionUtils;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi.hwpf.usermodel.Picture;
import org.w3c.dom.Document;

import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.List;

/**
 * word文档转换HTML 含文档里面的图片
 *
 * @author Administrator
 * @date 2022年03月18日
 */
public class WordComponent {

    private static String getFileExtname(String filename) {
        return filename.substring(filename.lastIndexOf(".")).toLowerCase();
    }

    private static void createFileDir(String dirPath) {
        File file = new File(dirPath);
        if (!file.exists() && !file.isDirectory()) {
            boolean r = file.mkdirs();
            System.out.println(dirPath + "不存在,创建文件夹->" + r);
        }
    }


    public static void docToHtml(String docFilePath) throws Exception {
        File file = new File(docFilePath);
        if (!file.exists()) {
            System.err.println(docFilePath + "->文件不存在");
            return;
        }
        String name = file.getName();
        String dirName = name.replace(getFileExtname(name), "");
        if (dirName.length() > 100) {
            dirName = String.valueOf(System.currentTimeMillis());
        }
        //html文件目录
        String htmlDirPath = file.getParent() + dirName;
        //创建目录
        createFileDir(htmlDirPath);
        //存储图片目录
        String imagePath = htmlDirPath + "/image/";
        HWPFDocument wordDocument = new HWPFDocument(new FileInputStream(docFilePath));
        WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument());
        wordToHtmlConverter.setPicturesManager((content, pictureType, suggestedName, widthInches, heightInches) -> imagePath + suggestedName);
        wordToHtmlConverter.processDocument(wordDocument);
        List<Picture> allPictures = wordDocument.getPicturesTable().getAllPictures();
        if (CollectionUtils.isNotEmpty(allPictures)) {
            createFileDir(imagePath);
            allPictures.forEach(picture -> {
                try {
                    picture.writeImageContent(new FileOutputStream(imagePath + picture.suggestFullFileName()));
                } catch (IOException e) {
                    e.printStackTrace();
                }
            });
        }
        Document htmlDocument = wordToHtmlConverter.getDocument();
        String htmlPath = htmlDirPath + "/" + dirName + ".html";
        File out = new File(htmlPath);
        DOMSource domSource = new DOMSource(htmlDocument);
        StreamResult streamResult = new StreamResult(out);
        TransformerFactory tf = TransformerFactory.newInstance();
        Transformer serializer = tf.newTransformer();
        serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
        serializer.setOutputProperty(OutputKeys.INDENT, "yes");
        serializer.setOutputProperty(OutputKeys.METHOD, "html");
        serializer.transform(domSource, streamResult);
        System.out.println("转换成功");
    }


    public static void main(String[] args) throws Exception {
        docToHtml("D:/我是一篇简历.doc");
    }
}

 三、Word内容

 

四、转换后内容

 

        代码块

<html>
    <head>
        <META http-equiv="Content-Type" content="text/html; charset=utf-8">
        <style type="text/css">.b1{white-space-collapsing:preserve;}
.b2{margin: 1.0in 0.7875in 0.7875in 1.0in;}
.s1{font-weight:bold;color:black;}
.s2{color:black;}
.s3{font-weight:bold;}
.p1{text-align:center;hyphenate:auto;font-family:华文新魏;font-size:36pt;}
.p2{text-align:center;hyphenate:auto;font-family:宋体;font-size:12pt;}
.p3{text-align:center;hyphenate:auto;font-family:楷体_GB2312;font-size:12pt;}
.p4{text-align:center;hyphenate:auto;font-family:Times New Roman;font-size:12pt;}
.p5{text-indent:0.16666667in;text-align:justify;hyphenate:auto;font-family:楷体_GB2312;font-size:12pt;}
.p6{text-align:justify;hyphenate:auto;font-family:楷体_GB2312;font-size:14pt;}
.p7{text-align:justify;hyphenate:auto;font-family:宋体;font-size:12pt;}
.p8{text-align:justify;hyphenate:auto;font-family:Times New Roman;font-size:12pt;}
.p9{text-indent:0.16666667in;text-align:justify;hyphenate:auto;font-family:Times New Roman;font-size:12pt;}
.p10{text-indent:0.3125in;text-align:justify;hyphenate:auto;font-family:Times New Roman;font-size:12pt;}
.p11{text-align:justify;hyphenate:auto;font-family:宋体;font-size:11pt;}
.p12{text-align:justify;hyphenate:auto;font-family:Times New Roman;font-size:10pt;}
.td1{width:0.75in;padding-start:0.0in;padding-end:0.0in;border-bottom:thin solid black;border-left:1.500pt solid black;border-right:thin solid black;border-top:1.500pt solid black;}
.td2{width:1.0in;padding-start:0.0in;padding-end:0.0in;border-bottom:thin solid black;border-left:thin solid black;border-right:thin solid black;border-top:1.500pt solid black;}
.td3{width:0.875in;padding-start:0.0in;padding-end:0.0in;border-bottom:thin solid black;border-left:thin solid black;border-right:thin solid black;border-top:1.500pt solid black;}
.td4{width:1.25in;padding-start:0.0in;padding-end:0.0in;border-bottom:thin solid black;border-left:thin solid black;border-right:thin solid black;border-top:1.500pt solid black;}
.td5{width:0.625in;padding-start:0.0in;padding-end:0.0in;border-bottom:thin solid black;border-left:thin solid black;border-right:thin solid black;border-top:1.500pt solid black;}
.td6{width:1.125in;padding-start:0.0in;padding-end:0.0in;border-bottom:thin solid black;border-left:thin solid black;border-right:1.0pt solid black;border-top:1.500pt solid black;}
.td7{width:1.125in;padding-start:0.0in;padding-end:0.0in;border-bottom:thin solid black;border-left:1.0pt solid black;border-right:1.500pt solid black;border-top:1.500pt solid black;}
.td8{width:0.75in;padding-start:0.0in;padding-end:0.0in;border-bottom:thin solid black;border-left:1.500pt solid black;border-right:thin solid black;border-top:thin solid black;}
.td9{width:1.0in;padding-start:0.0in;padding-end:0.0in;border-bottom:thin solid black;border-left:thin solid black;border-right:thin solid black;border-top:thin solid black;}
.td10{width:0.875in;padding-start:0.0in;padding-end:0.0in;border-bottom:thin solid black;border-left:thin solid black;border-right:thin solid black;}
.td11{width:1.25in;padding-start:0.0in;padding-end:0.0in;border-bottom:thin solid black;border-left:thin solid black;border-right:thin solid black;border-top:thin solid black;}
.td12{width:0.625in;padding-start:0.0in;padding-end:0.0in;border-bottom:thin solid black;border-left:thin solid black;border-right:thin solid black;}
.td13{width:1.125in;padding-start:0.0in;padding-end:0.0in;border-bottom:thin solid black;border-left:thin solid black;border-right:1.0pt solid black;border-top:thin solid black;}
.td14{width:0.75in;padding-start:0.0in;padding-end:0.0in;border-bottom:1.0pt solid black;border-left:1.500pt solid black;border-right:thin solid black;border-top:thin solid black;}
.td15{width:1.0in;padding-start:0.0in;padding-end:0.0in;border-bottom:1.0pt solid black;border-left:thin solid black;border-right:thin solid black;border-top:thin solid black;}
.td16{width:0.875in;padding-start:0.0in;padding-end:0.0in;border-bottom:1.0pt solid black;border-left:thin solid black;border-right:thin solid black;border-top:thin solid black;}
.td17{width:3.0in;padding-start:0.0in;padding-end:0.0in;border-bottom:1.0pt solid black;border-left:thin solid black;border-right:1.0pt solid black;border-top:thin solid black;}
.td18{width:5.625in;padding-start:0.0in;padding-end:0.0in;border-bottom:1.0pt solid black;border-left:1.500pt solid black;border-right:1.0pt solid black;border-top:1.0pt solid black;}
.td19{width:0.75in;padding-start:0.0in;padding-end:0.0in;border-bottom:1.0pt solid black;border-left:1.500pt solid black;border-right:thin solid black;border-top:1.0pt solid black;}
.td20{width:6.0in;padding-start:0.0in;padding-end:0.0in;border-bottom:1.0pt solid black;border-left:thin solid black;border-right:1.500pt solid black;border-top:1.0pt solid black;}
.td21{width:6.75in;padding-start:0.0in;padding-end:0.0in;border-bottom:1.0pt solid black;border-left:1.500pt solid black;border-right:1.500pt solid black;border-top:1.0pt solid black;}
.td22{width:0.75in;padding-start:0.0in;padding-end:0.0in;border-bottom:1.500pt solid black;border-left:1.500pt solid black;border-right:thin solid black;border-top:1.0pt solid black;}
.td23{width:6.0in;padding-start:0.0in;padding-end:0.0in;border-bottom:1.500pt solid black;border-left:thin solid black;border-right:1.500pt solid black;border-top:1.0pt solid black;}
.r1{height:0.39375in;}
.r2{height:0.11180556in;}
.r3{height:0.90555555in;}
.r4{height:0.110416666in;keep-together:always;}
.r5{height:0.90555555in;keep-together:always;}
.r6{height:0.07847222in;keep-together:always;}
.r7{height:0.1388889in;}
.r8{height:0.16666667in;keep-together:always;}
.r9{height:1.18125in;keep-together:always;}
.r10{height:1.1416667in;}
.t1{table-layout:fixed;border-collapse:collapse;border-spacing:0;}
</style>
        <title>个人简历表格</title>
        <meta content="User" name="author">
    </head>
    <body class="b1 b2">
        <p class="p1">
            <span class="s1">个人简历表格</span>
        </p>
        <table class="t1">
            <tbody>
                <tr class="r1">
                    <td class="td1">
                        <p class="p2">
                            <span class="s1">姓 名</span>
                        </p>
                    </td><td class="td2">
                        <p class="p3"></p>
                    </td><td class="td3">
                        <p class="p2">
                            <span class="s1">出生年月</span>
                        </p>
                    </td><td class="td4">
                        <p class="p3"></p>
                    </td><td class="td5">
                        <p class="p4">
                            <span class="s1">民族</span>
                        </p>
                    </td><td class="td6">
                        <p class="p3"></p>
                    </td><td class="td7" rowspan="2">
                        <p class="p4">
                            <img src="D:\%E6%88%91%E6%98%AF%E4%B8%80%E7%AF%87%E7%AE%80%E5%8E%86/image/0.png" style="width:0.9677889in;height:0.9677889in;vertical-align:text-bottom;">
                        </p>
                    </td>
                </tr>
                <tr class="r1">
                    <td class="td8">
                        <p class="p2">
                            <span class="s1">籍 贯</span>
                        </p>
                    </td><td class="td9">
                        <p class="p3"></p>
                    </td><td class="td10">
                        <p class="p2">
                            <span class="s1">毕业时间</span>
                        </p>
                    </td><td class="td11">
                        <p class="p3"></p>
                    </td><td class="td12">
                        <p class="p4">
                            <span class="s1">学历</span>
                        </p>
                    </td><td class="td13">
                        <p class="p5">
                            <span class="s2">    </span>
                        </p>
                    </td>
                </tr>
                <tr class="r1">
                    <td class="td14">
                        <p class="p2">
                            <span class="s1">性 别</span>
                        </p>
                    </td><td class="td15">
                        <p class="p3"></p>
                    </td><td class="td16">
                        <p class="p2">
                            <span class="s1">专 业</span>
                        </p>
                    </td><td class="td17" colspan="3">
                        <p class="p3"></p>
                    </td>
                </tr>
                <tr class="r2">
                    <td class="td18" colspan="6">
                        <p class="p4"></p>
                    </td>
                </tr>
                <tr class="r3">
                    <td class="td19">
                        <p class="p4">
                            <span class="s1">求</span>
                        </p>
                        <p class="p4">
                            <span class="s1">职</span>
                        </p>
                        <p class="p4">
                            <span class="s1">意</span>
                        </p>
                        <p class="p4">
                            <span class="s1">向</span>
                        </p>
                    </td><td class="td20" colspan="6">
                        <p class="p6"></p>
                    </td>
                </tr>
                <tr class="r4">
                    <td class="td21" colspan="7">
                        <p class="p7"></p>
                    </td>
                </tr>
                <tr class="r5">
                    <td class="td19">
                        <p class="p2">
                            <span class="s3">兴</span>
                        </p>
                        <p class="p2">
                            <span class="s3">趣</span>
                        </p>
                        <p class="p2">
                            <span class="s3">爱</span>
                        </p>
                        <p class="p2">
                            <span class="s3">好</span>
                        </p>
                    </td><td class="td20" colspan="6">
                        <p class="p8"></p>
                    </td>
                </tr>
                <tr class="r6">
                    <td class="td21" colspan="7">
                        <p class="p9"></p>
                    </td>
                </tr>
                <tr class="r5">
                    <td class="td19">
                        <p class="p4">
                            <span class="s3">社会</span>
                        </p>
                        <p class="p4">
                            <span class="s3">实践</span>
                        </p>
                        <p class="p4">
                            <span class="s3">经验</span>
                        </p>
                    </td><td class="td20" colspan="6">
                        <p class="p7"></p>
                    </td>
                </tr>
                <tr class="r7">
                    <td class="td21" colspan="7">
                        <p class="p10"></p>
                    </td>
                </tr>
                <tr class="r3">
                    <td class="td19">
                        <p class="p2">
                            <span class="s3">在校期间担任过何种职务</span>
                        </p>
                    </td><td class="td20" colspan="6">
                        <p class="p6"></p>
                    </td>
                </tr>
                <tr class="r8">
                    <td class="td21" colspan="7">
                        <p class="p9"></p>
                    </td>
                </tr>
                <tr class="r9">
                    <td class="td19">
                        <p class="p4">
                            <span class="s1">计</span>
                        </p>
                        <p class="p4">
                            <span class="s1">算</span>
                        </p>
                        <p class="p4">
                            <span class="s1">机</span>
                        </p>
                        <p class="p4">
                            <span class="s1">水</span>
                        </p>
                        <p class="p4">
                            <span class="s1">平</span>
                        </p>
                    </td><td class="td20" colspan="6">
                        <p class="p6"></p>
                    </td>
                </tr>
                <tr class="r7">
                    <td class="td21" colspan="7">
                        <p class="p7"></p>
                    </td>
                </tr>
                <tr class="r10">
                    <td class="td22">
                        <p class="p4">
                            <span class="s1">自</span>
                        </p>
                        <p class="p4">
                            <span class="s1">我</span>
                        </p>
                        <p class="p4">
                            <span class="s1">评</span>
                        </p>
                        <p class="p4">
                            <span class="s1">价</span>
                        </p>
                    </td><td class="td23" colspan="6">
                        <p class="p11"></p>
                    </td>
                </tr>
            </tbody>
        </table>
        <p class="p12"></p>
    </body>
</html>

 

 

 

本文内容由网友自发贡献,版权归原作者所有,本站不承担相应法律责任。如您发现有涉嫌抄袭侵权的内容,请联系:hwhale#tublm.com(使用前将#替换为@)

JAVA利用POI scratchpad 5.2.1 将Word文档doc格式转换成HTML 格式 含文档里面图片 的相关文章

随机推荐