我通过搜索发现了这个问题,所以我认为它值得回答。我在这里找到了 BrightTide 的答案:https://github.com/galkahana/HummusJS/issues/71#issuecomment-275956347
基本上,有一个非常强大的 Hummus 包,它使用用 C++ 编写的库(当然是跨平台的)。我认为 github 评论中给出的答案可以这样功能化:
var hummus = require('hummus');
/**
* Returns a byteArray string
*
* @param {string} str - input string
*/
function strToByteArray(str) {
var myBuffer = [];
var buffer = new Buffer(str);
for (var i = 0; i < buffer.length; i++) {
myBuffer.push(buffer[i]);
}
return myBuffer;
}
function replaceText(sourceFile, targetFile, pageNumber, findText, replaceText) {
var writer = hummus.createWriterToModify(sourceFile, {
modifiedFilePath: targetFile
});
var sourceParser = writer.createPDFCopyingContextForModifiedFile().getSourceDocumentParser();
var pageObject = sourceParser.parsePage(pageNumber);
var textObjectId = pageObject.getDictionary().toJSObject().Contents.getObjectID();
var textStream = sourceParser.queryDictionaryObject(pageObject.getDictionary(), 'Contents');
//read the original block of text data
var data = [];
var readStream = sourceParser.startReadingFromStream(textStream);
while(readStream.notEnded()){
Array.prototype.push.apply(data, readStream.read(10000));
}
var string = new Buffer(data).toString().replace(findText, replaceText);
//Create and write our new text object
var objectsContext = writer.getObjectsContext();
objectsContext.startModifiedIndirectObject(textObjectId);
var stream = objectsContext.startUnfilteredPDFStream();
stream.getWriteStream().write(strToByteArray(string));
objectsContext.endPDFStream(stream);
objectsContext.endIndirectObject();
writer.end();
}
// replaceText('source.pdf', 'output.pdf', 0, /REPLACEME/g, 'My New Custom Text');
UPDATE:
编写示例时使用的版本是1.0.83
,最近情况可能会发生变化。
更新2:
最近我遇到了另一个具有不同字体的 PDF 文件的问题。由于某种原因,文本被分成小块,即字符串QWERTYUIOPASDFGHJKLZXCVBNM1234567890-
被表示为-286(Q)9(WER)24(T)-8(YUIOP)116(ASDF)19(GHJKLZX)15(CVBNM1234567890-)
除了编写正则表达式之外,我不知道还能做什么。所以不要用这一行:
var string = new Buffer(data).toString().replace(findText, replaceText);
我现在有这样的事情:
var string = Buffer.from(data).toString();
var characters = REPLACE_ME;
var match = [];
for (var a = 0; a < characters.length; a++) {
match.push('(-?[0-9]+)?(\\()?' + characters[a] + '(\\))?');
}
string = string.replace(new RegExp(match.join('')), function(m, m1) {
// m1 holds the first item which is a space
return m1 + '( ' + REPLACE_WITH_THIS + ')';
});