我正在尝试创建代码以使用 iText 版本 7.19 导出 PDF 中的图像。我在使用 Flate 编码图像时遇到一些问题。我使用的 Microsoft 免费书籍中的所有 Flate 编码图像作为示例(请参阅迁移到 Microsoft Visual Studio 2010 https://blogs.msdn.microsoft.com/microsoft_press/2010/09/13/free-ebook-moving-to-microsoft-visual-studio-2010/)总是呈粉红色,并且根据我尝试复制字节的方式,它们可能会出现扭曲。
如果我尝试一次复制所有图像字节(请参阅下面代码中的 SaveFlateEncodedImage2 方法),它们会像这样扭曲:
如果我尝试逐行复制它们(请参阅下面代码中的 SaveFlateEncodedImage 方法),它们会像这个一样呈粉红色
这是我用来导出它们的代码:
using iText.Kernel;
using iText.Kernel.Pdf;
using iText.Kernel.Pdf.Filters;
using System;
using System.Drawing;
using System.Drawing.Imaging;
using System.IO;
using System.Runtime.InteropServices;
namespace ITextPdfStuff
{
public class MyPdfImageExtractor
{
private readonly string _pdfFileName;
public MyPdfImageExtractor(string pdfFileName)
{
_pdfFileName = pdfFileName;
}
public void ExtractToDirectory(string directoryName)
{
using (var reader = new PdfReader(_pdfFileName))
{
// Avoid iText.Kernel.Crypto.BadPasswordException: https://stackoverflow.com/a/48065052/97803
reader.SetUnethicalReading(true);
using (var pdfDoc = new PdfDocument(reader))
{
ExtractImagesOnAllPages(pdfDoc, directoryName);
}
}
}
private void ExtractImagesOnAllPages(PdfDocument pdfDoc, string directoryName)
{
Console.WriteLine($"Number of pdf {pdfDoc.GetNumberOfPdfObjects()} objects");
// Extract objects https://itextpdf.com/en/resources/examples/itext-7/extracting-objects-pdf
for (int objNumber = 1; objNumber <= pdfDoc.GetNumberOfPdfObjects(); objNumber++)
{
PdfObject currentObject = pdfDoc.GetPdfObject(objNumber);
if (currentObject != null && currentObject.IsStream())
{
try
{
ExtractImagesOneImage(currentObject as PdfStream, Path.Combine(directoryName, $"image{objNumber}.png"));
}
catch (Exception ex)
{
Console.WriteLine($"Object number {objNumber} is NOT an image! -- error: {ex.Message}");
}
}
}
}
private void ExtractImagesOneImage(PdfStream someStream, string fileName)
{
var pdfDict = (PdfDictionary)someStream;
string subType = pdfDict.Get(PdfName.Subtype)?.ToString() ?? string.Empty;
bool isImage = subType == "/Image";
if (isImage == false)
return;
bool decoded = false;
string filter = pdfDict.Get(PdfName.Filter).ToString();
if (filter == "/FlateDecode")
{
SaveFlateEncodedImage(fileName, pdfDict, someStream.GetBytes(false));
}
else
{
byte[] imgData;
try
{
imgData = someStream.GetBytes(decoded);
}
catch (PdfException ex)
{
imgData = someStream.GetBytes(!decoded);
}
SaveNormalImage(fileName, imgData);
}
}
private void SaveNormalImage(string fileName, byte[] imgData)
{
using (var memStream = new System.IO.MemoryStream(imgData))
using (var image = System.Drawing.Image.FromStream(memStream))
{
image.Save(fileName, ImageFormat.Png);
Console.WriteLine($"{Path.GetFileName(fileName)}");
}
}
private void SaveFlateEncodedImage(string fileName, PdfDictionary pdfDict, byte[] imgData)
{
int width = int.Parse(pdfDict.Get(PdfName.Width).ToString());
int height = int.Parse(pdfDict.Get(PdfName.Height).ToString());
int bpp = int.Parse(pdfDict.Get(PdfName.BitsPerComponent).ToString());
// Example that helped: https://stackoverflow.com/a/8517377/97803
PixelFormat pixelFormat;
switch (bpp)
{
case 1:
pixelFormat = PixelFormat.Format1bppIndexed;
break;
case 8:
pixelFormat = PixelFormat.Format8bppIndexed;
break;
case 24:
pixelFormat = PixelFormat.Format24bppRgb;
break;
default:
throw new Exception("Unknown pixel format " + bpp);
}
// .NET docs https://api.itextpdf.com/iText7/dotnet/7.1.9/classi_text_1_1_kernel_1_1_pdf_1_1_filters_1_1_flate_decode_strict_filter.html
// Java docs have more detail: https://api.itextpdf.com/iText7/java/7.1.7/com/itextpdf/kernel/pdf/filters/FlateDecodeFilter.html
imgData = FlateDecodeStrictFilter.FlateDecode(imgData, true);
// byte[] streamBytes = FlateDecodeStrictFilter.DecodePredictor(imgData, pdfDict);
// Copy the image one row at a time
using (var bmp = new Bitmap(width, height, pixelFormat))
{
BitmapData bmpData = bmp.LockBits(new Rectangle(0, 0, width, height), ImageLockMode.WriteOnly, pixelFormat);
int length = (int)Math.Ceiling(width * bpp / 8.0);
for (int i = 0; i < height; i++)
{
int offset = i * length;
int scanOffset = i * bmpData.Stride;
Marshal.Copy(imgData, offset, new IntPtr(bmpData.Scan0.ToInt64() + scanOffset), length);
}
bmp.UnlockBits(bmpData);
bmp.Save(fileName, ImageFormat.Png);
}
Console.WriteLine($"FlateDecode! {Path.GetFileName(fileName)}");
}
/// <summary>This method distorts the image badly</summary>
private void SaveFlateEncodedImage2(string fileName, PdfDictionary pdfDict, byte[] imgData)
{
int width = int.Parse(pdfDict.Get(PdfName.Width).ToString());
int height = int.Parse(pdfDict.Get(PdfName.Height).ToString());
int bpp = int.Parse(pdfDict.Get(PdfName.BitsPerComponent).ToString());
// Example that helped: https://stackoverflow.com/a/8517377/97803
PixelFormat pixelFormat;
switch (bpp)
{
case 1:
pixelFormat = PixelFormat.Format1bppIndexed;
break;
case 8:
pixelFormat = PixelFormat.Format8bppIndexed;
break;
case 24:
pixelFormat = PixelFormat.Format24bppRgb;
break;
default:
throw new Exception("Unknown pixel format " + bpp);
}
// .NET docs https://api.itextpdf.com/iText7/dotnet/7.1.9/classi_text_1_1_kernel_1_1_pdf_1_1_filters_1_1_flate_decode_strict_filter.html
// Java docs have more detail: https://api.itextpdf.com/iText7/java/7.1.7/com/itextpdf/kernel/pdf/filters/FlateDecodeFilter.html
imgData = FlateDecodeStrictFilter.FlateDecode(imgData, true);
// byte[] streamBytes = FlateDecodeStrictFilter.DecodePredictor(imgData, pdfDict);
// Copy the entire image in one go
using (var bmp = new Bitmap(width, height, pixelFormat))
{
BitmapData bmpData = bmp.LockBits(new Rectangle(0, 0, width, height), ImageLockMode.WriteOnly, pixelFormat);
Marshal.Copy(imgData, 0, bmpData.Scan0, imgData.Length);
bmp.UnlockBits(bmpData);
bmp.Save(fileName, ImageFormat.Png);
}
Console.WriteLine($"FlateDecode! {Path.GetFileName(fileName)}");
}
}
}
该代码可以在 .NET Core 控制台应用程序中实例化和调用,如下所示:
string existingFileName = @"c:\temp\ReallyLongBook1.pdf";
var imageExtractor = new MyPdfImageExtractor(existingFileName);
imageExtractor.ExtractToDirectory(@"c:\temp\images");
我正在通过此代码运行以下免费的 Microsoft 书籍:迁移到 Microsoft Visual Studio 2010 https://blogs.msdn.microsoft.com/microsoft_press/2010/09/13/free-ebook-moving-to-microsoft-visual-studio-2010/
有问题的图像位于第 10 页,它是黑白的(不是粉红色的)。
我不是 PDF 专家,几天来我一直在研究这段代码,现在我挑选了一些示例来尝试将其拼凑在一起。任何能让我摆脱粉红色图像的帮助,将不胜感激。
-------2020年2月4日更新------
这是经过 MKL 建议更改后的修订版本。他的更改提取了比我更多的图像,并生成了我上面提到的书中出现的看起来正确的图像:
using iText.Kernel.Pdf;
using iText.Kernel.Pdf.Canvas.Parser;
using iText.Kernel.Pdf.Canvas.Parser.Data;
using iText.Kernel.Pdf.Canvas.Parser.Listener;
using iText.Kernel.Pdf.Xobject;
using System;
using System.Collections.Generic;
using System.IO;
namespace ITextPdfStuff
{
public class MyPdfImageExtractor
{
private readonly string _pdfFileName;
public MyPdfImageExtractor(string pdfFileName)
{
_pdfFileName = pdfFileName;
}
public void ExtractToDirectory(string directoryName)
{
using (var reader = new PdfReader(_pdfFileName))
{
// Avoid iText.Kernel.Crypto.BadPasswordException: https://stackoverflow.com/a/48065052/97803
reader.SetUnethicalReading(true);
using (var pdfDoc = new PdfDocument(reader))
{
ExtractImagesOnAllPages(pdfDoc, directoryName);
}
}
}
private void ExtractImagesOnAllPages(PdfDocument pdfDoc, string directoryName)
{
Console.WriteLine($"Number of pdf {pdfDoc.GetNumberOfPdfObjects()} objects");
IEventListener strategy = new ImageRenderListener(Path.Combine(directoryName, @"image{0}.{1}"));
PdfCanvasProcessor parser = new PdfCanvasProcessor(strategy);
for (var i = 1; i <= pdfDoc.GetNumberOfPages(); i++)
{
parser.ProcessPageContent(pdfDoc.GetPage(i));
}
}
}
public class ImageRenderListener : IEventListener
{
public ImageRenderListener(string format)
{
this.format = format;
}
public void EventOccurred(IEventData data, EventType type)
{
if (data is ImageRenderInfo imageData)
{
try
{
PdfImageXObject imageObject = imageData.GetImage();
if (imageObject == null)
{
Console.WriteLine("Image could not be read.");
}
else
{
File.WriteAllBytes(string.Format(format, index++, imageObject.IdentifyImageFileExtension()), imageObject.GetImageBytes());
}
}
catch (Exception ex)
{
Console.WriteLine("Image could not be read: {0}.", ex.Message);
}
}
}
public ICollection<EventType> GetSupportedEvents()
{
return null;
}
string format;
int index = 0;
}
}