30

I m trying to extract images from a pdf using pdfbox. The example pdf here

But i m getting blank images only.

The code i m trying:-

public static void main(String[] args) {
   PDFImageExtract obj = new PDFImageExtract();
    try {
        obj.read_pdf();
    } catch (IOException ex) {
        System.out.println("" + ex);
    }

}

 void read_pdf() throws IOException {
    PDDocument document = null; 
    try {
        document = PDDocument.load("C:\\Users\\Pradyut\\Documents\\MCS-034.pdf");
    } catch (IOException ex) {
        System.out.println("" + ex);
    }
    List pages = document.getDocumentCatalog().getAllPages();
    Iterator iter = pages.iterator(); 
    int i =1;
    String name = null;

    while (iter.hasNext()) {
        PDPage page = (PDPage) iter.next();
        PDResources resources = page.getResources();
        Map pageImages = resources.getImages();
        if (pageImages != null) { 
            Iterator imageIter = pageImages.keySet().iterator();
            while (imageIter.hasNext()) {
                String key = (String) imageIter.next();
                PDXObjectImage image = (PDXObjectImage) pageImages.get(key);
                image.write2file("C:\\Users\\Pradyut\\Documents\\image" + i);
                i ++;
            }
        }
    }

}

Thanks

Pradyut Bhattacharya
  • 4,881
  • 13
  • 46
  • 78

8 Answers8

31

Here is code using PDFBox 2.0.1 that will get a list of all images from the PDF. This is different than the other code in that it will recurse through the document instead of trying to get the images from the top level.

public List<RenderedImage> getImagesFromPDF(PDDocument document) throws IOException {
        List<RenderedImage> images = new ArrayList<>();
    for (PDPage page : document.getPages()) {
        images.addAll(getImagesFromResources(page.getResources()));
    }

    return images;
}

private List<RenderedImage> getImagesFromResources(PDResources resources) throws IOException {
    List<RenderedImage> images = new ArrayList<>();

    for (COSName xObjectName : resources.getXObjectNames()) {
        PDXObject xObject = resources.getXObject(xObjectName);

        if (xObject instanceof PDFormXObject) {
            images.addAll(getImagesFromResources(((PDFormXObject) xObject).getResources()));
        } else if (xObject instanceof PDImageXObject) {
            images.add(((PDImageXObject) xObject).getImage());
        }
    }

    return images;
}
Matt
  • 482
  • 4
  • 7
  • This should be marked as the correct answer. This worked for me on Java11 with PDFBox 2.0.12 If anybody has trouble with jpeg2000 errors using this, then take a look at https://github.com/jai-imageio/jai-imageio-jpeg2000 – ThetaSinner Nov 04 '18 at 21:35
  • @Matt You are only considering the cases that the `xObject` is a `PDFormXObject` or a `PDImageXObject`. But accourding to [the Javadoc](https://pdfbox.apache.org/docs/2.0.13/javadocs/org/apache/pdfbox/pdmodel/graphics/PDXObject.html) it could also be a `PDPostScriptXObject`. Couldn't a `PDPostScriptXObject` still contain further images? – Joe7 Nov 06 '20 at 20:28
11

The below GetImagesFromPDF java class get all images in 04-Request-Headers.pdf file and save those files into destination folder PDFCopy.

import java.io.File;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObjectImage;

@SuppressWarnings({ "unchecked", "rawtypes", "deprecation" })
public class GetImagesFromPDF {
    public static void main(String[] args) {
        try {
            String sourceDir = "C:/PDFCopy/04-Request-Headers.pdf";// Paste pdf files in PDFCopy folder to read
            String destinationDir = "C:/PDFCopy/";
            File oldFile = new File(sourceDir);
            if (oldFile.exists()) {
            PDDocument document = PDDocument.load(sourceDir);

            List<PDPage> list = document.getDocumentCatalog().getAllPages();

            String fileName = oldFile.getName().replace(".pdf", "_cover");
            int totalImages = 1;
            for (PDPage page : list) {
                PDResources pdResources = page.getResources();

                Map pageImages = pdResources.getImages();
                if (pageImages != null) {

                    Iterator imageIter = pageImages.keySet().iterator();
                    while (imageIter.hasNext()) {
                        String key = (String) imageIter.next();
                        PDXObjectImage pdxObjectImage = (PDXObjectImage) pageImages.get(key);
                        pdxObjectImage.write2file(destinationDir + fileName+ "_" + totalImages);
                        totalImages++;
                    }
                }
            }
        } else {
            System.err.println("File not exists");
        }
    } catch (Exception e) {
        e.printStackTrace();
    }
}

}

UdayKiran Pulipati
  • 6,053
  • 7
  • 60
  • 84
9

For PDFBox 2.0.1, pudaykiran's answer must be slightly modified since some APIs have been changed.

public static void testPDFBoxExtractImages() throws Exception {
    PDDocument document = PDDocument.load(new File("D:/Temp/Test.pdf"));
    PDPageTree list = document.getPages();
    for (PDPage page : list) {
        PDResources pdResources = page.getResources();
        for (COSName c : pdResources.getXObjectNames()) {
            PDXObject o = pdResources.getXObject(c);
            if (o instanceof org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject) {
                File file = new File("D:/Temp/" + System.nanoTime() + ".png");
                ImageIO.write(((org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject)o).getImage(), "png", file);
            }
        }
    }
}
PerseusBC
  • 131
  • 1
  • 4
2

Just add the .jpeg to the end of your path:

image.write2file("C:\\Users\\Pradyut\\Documents\\image" + i + ".jpeg");

That works for me.

t0mm13b
  • 32,846
  • 7
  • 71
  • 106
Jorge
  • 21
  • 2
2

You can use PDPage.convertToImage() function which can convert the PDF page into a BufferedImage. Next you can use the BufferedImage to create an Image.

Use the following reference for further detail:

And do not forget to look for PDPage.convertToImage() function in PDPage class.

рüффп
  • 4,475
  • 34
  • 62
  • 99
Abhay Pai
  • 316
  • 4
  • 17
1

For someone who want just copy and paste this ready to use code

import org.apache.pdfbox.contentstream.PDFStreamEngine;
import org.apache.pdfbox.contentstream.operator.Operator;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.graphics.PDXObject;
import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;

import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.IOException;
import java.util.List;
import java.util.UUID;

public class ExtractImagesUseCase extends PDFStreamEngine{
    private final String filePath;
    private final String outputDir;

    // Constructor
    public ExtractImagesUseCase(String filePath,
                                String outputDir){
        this.filePath = filePath;
        this.outputDir = outputDir;
    }

    // Execute
    public void execute(){
        try{
            File file = new File(filePath);
            PDDocument document = PDDocument.load(file);

            for(PDPage page : document.getPages()){
                processPage(page);
            }

        }catch(IOException e){
            e.printStackTrace();
        }
    }

    @Override
    protected void processOperator(Operator operator, List<COSBase> operands) throws IOException{
        String operation = operator.getName();

        if("Do".equals(operation)){
            COSName objectName = (COSName) operands.get(0);
            PDXObject pdxObject = getResources().getXObject(objectName);

            if(pdxObject instanceof PDImageXObject){
                // Image
                PDImageXObject image = (PDImageXObject) pdxObject;
                BufferedImage bImage = image.getImage();

                // File
                String randomName = UUID.randomUUID().toString();
                File outputFile = new File(outputDir,randomName + ".png");

                // Write image to file
                ImageIO.write(bImage, "PNG", outputFile);

            }else if(pdxObject instanceof PDFormXObject){
                PDFormXObject form = (PDFormXObject) pdxObject;
                showForm(form);
            }
        }

        else super.processOperator(operator, operands);
    }
}

Demo

public class ExtractImageDemo{
    public static void main(String[] args){
        String filePath = "C:\\Users\\John\\Downloads\\Documents\\sample-file.pdf";
        String outputDir = "C:\\Users\\John\\Downloads\\Documents\\Output";

        ExtractImagesUseCase useCase = new ExtractImagesUseCase(
                filePath,
                outputDir
        );
        useCase.execute();
    }
}
NM Naufaldo
  • 300
  • 3
  • 11
0

Instead of calling

image.write2file("C:\\Users\\Pradyut\\Documents\\image" + i);

You can use the ImageIO.write() static method to write the RGB image out in whatever format you need. Here I've used PNG:

File outputFile = new File( "C:\\Users\\Pradyut\\Documents\\image" + i + ".png");
ImageIO.write( image.getRGBImage(), "png", outputFile);
nickb
  • 56,839
  • 11
  • 91
  • 130
0

This is a kotlin version of @Matt's answer.

fun <R> PDResources.onImageResources(block: (RenderedImage) -> (R)): List<R> =
        this.xObjectNames.flatMap {
            when (val xObject = this.getXObject(it)) {
                is PDFormXObject -> xObject.resources.onImageResources(block)
                is PDImageXObject -> listOf(block(xObject.image))
                else -> emptyList()
            }
        }

You can use it on PDPage Resources like this:

page.resources.onImageResources { image ->
    Files.createTempFile("image", "xxx").also { path-> 
        if(!ImageIO.write(it, "xxx", file.toFile()))
            IllegalStateException("Couldn't write image to file")
    }
}

Where "xxx" is the format you need (like "jpeg")

d0x
  • 9,378
  • 14
  • 58
  • 93