Handling PDF in Java With Apache PDFBox

Apache PDFBox is a Java tool for working with PDF documents. In this post, I will introduce how to use Apache PDFBox to handle PDF files. The code examples in this post are based on pdfbox v2.0.29.

<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.29</version>
</dependency>

Extract Text

Extract all page text

String inputFilePath = "your/pdf/filepath";
// Load PDF document
PDDocument document = PDDocument.load(new File(inputFilePath));
// Create PDFTextStripper instance
PDFTextStripper pdfStripper = new PDFTextStripper();
// Extract text from PDF
String text = pdfStripper.getText(document);
// Print extracted text
System.out.println(text);
// Close the document
document.close();

Extract page by page

String inputFilePath = "your/pdf/filepath";
// Load the PDF document
PDDocument document = PDDocument.load(new File(inputFilePath));
// Create an instance of PDFTextStripper
PDFTextStripper stripper = new PDFTextStripper();
// Iterate through each page and extract the text
for (int pageNumber = 1; pageNumber <= document.getNumberOfPages(); pageNumber++) {
stripper.setStartPage(pageNumber);
stripper.setEndPage(pageNumber);

String text = stripper.getText(document);
System.out.println("Page " + pageNumber + ":");
System.out.println(text);
}
// Close the PDF document
document.close();

Split and Merge

Split

private static void splitPdf(String inputFilePath, String outputDir) throws IOException {
File file = new File(inputFilePath);
// Load the PDF document
PDDocument document = PDDocument.load(file);
// Create a PDF splitter object
Splitter splitter = new Splitter();
// Split the document
List<PDDocument> splitDocuments = splitter.split(document);
// Get an iterator for the split documents
Iterator<PDDocument> iterator = splitDocuments.iterator();
// Iterate through the split documents and save them
int i = 1;
while (iterator.hasNext()) {
PDDocument splitDocument = iterator.next();
String outputFilePath = new StringBuilder().append(outputDir)
.append(File.separator)
.append(file.getName().replaceAll("[.](pdf|PDF)", ""))
.append("_split_")
.append(i)
.append(".pdf")
.toString();
splitDocument.save(outputFilePath);
splitDocument.close();
i++;
}
// Close the source document
document.close();
System.out.println("PDF split successfully!");
}

Merge PDF files

private static void mergePdfFiles(List<String> inputFilePaths, String outputFilePath) throws IOException {
PDFMergerUtility merger = new PDFMergerUtility();
// Add as many files as you need
for (String inputFilePath : inputFilePaths) {
merger.addSource(new File(inputFilePath));
}
merger.setDestinationFileName(outputFilePath);
merger.mergeDocuments();
System.out.println("PDF files merged successfully!");
}

Insert and remove pages

Insert pages

public static void insertPage(String sourceFile, String targetFile, int pageIndex) throws IOException {
// Load the existing PDF document
PDDocument sourceDoc = PDDocument.load(new File(sourceFile));
Integer sourcePageCount = sourceDoc.getNumberOfPages();
// Validate the requested page index
if (pageIndex < 0 || pageIndex > sourcePageCount) {
throw new IllegalArgumentException("Invalid page index");
}
// Create a new blank page
PDPage newPage = new PDPage();
// Insert the new page at the requested index
if (sourcePageCount.equals(pageIndex)) {
sourceDoc.getPages().add(newPage);
} else {
sourceDoc.getPages().insertBefore(newPage, sourceDoc.getPages().get(pageIndex));
}
// Save the modified PDF document to a target file
sourceDoc.save(targetFile);
// Close the source and target documents
sourceDoc.close();
}

Remove pages

private static void removePage(String inputFilePath, String outputFilePath, int pageIndex) throws IOException {

PDDocument sourceDoc = PDDocument.load(new File(inputFilePath));
Integer sourcePageCount = sourceDoc.getNumberOfPages();
// Validate the requested page index
if (pageIndex < 0 || pageIndex >= sourcePageCount) {
throw new IllegalArgumentException("Invalid page index");
}
sourceDoc.getPages().remove(pageIndex);
sourceDoc.save(outputFilePath);
sourceDoc.close();
}
private static void removePage2(String inputFilePath, String outputFilePath, int pageIndex) throws IOException {
PDDocument sourceDoc = PDDocument.load(new File(inputFilePath));
Integer sourcePageCount = sourceDoc.getNumberOfPages();
// Validate the requested page index
if (pageIndex < 0 || pageIndex >= sourcePageCount) {
throw new IllegalArgumentException("Invalid page index");
}
Splitter splitter = new Splitter();
List<PDDocument> pages = splitter.split(sourceDoc);
pages.remove(pageIndex);
PDDocument outputDocument = new PDDocument();
for (PDDocument page : pages) {
outputDocument.addPage(page.getPage(0));
}
outputDocument.save(outputFilePath);
sourceDoc.close();
outputDocument.close();
}

Encryption

Encrypt

public static void encryptPdf(String inputFilePath, String outputFilePath, String password) throws IOException {
PDDocument doc = PDDocument.load(new File(inputFilePath));

AccessPermission ap = new AccessPermission();
// disable printing,
ap.setCanPrint(false);
//disable copying
ap.setCanExtractContent(false);
//Disable other things if needed...

// Owner password (to open the file with all permissions)
// User password (to open the file but with restricted permissions)
StandardProtectionPolicy spp = new StandardProtectionPolicy(password, password, ap);
// Define the length of the encryption key.
// Possible values are 40, 128 or 256.
int keyLength = 256;
spp.setEncryptionKeyLength(keyLength);

//Apply protection
doc.protect(spp);

doc.save(outputFilePath);
doc.close();
}

Update password

public static void updatePdfPassword(String inputFilePath, String outputFilePath,
String oldPassword, String newPassword) throws IOException {
PDDocument doc = PDDocument.load(new File(inputFilePath), oldPassword);

AccessPermission ap = new AccessPermission();
// disable printing,
ap.setCanPrint(false);
//disable copying
ap.setCanExtractContent(false);
//Disable other things if needed...

// Owner password (to open the file with all permissions)
// User password (to open the file but with restricted permissions)
StandardProtectionPolicy spp = new StandardProtectionPolicy(newPassword, newPassword, ap);
// Define the length of the encryption key.
// Possible values are 40, 128 or 256.
int keyLength = 256;
spp.setEncryptionKeyLength(keyLength);

//Apply protection
doc.protect(spp);

doc.save(outputFilePath);
doc.close();
}

Remove password

public static void removePdfPassword(String inputFilePath, String outputFilePath,
String password) throws IOException {
PDDocument doc = PDDocument.load(new File(inputFilePath), password);
// Set the document access permissions
doc.setAllSecurityToBeRemoved(true);
// Save the unprotected PDF document
doc.save(outputFilePath);
// Close the document
doc.close();
}

Convert to Image

PDF to Image

public static void pdfToImage(String pdfFilePath, String imageFileDir) throws IOException {
File file = new File(pdfFilePath);
PDDocument document = PDDocument.load(file);
// Create PDFRenderer object to render each page as an image
PDFRenderer pdfRenderer = new PDFRenderer(document);
// Iterate over all the pages and convert each page to an image
for (int pageIndex = 0; pageIndex < document.getNumberOfPages(); pageIndex++) {
// Render the page as an image
// 100 DPI: general-quality
// 300 DPI: high-quality
// 600 DPI: pristine-quality
BufferedImage image = pdfRenderer.renderImageWithDPI(pageIndex, 300);
// Save the image to a file
String imageFilePath = new StringBuilder()
.append(imageFileDir)
.append(File.separator)
.append(file.getName().replaceAll("[.](pdf|PDF)", ""))
.append("_")
.append(pageIndex + 1)
.append(".png")
.toString();
ImageIO.write(image, "PNG", new File(imageFilePath));
}
// Close the document
document.close();
}

Image to PDF

private static void imageToPdf(String imagePath, String pdfPath) throws IOException {
try (PDDocument doc = new PDDocument()) {
PDPage page = new PDPage();
doc.addPage(page);
// createFromFile is the easiest way with an image file
// if you already have the image in a BufferedImage,
// call LosslessFactory.createFromImage() instead
PDImageXObject pdImage = PDImageXObject.createFromFile(imagePath, doc);
// draw the image at full size at (x=0, y=0)
try (PDPageContentStream contents = new PDPageContentStream(doc, page)) {
// to draw the image at PDF width
int scaledWidth = 600;
if (pdImage.getWidth() < 600) {
scaledWidth = pdImage.getWidth();
}
contents.drawImage(pdImage, 0, 0, scaledWidth, pdImage.getHeight() * scaledWidth / pdImage.getWidth());
}
doc.save(pdfPath);
}
}

Create PDFs

String outputFilePath = "output/pdf/filepath";

PDDocument document = new PDDocument();
PDPage page = new PDPage(PDRectangle.A4);
document.addPage(page);
// Create content stream to draw on the page
PDPageContentStream contentStream = new PDPageContentStream(document, page);
contentStream.setFont(PDType1Font.HELVETICA, 12);
// Insert text
contentStream.beginText();
contentStream.newLineAtOffset(100, 700);
contentStream.showText("Hello, World!");
contentStream.endText();
// Load the image
String imageFilePath = "C:\\Users\\Taogen\\Pictures\\icon.jpg";
PDImageXObject image = PDImageXObject.createFromFile(imageFilePath, document);
// Set the scale and position of the image on the page
float scale = 0.5f; // adjust the scale as needed
float x = 100; // x-coordinate of the image
float y = 500; // y-coordinate of the image
// Draw the image on the page
contentStream.drawImage(image, x, y, image.getWidth() * scale, image.getHeight() * scale);
contentStream.close();
document.save(outputFilePath);
document.close();

Compress (TODO)

Watermark (Todo)